From 73d7b7edd24060e4f59ff2b11a156be36b497d75 Mon Sep 17 00:00:00 2001 From: "Barry E. Moore II" Date: Thu, 2 Jun 2016 19:03:34 -0400 Subject: [PATCH 001/359] BUG: df.to_string with formatters, header and index False --- pandas/formats/format.py | 2 +- pandas/tests/formats/test_printing.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 27d8b553013b9..f9682708b06a0 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -586,7 +586,7 @@ def to_string(self): self._chk_truncate() strcols = self._to_str_columns() text = self.adj.adjoin(1, *strcols) - if not self.index: + if not self.index and self.header: text = text.replace('\n ', '\n').strip() self.buf.writelines(text) diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index 3bcceca1f50a7..f93f37f27b2c5 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -9,6 +9,17 @@ _multiprocess_can_split_ = True +def test_to_string_formatters_index_header(): + from pandas import DataFrame + frame = DataFrame(data={0: 0, 1: 0}, index=[0]) + expected = ' 0 0' + + formatter = lambda x: '{:4d}'.format(x) + + string = frame.to_string(formatters=[formatter, formatter], index=False, + header=False) + assert(string == expected) + def test_adjoin(): data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']] expected = 'a dd ggg\nb ee hhh\nc ff iii' From 0c6226cbbc319ec22cf4c957bdcc055eaa7aea99 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 2 Jun 2016 19:15:18 -0400 Subject: [PATCH 002/359] ENH: Add support for compact_ints and use_unsigned in Python engine Title is self-explanatory. xref #12686 - I don't quite understand why these are marked (if at all) as internal to the C engine only, as the benefits for having these options accepted for the Python engine is quite clear based on the documentation I added as well. Implementation simply just calls the already-written function in `pandas/parsers.pyx` - as it isn't specific to the `TextReader` class, crossing over to grab this function from Cython (instead of duplicating in pure Python) seems reasonable while maintaining that separation between the C and Python engines. Author: gfyoung Closes #13323 from gfyoung/python-engine-compact-ints and squashes the following commits: 95f7ba8 [gfyoung] ENH: Add support for compact_ints and use_unsigned in Python engine --- doc/source/io.rst | 11 +++ doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 35 ++++++++- pandas/io/tests/parser/c_parser_only.py | 46 ++++-------- pandas/io/tests/parser/common.py | 43 +++++++++++ pandas/io/tests/parser/test_unsupported.py | 21 ++++++ pandas/parser.pyx | 72 +----------------- pandas/src/inference.pyx | 85 ++++++++++++++++++++++ pandas/tests/test_infer_and_convert.py | 36 +++++++++ 9 files changed, 246 insertions(+), 104 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 6cf41bbc50fb5..4eb42e1fb918d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -176,6 +176,17 @@ low_memory : boolean, default ``True`` Note that the entire file is read into a single DataFrame regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. (Only valid with C parser) +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the + parser will attempt to cast it as the smallest integer ``dtype`` possible, either + signed or unsigned depending on the specification from the ``use_unsigned`` parameter. +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether + the column should be compacted to the smallest signed or unsigned integer dtype. NA and Missing Data Handling ++++++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 950bf397f43b5..b87cdd91aa464 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -292,6 +292,7 @@ Other API changes Deprecations ^^^^^^^^^^^^ +- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv`` and will be removed in a future version (:issue:`13320`) .. _whatsnew_0182.performance: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bba8ad3ccd72b..2c8726f588522 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -227,6 +227,20 @@ Note that the entire file is read into a single DataFrame regardless, use the `chunksize` or `iterator` parameter to return the data in chunks. (Only valid with C parser) +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If compact_ints is True, then for any column that is of integer dtype, + the parser will attempt to cast it as the smallest integer dtype possible, + either signed or unsigned depending on the specification from the + `use_unsigned` parameter. + +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. `compact_ints=True`), specify + whether the column should be compacted to the smallest signed or unsigned + integer dtype. Returns ------- @@ -425,8 +439,6 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ 'as_recarray', - 'compact_ints', - 'use_unsigned', 'low_memory', 'memory_map', 'buffer_lines', @@ -435,6 +447,10 @@ def _read(filepath_or_buffer, kwds): 'dtype', 'float_precision', ]) +_deprecated_args = set([ + 'compact_ints', + 'use_unsigned', +]) def _make_parser_function(name, sep=','): @@ -789,6 +805,12 @@ def _clean_options(self, options, engine): _validate_header_arg(options['header']) + for arg in _deprecated_args: + if result[arg] != _c_parser_defaults[arg]: + warnings.warn("The '{arg}' argument has been deprecated " + "and will be removed in a future version" + .format(arg=arg), FutureWarning, stacklevel=2) + if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if _is_index_col(index_col): @@ -1206,6 +1228,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, cvals, na_count = self._convert_types( values, set(col_na_values) | col_na_fvalues, coerce_type) + + if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: + cvals = lib.downcast_int64( + cvals, _parser.na_values, + self.use_unsigned) + result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) @@ -1648,8 +1676,11 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] + self.compact_ints = kwds['compact_ints'] + self.use_unsigned = kwds['use_unsigned'] self.thousands = kwds['thousands'] self.decimal = kwds['decimal'] + self.comment = kwds['comment'] self._comment_lines = [] diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 7fca37cef473e..b7ef754004e18 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -172,28 +172,8 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_compact_ints(self): - if compat.is_platform_windows() and not self.low_memory: - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - def test_compact_ints_as_recarray(self): - if compat.is_platform_windows() and self.low_memory: + if compat.is_platform_windows(): raise nose.SkipTest( "segfaults on win-64, only when all tests are run") @@ -201,16 +181,20 @@ def test_compact_ints_as_recarray(self): '1,1,0,0\n' '0,1,0,1') - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) def test_pass_dtype(self): data = """\ diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 44892dc17c47b..f8c7241fdf88a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1330,3 +1330,46 @@ def test_raise_on_no_columns(self): # test with more than a single newline data = "\n\n\n" self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) + + def test_compact_ints_use_unsigned(self): + # see gh-13323 + data = 'a,b,c\n1,9,258' + + # sanity check + expected = DataFrame({ + 'a': np.array([1], dtype=np.int64), + 'b': np.array([9], dtype=np.int64), + 'c': np.array([258], dtype=np.int64), + }) + out = self.read_csv(StringIO(data)) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.int8), + 'b': np.array([9], dtype=np.int8), + 'c': np.array([258], dtype=np.int16), + }) + + # default behaviour for 'use_unsigned' + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True) + tm.assert_frame_equal(out, expected) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=False) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.uint8), + 'b': np.array([9], dtype=np.uint8), + 'c': np.array([258], dtype=np.uint16), + }) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=True) + tm.assert_frame_equal(out, expected) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 3c1c45831e7b4..e820924d2be8b 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -117,6 +117,27 @@ def test_python_engine(self): with tm.assertRaisesRegexp(ValueError, msg): read_csv(StringIO(data), engine=engine, **kwargs) + +class TestDeprecatedFeatures(tm.TestCase): + def test_deprecated_args(self): + data = '1,2,3' + + # deprecated arguments with non-default values + deprecated = { + 'compact_ints': True, + 'use_unsigned': True, + } + + engines = 'c', 'python' + + for engine in engines: + for arg, non_default_val in deprecated.items(): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + kwargs = {arg: non_default_val} + read_csv(StringIO(data), engine=engine, + **kwargs) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 729e5af528b80..d7ddaee658fe7 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1018,7 +1018,7 @@ cdef class TextReader: col_res = _maybe_upcast(col_res) if issubclass(col_res.dtype.type, np.integer) and self.compact_ints: - col_res = downcast_int64(col_res, self.use_unsigned) + col_res = lib.downcast_int64(col_res, na_values, self.use_unsigned) if col_res is None: raise CParserError('Unable to parse column %d' % i) @@ -1866,76 +1866,6 @@ cdef raise_parser_error(object base, parser_t *parser): raise CParserError(message) -def downcast_int64(ndarray[int64_t] arr, bint use_unsigned=0): - cdef: - Py_ssize_t i, n = len(arr) - int64_t mx = INT64_MIN + 1, mn = INT64_MAX - int64_t NA = na_values[np.int64] - int64_t val - ndarray[uint8_t] mask - int na_count = 0 - - _mask = np.empty(n, dtype=bool) - mask = _mask.view(np.uint8) - - for i in range(n): - val = arr[i] - - if val == NA: - mask[i] = 1 - na_count += 1 - continue - - # not NA - mask[i] = 0 - - if val > mx: - mx = val - - if val < mn: - mn = val - - if mn >= 0 and use_unsigned: - if mx <= UINT8_MAX - 1: - result = arr.astype(np.uint8) - if na_count: - np.putmask(result, _mask, na_values[np.uint8]) - return result - - if mx <= UINT16_MAX - 1: - result = arr.astype(np.uint16) - if na_count: - np.putmask(result, _mask, na_values[np.uint16]) - return result - - if mx <= UINT32_MAX - 1: - result = arr.astype(np.uint32) - if na_count: - np.putmask(result, _mask, na_values[np.uint32]) - return result - - else: - if mn >= INT8_MIN + 1 and mx <= INT8_MAX: - result = arr.astype(np.int8) - if na_count: - np.putmask(result, _mask, na_values[np.int8]) - return result - - if mn >= INT16_MIN + 1 and mx <= INT16_MAX: - result = arr.astype(np.int16) - if na_count: - np.putmask(result, _mask, na_values[np.int16]) - return result - - if mn >= INT32_MIN + 1 and mx <= INT32_MAX: - result = arr.astype(np.int32) - if na_count: - np.putmask(result, _mask, na_values[np.int32]) - return result - - return arr - - def _concatenate_chunks(list chunks): cdef: list names = list(chunks[0].keys()) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 5f7c5478b5d87..262e036ff44f1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -6,6 +6,20 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 +cdef extern from "headers/stdint.h": + enum: UINT8_MAX + enum: UINT16_MAX + enum: UINT32_MAX + enum: UINT64_MAX + enum: INT8_MIN + enum: INT8_MAX + enum: INT16_MIN + enum: INT16_MAX + enum: INT32_MAX + enum: INT32_MIN + enum: INT64_MAX + enum: INT64_MIN + # core.common import for fast inference checks def is_float(object obj): return util.is_float_object(obj) @@ -1240,3 +1254,74 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) + + +def downcast_int64(ndarray[int64_t] arr, object na_values, + bint use_unsigned=0): + cdef: + Py_ssize_t i, n = len(arr) + int64_t mx = INT64_MIN + 1, mn = INT64_MAX + int64_t NA = na_values[np.int64] + int64_t val + ndarray[uint8_t] mask + int na_count = 0 + + _mask = np.empty(n, dtype=bool) + mask = _mask.view(np.uint8) + + for i in range(n): + val = arr[i] + + if val == NA: + mask[i] = 1 + na_count += 1 + continue + + # not NA + mask[i] = 0 + + if val > mx: + mx = val + + if val < mn: + mn = val + + if mn >= 0 and use_unsigned: + if mx <= UINT8_MAX - 1: + result = arr.astype(np.uint8) + if na_count: + np.putmask(result, _mask, na_values[np.uint8]) + return result + + if mx <= UINT16_MAX - 1: + result = arr.astype(np.uint16) + if na_count: + np.putmask(result, _mask, na_values[np.uint16]) + return result + + if mx <= UINT32_MAX - 1: + result = arr.astype(np.uint32) + if na_count: + np.putmask(result, _mask, na_values[np.uint32]) + return result + + else: + if mn >= INT8_MIN + 1 and mx <= INT8_MAX: + result = arr.astype(np.int8) + if na_count: + np.putmask(result, _mask, na_values[np.int8]) + return result + + if mn >= INT16_MIN + 1 and mx <= INT16_MAX: + result = arr.astype(np.int16) + if na_count: + np.putmask(result, _mask, na_values[np.int16]) + return result + + if mn >= INT32_MIN + 1 and mx <= INT32_MAX: + result = arr.astype(np.int32) + if na_count: + np.putmask(result, _mask, na_values[np.int32]) + return result + + return arr diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py index 68eac12e5ec4c..a6941369b35be 100644 --- a/pandas/tests/test_infer_and_convert.py +++ b/pandas/tests/test_infer_and_convert.py @@ -401,6 +401,42 @@ def test_convert_sql_column_decimals(self): expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') self.assert_numpy_array_equal(result, expected) + def test_convert_downcast_int64(self): + from pandas.parser import na_values + + arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) + + # default argument + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + result = lib.downcast_int64(arr, na_values, use_unsigned=False) + self.assert_numpy_array_equal(result, expected) + + expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + # still cast to int8 despite use_unsigned=True + # because of the negative number as an element + arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + int8_na = na_values[np.int8] + int64_na = na_values[np.int64] + arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) + expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + if __name__ == '__main__': import nose From 69e72b046ca23402218911cedc2bb609d8979e87 Mon Sep 17 00:00:00 2001 From: "Barry E. Moore II" Date: Thu, 2 Jun 2016 20:59:54 -0400 Subject: [PATCH 003/359] BUG: Fix issue #13032, annotate test --- pandas/formats/format.py | 2 -- pandas/tests/formats/test_printing.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index f9682708b06a0..b9e2cbad272da 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -586,8 +586,6 @@ def to_string(self): self._chk_truncate() strcols = self._to_str_columns() text = self.adj.adjoin(1, *strcols) - if not self.index and self.header: - text = text.replace('\n ', '\n').strip() self.buf.writelines(text) if self.should_show_dimensions: diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index f93f37f27b2c5..880f7413544dd 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import nose from pandas import compat +from pandas import DataFrame import pandas.formats.printing as printing import pandas.formats.format as fmt import pandas.util.testing as tm @@ -8,9 +9,8 @@ _multiprocess_can_split_ = True - +# Added due to issue #13032 as part of PR #13350 def test_to_string_formatters_index_header(): - from pandas import DataFrame frame = DataFrame(data={0: 0, 1: 0}, index=[0]) expected = ' 0 0' From 30719ef90feb1e38b775463115394f022cfdf59e Mon Sep 17 00:00:00 2001 From: "Barry E. Moore II" Date: Thu, 2 Jun 2016 23:31:32 -0400 Subject: [PATCH 004/359] BUG: spacing issue complete --- pandas/formats/format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index b9e2cbad272da..accf597ef6455 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -706,7 +706,7 @@ def space_format(x, y): fmt_columns = columns.format() dtypes = self.frame.dtypes need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - str_columns = [[' ' + x if not self._get_formatter(i) and + str_columns = [[x if not self._get_formatter(i) and need_leadsp[x] else x] for i, (col, x) in enumerate(zip(columns, fmt_columns))] @@ -2206,7 +2206,7 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self): - formatter = self.formatter or (lambda x: '% d' % x) + formatter = self.formatter or (lambda x: '%d' % x) fmt_values = [formatter(x) for x in self.values] return fmt_values From 49d1013ec2c2c909d77e2ca3548d85b68fef37d1 Mon Sep 17 00:00:00 2001 From: "Barry E. Moore II" Date: Fri, 3 Jun 2016 00:53:41 -0400 Subject: [PATCH 005/359] BUG: hunt down remaining leading whitespace --- pandas/formats/format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index accf597ef6455..6ede3768fcef7 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -2060,11 +2060,11 @@ def _format(x): fmt_values = [] for i, v in enumerate(vals): if not is_float[i] and leading_space: - fmt_values.append(' %s' % _format(v)) + fmt_values.append('%s' % _format(v)) elif is_float[i]: fmt_values.append(float_format(v)) else: - fmt_values.append(' %s' % _format(v)) + fmt_values.append('%s' % _format(v)) return fmt_values From 2061e9e5fbbd890c484b53232b0747e08d7d1739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Fri, 3 Jun 2016 11:00:50 -0400 Subject: [PATCH 006/359] BUG: Fix series comparison operators when dealing with zero rank numpy arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #13006 Author: Gábor Lipták Closes #13307 from gliptak/seriescomp1 and squashes the following commits: 4967db4 [Gábor Lipták] Fix series comparison operators when dealing with zero rank numpy arrays --- doc/source/whatsnew/v0.18.2.txt | 3 ++- pandas/core/ops.py | 5 ++++- pandas/tests/series/test_operators.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index b87cdd91aa464..2f6afa8ed2ad0 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -101,7 +101,7 @@ API changes - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) -- An ``UnsupportedFunctionCall`` error is now raised if numpy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) +- An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) - Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) .. _whatsnew_0182.api.tolist: @@ -368,6 +368,7 @@ Bug Fixes - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) +- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index d1bb67fa0bc13..f27a83f50e115 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -754,7 +754,10 @@ def wrapper(self, other, axis=None): elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (np.ndarray, pd.Index)): - if len(self) != len(other): + # do not check length of zerodim array + # as it will broadcast + if (not lib.isscalar(lib.item_from_zerodim(other)) and + len(self) != len(other)): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 3588faa8b42f1..1e23c87fdb4ca 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -264,6 +264,18 @@ def test_operators_timedelta64(self): rs[2] += np.timedelta64(timedelta(minutes=5, seconds=1)) self.assertEqual(rs[2], value) + def test_operator_series_comparison_zerorank(self): + # GH 13006 + result = np.float64(0) > pd.Series([1, 2, 3]) + expected = 0.0 > pd.Series([1, 2, 3]) + self.assert_series_equal(result, expected) + result = pd.Series([1, 2, 3]) < np.float64(0) + expected = pd.Series([1, 2, 3]) < 0.0 + self.assert_series_equal(result, expected) + result = np.array([0, 1, 2])[0] > pd.Series([0, 1, 2]) + expected = 0.0 > pd.Series([1, 2, 3]) + self.assert_series_equal(result, expected) + def test_timedeltas_with_DateOffset(self): # GH 4532 From 103f7d31e1b850e532ed85a4b53ef222d1271c54 Mon Sep 17 00:00:00 2001 From: Chris Warth Date: Fri, 3 Jun 2016 16:57:24 -0400 Subject: [PATCH 007/359] DOC: Add example usage to DataFrame.filter Author: Chris Warth Closes #12399 from cswarth/doc/df_filter and squashes the following commits: f48e9ff [Chris Warth] DOC: Add example usage to DataFrame.filter --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/generic.py | 56 ++++++++++++++++--- .../tests/frame/test_axis_select_reindex.py | 16 ++++++ 3 files changed, 66 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2f6afa8ed2ad0..7493150370e9f 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -286,6 +286,7 @@ Other API changes - ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) - ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) +- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) .. _whatsnew_0182.deprecations: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9ecaaebc2b523..0852c5a293f4e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2357,7 +2357,11 @@ def _reindex_axis(self, new_index, fill_method, axis, copy): def filter(self, items=None, like=None, regex=None, axis=None): """ - Restrict the info axis to set of items or wildcard + Subset rows or columns of dataframe according to labels in + the specified index. + + Note that this routine does not filter a dataframe on its + contents. The filter is applied to the labels of the index. Parameters ---------- @@ -2367,19 +2371,57 @@ def filter(self, items=None, like=None, regex=None, axis=None): Keep info axis where "arg in col == True" regex : string (regular expression) Keep info axis with re.search(regex, col) == True - axis : int or None - The axis to filter on. By default this is the info axis. The "info - axis" is the axis that is used when indexing with ``[]``. For - example, ``df = DataFrame({'a': [1, 2, 3, 4]]}); df['a']``. So, - the ``DataFrame`` columns are the info axis. + axis : int or string axis name + The axis to filter on. By default this is the info axis, + 'index' for Series, 'columns' for DataFrame + + Returns + ------- + same type as input object + + Examples + -------- + >>> df + one two three + mouse 1 2 3 + rabbit 4 5 6 + + >>> # select columns by name + >>> df.filter(items=['one', 'three']) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select columns by regular expression + >>> df.filter(regex='e$', axis=1) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select rows containing 'bbi' + >>> df.filter(like='bbi', axis=0) + one two three + rabbit 4 5 6 + + See Also + -------- + pandas.DataFrame.select Notes ----- - Arguments are mutually exclusive, but this is not checked for + The ``items``, ``like``, and ``regex`` parameters are + enforced to be mutually exclusive. + ``axis`` defaults to the info axis that is used when indexing + with ``[]``. """ import re + nkw = sum([x is not None for x in [items, like, regex]]) + if nkw > 1: + raise TypeError('Keyword arguments `items`, `like`, or `regex` ' + 'are mutually exclusive') + if axis is None: axis = self._info_axis_name axis_name = self._get_axis_name(axis) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 07fe28f13b7d0..9da1b31d259c5 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -661,8 +661,24 @@ def test_filter(self): assert_frame_equal(filtered, expected) # pass in None + with assertRaisesRegexp(TypeError, 'Must pass'): + self.frame.filter() with assertRaisesRegexp(TypeError, 'Must pass'): self.frame.filter(items=None) + with assertRaisesRegexp(TypeError, 'Must pass'): + self.frame.filter(axis=1) + + # test mutually exclusive arguments + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$', like='bbi') + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$', axis=1) + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$') + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], like='bbi', axis=0) + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], like='bbi') # objects filtered = self.mixed_frame.filter(like='foo') From faf9b7d3218bc25068692ebc273f4c6942382a84 Mon Sep 17 00:00:00 2001 From: babakkeyvani Date: Sun, 5 Jun 2016 09:50:35 -0400 Subject: [PATCH 008/359] DOC: Fixed a minor typo Author: babakkeyvani Closes #13366 from bkeyvani/master and squashes the following commits: 029ade7 [babakkeyvani] DOC: Fixed a minor typo --- doc/README.rst | 2 +- doc/source/contributing.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/README.rst b/doc/README.rst index 06d95e6b9c44d..a93ad32a4c8f8 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -160,7 +160,7 @@ and `Good as first PR `_ where you could start out. -Or maybe you have an idea of you own, by using pandas, looking for something +Or maybe you have an idea of your own, by using pandas, looking for something in the documentation and thinking 'this can be improved', let's do something about that! diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index e64ff4c155132..a9b86925666b7 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -21,7 +21,7 @@ and `Difficulty Novice `_ where you could start out. -Or maybe through using *pandas* you have an idea of you own or are looking for something +Or maybe through using *pandas* you have an idea of your own or are looking for something in the documentation and thinking 'this can be improved'...you can do something about it! From eca7891c5e6bf1ea8fd1460ab6be171769616a73 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 5 Jun 2016 09:54:49 -0400 Subject: [PATCH 009/359] DOC: document doublequote in read_csv Title is self-explanatory. Author: gfyoung Closes #13368 from gfyoung/doublequote-doc and squashes the following commits: f3e01fc [gfyoung] DOC: document doublequote in read_csv --- doc/source/io.rst | 4 ++++ pandas/io/parsers.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 4eb42e1fb918d..79867d33c5838 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -273,6 +273,10 @@ quoting : int or ``csv.QUOTE_*`` instance, default ``None`` ``QUOTE_MINIMAL`` (0), ``QUOTE_ALL`` (1), ``QUOTE_NONNUMERIC`` (2) or ``QUOTE_NONE`` (3). Default (``None``) results in ``QUOTE_MINIMAL`` behavior. +doublequote : boolean, default ``True`` + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, + indicate whether or not to interpret two consecutive ``quotechar`` elements + **inside** a field as a single ``quotechar`` element. escapechar : str (length 1), default ``None`` One-character string used to escape delimiter when quoting is ``QUOTE_NONE``. comment : str, default ``None`` diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2c8726f588522..150e5ba5e1521 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -192,6 +192,10 @@ Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). Default (None) results in QUOTE_MINIMAL behavior. +doublequote : boolean, default ``True`` + When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive quotechar elements INSIDE a + field as a single ``quotechar`` element. escapechar : str (length 1), default None One-character string used to escape delimiter when quoting is QUOTE_NONE. comment : str, default None From 863cbc571b17a1734d813a45201b8158643ce3e2 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 5 Jun 2016 09:56:35 -0400 Subject: [PATCH 010/359] DEPR, DOC: Deprecate buffer_lines in read_csv `buffer_lines` is not respected, as it is determined internally via a heuristic involving `table_width` (see here for how it is computed). Author: gfyoung Closes #13360 from gfyoung/buffer-lines-depr-doc and squashes the following commits: a72ecbe [gfyoung] DEPR, DOC: Deprecate buffer_lines in read_csv --- doc/source/io.rst | 6 ++++++ doc/source/whatsnew/v0.18.2.txt | 3 ++- pandas/io/parsers.py | 11 +++++++++-- pandas/io/tests/parser/test_parsers.py | 2 -- pandas/io/tests/parser/test_unsupported.py | 5 +++++ 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 79867d33c5838..f559c3cb3ebaf 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -176,6 +176,12 @@ low_memory : boolean, default ``True`` Note that the entire file is read into a single DataFrame regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. (Only valid with C parser) +buffer_lines : int, default None + DEPRECATED: this argument will be removed in a future version because its + value is not respected by the parser + + If ``low_memory`` is ``True``, specify the number of rows to be read for + each chunk. (Only valid with C parser) compact_ints : boolean, default False DEPRECATED: this argument will be removed in a future version diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 7493150370e9f..2f841fa6b6e18 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -293,7 +293,8 @@ Other API changes Deprecations ^^^^^^^^^^^^ -- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv`` and will be removed in a future version (:issue:`13320`) +- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) +- ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) .. _whatsnew_0182.performance: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 150e5ba5e1521..a851a5f48f5e6 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -231,6 +231,12 @@ Note that the entire file is read into a single DataFrame regardless, use the `chunksize` or `iterator` parameter to return the data in chunks. (Only valid with C parser) +buffer_lines : int, default None + DEPRECATED: this argument will be removed in a future version because its + value is not respected by the parser + + If low_memory is True, specify the number of rows to be read for each + chunk. (Only valid with C parser) compact_ints : boolean, default False DEPRECATED: this argument will be removed in a future version @@ -238,7 +244,6 @@ the parser will attempt to cast it as the smallest integer dtype possible, either signed or unsigned depending on the specification from the `use_unsigned` parameter. - use_unsigned : boolean, default False DEPRECATED: this argument will be removed in a future version @@ -452,6 +457,7 @@ def _read(filepath_or_buffer, kwds): 'float_precision', ]) _deprecated_args = set([ + 'buffer_lines', 'compact_ints', 'use_unsigned', ]) @@ -810,7 +816,8 @@ def _clean_options(self, options, engine): _validate_header_arg(options['header']) for arg in _deprecated_args: - if result[arg] != _c_parser_defaults[arg]: + parser_default = _c_parser_defaults[arg] + if result.get(arg, parser_default) != parser_default: warnings.warn("The '{arg}' argument has been deprecated " "and will be removed in a future version" .format(arg=arg), FutureWarning, stacklevel=2) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index ea8ce9b616f36..fda7b28769647 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -72,14 +72,12 @@ def read_csv(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine kwds['low_memory'] = self.low_memory - kwds['buffer_lines'] = 2 return read_csv(*args, **kwds) def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine kwds['low_memory'] = True - kwds['buffer_lines'] = 2 return read_table(*args, **kwds) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index e820924d2be8b..97862ffa90cef 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -124,6 +124,7 @@ def test_deprecated_args(self): # deprecated arguments with non-default values deprecated = { + 'buffer_lines': True, 'compact_ints': True, 'use_unsigned': True, } @@ -132,6 +133,10 @@ def test_deprecated_args(self): for engine in engines: for arg, non_default_val in deprecated.items(): + if engine == 'python' and arg == 'buffer_lines': + # unsupported --> exception is raised first + continue + with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): kwargs = {arg: non_default_val} From 5a9b498e43a41744470732438e9422a407b0b380 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Sun, 5 Jun 2016 10:04:11 -0400 Subject: [PATCH 011/359] BUG: Make pd.read_hdf('data.h5') work when pandas object stored contained categorical columns closes #13231 Author: Christian Hudon Closes #13359 from chrish42/gh13231 and squashes the following commits: e839638 [Christian Hudon] Raise a better exception when the HDF file is empty and kwy=None. 611aa28 [Christian Hudon] Formatting fixes. e7c8313 [Christian Hudon] Add changelog entry. df10016 [Christian Hudon] Make logic that detects if there is only one dataset in a HDF5 file work when storing a dataframe that contains categorical data. 2f41aef [Christian Hudon] Tweak comment to be clearer. b3a5773 [Christian Hudon] Add test that fails for GitHub bug #13231 02f90d5 [Christian Hudon] Use if-expression. --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/pytables.py | 33 +++++++++++++++++++++++++++----- pandas/io/tests/test_pytables.py | 25 ++++++++++++++++++++++-- 3 files changed, 52 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2f841fa6b6e18..93aedce07da9d 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -342,6 +342,7 @@ Bug Fixes - Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) - Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue: `13306`) +- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fcf5125d956c6..cbe04349b5105 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -331,11 +331,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): try: if key is None: - keys = store.keys() - if len(keys) != 1: - raise ValueError('key must be provided when HDF file contains ' - 'multiple datasets.') - key = keys[0] + groups = store.groups() + if len(groups) == 0: + raise ValueError('No dataset in HDF5 file.') + candidate_only_group = groups[0] + + # For the HDF file to have only one dataset, all other groups + # should then be metadata groups for that candidate group. (This + # assumes that the groups() method enumerates parent groups + # before their children.) + for group_to_check in groups[1:]: + if not _is_metadata_of(group_to_check, candidate_only_group): + raise ValueError('key must be provided when HDF5 file ' + 'contains multiple datasets.') + key = candidate_only_group._v_pathname return store.select(key, auto_close=auto_close, **kwargs) except: # if there is an error, close the store @@ -347,6 +356,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): raise +def _is_metadata_of(group, parent_group): + """Check if a given group is a metadata group for a given parent_group.""" + if group._v_depth <= parent_group._v_depth: + return False + + current = group + while current._v_depth > 1: + parent = current._v_parent + if parent == parent_group and current._v_name == 'meta': + return True + current = current._v_parent + return False + + class HDFStore(StringMixin): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 96b66265ea586..9c13162bd774c 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -46,8 +46,8 @@ from distutils.version import LooseVersion -_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ - and 'blosc' or 'zlib' +_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' + else 'zlib') _multiprocess_can_split_ = False @@ -4877,6 +4877,9 @@ def test_read_nokey(self): df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) + + # Categorical dtype not supported for "fixed" format. So no need + # to test with that dtype in the dataframe here. with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a') reread = read_hdf(path) @@ -4884,6 +4887,24 @@ def test_read_nokey(self): df.to_hdf(path, 'df2', mode='a') self.assertRaises(ValueError, read_hdf, path) + def test_read_nokey_table(self): + # GH13231 + df = DataFrame({'i': range(5), + 'c': Series(list('abacd'), dtype='category')}) + + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', mode='a', format='table') + reread = read_hdf(path) + assert_frame_equal(df, reread) + df.to_hdf(path, 'df2', mode='a', format='table') + self.assertRaises(ValueError, read_hdf, path) + + def test_read_nokey_empty(self): + with ensure_clean_path(self.path) as path: + store = HDFStore(path) + store.close() + self.assertRaises(ValueError, read_hdf, path) + def test_read_from_pathlib_path(self): # GH11773 From e90d411714e7deac73e3e6b763ba9dccd3549871 Mon Sep 17 00:00:00 2001 From: Stewart Henderson Date: Sun, 5 Jun 2016 13:06:10 -0500 Subject: [PATCH 012/359] DOC: remove obsolete cron job script (#13369) * Typo correction * removed deprecated script --- ci/cron/go_doc.sh | 99 ----------------------------------------------- 1 file changed, 99 deletions(-) delete mode 100755 ci/cron/go_doc.sh diff --git a/ci/cron/go_doc.sh b/ci/cron/go_doc.sh deleted file mode 100755 index 89659577d0e7f..0000000000000 --- a/ci/cron/go_doc.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# This is a one-command cron job for setting up -# a virtualenv-based, linux-based, py2-based environment -# for building the Pandas documentation. -# -# The first run will install all required deps from pypi -# into the venv including monsters like scipy. -# You may want to set it up yourself to speed up the -# process. -# -# This is meant to be run as a cron job under a dedicated -# user account whose HOME directory contains this script. -# a CI directory will be created under it and all files -# stored within it. -# -# The hardcoded dep versions will gradually become obsolete -# You may need to tweak them -# -# @y-p, Jan/2014 - -# disto latex is sometimes finicky. Optionall use -# a local texlive install -export PATH=/mnt/debian/texlive/2013/bin/x86_64-linux:$PATH - -# Having ccache will speed things up -export PATH=/usr/lib64/ccache/:$PATH - -# limit disk usage -ccache -M 200M - -BASEDIR="$HOME/CI" -REPO_URL="https://github.com/pydata/pandas" -REPO_LOC="$BASEDIR/pandas" - -if [ ! -d $BASEDIR ]; then - mkdir -p $BASEDIR - virtualenv $BASEDIR/venv -fi - -source $BASEDIR/venv/bin/activate - -pip install numpy==1.7.2 -pip install cython==0.20.0 -pip install python-dateutil==2.2 -pip install --pre pytz==2013.9 -pip install sphinx==1.1.3 -pip install numexpr==2.2.2 - -pip install matplotlib==1.3.0 -pip install lxml==3.2.5 -pip install beautifulsoup4==4.3.2 -pip install html5lib==0.99 - -# You'll need R as well -pip install rpy2==2.3.9 - -pip install tables==3.0.0 -pip install bottleneck==0.7.0 -pip install ipython==0.13.2 - -# only if you have too -pip install scipy==0.13.2 - -pip install openpyxl==1.6.2 -pip install xlrd==0.9.2 -pip install xlwt==0.7.5 -pip install xlsxwriter==0.5.1 -pip install sqlalchemy==0.8.3 - -if [ ! -d "$REPO_LOC" ]; then - git clone "$REPO_URL" "$REPO_LOC" -fi - -cd "$REPO_LOC" -git reset --hard -git clean -df -git checkout master -git pull origin -make - -source $BASEDIR/venv/bin/activate -export PATH="/usr/lib64/ccache/:$PATH" -pip uninstall pandas -yq -pip install "$REPO_LOC" - -cd "$REPO_LOC"/doc - -python make.py clean -python make.py html -if [ ! $? == 0 ]; then - exit 1 -fi -python make.py zip_html -# usually requires manual intervention -# python make.py latex - -# If you have access: -# python make.py upload_dev From b722222f5ea760a3f3df4d063309949eb4956674 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 5 Jun 2016 17:56:40 -0400 Subject: [PATCH 013/359] CLN: remove old skiplist code Author: Jeff Reback Closes #13372 from jreback/skiplist and squashes the following commits: e05ea24 [Jeff Reback] CLN: remove old skiplist code --- pandas/algos.pyx | 44 -------------------------------------------- 1 file changed, 44 deletions(-) diff --git a/pandas/algos.pyx b/pandas/algos.pyx index a31b35ba4afc6..7884d9c41845c 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1505,52 +1505,8 @@ def roll_kurt(ndarray[double_t] input, #------------------------------------------------------------------------------- # Rolling median, min, max -ctypedef double_t (* skiplist_f)(object sl, int n, int p) - -cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op): - cdef ndarray[double_t] input = arg - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef Py_ssize_t nobs = 0, i - - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - skiplist = IndexableSkiplist(win) - - minp = _check_minp(win, minp, N) - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - - if prev == prev: - skiplist.remove(prev) - nobs -= 1 - - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = op(skiplist, nobs, minp) - - return output - from skiplist cimport * - @cython.boundscheck(False) @cython.wraparound(False) def roll_median_c(ndarray[float64_t] arg, int win, int minp): From 600a7dc666fd3adca79f8d59924a4670409a981e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 6 Jun 2016 08:08:51 -0400 Subject: [PATCH 014/359] DOC: actually document float_precision in read_csv So I wasn't 100% correct when I said that `float_precision` was documented here. It was well documented internally for `TextParser` and in a section for `io.rst`, but it wasn't listed formally in the parameters for the `read_csv` documentation. Author: gfyoung Closes #13377 from gfyoung/float-precision-doc and squashes the following commits: a9eed16 [gfyoung] DOC: actually document float_precision in read_csv --- doc/source/io.rst | 4 ++++ pandas/io/parsers.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index f559c3cb3ebaf..6aa2df3549914 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -269,6 +269,10 @@ thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` Character to recognize as decimal point. E.g. use ``','`` for European data. +float_precision : string, default None + Specifies which converter the C engine should use for floating-point values. + The options are ``None`` for the ordinary converter, ``high`` for the + high-precision converter, and ``round_trip`` for the round-trip converter. lineterminator : str (length 1), default ``None`` Character to break file into lines. Only valid with C parser. quotechar : str (length 1) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a851a5f48f5e6..04b488aff5c0c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -183,6 +183,11 @@ Thousands separator decimal : str, default '.' Character to recognize as decimal point (e.g. use ',' for European data). +float_precision : string, default None + Specifies which converter the C engine should use for floating-point + values. The options are `None` for the ordinary converter, + `high` for the high-precision converter, and `round_trip` for the + round-trip converter. lineterminator : str (length 1), default None Character to break file into lines. Only valid with C parser. quotechar : str (length 1), optional From 6edf4471d1e55ce6b587a7e37dd540d787716413 Mon Sep 17 00:00:00 2001 From: Mike Graham Date: Mon, 6 Jun 2016 08:10:10 -0400 Subject: [PATCH 015/359] DOC: Fix wording/grammar for rolling's win_type argument. Author: Mike Graham Closes #13376 from mikegraham/master and squashes the following commits: ec0c88e [Mike Graham] DOC: Fix wording/grammar for rolling's win_type argument. --- pandas/core/window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index cd66d4e30c351..bf3fd69c6340b 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -280,7 +280,7 @@ class Window(_Window): center : boolean, default False Set the labels at the center of the window. win_type : string, default None - prove a window type, see the notes below + Provide a window type. See the notes below. axis : int, default 0 Returns From 27448d9dca193af9c2280b8efb2d7df3813a9d33 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 6 Jun 2016 10:33:23 -0400 Subject: [PATCH 016/359] CLN: extract window functions from algox.pyx and create window.pyx (#13380) --- pandas/algos.pyx | 939 +---------------------------------------- pandas/core/window.py | 50 +-- pandas/lib.pxd | 2 + pandas/src/util.pxd | 14 + pandas/window.pyx | 954 ++++++++++++++++++++++++++++++++++++++++++ setup.py | 31 +- 6 files changed, 1009 insertions(+), 981 deletions(-) create mode 100644 pandas/window.pyx diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 7884d9c41845c..f1fd0204e2fd2 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -31,38 +31,17 @@ float16 = np.dtype(np.float16) float32 = np.dtype(np.float32) float64 = np.dtype(np.float64) -cdef np.int8_t MINint8 = np.iinfo(np.int8).min -cdef np.int16_t MINint16 = np.iinfo(np.int16).min -cdef np.int32_t MINint32 = np.iinfo(np.int32).min -cdef np.int64_t MINint64 = np.iinfo(np.int64).min -cdef np.float16_t MINfloat16 = np.NINF -cdef np.float32_t MINfloat32 = np.NINF -cdef np.float64_t MINfloat64 = np.NINF - -cdef np.int8_t MAXint8 = np.iinfo(np.int8).max -cdef np.int16_t MAXint16 = np.iinfo(np.int16).max -cdef np.int32_t MAXint32 = np.iinfo(np.int32).max -cdef np.int64_t MAXint64 = np.iinfo(np.int64).max -cdef np.float16_t MAXfloat16 = np.inf -cdef np.float32_t MAXfloat32 = np.inf -cdef np.float64_t MAXfloat64 = np.inf - cdef double NaN = np.NaN cdef double nan = NaN -cdef inline int int_max(int a, int b): return a if a >= b else b -cdef inline int int_min(int a, int b): return a if a <= b else b - - cdef extern from "src/headers/math.h": double sqrt(double x) nogil double fabs(double) nogil - int signbit(double) nogil -from pandas import lib - -include "skiplist.pyx" +# this is our util.pxd +from util cimport numeric +from pandas import lib cdef: int TIEBREAK_AVERAGE = 0 @@ -720,57 +699,6 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', # return result -# Cython implementations of rolling sum, mean, variance, skewness, -# other statistical moment functions -# -# Misc implementation notes -# ------------------------- -# -# - In Cython x * x is faster than x ** 2 for C types, this should be -# periodically revisited to see if it's still true. -# -# - - -def _check_minp(win, minp, N, floor=1): - if minp > win: - raise ValueError('min_periods (%d) must be <= window (%d)' - % (minp, win)) - elif minp > N: - minp = N + 1 - elif minp < 0: - raise ValueError('min_periods must be >= 0') - return max(minp, floor) - -# original C implementation by N. Devillard. -# This code in public domain. -# Function : kth_smallest() -# In : array of elements, # of elements in the array, rank k -# Out : one element -# Job : find the kth smallest element in the array - -# Reference: - -# Author: Wirth, Niklaus -# Title: Algorithms + data structures = programs -# Publisher: Englewood Cliffs: Prentice-Hall, 1976 -# Physical description: 366 p. -# Series: Prentice-Hall Series in Automatic Computation - - -ctypedef fused numeric: - int8_t - int16_t - int32_t - int64_t - - uint8_t - uint16_t - uint32_t - uint64_t - - float32_t - float64_t - cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil except -1: cdef numeric t @@ -894,263 +822,6 @@ def min_subseq(ndarray[double_t] arr): return (s, e, -m) -#------------------------------------------------------------------------------- -# Rolling sum -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_sum(ndarray[double_t] input, int win, int minp): - cdef double val, prev, sum_x = 0 - cdef int nobs = 0, i - cdef int N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - sum_x += val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if val == val: - nobs += 1 - sum_x += val - - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 - - if nobs >= minp: - output[i] = sum_x - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling mean -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_mean(ndarray[double_t] input, - int win, int minp): - cdef: - double val, prev, result, sum_x = 0 - Py_ssize_t nobs = 0, i, neg_ct = 0 - Py_ssize_t N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 - - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 - if signbit(prev): - neg_ct -= 1 - - if nobs >= minp: - result = sum_x / nobs - if neg_ct == 0 and result < 0: - # all positive - output[i] = 0 - elif neg_ct == nobs and result > 0: - # all negative - output[i] = 0 - else: - output[i] = result - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Exponentially weighted moving average - -def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, int minp): - """ - Compute exponentially-weighted moving average using center-of-mass. - - Parameters - ---------- - input : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int - - Returns - ------- - y : ndarray - """ - - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output - - minp = max(minp, 1) - - cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur - cdef Py_ssize_t i, nobs - - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha - - weighted_avg = input[0] - is_observation = (weighted_avg == weighted_avg) - nobs = int(is_observation) - output[0] = weighted_avg if (nobs >= minp) else NaN - old_wt = 1. - - for i from 1 <= i < N: - cur = input[i] - is_observation = (cur == cur) - nobs += int(is_observation) - if weighted_avg == weighted_avg: - if is_observation or (not ignore_na): - old_wt *= old_wt_factor - if is_observation: - if weighted_avg != cur: # avoid numerical errors on constant series - weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1. - elif is_observation: - weighted_avg = cur - - output[i] = weighted_avg if (nobs >= minp) else NaN - - return output - -#------------------------------------------------------------------------------- -# Exponentially weighted moving covariance - -def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, - double_t com, int adjust, int ignore_na, int minp, int bias): - """ - Compute exponentially-weighted moving variance using center-of-mass. - - Parameters - ---------- - input_x : ndarray (float64 type) - input_y : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int - bias: int - - Returns - ------- - y : ndarray - """ - - cdef Py_ssize_t N = len(input_x) - if len(input_y) != N: - raise ValueError('arrays are of different lengths (%d and %d)' % (N, len(input_y))) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output - - minp = max(minp, 1) - - cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov - cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y - cdef Py_ssize_t i, nobs - - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha - - mean_x = input_x[0] - mean_y = input_y[0] - is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) - nobs = int(is_observation) - if not is_observation: - mean_x = NaN - mean_y = NaN - output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN - cov = 0. - sum_wt = 1. - sum_wt2 = 1. - old_wt = 1. - - for i from 1 <= i < N: - cur_x = input_x[i] - cur_y = input_y[i] - is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) - nobs += int(is_observation) - if mean_x == mean_x: - if is_observation or (not ignore_na): - sum_wt *= old_wt_factor - sum_wt2 *= (old_wt_factor * old_wt_factor) - old_wt *= old_wt_factor - if is_observation: - old_mean_x = mean_x - old_mean_y = mean_y - if mean_x != cur_x: # avoid numerical errors on constant series - mean_x = ((old_wt * old_mean_x) + (new_wt * cur_x)) / (old_wt + new_wt) - if mean_y != cur_y: # avoid numerical errors on constant series - mean_y = ((old_wt * old_mean_y) + (new_wt * cur_y)) / (old_wt + new_wt) - cov = ((old_wt * (cov + ((old_mean_x - mean_x) * (old_mean_y - mean_y)))) + - (new_wt * ((cur_x - mean_x) * (cur_y - mean_y)))) / (old_wt + new_wt) - sum_wt += new_wt - sum_wt2 += (new_wt * new_wt) - old_wt += new_wt - if not adjust: - sum_wt /= old_wt - sum_wt2 /= (old_wt * old_wt) - old_wt = 1. - elif is_observation: - mean_x = cur_x - mean_y = cur_y - - if nobs >= minp: - if not bias: - numerator = sum_wt * sum_wt - denominator = numerator - sum_wt2 - output[i] = ((numerator / denominator) * cov) if (denominator > 0.) else NaN - else: - output[i] = cov - else: - output[i] = NaN - - return output - #---------------------------------------------------------------------- # Pairwise correlation/covariance @@ -1273,613 +944,9 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): return result -#---------------------------------------------------------------------- -# Rolling variance - -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): - """ - Numerically stable implementation using Welford's method. - """ - cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta - cdef Py_ssize_t i - cdef Py_ssize_t N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - minp = _check_minp(win, minp, N) - - # Check for windows larger than array, addresses #7297 - win = min(win, N) - - with nogil: - # Over the first window, observations can only be added, never removed - for i from 0 <= i < win: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 - else: - val = NaN - - output[i] = val - - # After the first window, observations can both be added and removed - for i from win <= i < N: - val = input[i] - prev = input[i - win] - - if val == val: - if prev == prev: - # Adding one observation and removing another one - delta = val - prev - prev -= mean_x - mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta - else: - # Adding one observation and not removing any - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - elif prev == prev: - # Adding no new observation, but removing one - nobs -= 1 - if nobs: - delta = (prev - mean_x) - mean_x -= delta / nobs - ssqdm_x -= delta * (prev - mean_x) - else: - mean_x = 0 - ssqdm_x = 0 - # Variance is unchanged if no observation is added or removed - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 - else: - val = NaN - - output[i] = val - - return output - - -#------------------------------------------------------------------------------- -# Rolling skewness -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_skew(ndarray[double_t] input, int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - # 3 components of the skewness equation - cdef double A, B, C, R - - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - - nobs -= 1 - if nobs >= minp: - A = x / nobs - B = xx / nobs - A * A - C = xxx / nobs - A * A * A - 3 * A * B - if B <= 0 or nobs < 3: - output[i] = NaN - else: - R = sqrt(B) - output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / - ((nobs-2) * R * R * R)) - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling kurtosis -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_kurt(ndarray[double_t] input, - int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - # 5 components of the kurtosis equation - cdef double A, B, C, D, R, K - - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - - # seriously don't ask me why this is faster - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val - - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - xxxx -= prev * prev * prev * prev - - nobs -= 1 - - if nobs >= minp: - A = x / nobs - R = A * A - B = xx / nobs - R - R = R * A - C = xxx / nobs - R - 3 * A * B - R = R * A - D = xxxx / nobs - R - 6*B*A*A - 4*C*A - - if B == 0 or nobs < 4: - output[i] = NaN - - else: - K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) - K = K / ((nobs - 2.)*(nobs-3.)) - - output[i] = K - - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling median, min, max - -from skiplist cimport * - -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_median_c(ndarray[float64_t] arg, int win, int minp): - cdef: - double val, res, prev - bint err=0 - int ret=0 - skiplist_t *sl - Py_ssize_t midpoint, nobs = 0, i - - - cdef Py_ssize_t N = len(arg) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - sl = skiplist_init(win) - if sl == NULL: - raise MemoryError("skiplist_init failed") - - minp = _check_minp(win, minp, N) - - with nogil: - for i from 0 <= i < minp - 1: - val = arg[i] - - # Not NaN - if val == val: - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - output[i] = NaN - - with nogil: - if not err: - for i from minp - 1 <= i < N: - - val = arg[i] - - if i > win - 1: - prev = arg[i - win] - - if prev == prev: - skiplist_remove(sl, prev) - nobs -= 1 - - if val == val: - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - - if nobs >= minp: - midpoint = nobs / 2 - if nobs % 2: - res = skiplist_get(sl, midpoint, &ret) - else: - res = (skiplist_get(sl, midpoint, &ret) + - skiplist_get(sl, (midpoint - 1), &ret)) / 2 - else: - res = NaN - - output[i] = res - - skiplist_destroy(sl) - if err: - raise MemoryError("skiplist_insert failed") - return output - -#---------------------------------------------------------------------- - -# Moving maximum / minimum code taken from Bottleneck under the terms -# of its Simplified BSD license -# https://github.com/kwgoodman/bottleneck - -from libc cimport stdlib - -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_max(ndarray[numeric] a, int window, int minp): - """ - Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. - - Parameters - ---------- - a: numpy array - window: int, size of rolling window - minp: if number of observations in window - is below this, output a NaN - """ - return _roll_min_max(a, window, minp, 1) - -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_min(ndarray[numeric] a, int window, int minp): - """ - Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. - - Parameters - ---------- - a: numpy array - window: int, size of rolling window - minp: if number of observations in window - is below this, output a NaN - """ - return _roll_min_max(a, window, minp, 0) - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef _roll_min_max(ndarray[numeric] a, int window, int minp, bint is_max): - "Moving min/max of 1d array of any numeric type along axis=0 ignoring NaNs." - cdef numeric ai, aold - cdef Py_ssize_t count - cdef Py_ssize_t* death - cdef numeric* ring - cdef numeric* minvalue - cdef numeric* end - cdef numeric* last - cdef Py_ssize_t i0 - cdef np.npy_intp *dim - dim = PyArray_DIMS(a) - cdef Py_ssize_t n0 = dim[0] - cdef np.npy_intp *dims = [n0] - cdef bint should_replace - cdef np.ndarray[numeric, ndim=1] y = PyArray_EMPTY(1, dims, PyArray_TYPE(a), 0) - - if window < 1: - raise ValueError('Invalid window size %d' - % (window)) - - if minp > window: - raise ValueError('Invalid min_periods size %d greater than window %d' - % (minp, window)) - - minp = _check_minp(window, minp, n0) - with nogil: - ring = stdlib.malloc(window * sizeof(numeric)) - death = stdlib.malloc(window * sizeof(Py_ssize_t)) - end = ring + window - last = ring - - minvalue = ring - ai = a[0] - if numeric in cython.floating: - if ai == ai: - minvalue[0] = ai - elif is_max: - minvalue[0] = MINfloat64 - else: - minvalue[0] = MAXfloat64 - else: - minvalue[0] = ai - death[0] = window - - count = 0 - for i0 in range(n0): - ai = a[i0] - if numeric in cython.floating: - if ai == ai: - count += 1 - elif is_max: - ai = MINfloat64 - else: - ai = MAXfloat64 - else: - count += 1 - if i0 >= window: - aold = a[i0 - window] - if aold == aold: - count -= 1 - if death[minvalue-ring] == i0: - minvalue += 1 - if minvalue >= end: - minvalue = ring - should_replace = ai >= minvalue[0] if is_max else ai <= minvalue[0] - if should_replace: - minvalue[0] = ai - death[minvalue-ring] = i0 + window - last = minvalue - else: - should_replace = last[0] <= ai if is_max else last[0] >= ai - while should_replace: - if last == ring: - last = end - last -= 1 - should_replace = last[0] <= ai if is_max else last[0] >= ai - last += 1 - if last == end: - last = ring - last[0] = ai - death[last - ring] = i0 + window - if numeric in cython.floating: - if count >= minp: - y[i0] = minvalue[0] - else: - y[i0] = NaN - else: - y[i0] = minvalue[0] - - for i0 in range(minp - 1): - if numeric in cython.floating: - y[i0] = NaN - else: - y[i0] = 0 - - stdlib.free(ring) - stdlib.free(death) - return y - -def roll_quantile(ndarray[float64_t, cast=True] input, int win, - int minp, double quantile): - """ - O(N log(window)) implementation using skip list - """ - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - skiplist = IndexableSkiplist(win) - - minp = _check_minp(win, minp, N) - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - - if prev == prev: - skiplist.remove(prev) - nobs -= 1 - - if val == val: - nobs += 1 - skiplist.insert(val) - - if nobs >= minp: - idx = int((quantile / 1.) * (nobs - 1)) - output[i] = skiplist.get(idx) - else: - output[i] = NaN - - return output - -def roll_generic(ndarray[float64_t, cast=True] input, - int win, int minp, int offset, - object func, object args, object kwargs): - cdef ndarray[double_t] output, counts, bufarr - cdef Py_ssize_t i, n - cdef float64_t *buf - cdef float64_t *oldbuf - - if not input.flags.c_contiguous: - input = input.copy('C') - - n = len(input) - if n == 0: - return input - - minp = _check_minp(win, minp, n, floor=0) - output = np.empty(n, dtype=float) - counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:] - - # truncated windows at the beginning, through first full-length window - for i from 0 <= i < (int_min(win, n) - offset): - if counts[i] >= minp: - output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs) - else: - output[i] = NaN - - # remaining full-length windows - buf = input.data - bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data - for i from (win - offset) <= i < (n - offset): - buf = buf + 1 - bufarr.data = buf - if counts[i] >= minp: - output[i] = func(bufarr, *args, **kwargs) - else: - output[i] = NaN - bufarr.data = oldbuf - - # truncated windows at the end - for i from int_max(n - offset, 0) <= i < n: - if counts[i] >= minp: - output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs) - else: - output[i] = NaN - - return output - - -def roll_window(ndarray[float64_t, ndim=1, cast=True] input, - ndarray[float64_t, ndim=1, cast=True] weights, - int minp, bint avg=True): - """ - Assume len(weights) << len(input) - """ - cdef: - ndarray[double_t] output, tot_wgt, counts - Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k - float64_t val_in, val_win, c, w - - in_n = len(input) - win_n = len(weights) - output = np.zeros(in_n, dtype=float) - counts = np.zeros(in_n, dtype=float) - if avg: - tot_wgt = np.zeros(in_n, dtype=float) - - minp = _check_minp(len(weights), minp, in_n) - - if avg: - for win_i from 0 <= win_i < win_n: - val_win = weights[win_i] - if val_win != val_win: - continue - - for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = input[in_i] - if val_in == val_in: - output[in_i + (win_n - win_i) - 1] += val_in * val_win - counts[in_i + (win_n - win_i) - 1] += 1 - tot_wgt[in_i + (win_n - win_i) - 1] += val_win - - for in_i from 0 <= in_i < in_n: - c = counts[in_i] - if c < minp: - output[in_i] = NaN - else: - w = tot_wgt[in_i] - if w == 0: - output[in_i] = NaN - else: - output[in_i] /= tot_wgt[in_i] - - else: - for win_i from 0 <= win_i < win_n: - val_win = weights[win_i] - if val_win != val_win: - continue - - for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = input[in_i] - - if val_in == val_in: - output[in_i + (win_n - win_i) - 1] += val_in * val_win - counts[in_i + (win_n - win_i) - 1] += 1 - - for in_i from 0 <= in_i < in_n: - c = counts[in_i] - if c < minp: - output[in_i] = NaN - - return output - - #---------------------------------------------------------------------- # group operations - @cython.wraparound(False) @cython.boundscheck(False) def is_lexsorted(list list_of_arrays): diff --git a/pandas/core/window.py b/pandas/core/window.py index bf3fd69c6340b..fbc56335aabd9 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -16,7 +16,7 @@ from pandas.core.base import (PandasObject, SelectionMixin, GroupByMixin) import pandas.core.common as com -import pandas.algos as algos +import pandas._window as _window from pandas import compat from pandas.compat.numpy import function as nv from pandas.util.decorators import Substitution, Appender @@ -407,9 +407,10 @@ def _apply_window(self, mean=True, how=None, **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, len(window)) - return algos.roll_window(np.concatenate((arg, additional_nans)) - if center else arg, window, minp, - avg=mean) + return _window.roll_window(np.concatenate((arg, + additional_nans)) + if center else arg, window, minp, + avg=mean) result = np.apply_along_axis(f, self.axis, values) @@ -532,11 +533,10 @@ def _apply(self, func, name=None, window=None, center=None, # if we have a string function name, wrap it if isinstance(func, compat.string_types): - if not hasattr(algos, func): + cfunc = getattr(_window, func, None) + if cfunc is None: raise ValueError("we do not support this function " - "algos.{0}".format(func)) - - cfunc = getattr(algos, func) + "in _window.{0}".format(func)) def func(arg, window, min_periods=None): minp = check_minp(min_periods, window) @@ -617,8 +617,8 @@ def apply(self, func, args=(), kwargs={}): def f(arg, window, min_periods): minp = _use_window(min_periods, window) - return algos.roll_generic(arg, window, minp, offset, func, args, - kwargs) + return _window.roll_generic(arg, window, minp, offset, func, args, + kwargs) return self._apply(f, func, args=args, kwargs=kwargs, center=False) @@ -687,7 +687,7 @@ def std(self, ddof=1, *args, **kwargs): def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt(algos.roll_var(arg, window, minp, ddof)) + return _zsqrt(_window.roll_var(arg, window, minp, ddof)) return self._apply(f, 'std', check_minp=_require_min_periods(1), ddof=ddof, **kwargs) @@ -732,7 +732,7 @@ def quantile(self, quantile, **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) - return algos.roll_quantile(arg, window, minp, quantile) + return _window.roll_quantile(arg, window, minp, quantile) return self._apply(f, 'quantile', quantile=quantile, **kwargs) @@ -1278,11 +1278,10 @@ def _apply(self, func, how=None, **kwargs): # if we have a string function name, wrap it if isinstance(func, compat.string_types): - if not hasattr(algos, func): + cfunc = getattr(_window, func, None) + if cfunc is None: raise ValueError("we do not support this function " - "algos.{0}".format(func)) - - cfunc = getattr(algos, func) + "in _window.{0}".format(func)) def func(arg): return cfunc(arg, self.com, int(self.adjust), @@ -1317,9 +1316,9 @@ def var(self, bias=False, *args, **kwargs): nv.validate_window_func('var', args, kwargs) def f(arg): - return algos.ewmcov(arg, arg, self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods), - int(bias)) + return _window.ewmcov(arg, arg, self.com, int(self.adjust), + int(self.ignore_na), int(self.min_periods), + int(bias)) return self._apply(f, **kwargs) @@ -1337,9 +1336,9 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): def _get_cov(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) - cov = algos.ewmcov(X._prep_values(), Y._prep_values(), self.com, - int(self.adjust), int(self.ignore_na), - int(self.min_periods), int(bias)) + cov = _window.ewmcov(X._prep_values(), Y._prep_values(), self.com, + int(self.adjust), int(self.ignore_na), + int(self.min_periods), int(bias)) return X._wrap_result(cov) return _flex_binary_moment(self._selected_obj, other._selected_obj, @@ -1361,9 +1360,10 @@ def _get_corr(X, Y): Y = self._shallow_copy(Y) def _cov(x, y): - return algos.ewmcov(x, y, self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods), - 1) + return _window.ewmcov(x, y, self.com, int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + 1) x_values = X._prep_values() y_values = Y._prep_values() diff --git a/pandas/lib.pxd b/pandas/lib.pxd index ba52e4cc47c89..36c91faa00036 100644 --- a/pandas/lib.pxd +++ b/pandas/lib.pxd @@ -1 +1,3 @@ +# prototypes for sharing + cdef bint is_null_datetimelike(v) diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd index 84b331f1e8e6f..96a23a91cc7c2 100644 --- a/pandas/src/util.pxd +++ b/pandas/src/util.pxd @@ -24,6 +24,20 @@ cdef extern from "numpy_helper.h": object sarr_from_data(cnp.dtype, int length, void* data) inline object unbox_if_zerodim(object arr) +ctypedef fused numeric: + cnp.int8_t + cnp.int16_t + cnp.int32_t + cnp.int64_t + + cnp.uint8_t + cnp.uint16_t + cnp.uint32_t + cnp.uint64_t + + cnp.float32_t + cnp.float64_t + cdef inline object get_value_at(ndarray arr, object loc): cdef: Py_ssize_t i, sz diff --git a/pandas/window.pyx b/pandas/window.pyx new file mode 100644 index 0000000000000..bfe9152477a40 --- /dev/null +++ b/pandas/window.pyx @@ -0,0 +1,954 @@ +from numpy cimport * +cimport numpy as np +import numpy as np + +cimport cython + +import_array() + +cimport util + +from libc.stdlib cimport malloc, free + +from numpy cimport NPY_INT8 as NPY_int8 +from numpy cimport NPY_INT16 as NPY_int16 +from numpy cimport NPY_INT32 as NPY_int32 +from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT16 as NPY_float16 +from numpy cimport NPY_FLOAT32 as NPY_float32 +from numpy cimport NPY_FLOAT64 as NPY_float64 + +from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, + uint32_t, uint64_t, float16_t, float32_t, float64_t) + +int8 = np.dtype(np.int8) +int16 = np.dtype(np.int16) +int32 = np.dtype(np.int32) +int64 = np.dtype(np.int64) +float16 = np.dtype(np.float16) +float32 = np.dtype(np.float32) +float64 = np.dtype(np.float64) + +cdef np.int8_t MINint8 = np.iinfo(np.int8).min +cdef np.int16_t MINint16 = np.iinfo(np.int16).min +cdef np.int32_t MINint32 = np.iinfo(np.int32).min +cdef np.int64_t MINint64 = np.iinfo(np.int64).min +cdef np.float16_t MINfloat16 = np.NINF +cdef np.float32_t MINfloat32 = np.NINF +cdef np.float64_t MINfloat64 = np.NINF + +cdef np.int8_t MAXint8 = np.iinfo(np.int8).max +cdef np.int16_t MAXint16 = np.iinfo(np.int16).max +cdef np.int32_t MAXint32 = np.iinfo(np.int32).max +cdef np.int64_t MAXint64 = np.iinfo(np.int64).max +cdef np.float16_t MAXfloat16 = np.inf +cdef np.float32_t MAXfloat32 = np.inf +cdef np.float64_t MAXfloat64 = np.inf + +cdef double NaN = np.NaN +cdef double nan = NaN + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + +# this is our util.pxd +from util cimport numeric + +cdef extern from "src/headers/math.h": + double sqrt(double x) nogil + int signbit(double) nogil + +include "skiplist.pyx" + +# Cython implementations of rolling sum, mean, variance, skewness, +# other statistical moment functions +# +# Misc implementation notes +# ------------------------- +# +# - In Cython x * x is faster than x ** 2 for C types, this should be +# periodically revisited to see if it's still true. +# +# - + +def _check_minp(win, minp, N, floor=1): + if minp > win: + raise ValueError('min_periods (%d) must be <= window (%d)' + % (minp, win)) + elif minp > N: + minp = N + 1 + elif minp < 0: + raise ValueError('min_periods must be >= 0') + return max(minp, floor) + +# original C implementation by N. Devillard. +# This code in public domain. +# Function : kth_smallest() +# In : array of elements, # of elements in the array, rank k +# Out : one element +# Job : find the kth smallest element in the array + +# Reference: + +# Author: Wirth, Niklaus +# Title: Algorithms + data structures = programs +# Publisher: Englewood Cliffs: Prentice-Hall, 1976 +# Physical description: 366 p. +# Series: Prentice-Hall Series in Automatic Computation + +#------------------------------------------------------------------------------- +# Rolling sum +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_sum(ndarray[double_t] input, int win, int minp): + cdef double val, prev, sum_x = 0 + cdef int nobs = 0, i + cdef int N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + minp = _check_minp(win, minp, N) + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if val == val: + nobs += 1 + sum_x += val + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + nobs -= 1 + + if nobs >= minp: + output[i] = sum_x + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling mean +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_mean(ndarray[double_t] input, + int win, int minp): + cdef: + double val, prev, result, sum_x = 0 + Py_ssize_t nobs = 0, i, neg_ct = 0 + Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + minp = _check_minp(win, minp, N) + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + if signbit(val): + neg_ct += 1 + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if val == val: + nobs += 1 + sum_x += val + if signbit(val): + neg_ct += 1 + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + nobs -= 1 + if signbit(prev): + neg_ct -= 1 + + if nobs >= minp: + result = sum_x / nobs + if neg_ct == 0 and result < 0: + # all positive + output[i] = 0 + elif neg_ct == nobs and result > 0: + # all negative + output[i] = 0 + else: + output[i] = result + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Exponentially weighted moving average + +def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, int minp): + """ + Compute exponentially-weighted moving average using center-of-mass. + + Parameters + ---------- + input : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + weighted_avg = input[0] + is_observation = (weighted_avg == weighted_avg) + nobs = int(is_observation) + output[0] = weighted_avg if (nobs >= minp) else NaN + old_wt = 1. + + for i from 1 <= i < N: + cur = input[i] + is_observation = (cur == cur) + nobs += int(is_observation) + if weighted_avg == weighted_avg: + if is_observation or (not ignore_na): + old_wt *= old_wt_factor + if is_observation: + if weighted_avg != cur: # avoid numerical errors on constant series + weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1. + elif is_observation: + weighted_avg = cur + + output[i] = weighted_avg if (nobs >= minp) else NaN + + return output + +#------------------------------------------------------------------------------- +# Exponentially weighted moving covariance + +def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, + double_t com, int adjust, int ignore_na, int minp, int bias): + """ + Compute exponentially-weighted moving variance using center-of-mass. + + Parameters + ---------- + input_x : ndarray (float64 type) + input_y : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + bias: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input_x) + if len(input_y) != N: + raise ValueError('arrays are of different lengths (%d and %d)' % (N, len(input_y))) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov + cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + mean_x = input_x[0] + mean_y = input_y[0] + is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) + nobs = int(is_observation) + if not is_observation: + mean_x = NaN + mean_y = NaN + output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN + cov = 0. + sum_wt = 1. + sum_wt2 = 1. + old_wt = 1. + + for i from 1 <= i < N: + cur_x = input_x[i] + cur_y = input_y[i] + is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) + nobs += int(is_observation) + if mean_x == mean_x: + if is_observation or (not ignore_na): + sum_wt *= old_wt_factor + sum_wt2 *= (old_wt_factor * old_wt_factor) + old_wt *= old_wt_factor + if is_observation: + old_mean_x = mean_x + old_mean_y = mean_y + if mean_x != cur_x: # avoid numerical errors on constant series + mean_x = ((old_wt * old_mean_x) + (new_wt * cur_x)) / (old_wt + new_wt) + if mean_y != cur_y: # avoid numerical errors on constant series + mean_y = ((old_wt * old_mean_y) + (new_wt * cur_y)) / (old_wt + new_wt) + cov = ((old_wt * (cov + ((old_mean_x - mean_x) * (old_mean_y - mean_y)))) + + (new_wt * ((cur_x - mean_x) * (cur_y - mean_y)))) / (old_wt + new_wt) + sum_wt += new_wt + sum_wt2 += (new_wt * new_wt) + old_wt += new_wt + if not adjust: + sum_wt /= old_wt + sum_wt2 /= (old_wt * old_wt) + old_wt = 1. + elif is_observation: + mean_x = cur_x + mean_y = cur_y + + if nobs >= minp: + if not bias: + numerator = sum_wt * sum_wt + denominator = numerator - sum_wt2 + output[i] = ((numerator / denominator) * cov) if (denominator > 0.) else NaN + else: + output[i] = cov + else: + output[i] = NaN + + return output + +#---------------------------------------------------------------------- +# Rolling variance + +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): + """ + Numerically stable implementation using Welford's method. + """ + cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + cdef Py_ssize_t i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + minp = _check_minp(win, minp, N) + + # Check for windows larger than array, addresses #7297 + win = min(win, N) + + with nogil: + # Over the first window, observations can only be added, never removed + for i from 0 <= i < win: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + delta = (val - mean_x) + mean_x += delta / nobs + ssqdm_x += delta * (val - mean_x) + + if (nobs >= minp) and (nobs > ddof): + #pathological case + if nobs == 1: + val = 0 + else: + val = ssqdm_x / (nobs - ddof) + if val < 0: + val = 0 + else: + val = NaN + + output[i] = val + + # After the first window, observations can both be added and removed + for i from win <= i < N: + val = input[i] + prev = input[i - win] + + if val == val: + if prev == prev: + # Adding one observation and removing another one + delta = val - prev + prev -= mean_x + mean_x += delta / nobs + val -= mean_x + ssqdm_x += (val + prev) * delta + else: + # Adding one observation and not removing any + nobs += 1 + delta = (val - mean_x) + mean_x += delta / nobs + ssqdm_x += delta * (val - mean_x) + elif prev == prev: + # Adding no new observation, but removing one + nobs -= 1 + if nobs: + delta = (prev - mean_x) + mean_x -= delta / nobs + ssqdm_x -= delta * (prev - mean_x) + else: + mean_x = 0 + ssqdm_x = 0 + # Variance is unchanged if no observation is added or removed + + if (nobs >= minp) and (nobs > ddof): + #pathological case + if nobs == 1: + val = 0 + else: + val = ssqdm_x / (nobs - ddof) + if val < 0: + val = 0 + else: + val = NaN + + output[i] = val + + return output + + +#------------------------------------------------------------------------------- +# Rolling skewness +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_skew(ndarray[double_t] input, int win, int minp): + cdef double val, prev + cdef double x = 0, xx = 0, xxx = 0 + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + # 3 components of the skewness equation + cdef double A, B, C, R + + minp = _check_minp(win, minp, N) + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + + if i > win - 1: + prev = input[i - win] + if prev == prev: + x -= prev + xx -= prev * prev + xxx -= prev * prev * prev + + nobs -= 1 + if nobs >= minp: + A = x / nobs + B = xx / nobs - A * A + C = xxx / nobs - A * A * A - 3 * A * B + if B <= 0 or nobs < 3: + output[i] = NaN + else: + R = sqrt(B) + output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / + ((nobs-2) * R * R * R)) + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling kurtosis +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_kurt(ndarray[double_t] input, + int win, int minp): + cdef double val, prev + cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + # 5 components of the kurtosis equation + cdef double A, B, C, D, R, K + + minp = _check_minp(win, minp, N) + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + + # seriously don't ask me why this is faster + x += val + xx += val * val + xxx += val * val * val + xxxx += val * val * val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + xxxx += val * val * val * val + + if i > win - 1: + prev = input[i - win] + if prev == prev: + x -= prev + xx -= prev * prev + xxx -= prev * prev * prev + xxxx -= prev * prev * prev * prev + + nobs -= 1 + + if nobs >= minp: + A = x / nobs + R = A * A + B = xx / nobs - R + R = R * A + C = xxx / nobs - R - 3 * A * B + R = R * A + D = xxxx / nobs - R - 6*B*A*A - 4*C*A + + if B == 0 or nobs < 4: + output[i] = NaN + + else: + K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) + K = K / ((nobs - 2.)*(nobs-3.)) + + output[i] = K + + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling median, min, max + +from skiplist cimport * + +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_median_c(ndarray[float64_t] arg, int win, int minp): + cdef: + double val, res, prev + bint err=0 + int ret=0 + skiplist_t *sl + Py_ssize_t midpoint, nobs = 0, i + + + cdef Py_ssize_t N = len(arg) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + sl = skiplist_init(win) + if sl == NULL: + raise MemoryError("skiplist_init failed") + + minp = _check_minp(win, minp, N) + + with nogil: + for i from 0 <= i < minp - 1: + val = arg[i] + + # Not NaN + if val == val: + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + output[i] = NaN + + with nogil: + if not err: + for i from minp - 1 <= i < N: + + val = arg[i] + + if i > win - 1: + prev = arg[i - win] + + if prev == prev: + skiplist_remove(sl, prev) + nobs -= 1 + + if val == val: + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + + if nobs >= minp: + midpoint = nobs / 2 + if nobs % 2: + res = skiplist_get(sl, midpoint, &ret) + else: + res = (skiplist_get(sl, midpoint, &ret) + + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + res = NaN + + output[i] = res + + skiplist_destroy(sl) + if err: + raise MemoryError("skiplist_insert failed") + return output + +#---------------------------------------------------------------------- + +# Moving maximum / minimum code taken from Bottleneck under the terms +# of its Simplified BSD license +# https://github.com/kwgoodman/bottleneck + +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_max(ndarray[numeric] a, int window, int minp): + """ + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + a: numpy array + window: int, size of rolling window + minp: if number of observations in window + is below this, output a NaN + """ + return _roll_min_max(a, window, minp, 1) + +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_min(ndarray[numeric] a, int window, int minp): + """ + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + a: numpy array + window: int, size of rolling window + minp: if number of observations in window + is below this, output a NaN + """ + return _roll_min_max(a, window, minp, 0) + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef _roll_min_max(ndarray[numeric] a, int window, int minp, bint is_max): + "Moving min/max of 1d array of any numeric type along axis=0 ignoring NaNs." + cdef numeric ai, aold + cdef Py_ssize_t count + cdef Py_ssize_t* death + cdef numeric* ring + cdef numeric* minvalue + cdef numeric* end + cdef numeric* last + cdef Py_ssize_t i0 + cdef np.npy_intp *dim + dim = PyArray_DIMS(a) + cdef Py_ssize_t n0 = dim[0] + cdef np.npy_intp *dims = [n0] + cdef bint should_replace + cdef np.ndarray[numeric, ndim=1] y = PyArray_EMPTY(1, dims, PyArray_TYPE(a), 0) + + if window < 1: + raise ValueError('Invalid window size %d' + % (window)) + + if minp > window: + raise ValueError('Invalid min_periods size %d greater than window %d' + % (minp, window)) + + minp = _check_minp(window, minp, n0) + with nogil: + ring = malloc(window * sizeof(numeric)) + death = malloc(window * sizeof(Py_ssize_t)) + end = ring + window + last = ring + + minvalue = ring + ai = a[0] + if numeric in cython.floating: + if ai == ai: + minvalue[0] = ai + elif is_max: + minvalue[0] = MINfloat64 + else: + minvalue[0] = MAXfloat64 + else: + minvalue[0] = ai + death[0] = window + + count = 0 + for i0 in range(n0): + ai = a[i0] + if numeric in cython.floating: + if ai == ai: + count += 1 + elif is_max: + ai = MINfloat64 + else: + ai = MAXfloat64 + else: + count += 1 + if i0 >= window: + aold = a[i0 - window] + if aold == aold: + count -= 1 + if death[minvalue-ring] == i0: + minvalue += 1 + if minvalue >= end: + minvalue = ring + should_replace = ai >= minvalue[0] if is_max else ai <= minvalue[0] + if should_replace: + minvalue[0] = ai + death[minvalue-ring] = i0 + window + last = minvalue + else: + should_replace = last[0] <= ai if is_max else last[0] >= ai + while should_replace: + if last == ring: + last = end + last -= 1 + should_replace = last[0] <= ai if is_max else last[0] >= ai + last += 1 + if last == end: + last = ring + last[0] = ai + death[last - ring] = i0 + window + if numeric in cython.floating: + if count >= minp: + y[i0] = minvalue[0] + else: + y[i0] = NaN + else: + y[i0] = minvalue[0] + + for i0 in range(minp - 1): + if numeric in cython.floating: + y[i0] = NaN + else: + y[i0] = 0 + + free(ring) + free(death) + return y + +def roll_quantile(ndarray[float64_t, cast=True] input, int win, + int minp, double quantile): + """ + O(N log(window)) implementation using skip list + """ + cdef double val, prev, midpoint + cdef IndexableSkiplist skiplist + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + skiplist = IndexableSkiplist(win) + + minp = _check_minp(win, minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + skiplist.insert(val) + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + + if prev == prev: + skiplist.remove(prev) + nobs -= 1 + + if val == val: + nobs += 1 + skiplist.insert(val) + + if nobs >= minp: + idx = int((quantile / 1.) * (nobs - 1)) + output[i] = skiplist.get(idx) + else: + output[i] = NaN + + return output + +def roll_generic(ndarray[float64_t, cast=True] input, + int win, int minp, int offset, + object func, object args, object kwargs): + cdef ndarray[double_t] output, counts, bufarr + cdef Py_ssize_t i, n + cdef float64_t *buf + cdef float64_t *oldbuf + + if not input.flags.c_contiguous: + input = input.copy('C') + + n = len(input) + if n == 0: + return input + + minp = _check_minp(win, minp, n, floor=0) + output = np.empty(n, dtype=float) + counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:] + + # truncated windows at the beginning, through first full-length window + for i from 0 <= i < (int_min(win, n) - offset): + if counts[i] >= minp: + output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs) + else: + output[i] = NaN + + # remaining full-length windows + buf = input.data + bufarr = np.empty(win, dtype=float) + oldbuf = bufarr.data + for i from (win - offset) <= i < (n - offset): + buf = buf + 1 + bufarr.data = buf + if counts[i] >= minp: + output[i] = func(bufarr, *args, **kwargs) + else: + output[i] = NaN + bufarr.data = oldbuf + + # truncated windows at the end + for i from int_max(n - offset, 0) <= i < n: + if counts[i] >= minp: + output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs) + else: + output[i] = NaN + + return output + + +def roll_window(ndarray[float64_t, ndim=1, cast=True] input, + ndarray[float64_t, ndim=1, cast=True] weights, + int minp, bint avg=True): + """ + Assume len(weights) << len(input) + """ + cdef: + ndarray[double_t] output, tot_wgt, counts + Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k + float64_t val_in, val_win, c, w + + in_n = len(input) + win_n = len(weights) + output = np.zeros(in_n, dtype=float) + counts = np.zeros(in_n, dtype=float) + if avg: + tot_wgt = np.zeros(in_n, dtype=float) + + minp = _check_minp(len(weights), minp, in_n) + + if avg: + for win_i from 0 <= win_i < win_n: + val_win = weights[win_i] + if val_win != val_win: + continue + + for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: + val_in = input[in_i] + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 + tot_wgt[in_i + (win_n - win_i) - 1] += val_win + + for in_i from 0 <= in_i < in_n: + c = counts[in_i] + if c < minp: + output[in_i] = NaN + else: + w = tot_wgt[in_i] + if w == 0: + output[in_i] = NaN + else: + output[in_i] /= tot_wgt[in_i] + + else: + for win_i from 0 <= win_i < win_n: + val_win = weights[win_i] + if val_win != val_win: + continue + + for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: + val_in = input[in_i] + + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 + + for in_i from 0 <= in_i < in_n: + c = counts[in_i] + if c < minp: + output[in_i] = NaN + + return output diff --git a/setup.py b/setup.py index 596fe62ff0781..1d189364239a9 100755 --- a/setup.py +++ b/setup.py @@ -270,6 +270,7 @@ class CheckSDist(sdist_class): 'pandas/tslib.pyx', 'pandas/index.pyx', 'pandas/algos.pyx', + 'pandas/window.pyx', 'pandas/parser.pyx', 'pandas/src/period.pyx', 'pandas/src/sparse.pyx', @@ -425,17 +426,23 @@ def pxd(name): 'sources': ['pandas/src/datetime/np_datetime.c', 'pandas/src/datetime/np_datetime_strings.c']}, algos={'pyxfile': 'algos', - 'pxdfiles': ['src/skiplist'], + 'pxdfiles': ['src/util'], 'depends': [srcpath('generated', suffix='.pyx'), - srcpath('join', suffix='.pyx'), - 'pandas/src/skiplist.pyx', - 'pandas/src/skiplist.h']}, + srcpath('join', suffix='.pyx')]}, + _window={'pyxfile': 'window', + 'pxdfiles': ['src/skiplist','src/util'], + 'depends': ['pandas/src/skiplist.pyx', + 'pandas/src/skiplist.h']}, parser={'pyxfile': 'parser', 'depends': ['pandas/src/parser/tokenizer.h', 'pandas/src/parser/io.h', 'pandas/src/numpy_helper.h'], 'sources': ['pandas/src/parser/tokenizer.c', 'pandas/src/parser/io.c']}, + _sparse={'pyxfile': 'src/sparse', + 'depends': [srcpath('sparse', suffix='.pyx')]}, + _testing={'pyxfile': 'src/testing', + 'depends': [srcpath('testing', suffix='.pyx')]}, ) ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'} @@ -461,22 +468,6 @@ def pxd(name): extensions.append(obj) -sparse_ext = Extension('pandas._sparse', - sources=[srcpath('sparse', suffix=suffix)], - include_dirs=[], - libraries=libraries, - extra_compile_args=extra_compile_args) - -extensions.extend([sparse_ext]) - -testing_ext = Extension('pandas._testing', - sources=[srcpath('testing', suffix=suffix)], - include_dirs=[], - libraries=libraries, - extra_compile_args=extra_compile_args) - -extensions.extend([testing_ext]) - #---------------------------------------------------------------------- # msgpack From b1bfd2fd59e5f6f5fc52bc5d934837262f293bee Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 6 Jun 2016 19:10:21 -0400 Subject: [PATCH 017/359] DEPR: Deprecate as_recarray in read_csv 1) Documented and deprecate `as_recarray` 2) Added `as_recarray` functionality to Python engine 3) Fixed bug in C engine in which `usecols` was not being respected in combination with `as_recarray` Author: gfyoung Closes #13373 from gfyoung/as-recarray-python-engine and squashes the following commits: abaeaef [gfyoung] ENH: Support as_recarray better in read_csv --- doc/source/io.rst | 11 ++- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 50 ++++++++-- pandas/io/tests/parser/c_parser_only.py | 34 ++----- pandas/io/tests/parser/common.py | 105 +++++++++++++++++++-- pandas/io/tests/parser/header.py | 10 +- pandas/io/tests/parser/test_textreader.py | 9 -- pandas/io/tests/parser/test_unsupported.py | 1 + pandas/parser.pyx | 8 +- 9 files changed, 164 insertions(+), 65 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 6aa2df3549914..6802a448c4e14 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -134,6 +134,14 @@ usecols : array-like, default ``None`` inferred from the document header row(s). For example, a valid `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter results in much faster parsing time and lower memory usage. +as_recarray : boolean, default ``False`` + DEPRECATED: this argument will be removed in a future version. Please call + ``pd.read_csv(...).to_records()`` instead. + + Return a NumPy recarray instead of a DataFrame after parsing the data. If + set to ``True``, this option takes precedence over the ``squeeze`` parameter. + In addition, as row indices are not available in such a format, the ``index_col`` + parameter will be ignored. squeeze : boolean, default ``False`` If the parsed data only contains one column then return a Series. prefix : str, default ``None`` @@ -179,9 +187,6 @@ low_memory : boolean, default ``True`` buffer_lines : int, default None DEPRECATED: this argument will be removed in a future version because its value is not respected by the parser - - If ``low_memory`` is ``True``, specify the number of rows to be read for - each chunk. (Only valid with C parser) compact_ints : boolean, default False DEPRECATED: this argument will be removed in a future version diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 93aedce07da9d..1e95af2df247b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -295,6 +295,7 @@ Deprecations - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) +- ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) .. _whatsnew_0182.performance: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 04b488aff5c0c..0f0e1848750c0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,7 +2,8 @@ Module contains tools for processing files into DataFrames or other objects """ from __future__ import print_function -from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map +from pandas.compat import (range, lrange, StringIO, lzip, zip, + string_types, map, OrderedDict) from pandas import compat from collections import defaultdict import re @@ -87,6 +88,14 @@ inferred from the document header row(s). For example, a valid `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter results in much faster parsing time and lower memory usage. +as_recarray : boolean, default False + DEPRECATED: this argument will be removed in a future version. Please call + `pd.read_csv(...).to_records()` instead. + + Return a NumPy recarray instead of a DataFrame after parsing the data. + If set to True, this option takes precedence over the `squeeze` parameter. + In addition, as row indices are not available in such a format, the + `index_col` parameter will be ignored. squeeze : boolean, default False If the parsed data only contains one column then return a Series prefix : str, default None @@ -239,9 +248,6 @@ buffer_lines : int, default None DEPRECATED: this argument will be removed in a future version because its value is not respected by the parser - - If low_memory is True, specify the number of rows to be read for each - chunk. (Only valid with C parser) compact_ints : boolean, default False DEPRECATED: this argument will be removed in a future version @@ -452,7 +458,6 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ - 'as_recarray', 'low_memory', 'memory_map', 'buffer_lines', @@ -462,6 +467,7 @@ def _read(filepath_or_buffer, kwds): 'float_precision', ]) _deprecated_args = set([ + 'as_recarray', 'buffer_lines', 'compact_ints', 'use_unsigned', @@ -820,12 +826,22 @@ def _clean_options(self, options, engine): _validate_header_arg(options['header']) + depr_warning = '' + for arg in _deprecated_args: parser_default = _c_parser_defaults[arg] + msg = ("The '{arg}' argument has been deprecated " + "and will be removed in a future version." + .format(arg=arg)) + + if arg == 'as_recarray': + msg += ' Please call pd.to_csv(...).to_records() instead.' + if result.get(arg, parser_default) != parser_default: - warnings.warn("The '{arg}' argument has been deprecated " - "and will be removed in a future version" - .format(arg=arg), FutureWarning, stacklevel=2) + depr_warning += msg + '\n\n' + + if depr_warning != '': + warnings.warn(depr_warning, FutureWarning, stacklevel=2) if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") @@ -973,6 +989,7 @@ def __init__(self, kwds): self.na_fvalues = kwds.get('na_fvalues') self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') + self.as_recarray = kwds.get('as_recarray', False) self.tupleize_cols = kwds.get('tupleize_cols', False) self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) @@ -1304,7 +1321,6 @@ def __init__(self, src, **kwds): self.kwds = kwds kwds = kwds.copy() - self.as_recarray = kwds.get('as_recarray', False) ParserBase.__init__(self, kwds) if 'utf-16' in (kwds.get('encoding') or ''): @@ -1889,6 +1905,9 @@ def read(self, rows=None): columns, data = self._do_date_conversions(columns, data) data = self._convert_data(data) + if self.as_recarray: + return self._to_recarray(data, columns) + index, columns = self._make_index(data, alldata, columns, indexnamerow) return index, columns, data @@ -1928,6 +1947,19 @@ def _convert_data(self, data): return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose, clean_conv) + def _to_recarray(self, data, columns): + dtypes = [] + o = OrderedDict() + + # use the columns to "order" the keys + # in the unordered 'data' dictionary + for col in columns: + dtypes.append((str(col), data[col].dtype)) + o[col] = data[col] + + tuples = lzip(*o.values()) + return np.array(tuples, dtypes) + def _infer_columns(self): names = self.names num_original_columns = 0 diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index b7ef754004e18..90103064774c1 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -172,30 +172,6 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_compact_ints_as_recarray(self): - if compat.is_platform_windows(): - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - def test_pass_dtype(self): data = """\ one,two @@ -220,10 +196,12 @@ def test_pass_dtype_as_recarray(self): 3,4.5 4,5.5""" - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}, - as_recarray=True) - self.assertEqual(result['one'].dtype, 'u1') - self.assertEqual(result['two'].dtype, 'S1') + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), dtype={ + 'one': 'u1', 1: 'S1'}, as_recarray=True) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'S1') def test_empty_pass_dtype(self): data = 'one,two' diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index f8c7241fdf88a..fdaac71f59386 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -608,10 +608,6 @@ def test_url(self): @tm.slow def test_file(self): - - # FILE - if sys.version_info[:2] < (2, 6): - raise nose.SkipTest("file:// not supported with Python < 2.6") dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salary.table.csv') local_table = self.read_table(localtable) @@ -925,8 +921,8 @@ def test_empty_with_nrows_chunksize(self): StringIO('foo,bar\n'), chunksize=10))) tm.assert_frame_equal(result, expected) - # 'as_recarray' is not supported yet for the Python parser - if self.engine == 'c': + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): result = self.read_csv(StringIO('foo,bar\n'), nrows=10, as_recarray=True) result = DataFrame(result[2], columns=result[1], @@ -934,11 +930,13 @@ def test_empty_with_nrows_chunksize(self): tm.assert_frame_equal(DataFrame.from_records( result), expected, check_index_type=False) - result = next(iter(self.read_csv( - StringIO('foo,bar\n'), chunksize=10, as_recarray=True))) + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = next(iter(self.read_csv(StringIO('foo,bar\n'), + chunksize=10, as_recarray=True))) result = DataFrame(result[2], columns=result[1], index=result[0]) - tm.assert_frame_equal(DataFrame.from_records( - result), expected, check_index_type=False) + tm.assert_frame_equal(DataFrame.from_records(result), expected, + check_index_type=False) def test_eof_states(self): # see gh-10728, gh-10548 @@ -1373,3 +1371,90 @@ def test_compact_ints_use_unsigned(self): out = self.read_csv(StringIO(data), compact_ints=True, use_unsigned=True) tm.assert_frame_equal(out, expected) + + def test_compact_ints_as_recarray(self): + data = ('0,1,0,0\n' + '1,1,0,0\n' + '0,1,0,1') + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + def test_as_recarray(self): + # basic test + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + data = 'a,b\n1,a\n2,b' + expected = np.array([(1, 'a'), (2, 'b')], + dtype=[('a', ' Date: Mon, 6 Jun 2016 19:17:30 -0400 Subject: [PATCH 018/359] [BUG] Reading multiindex, incorrectly names columns without name. closes #12453 Author: Jozef Brandys Closes #13115 from brandys11/excel_multiindex_empty_name and squashes the following commits: 7953aee [Jozef Brandys] [BUG] Reading multiindex, incorrectly names columns without name. --- pandas/formats/format.py | 47 +++++++++++++++++++++-------------- pandas/io/excel.py | 32 ++++++++++++++++++++---- pandas/io/tests/test_excel.py | 40 +++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 23 deletions(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 27d8b553013b9..923ac25f0ebed 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1277,29 +1277,41 @@ def _write_hierarchical_rows(self, fmt_values, indent): def _get_level_lengths(levels, sentinel=''): - from itertools import groupby + """For each index in each level the function returns lengths of indexes. - def _make_grouper(): - record = {'count': 0} + Parameters + ---------- + levels : list of lists + List of values on for level. + sentinel : string, optional + Value which states that no new index starts on there. - def grouper(x): - if x != sentinel: - record['count'] += 1 - return record['count'] + Returns + ---------- + Returns list of maps. For each level returns map of indexes (key is index + in row and value is length of index). + """ + if len(levels) == 0: + return [] - return grouper + control = [True for x in levels[0]] result = [] - for lev in levels: - i = 0 - f = _make_grouper() - recs = {} - for key, gpr in groupby(lev, f): - values = list(gpr) - recs[i] = len(values) - i += len(values) + for level in levels: + last_index = 0 - result.append(recs) + lengths = {} + for i, key in enumerate(level): + if control[i] and key == sentinel: + pass + else: + control[i] = False + lengths[last_index] = i - last_index + last_index = i + + lengths[last_index] = len(level) - last_index + + result.append(lengths) return result @@ -1762,7 +1774,6 @@ def _format_value(self, val): return val def _format_header_mi(self): - if self.columns.nlevels > 1: if not self.index: raise NotImplementedError("Writing to Excel with MultiIndex" diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 4c26480a0f583..775465ea9372d 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -431,10 +431,13 @@ def _parse_cell(cell_contents, cell_typ): if header is not None: if com.is_list_like(header): header_names = [] + control_row = [True for x in data[0]] for row in header: if com.is_integer(skiprows): row += skiprows - data[row] = _fill_mi_header(data[row]) + + data[row], control_row = _fill_mi_header( + data[row], control_row) header_name, data[row] = _pop_header_name( data[row], index_col) header_names.append(header_name) @@ -511,16 +514,35 @@ def _trim_excel_header(row): return row -def _fill_mi_header(row): - # forward fill blanks entries - # from headers if parsing as MultiIndex +def _fill_mi_header(row, control_row): + """Forward fills blank entries in row, but only inside the same parent index + + Used for creating headers in Multiindex. + Parameters + ---------- + row : list + List of items in a single row. + constrol_row : list of boolean + Helps to determine if particular column is in same parent index as the + previous value. Used to stop propagation of empty cells between + different indexes. + + Returns + ---------- + Returns changed row and control_row + """ last = row[0] for i in range(1, len(row)): + if not control_row[i]: + last = row[i] + if row[i] == '' or row[i] is None: row[i] = last else: + control_row[i] = False last = row[i] - return row + + return row, control_row # fill blank if index_col not None diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index b7e5360a6f3db..55a7f5350719d 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -725,6 +725,46 @@ def test_read_excel_multiindex(self): header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected) + def test_read_excel_multiindex_empty_level(self): + # GH 12453 + _skip_if_no_xlsxwriter() + with ensure_clean('.xlsx') as path: + df = DataFrame({ + ('Zero', ''): {0: 0}, + ('One', 'x'): {0: 1}, + ('Two', 'X'): {0: 3}, + ('Two', 'Y'): {0: 7} + }) + + expected = DataFrame({ + ('Zero', 'Unnamed: 3_level_1'): {0: 0}, + ('One', u'x'): {0: 1}, + ('Two', u'X'): {0: 3}, + ('Two', u'Y'): {0: 7} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1]) + tm.assert_frame_equal(actual, expected) + + df = pd.DataFrame({ + ('Beg', ''): {0: 0}, + ('Middle', 'x'): {0: 1}, + ('Tail', 'X'): {0: 3}, + ('Tail', 'Y'): {0: 7} + }) + + expected = pd.DataFrame({ + ('Beg', 'Unnamed: 0_level_1'): {0: 0}, + ('Middle', u'x'): {0: 1}, + ('Tail', u'X'): {0: 3}, + ('Tail', u'Y'): {0: 7} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1]) + tm.assert_frame_equal(actual, expected) + def test_excel_multindex_roundtrip(self): # GH 4679 _skip_if_no_xlsxwriter() From 158ae5bf1e2c15945590f8938bc97d3512ce162b Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 7 Jun 2016 23:18:39 +0100 Subject: [PATCH 019/359] COMPAT, TST: allow numpy array comparisons with complex dtypes (#13392) Traces back to bug in NumPy v1.7.1 in which the 'array_equivalent' method could not compare NumPy arrays with complicated dtypes. As pandas relies on this function to check NumPy array equality during testing, this commit adds a fallback method for doing so. Closes gh-13388. --- pandas/core/common.py | 19 ++++++++++++++++++- pandas/tests/test_common.py | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index d26c59e62de30..28bae362a3411 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -349,7 +349,24 @@ def array_equivalent(left, right, strict_nan=False): right = right.view('i8') # NaNs cannot occur otherwise. - return np.array_equal(left, right) + try: + return np.array_equal(left, right) + except AttributeError: + # see gh-13388 + # + # NumPy v1.7.1 has a bug in its array_equal + # function that prevents it from correctly + # comparing two arrays with complex dtypes. + # This bug is corrected in v1.8.0, so remove + # this try-except block as soon as we stop + # supporting NumPy versions < 1.8.0 + if not is_dtype_equal(left.dtype, right.dtype): + return False + + left = left.tolist() + right = right.tolist() + + return left == right def _iterable_not_string(x): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index ad43dc1c09ef1..56b1b542d547e 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -832,6 +832,24 @@ def test_is_timedelta(): assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))) +def test_array_equivalent_compat(): + # see gh-13388 + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + assert (com.array_equivalent(m, n, strict_nan=True)) + assert (com.array_equivalent(m, n, strict_nan=False)) + + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (4, 3)], dtype=[('a', int), ('b', float)]) + assert (not com.array_equivalent(m, n, strict_nan=True)) + assert (not com.array_equivalent(m, n, strict_nan=False)) + + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (3, 4)], dtype=[('b', int), ('a', float)]) + assert (not com.array_equivalent(m, n, strict_nan=True)) + assert (not com.array_equivalent(m, n, strict_nan=False)) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 5407249a99e4c6e3d9f28465a940cd48a44799db Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 8 Jun 2016 07:22:34 -0400 Subject: [PATCH 020/359] DOC, ENH: Support memory_map for Python engine Title is self-explanatory. Author: gfyoung Closes #13381 from gfyoung/memory-map-python-engine and squashes the following commits: 5278fb5 [gfyoung] DOC, ENH: Support memory_map for Python engine --- doc/source/io.rst | 4 ++ doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/common.py | 49 ++++++++++++++++++++++- pandas/io/parsers.py | 9 ++++- pandas/io/tests/data/test_mmap.csv | 5 +++ pandas/io/tests/parser/c_parser_only.py | 4 -- pandas/io/tests/parser/common.py | 11 +++++ pandas/io/tests/parser/data/test_mmap.csv | 4 ++ pandas/io/tests/test_common.py | 47 ++++++++++++++++++++++ 9 files changed, 127 insertions(+), 7 deletions(-) create mode 100644 pandas/io/tests/data/test_mmap.csv create mode 100644 pandas/io/tests/parser/data/test_mmap.csv diff --git a/doc/source/io.rst b/doc/source/io.rst index 6802a448c4e14..61625104f5c1d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -198,6 +198,10 @@ use_unsigned : boolean, default False If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether the column should be compacted to the smallest signed or unsigned integer dtype. +memory_map : boolean, default False + If a filepath is provided for ``filepath_or_buffer``, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. NA and Missing Data Handling ++++++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 1e95af2df247b..5aee616241406 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -76,6 +76,7 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/common.py b/pandas/io/common.py index cf4bba6e97afb..76395928eb011 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -4,6 +4,7 @@ import os import csv import codecs +import mmap import zipfile from contextlib import contextmanager, closing @@ -276,7 +277,7 @@ def ZipFile(*args, **kwargs): ZipFile = zipfile.ZipFile -def _get_handle(path, mode, encoding=None, compression=None): +def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): """Gets file handle for given path and mode. """ if compression is not None: @@ -324,9 +325,55 @@ def _get_handle(path, mode, encoding=None, compression=None): else: f = open(path, mode) + if memory_map and hasattr(f, 'fileno'): + try: + f = MMapWrapper(f) + except Exception: + # we catch any errors that may have occurred + # because that is consistent with the lower-level + # functionality of the C engine (pd.read_csv), so + # leave the file handler as is then + pass + return f +class MMapWrapper(BaseIterator): + """ + Wrapper for the Python's mmap class so that it can be properly read in + by Python's csv.reader class. + + Parameters + ---------- + f : file object + File object to be mapped onto memory. Must support the 'fileno' + method or have an equivalent attribute + + """ + + def __init__(self, f): + self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + + def __getattr__(self, name): + return getattr(self.mmap, name) + + def __next__(self): + newline = self.mmap.readline() + + # readline returns bytes, not str, in Python 3, + # but Python's CSV reader expects str, so convert + # the output to str before continuing + if compat.PY3: + newline = compat.bytes_to_str(newline) + + # mmap doesn't raise if reading past the allocated + # data but instead returns an empty string, so raise + # if that is returned + if newline == '': + raise StopIteration + return newline + + class UTF8Recoder(BaseIterator): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 0f0e1848750c0..4e954979f7d08 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -261,6 +261,10 @@ If integer columns are being compacted (i.e. `compact_ints=True`), specify whether the column should be compacted to the smallest signed or unsigned integer dtype. +memory_map : boolean, default False + If a filepath is provided for `filepath_or_buffer`, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. Returns ------- @@ -459,7 +463,6 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ 'low_memory', - 'memory_map', 'buffer_lines', 'error_bad_lines', 'warn_bad_lines', @@ -1683,6 +1686,7 @@ def __init__(self, f, **kwds): self.encoding = kwds['encoding'] self.compression = kwds['compression'] + self.memory_map = kwds['memory_map'] self.skiprows = kwds['skiprows'] self.skip_footer = kwds['skip_footer'] @@ -1718,7 +1722,8 @@ def __init__(self, f, **kwds): if isinstance(f, compat.string_types): f = _get_handle(f, 'r', encoding=self.encoding, - compression=self.compression) + compression=self.compression, + memory_map=self.memory_map) elif self.compression: f = _wrap_compressed(f, self.compression, self.encoding) # in Python 3, convert BytesIO or fileobjects passed with an encoding diff --git a/pandas/io/tests/data/test_mmap.csv b/pandas/io/tests/data/test_mmap.csv new file mode 100644 index 0000000000000..cc2cd7c30349b --- /dev/null +++ b/pandas/io/tests/data/test_mmap.csv @@ -0,0 +1,5 @@ +a,b,c +1,one,I +2,two,II + +3,three,III diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 90103064774c1..b6048051edc4d 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -285,10 +285,6 @@ def test_usecols_dtypes(self): self.assertTrue((result.dtypes == [object, np.int, np.float]).all()) self.assertTrue((result2.dtypes == [object, np.float]).all()) - def test_memory_map(self): - # it works! - self.read_csv(self.csv1, memory_map=True) - def test_disable_bool_parsing(self): # #2090 diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index fdaac71f59386..670f3df6f3984 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1458,3 +1458,14 @@ def test_as_recarray(self): out = self.read_csv(StringIO(data), as_recarray=True, usecols=['a']) tm.assert_numpy_array_equal(out, expected) + + def test_memory_map(self): + mmap_file = os.path.join(self.dirpath, 'test_mmap.csv') + expected = DataFrame({ + 'a': [1, 2, 3], + 'b': ['one', 'two', 'three'], + 'c': ['I', 'II', 'III'] + }) + + out = self.read_csv(mmap_file, memory_map=True) + tm.assert_frame_equal(out, expected) diff --git a/pandas/io/tests/parser/data/test_mmap.csv b/pandas/io/tests/parser/data/test_mmap.csv new file mode 100644 index 0000000000000..2885fc2bfbd69 --- /dev/null +++ b/pandas/io/tests/parser/data/test_mmap.csv @@ -0,0 +1,4 @@ +a,b,c +1,one,I +2,two,II +3,three,III diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 8615b75d87626..b70fca3ed2d20 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -2,6 +2,7 @@ Tests for the pandas.io.common functionalities """ from pandas.compat import StringIO +import mmap import os from os.path import isabs @@ -87,3 +88,49 @@ def test_iterator(self): tm.assert_frame_equal(first, expected.iloc[[0]]) expected.index = [0 for i in range(len(expected))] tm.assert_frame_equal(concat(it), expected.iloc[1:]) + + +class TestMMapWrapper(tm.TestCase): + + def setUp(self): + self.mmap_file = os.path.join(tm.get_data_path(), + 'test_mmap.csv') + + def test_constructor_bad_file(self): + non_file = StringIO('I am not a file') + non_file.fileno = lambda: -1 + + msg = "Invalid argument" + tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file) + + target = open(self.mmap_file, 'r') + target.close() + + msg = "I/O operation on closed file" + tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target) + + def test_get_attr(self): + target = open(self.mmap_file, 'r') + wrapper = common.MMapWrapper(target) + + attrs = dir(wrapper.mmap) + attrs = [attr for attr in attrs + if not attr.startswith('__')] + attrs.append('__next__') + + for attr in attrs: + self.assertTrue(hasattr(wrapper, attr)) + + self.assertFalse(hasattr(wrapper, 'foo')) + + def test_next(self): + target = open(self.mmap_file, 'r') + wrapper = common.MMapWrapper(target) + + lines = target.readlines() + + for line in lines: + next_line = next(wrapper) + self.assertEqual(next_line, line) + + self.assertRaises(StopIteration, next, wrapper) From d5bea25274b5faad5d2014b279a1da6451ebe1c7 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 8 Jun 2016 07:26:55 -0400 Subject: [PATCH 021/359] API/ENH: union Categorical Author: Chris Closes #13361 from chris-b1/union-categorical and squashes the following commits: 568784f [Chris] versionadded; empty case 17209f9 [Chris] Doc updates; use Index.append 4499cda [Chris] move tests, adress feedback 77e7963 [Chris] doc notes 7b37c34 [Chris] cleanup impl, add asv ccaeb76 [Chris] API/ENH: union Categorical --- asv_bench/benchmarks/categoricals.py | 15 ++++++++ doc/source/categorical.rst | 25 ++++++++++++++ doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/tools/tests/test_concat.py | 51 +++++++++++++++++++++++++++- pandas/types/concat.py | 51 ++++++++++++++++++++++++++++ pandas/util/testing.py | 36 +++++++++++++++++--- 6 files changed, 173 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 244af3a577fe2..bf1e1b3f40ab0 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,4 +1,8 @@ from .pandas_vb_common import * +try: + from pandas.types.concat import union_categoricals +except ImportError: + pass import string @@ -12,6 +16,17 @@ def time_concat_categorical(self): concat([self.s, self.s]) +class union_categorical(object): + goal_time = 0.2 + + def setup(self): + self.a = pd.Categorical((list('aabbcd') * 1000000)) + self.b = pd.Categorical((list('bbcdjk') * 1000000)) + + def time_union_categorical(self): + union_categoricals([self.a, self.b]) + + class categorical_value_counts(object): goal_time = 1 diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index b518bc947c2da..e971f1f28903f 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -648,6 +648,31 @@ In this case the categories are not the same and so an error is raised: The same applies to ``df.append(df_different)``. +.. _categorical.union: + +Unioning +~~~~~~~~ + +.. versionadded:: 0.18.2 + +If you want to combine categoricals that do not necessarily have +the same categories, the `union_categorical` function will +combine a list-like of categoricals. The new categories +will be the union of the categories being combined. + +.. ipython:: python + + from pandas.types.concat import union_categoricals + a = pd.Categorical(["b", "c"]) + b = pd.Categorical(["a", "b"]) + union_categoricals([a, b]) + +.. note:: + + `union_categoricals` only works with unordered categoricals + and will raise if any are ordered. + + Getting Data In/Out ------------------- diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 5aee616241406..749eb088b6ef7 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -91,7 +91,7 @@ Other enhancements - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - +- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 9d9b0635e0f35..a8c86657a48cc 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -9,7 +9,8 @@ from pandas import (DataFrame, concat, read_csv, isnull, Series, date_range, Index, Panel, MultiIndex, Timestamp, - DatetimeIndex) + DatetimeIndex, Categorical) +from pandas.types.concat import union_categoricals from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal, makeCustomDataframe as mkdf, @@ -919,6 +920,54 @@ def test_concat_keys_with_none(self): keys=['b', 'c', 'd', 'e']) tm.assert_frame_equal(result, expected) + def test_union_categorical(self): + # GH 13361 + data = [ + (list('abc'), list('abd'), list('abcabd')), + ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), + ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), + + (pd.date_range('2014-01-01', '2014-01-05'), + pd.date_range('2014-01-06', '2014-01-07'), + pd.date_range('2014-01-01', '2014-01-07')), + + (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), + pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), + pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), + + (pd.period_range('2014-01-01', '2014-01-05'), + pd.period_range('2014-01-06', '2014-01-07'), + pd.period_range('2014-01-01', '2014-01-07')), + ] + + for a, b, combined in data: + result = union_categoricals([Categorical(a), Categorical(b)]) + expected = Categorical(combined) + tm.assert_categorical_equal(result, expected, + check_category_order=True) + + # new categories ordered by appearance + s = Categorical(['x', 'y', 'z']) + s2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([s, s2]).categories + expected = Index(['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_index_equal(result, expected) + + # can't be ordered + s = Categorical([0, 1.2, 2], ordered=True) + s2 = Categorical([0, 1.2, 2], ordered=True) + with tm.assertRaises(TypeError): + union_categoricals([s, s2]) + + # must exactly match types + s = Categorical([0, 1.2, 2]) + s2 = Categorical([2, 3, 4]) + with tm.assertRaises(TypeError): + union_categoricals([s, s2]) + + with tm.assertRaises(ValueError): + union_categoricals([]) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 5cd7abb6889b7..53db9ddf79a5c 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -201,6 +201,57 @@ def convert_categorical(x): return Categorical(concatted, rawcats) +def union_categoricals(to_union): + """ + Combine list-like of Categoricals, unioning categories. All + must have the same dtype, and none can be ordered. + + .. versionadded 0.18.2 + + Parameters + ---------- + to_union : list-like of Categoricals + + Returns + ------- + Categorical + A single array, categories will be ordered as they + appear in the list + + Raises + ------ + TypeError + If any of the categoricals are ordered or all do not + have the same dtype + ValueError + Emmpty list of categoricals passed + """ + from pandas import Index, Categorical + + if len(to_union) == 0: + raise ValueError('No Categoricals to union') + + first = to_union[0] + if any(c.ordered for c in to_union): + raise TypeError("Can only combine unordered Categoricals") + + if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype) + for c in to_union): + raise TypeError("dtype of categories must be the same") + + cats = first.categories + unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() + categories = Index(unique_cats) + + new_codes = [] + for c in to_union: + indexer = categories.get_indexer(c.categories) + new_codes.append(indexer.take(c.codes)) + codes = np.concatenate(new_codes) + return Categorical(codes, categories=categories, ordered=False, + fastpath=True) + + def _concat_datetime(to_concat, axis=0, typs=None): """ provide concatenation of an datetimelike array of arrays each of which is a diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 03ccfcab24f58..d13873fcf2c84 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -963,14 +963,40 @@ def assertNotIsInstance(obj, cls, msg=''): def assert_categorical_equal(left, right, check_dtype=True, - obj='Categorical'): + obj='Categorical', check_category_order=True): + """Test that categoricals are eqivalent + + Parameters + ---------- + left, right : Categorical + Categoricals to compare + check_dtype : bool, default True + Check that integer dtype of the codes are the same + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + check_category_order : bool, default True + Whether the order of the categories should be compared, which + implies identical integer codes. If False, only the resulting + values are compared. The ordered attribute is + checked regardless. + """ assertIsInstance(left, pd.Categorical, '[Categorical] ') assertIsInstance(right, pd.Categorical, '[Categorical] ') - assert_index_equal(left.categories, right.categories, - obj='{0}.categories'.format(obj)) - assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype, - obj='{0}.codes'.format(obj)) + if check_category_order: + assert_index_equal(left.categories, right.categories, + obj='{0}.categories'.format(obj)) + assert_numpy_array_equal(left.codes, right.codes, + check_dtype=check_dtype, + obj='{0}.codes'.format(obj)) + else: + assert_index_equal(left.categories.sort_values(), + right.categories.sort_values(), + obj='{0}.categories'.format(obj)) + assert_index_equal(left.categories.take(left.codes), + right.categories.take(right.codes), + obj='{0}.values'.format(obj)) assert_attr_equal('ordered', left, right, obj=obj) From d405bf26a03b3ecdaa30b62c4b1dd5cb22930329 Mon Sep 17 00:00:00 2001 From: Geraint Duck Date: Wed, 8 Jun 2016 16:27:07 +0100 Subject: [PATCH 022/359] BUG: Fix for Series.str.extractall (single group with quantifier) closes #13382 closes #13397 --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/strings.py | 2 ++ pandas/tests/test_strings.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 749eb088b6ef7..8b80901774828 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -336,6 +336,7 @@ Bug Fixes - Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) +- Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5b1b8bd05af42..2f9f8ec936e78 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -708,6 +708,8 @@ def str_extractall(arr, pat, flags=0): subject_key = (subject_key, ) for match_i, match_tuple in enumerate(regex.findall(subject)): + if isinstance(match_tuple, compat.string_types): + match_tuple = (match_tuple,) na_tuple = [np.NaN if group == "" else group for group in match_tuple] match_list.append(na_tuple) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 3d1851966afd0..73f9809a7f042 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -977,6 +977,20 @@ def test_extractall_single_group(self): e = DataFrame(['a', 'b', 'd', 'c'], i) tm.assert_frame_equal(r, e) + def test_extractall_single_group_with_quantifier(self): + # extractall(one un-named group with quantifier) returns + # DataFrame with one un-named column (GH13382). + s = Series(['ab3', 'abc3', 'd4cd2'], name='series_name') + r = s.str.extractall(r'([a-z]+)') + i = MultiIndex.from_tuples([ + (0, 0), + (1, 0), + (2, 0), + (2, 1), + ], names=(None, "match")) + e = DataFrame(['ab', 'abc', 'd', 'cd'], i) + tm.assert_frame_equal(r, e) + def test_extractall_no_matches(self): s = Series(['a3', 'b3', 'd4c2'], name='series_name') # one un-named group. From 3eb4784dbf1757f2c3383beff2f70c5da47a93fe Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 9 Jun 2016 07:35:59 -0400 Subject: [PATCH 023/359] BUG: don't raise on empty usecols Title is self-explanatory. Author: gfyoung Closes #13402 from gfyoung/empty-usecols-bug and squashes the following commits: 8eed8d1 [gfyoung] BUG: don't raise on empty usecols --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 3 ++- pandas/io/tests/parser/usecols.py | 7 +++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 8b80901774828..105194e504f45 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -358,6 +358,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) +- Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4e954979f7d08..475eb73812666 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -944,7 +944,8 @@ def _validate_usecols_arg(usecols): if usecols is not None: usecols_dtype = lib.infer_dtype(usecols) - if usecols_dtype not in ('integer', 'string', 'unicode'): + if usecols_dtype not in ('empty', 'integer', + 'string', 'unicode'): raise ValueError(msg) return usecols diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 0d3ae95f0d1d4..8e34018df279b 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -354,3 +354,10 @@ def test_usecols_with_multibyte_unicode_characters(self): df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい']) tm.assert_frame_equal(df, expected) + + def test_empty_usecols(self): + # should not raise + data = 'a,b,c\n1,2,3\n4,5,6' + expected = DataFrame() + result = self.read_csv(StringIO(data), usecols=set([])) + tm.assert_frame_equal(result, expected) From 62b4327c9b60d890b1d404cddf3a4244254346a8 Mon Sep 17 00:00:00 2001 From: harshul1610 Date: Thu, 9 Jun 2016 15:15:01 -0400 Subject: [PATCH 024/359] =?UTF-8?q?DOC:=20typo=20fix=20and=20adding=20corr?= =?UTF-8?q?ect=20command=20for=20environment=20deactivation=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: harshul1610 Closes #13413 from harshul1610/master and squashes the following commits: 6108138 [harshul1610] DOC- typo fix and adding correct command for environment deactivation for windows and linux --- doc/source/contributing.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index a9b86925666b7..3d39656faafa4 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -188,7 +188,7 @@ To work in this environment, Windows users should ``activate`` it as follows:: activate pandas_dev -Mac OSX and Linux users should use:: +Mac OSX / Linux users should use:: source activate pandas_dev @@ -198,10 +198,14 @@ To view your environments:: conda info -e -To return to you home root environment:: +To return to your home root environment in Windows:: deactivate +To return to your home root environment in OSX / Linux:: + + source deactivate + See the full conda docs `here `__. At this point you can easily do an *in-place* install, as detailed in the next section. From 07761c519154a6ed8a9345512476152226499ba6 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 14 Jun 2016 09:48:59 -0400 Subject: [PATCH 025/359] CLN: refactor numeric index creation to all numeric sub-classes Propogate name attribute closes #12309 Author: Pietro Battiston Closes #13205 from toobaz/numindexname and squashes the following commits: 9d93fea [Pietro Battiston] TST: test "check_same" in assert_numpy_array_equal() bea8101 [Pietro Battiston] BUG: Make DateTimeIndex copy datetime64[ns] data on copy=True 3320727 [Pietro Battiston] DOC: What's new 757d105 [Pietro Battiston] TST: Use assert_numpy_array_equal 6d75e55 [Pietro Battiston] BUG: It makes sense to also catch ValueErrors b6c9233 [Pietro Battiston] BUG: Common NumericIndex.__new__, fixed name handling in indices --- doc/source/whatsnew/v0.18.2.txt | 3 +- pandas/indexes/base.py | 27 +++++ pandas/indexes/category.py | 3 + pandas/indexes/numeric.py | 110 +++++++++------------ pandas/tests/frame/test_block_internals.py | 2 + pandas/tests/indexes/common.py | 47 +++++++++ pandas/tests/indexes/test_base.py | 1 + pandas/tests/indexes/test_category.py | 14 +++ pandas/tests/indexes/test_numeric.py | 4 +- pandas/tests/test_testing.py | 11 +++ pandas/tseries/index.py | 18 ++-- pandas/tseries/period.py | 5 +- pandas/tseries/tdi.py | 5 +- pandas/tseries/tests/test_timedeltas.py | 2 +- pandas/util/testing.py | 12 ++- 15 files changed, 185 insertions(+), 79 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 105194e504f45..e469cbf79b31a 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -348,7 +348,8 @@ Bug Fixes - Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) - +- Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) +- Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 82f16becbd511..96472698ba9d9 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -376,6 +376,33 @@ def _shallow_copy_with_infer(self, values=None, **kwargs): pass return Index(values, **attributes) + def _deepcopy_if_needed(self, orig, copy=False): + """ + .. versionadded:: 0.18.2 + + Make a copy of self if data coincides (in memory) with orig. + Subclasses should override this if self._base is not an ndarray. + + Parameters + ---------- + orig : ndarray + other ndarray to compare self._data against + copy : boolean, default False + when False, do not run any check, just return self + + Returns + ------- + A copy of self if needed, otherwise self : Index + """ + if copy: + # Retrieve the "base objects", i.e. the original memory allocations + orig = orig if orig.base is None else orig.base + new = self._data if self._data.base is None else self._data.base + if orig is new: + return self.copy(deep=True) + + return self + def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index e877e43bcc603..4c9ca43f7f25d 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -46,6 +46,9 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if fastpath: return cls._simple_new(data, name=name) + if name is None and hasattr(data, 'name'): + name = data.name + if isinstance(data, com.ABCCategorical): data = cls._create_categorical(cls, data, categories, ordered) elif isinstance(data, CategoricalIndex): diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 0deaf4da9b2bb..89fc05fdcc5f5 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -22,6 +22,28 @@ class NumericIndex(Index): """ _is_numeric_dtype = True + def __new__(cls, data=None, dtype=None, copy=False, name=None, + fastpath=False): + + if fastpath: + return cls._simple_new(data, name=name) + + # isscalar, generators handled in coerce_to_ndarray + data = cls._coerce_to_ndarray(data) + + if issubclass(data.dtype.type, compat.string_types): + cls._string_data_error(data) + + if copy or not is_dtype_equal(data.dtype, cls._default_dtype): + subarr = np.array(data, dtype=cls._default_dtype, copy=copy) + cls._assert_safe_casting(data, subarr) + else: + subarr = data + + if name is None and hasattr(data, 'name'): + name = data.name + return cls._simple_new(subarr, name=name) + def _maybe_cast_slice_bound(self, label, side, kind): """ This function should be overloaded in subclasses that allow non-trivial @@ -55,6 +77,15 @@ def _convert_tolerance(self, tolerance): raise ValueError('tolerance argument for %s must be numeric: %r' % (type(self).__name__, tolerance)) + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Subclasses need to override this only if the process of casting data + from some accepted dtype to the internal dtype(s) bears the risk of + truncation (e.g. float to int). + """ + pass + class Int64Index(NumericIndex): """ @@ -90,29 +121,7 @@ class Int64Index(NumericIndex): _engine_type = _index.Int64Engine - def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=False, **kwargs): - - if fastpath: - return cls._simple_new(data, name=name) - - # isscalar, generators handled in coerce_to_ndarray - data = cls._coerce_to_ndarray(data) - - if issubclass(data.dtype.type, compat.string_types): - cls._string_data_error(data) - - elif issubclass(data.dtype.type, np.integer): - dtype = np.int64 - subarr = np.array(data, dtype=dtype, copy=copy) - else: - subarr = np.array(data, dtype=np.int64, copy=copy) - if len(data) > 0: - if (subarr != data).any(): - raise TypeError('Unsafe NumPy casting to integer, you must' - ' explicitly cast') - - return cls._simple_new(subarr, name=name) + _default_dtype = np.int64 @property def inferred_type(self): @@ -155,17 +164,22 @@ def equals(self, other): if self.is_(other): return True - try: - return com.array_equivalent(com._values_from_object(self), - com._values_from_object(other)) - except TypeError: - # e.g. fails in numpy 1.6 with DatetimeIndex #1681 - return False + return com.array_equivalent(com._values_from_object(self), + com._values_from_object(other)) def _wrap_joined_index(self, joined, other): name = self.name if self.name == other.name else None return Int64Index(joined, name=name) + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented as ints. + """ + if not issubclass(data.dtype.type, np.integer): + if not np.array_equal(data, subarr): + raise TypeError('Unsafe NumPy casting, you must ' + 'explicitly cast') Int64Index._add_numeric_methods() Int64Index._add_logical_methods() @@ -200,39 +214,7 @@ class Float64Index(NumericIndex): _inner_indexer = _algos.inner_join_indexer_float64 _outer_indexer = _algos.outer_join_indexer_float64 - def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=False, **kwargs): - - if fastpath: - return cls._simple_new(data, name) - - data = cls._coerce_to_ndarray(data) - - if issubclass(data.dtype.type, compat.string_types): - cls._string_data_error(data) - - if dtype is None: - dtype = np.float64 - dtype = np.dtype(dtype) - - # allow integer / object dtypes to be passed, but coerce to float64 - if dtype.kind in ['i', 'O', 'f']: - dtype = np.float64 - - else: - raise TypeError("cannot support {0} dtype in " - "Float64Index".format(dtype)) - - try: - subarr = np.array(data, dtype=dtype, copy=copy) - except: - raise TypeError('Unsafe NumPy casting, you must explicitly cast') - - # coerce to float64 for storage - if subarr.dtype != np.float64: - subarr = subarr.astype(np.float64) - - return cls._simple_new(subarr, name) + _default_dtype = np.float64 @property def inferred_type(self): @@ -339,8 +321,7 @@ def equals(self, other): return False left, right = self._values, other._values return ((left == right) | (self._isnan & other._isnan)).all() - except TypeError: - # e.g. fails in numpy 1.6 with DatetimeIndex #1681 + except (TypeError, ValueError): return False def __contains__(self, other): @@ -392,6 +373,5 @@ def isin(self, values, level=None): return lib.ismember_nans(np.array(self), value_set, isnull(list(value_set)).any()) - Float64Index._add_numeric_methods() Float64Index._add_logical_methods_disabled() diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 0421cf2ba42d2..38163d89355e9 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -372,11 +372,13 @@ def test_consolidate_datetime64(self): ser_starting.index = ser_starting.values ser_starting = ser_starting.tz_localize('US/Eastern') ser_starting = ser_starting.tz_convert('UTC') + ser_starting.index.name = 'starting' ser_ending = df.ending ser_ending.index = ser_ending.values ser_ending = ser_ending.tz_localize('US/Eastern') ser_ending = ser_ending.tz_convert('UTC') + ser_ending.index.name = 'ending' df.starting = ser_starting.index df.ending = ser_ending.index diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index e342eee2aabbb..d6f7493bb25f9 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -205,6 +205,53 @@ def test_hash_error(self): type(ind).__name__): hash(ind) + def test_copy_name(self): + # Check that "name" argument passed at initialization is honoured + # GH12309 + for name, index in compat.iteritems(self.indices): + if isinstance(index, MultiIndex): + continue + + first = index.__class__(index, copy=True, name='mario') + second = first.__class__(first, copy=False) + + # Even though "copy=False", we want a new object. + self.assertIsNot(first, second) + # Not using tm.assert_index_equal() since names differ: + self.assertTrue(index.equals(first)) + + self.assertEqual(first.name, 'mario') + self.assertEqual(second.name, 'mario') + + s1 = Series(2, index=first) + s2 = Series(3, index=second[:-1]) + if not isinstance(index, CategoricalIndex): # See GH13365 + s3 = s1 * s2 + self.assertEqual(s3.index.name, 'mario') + + def test_ensure_copied_data(self): + # Check the "copy" argument of each Index.__new__ is honoured + # GH12309 + for name, index in compat.iteritems(self.indices): + init_kwargs = {} + if isinstance(index, PeriodIndex): + # Needs "freq" specification: + init_kwargs['freq'] = index.freq + elif isinstance(index, (RangeIndex, MultiIndex, CategoricalIndex)): + # RangeIndex cannot be initialized from data + # MultiIndex and CategoricalIndex are tested separately + continue + + index_type = index.__class__ + result = index_type(index.values, copy=True, **init_kwargs) + tm.assert_index_equal(index, result) + tm.assert_numpy_array_equal(index.values, result.values, + check_same='copy') + + result = index_type(index.values, copy=False, **init_kwargs) + tm.assert_numpy_array_equal(index.values, result.values, + check_same='same') + def test_copy_and_deepcopy(self): from copy import copy, deepcopy diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index aa007c039f8ee..d535eaa238567 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -172,6 +172,7 @@ def test_constructor_from_series(self): df['date'] = ['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990'] result = DatetimeIndex(df['date'], freq='MS') + expected.name = 'date' self.assert_index_equal(result, expected) self.assertEqual(df['date'].dtype, object) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c64b1e9fc4af8..e066842c33126 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -507,6 +507,20 @@ def test_identical(self): self.assertTrue(ci1.identical(ci1.copy())) self.assertFalse(ci1.identical(ci2)) + def test_ensure_copied_data(self): + # Check the "copy" argument of each Index.__new__ is honoured + # GH12309 + # Must be tested separately from other indexes because + # self.value is not an ndarray + _base = lambda ar : ar if ar.base is None else ar.base + for index in self.indices.values(): + result = CategoricalIndex(index.values, copy=True) + tm.assert_index_equal(index, result) + self.assertIsNot(_base(index.values), _base(result.values)) + + result = CategoricalIndex(index.values, copy=False) + self.assertIs(_base(index.values), _base(result.values)) + def test_equals(self): ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 5eac0bc870756..90025fa014b78 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -169,8 +169,8 @@ def test_constructor(self): # explicit construction index = Float64Index([1, 2, 3, 4, 5]) self.assertIsInstance(index, Float64Index) - self.assertTrue((index.values == np.array( - [1, 2, 3, 4, 5], dtype='float64')).all()) + expected = np.array([1, 2, 3, 4, 5], dtype='float64') + self.assert_numpy_array_equal(index.values, expected) index = Float64Index(np.array([1, 2, 3, 4, 5])) self.assertIsInstance(index, Float64Index) index = Float64Index([1., 2, 3, 4, 5]) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index c4e864a909c03..c242213ee226f 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -315,6 +315,17 @@ def test_numpy_array_equal_object_message(self): with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(a, b) + def test_numpy_array_equal_copy_flag(self): + a = np.array([1, 2, 3]) + b = a.copy() + c = a.view() + expected = 'array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)' + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(a, b, check_same='same') + expected = 'array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)' + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(a, c, check_same='copy') + def test_assert_almost_equal_iterable_message(self): expected = """Iterable are different diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 83ab5d2a2bce4..af60a2d028c93 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -225,6 +225,15 @@ def __new__(cls, data=None, verify_integrity=True, normalize=False, closed=None, ambiguous='raise', dtype=None, **kwargs): + # This allows to later ensure that the 'copy' parameter is honored: + if isinstance(data, Index): + ref_to_data = data._data + else: + ref_to_data = data + + if name is None and hasattr(data, 'name'): + name = data.name + dayfirst = kwargs.pop('dayfirst', None) yearfirst = kwargs.pop('yearfirst', None) @@ -302,7 +311,7 @@ def __new__(cls, data=None, raise TypeError("Already tz-aware, use tz_convert " "to convert.") - return data + return data._deepcopy_if_needed(ref_to_data, copy) if issubclass(data.dtype.type, compat.string_types): data = tslib.parse_str_array_to_datetime(data, freq=freq, @@ -335,10 +344,7 @@ def __new__(cls, data=None, elif data.dtype == _INT64_DTYPE: if isinstance(data, Int64Index): raise TypeError('cannot convert Int64Index->DatetimeIndex') - if copy: - subarr = np.asarray(data, dtype=_NS_DTYPE) - else: - subarr = data.view(_NS_DTYPE) + subarr = data.view(_NS_DTYPE) else: if isinstance(data, (ABCSeries, Index)): values = data._values @@ -414,7 +420,7 @@ def __new__(cls, data=None, if inferred: subarr.offset = to_offset(inferred) - return subarr + return subarr._deepcopy_if_needed(ref_to_data, copy) @classmethod def _generate(cls, start, end, periods, name, offset, diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index c3deee5f6dab2..8a3ac1f080c90 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -182,6 +182,9 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, raise ValueError('Periods must be a number, got %s' % str(periods)) + if name is None and hasattr(data, 'name'): + name = data.name + if data is None: if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) @@ -190,7 +193,7 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, freq, kwargs) else: ordinal, freq = cls._from_arraylike(data, freq, tz) - data = np.array(ordinal, dtype=np.int64, copy=False) + data = np.array(ordinal, dtype=np.int64, copy=copy) return cls._simple_new(data, name=name, freq=freq) diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 3e12cf14e7485..84f357481a28e 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -138,8 +138,9 @@ def __new__(cls, data=None, unit=None, if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: - data = data.copy() - return data + return data.copy() + else: + return data._shallow_copy() freq_infer = False if not isinstance(freq, DateOffset): diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 10276137b42a1..e515ba624d203 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1739,7 +1739,7 @@ def test_join_self(self): kinds = 'outer', 'inner', 'left', 'right' for kind in kinds: joined = index.join(index, how=kind) - self.assertIs(index, joined) + tm.assert_index_equal(index, joined) def test_factorize(self): idx1 = TimedeltaIndex(['1 day', '1 day', '2 day', '2 day', '3 day', diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d13873fcf2c84..8c4d2f838ee8d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1017,7 +1017,7 @@ def raise_assert_detail(obj, message, left, right): def assert_numpy_array_equal(left, right, strict_nan=False, check_dtype=True, err_msg=None, - obj='numpy array'): + obj='numpy array', check_same=None): """ Checks that 'np.ndarray' is equivalent Parameters @@ -1033,6 +1033,8 @@ def assert_numpy_array_equal(left, right, strict_nan=False, obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message + check_same : None|'copy'|'same', default None + Ensure "left" and "right refer/do not refer to the same memory area """ # instance validation @@ -1042,6 +1044,14 @@ def assert_numpy_array_equal(left, right, strict_nan=False, assertIsInstance(left, np.ndarray, '[ndarray] ') assertIsInstance(right, np.ndarray, '[ndarray] ') + def _get_base(obj): + return obj.base if getattr(obj, 'base', None) is not None else obj + + if check_same == 'same': + assertIs(_get_base(left), _get_base(right)) + elif check_same == 'copy': + assertIsNot(_get_base(left), _get_base(right)) + def _raise(left, right, err_msg): if err_msg is None: if left.shape != right.shape: From bd66592d7d1c10d88749c9fe42f770ded5d6a0d3 Mon Sep 17 00:00:00 2001 From: Adrien Emery Date: Tue, 14 Jun 2016 17:20:22 -0400 Subject: [PATCH 026/359] ENH: Add SemiMonthEnd and SemiMonthBegin offsets #1543 closes #1543 Author: Adrien Emery Closes #13315 from adrienemery/semi-monthly-offset and squashes the following commits: fe221b2 [Adrien Emery] ENH: Add SemiMonthEnd and SemiMonthBegin offsets #1543 --- asv_bench/benchmarks/timeseries.py | 60 ++++ doc/source/timeseries.rst | 4 + doc/source/whatsnew/v0.18.2.txt | 37 +++ pandas/tseries/offsets.py | 214 ++++++++++++- pandas/tseries/tests/test_frequencies.py | 63 ++++ pandas/tseries/tests/test_offsets.py | 372 ++++++++++++++++++++++- pandas/tseries/tests/test_timeseries.py | 13 +- 7 files changed, 750 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index bdf193cd1f3d3..2b0d098670858 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -1155,3 +1155,63 @@ def setup(self): def time_timeseries_year_incr(self): (self.date + self.year) + + +class timeseries_semi_month_offset(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + # date is not on an offset which will be slowest case + self.date = dt.datetime(2011, 1, 2) + self.semi_month_end = pd.offsets.SemiMonthEnd() + self.semi_month_begin = pd.offsets.SemiMonthBegin() + + def time_semi_month_end_apply(self): + self.semi_month_end.apply(self.date) + + def time_semi_month_end_incr(self): + self.date + self.semi_month_end + + def time_semi_month_end_incr_n(self): + self.date + 10 * self.semi_month_end + + def time_semi_month_end_decr(self): + self.date - self.semi_month_end + + def time_semi_month_end_decr_n(self): + self.date - 10 * self.semi_month_end + + def time_semi_month_end_apply_index(self): + self.semi_month_end.apply_index(self.rng) + + def time_semi_month_end_incr_rng(self): + self.rng + self.semi_month_end + + def time_semi_month_end_decr_rng(self): + self.rng - self.semi_month_end + + def time_semi_month_begin_apply(self): + self.semi_month_begin.apply(self.date) + + def time_semi_month_begin_incr(self): + self.date + self.semi_month_begin + + def time_semi_month_begin_incr_n(self): + self.date + 10 * self.semi_month_begin + + def time_semi_month_begin_decr(self): + self.date - self.semi_month_begin + + def time_semi_month_begin_decr_n(self): + self.date - 10 * self.semi_month_begin + + def time_semi_month_begin_apply_index(self): + self.semi_month_begin.apply_index(self.rng) + + def time_semi_month_begin_incr_rng(self): + self.rng + self.semi_month_begin + + def time_semi_month_begin_decr_rng(self): + self.rng - self.semi_month_begin diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 62601821488d3..7e832af14c051 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -589,6 +589,8 @@ frequency increment. Specific offset logic like "month", "business day", or BMonthBegin, "business month begin" CBMonthEnd, "custom business month end" CBMonthBegin, "custom business month begin" + SemiMonthEnd, "15th (or other day_of_month) and calendar month end" + SemiMonthBegin, "15th (or other day_of_month) and calendar month begin" QuarterEnd, "calendar quarter end" QuarterBegin, "calendar quarter begin" BQuarterEnd, "business quarter end" @@ -967,9 +969,11 @@ frequencies. We will refer to these aliases as *offset aliases* "D", "calendar day frequency" "W", "weekly frequency" "M", "month end frequency" + "SM", "semi-month end frequency (15th and end of month)" "BM", "business month end frequency" "CBM", "custom business month end frequency" "MS", "month start frequency" + "SMS", "semi-month start frequency (1st and 15th)" "BMS", "business month start frequency" "CBMS", "custom business month start frequency" "Q", "quarter end frequency" diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index e469cbf79b31a..f1890fd3a23d1 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -51,6 +51,43 @@ New behaviour: In [2]: pd.read_csv(StringIO(data), names=names) +.. _whatsnew_0182.enhancements.semi_month_offsets: + +Semi-Month Offsets +^^^^^^^^^^^^^^^^^^ + +Pandas has gained new frequency offsets, ``SemiMonthEnd`` ('SM') and ``SemiMonthBegin`` ('SMS'). +These provide date offsets anchored (by default) to the 15th and end of month, and 15th and 1st of month respectively. +(:issue:`1543`) + +.. ipython:: python + + from pandas.tseries.offsets import SemiMonthEnd, SemiMonthBegin + +SemiMonthEnd: + +.. ipython:: python + + Timestamp('2016-01-01') + SemiMonthEnd() + + pd.date_range('2015-01-01', freq='SM', periods=4) + +SemiMonthBegin: + +.. ipython:: python + + Timestamp('2016-01-01') + SemiMonthBegin() + + pd.date_range('2015-01-01', freq='SMS', periods=4) + +Using the anchoring suffix, you can also specify the day of month to use instead of the 15th. + +.. ipython:: python + + pd.date_range('2015-01-01', freq='SMS-16', periods=4) + + pd.date_range('2015-01-01', freq='SM-14', periods=4) + .. _whatsnew_0182.enhancements.other: Other enhancements diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 7d3255add4f64..f4b75ddd72126 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -4,7 +4,8 @@ import numpy as np from pandas.tseries.tools import to_datetime, normalize_date -from pandas.core.common import ABCSeries, ABCDatetimeIndex, ABCPeriod +from pandas.core.common import (ABCSeries, ABCDatetimeIndex, ABCPeriod, + AbstractMethodError) # import after tools, dateutil check from dateutil.relativedelta import relativedelta, weekday @@ -18,6 +19,7 @@ __all__ = ['Day', 'BusinessDay', 'BDay', 'CustomBusinessDay', 'CDay', 'CBMonthEnd', 'CBMonthBegin', 'MonthBegin', 'BMonthBegin', 'MonthEnd', 'BMonthEnd', + 'SemiMonthEnd', 'SemiMonthBegin', 'BusinessHour', 'CustomBusinessHour', 'YearBegin', 'BYearBegin', 'YearEnd', 'BYearEnd', 'QuarterBegin', 'BQuarterBegin', 'QuarterEnd', 'BQuarterEnd', @@ -1160,6 +1162,214 @@ def onOffset(self, dt): _prefix = 'MS' +class SemiMonthOffset(DateOffset): + _adjust_dst = True + _default_day_of_month = 15 + _min_day_of_month = 2 + + def __init__(self, n=1, day_of_month=None, normalize=False, **kwds): + if day_of_month is None: + self.day_of_month = self._default_day_of_month + else: + self.day_of_month = int(day_of_month) + if not self._min_day_of_month <= self.day_of_month <= 27: + raise ValueError('day_of_month must be ' + '{}<=day_of_month<=27, got {}'.format( + self._min_day_of_month, self.day_of_month)) + self.n = int(n) + self.normalize = normalize + self.kwds = kwds + self.kwds['day_of_month'] = self.day_of_month + + @classmethod + def _from_name(cls, suffix=None): + return cls(day_of_month=suffix) + + @property + def rule_code(self): + suffix = '-{}'.format(self.day_of_month) + return self._prefix + suffix + + @apply_wraps + def apply(self, other): + n = self.n + if not self.onOffset(other): + _, days_in_month = tslib.monthrange(other.year, other.month) + if 1 < other.day < self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n > 0: + # rollforward so subtract 1 + n -= 1 + elif self.day_of_month < other.day < days_in_month: + other += relativedelta(day=self.day_of_month) + if n < 0: + # rollforward in the negative direction so add 1 + n += 1 + elif n == 0: + n = 1 + + return self._apply(n, other) + + def _apply(self, n, other): + """Handle specific apply logic for child classes""" + raise AbstractMethodError(self) + + @apply_index_wraps + def apply_index(self, i): + # determine how many days away from the 1st of the month we are + days_from_start = i.to_perioddelta('M').asi8 + delta = Timedelta(days=self.day_of_month - 1).value + + # get boolean array for each element before the day_of_month + before_day_of_month = days_from_start < delta + + # get boolean array for each element after the day_of_month + after_day_of_month = days_from_start > delta + + # determine the correct n for each date in i + roll = self._get_roll(i, before_day_of_month, after_day_of_month) + + # isolate the time since it will be striped away one the next line + time = i.to_perioddelta('D') + + # apply the correct number of months + i = (i.to_period('M') + (roll // 2)).to_timestamp() + + # apply the correct day + i = self._apply_index_days(i, roll) + + return i + time + + def _get_roll(self, i, before_day_of_month, after_day_of_month): + """Return an array with the correct n for each date in i. + + The roll array is based on the fact that i gets rolled back to + the first day of the month. + """ + raise AbstractMethodError(self) + + def _apply_index_days(self, i, roll): + """Apply the correct day for each date in i""" + raise AbstractMethodError(self) + + +class SemiMonthEnd(SemiMonthOffset): + """ + Two DateOffset's per month repeating on the last + day of the month and day_of_month. + + .. versionadded:: 0.18.2 + + Parameters + ---------- + n: int + normalize : bool, default False + day_of_month: int, {1, 3,...,27}, default 15 + """ + _prefix = 'SM' + _min_day_of_month = 1 + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + _, days_in_month = tslib.monthrange(dt.year, dt.month) + return dt.day in (self.day_of_month, days_in_month) + + def _apply(self, n, other): + # if other.day is not day_of_month move to day_of_month and update n + if other.day < self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n > 0: + n -= 1 + elif other.day > self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n == 0: + n = 1 + else: + n += 1 + + months = n // 2 + day = 31 if n % 2 else self.day_of_month + return other + relativedelta(months=months, day=day) + + def _get_roll(self, i, before_day_of_month, after_day_of_month): + n = self.n + is_month_end = i.is_month_end + if n > 0: + roll_end = np.where(is_month_end, 1, 0) + roll_before = np.where(before_day_of_month, n, n + 1) + roll = roll_end + roll_before + elif n == 0: + roll_after = np.where(after_day_of_month, 2, 0) + roll_before = np.where(~after_day_of_month, 1, 0) + roll = roll_before + roll_after + else: + roll = np.where(after_day_of_month, n + 2, n + 1) + return roll + + def _apply_index_days(self, i, roll): + i += (roll % 2) * Timedelta(days=self.day_of_month).value + return i + Timedelta(days=-1) + + +class SemiMonthBegin(SemiMonthOffset): + """ + Two DateOffset's per month repeating on the first + day of the month and day_of_month. + + .. versionadded:: 0.18.2 + + Parameters + ---------- + n: int + normalize : bool, default False + day_of_month: int, {2, 3,...,27}, default 15 + """ + _prefix = 'SMS' + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.day in (1, self.day_of_month) + + def _apply(self, n, other): + # if other.day is not day_of_month move to day_of_month and update n + if other.day < self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n == 0: + n = -1 + else: + n -= 1 + elif other.day > self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n == 0: + n = 1 + elif n < 0: + n += 1 + + months = n // 2 + n % 2 + day = 1 if n % 2 else self.day_of_month + return other + relativedelta(months=months, day=day) + + def _get_roll(self, i, before_day_of_month, after_day_of_month): + n = self.n + is_month_start = i.is_month_start + if n > 0: + roll = np.where(before_day_of_month, n, n + 1) + elif n == 0: + roll_start = np.where(is_month_start, 0, 1) + roll_after = np.where(after_day_of_month, 1, 0) + roll = roll_start + roll_after + else: + roll_after = np.where(after_day_of_month, n + 2, n + 1) + roll_start = np.where(is_month_start, -1, 0) + roll = roll_after + roll_start + return roll + + def _apply_index_days(self, i, roll): + return i + (roll % 2) * Timedelta(days=self.day_of_month - 1).value + + class BusinessMonthEnd(MonthOffset): """DateOffset increments between business EOM dates""" @@ -2720,6 +2930,8 @@ def generate_range(start=None, end=None, periods=None, CustomBusinessHour, # 'CBH' MonthEnd, # 'M' MonthBegin, # 'MS' + SemiMonthEnd, # 'SM' + SemiMonthBegin, # 'SMS' Week, # 'W' Second, # 'S' Minute, # 'T' diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 528b9cc0b08a9..1f06b7ad4361b 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -52,6 +52,26 @@ def test_to_offset_multiple(): expected = offsets.Nano(2800) assert (result == expected) + freqstr = '2SM' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthEnd(2) + assert (result == expected) + + freqstr = '2SM-16' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthEnd(2, day_of_month=16) + assert (result == expected) + + freqstr = '2SMS-14' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthBegin(2, day_of_month=14) + assert (result == expected) + + freqstr = '2SMS-15' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthBegin(2) + assert (result == expected) + # malformed try: frequencies.to_offset('2h20m') @@ -70,6 +90,14 @@ def test_to_offset_negative(): result = frequencies.to_offset(freqstr) assert (result.n == -310) + freqstr = '-2SM' + result = frequencies.to_offset(freqstr) + assert (result.n == -2) + + freqstr = '-1SMS' + result = frequencies.to_offset(freqstr) + assert (result.n == -1) + def test_to_offset_leading_zero(): freqstr = '00H 00T 01S' @@ -137,6 +165,41 @@ def test_anchored_shortcuts(): expected = offsets.QuarterEnd(startingMonth=5) assert (result1 == expected) + result1 = frequencies.to_offset('SM') + result2 = frequencies.to_offset('SM-15') + expected = offsets.SemiMonthEnd(day_of_month=15) + assert (result1 == expected) + assert (result2 == expected) + + result = frequencies.to_offset('SM-1') + expected = offsets.SemiMonthEnd(day_of_month=1) + assert (result == expected) + + result = frequencies.to_offset('SM-27') + expected = offsets.SemiMonthEnd(day_of_month=27) + assert (result == expected) + + result = frequencies.to_offset('SMS-2') + expected = offsets.SemiMonthBegin(day_of_month=2) + assert (result == expected) + + result = frequencies.to_offset('SMS-27') + expected = offsets.SemiMonthBegin(day_of_month=27) + assert (result == expected) + + # ensure invalid cases fail as expected + invalid_anchors = ['SM-0', 'SM-28', 'SM-29', + 'SM-FOO', 'BSM', 'SM--1' + 'SMS-1', 'SMS-28', 'SMS-30', + 'SMS-BAR', 'BSMS', 'SMS--2'] + for invalid_anchor in invalid_anchors: + try: + frequencies.to_offset(invalid_anchor) + except ValueError: + pass + else: + raise AssertionError(invalid_anchor) + def test_get_rule_month(): result = frequencies._get_rule_month('W') diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index ec88acc421cdb..5965a661699a6 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -11,9 +11,9 @@ from pandas.compat.numpy import np_datetime64_compat from pandas.core.datetools import (bday, BDay, CDay, BQuarterEnd, BMonthEnd, BusinessHour, CustomBusinessHour, - CBMonthEnd, CBMonthBegin, - BYearEnd, MonthEnd, MonthBegin, BYearBegin, - QuarterBegin, + CBMonthEnd, CBMonthBegin, BYearEnd, + MonthEnd, MonthBegin, SemiMonthBegin, + SemiMonthEnd, BYearBegin, QuarterBegin, BQuarterBegin, BMonthBegin, DateOffset, Week, YearBegin, YearEnd, Hour, Minute, Second, Day, Micro, Milli, Nano, Easter, @@ -21,6 +21,7 @@ QuarterEnd, to_datetime, normalize_date, get_offset, get_standard_freq) +from pandas.core.series import Series from pandas.tseries.frequencies import (_offset_map, get_freq_code, _get_freq_str) from pandas.tseries.index import _to_m8, DatetimeIndex, _daterange_cache @@ -182,6 +183,8 @@ def setUp(self): 'BusinessMonthBegin': Timestamp('2011-01-03 09:00:00'), 'MonthEnd': Timestamp('2011-01-31 09:00:00'), + 'SemiMonthEnd': Timestamp('2011-01-15 09:00:00'), + 'SemiMonthBegin': Timestamp('2011-01-15 09:00:00'), 'BusinessMonthEnd': Timestamp('2011-01-31 09:00:00'), 'YearBegin': Timestamp('2012-01-01 09:00:00'), 'BYearBegin': Timestamp('2011-01-03 09:00:00'), @@ -311,9 +314,9 @@ def test_rollforward(self): expecteds = self.expecteds.copy() # result will not be changed if the target is on the offset - no_changes = ['Day', 'MonthBegin', 'YearBegin', 'Week', 'Hour', - 'Minute', 'Second', 'Milli', 'Micro', 'Nano', - 'DateOffset'] + no_changes = ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin', + 'Week', 'Hour', 'Minute', 'Second', 'Milli', 'Micro', + 'Nano', 'DateOffset'] for n in no_changes: expecteds[n] = Timestamp('2011/01/01 09:00') @@ -328,6 +331,7 @@ def test_rollforward(self): normalized = {'Day': Timestamp('2011-01-02 00:00:00'), 'DateOffset': Timestamp('2011-01-02 00:00:00'), 'MonthBegin': Timestamp('2011-02-01 00:00:00'), + 'SemiMonthBegin': Timestamp('2011-01-15 00:00:00'), 'YearBegin': Timestamp('2012-01-01 00:00:00'), 'Week': Timestamp('2011-01-08 00:00:00'), 'Hour': Timestamp('2011-01-01 00:00:00'), @@ -358,6 +362,7 @@ def test_rollback(self): Timestamp('2010-12-01 09:00:00'), 'BusinessMonthBegin': Timestamp('2010-12-01 09:00:00'), 'MonthEnd': Timestamp('2010-12-31 09:00:00'), + 'SemiMonthEnd': Timestamp('2010-12-31 09:00:00'), 'BusinessMonthEnd': Timestamp('2010-12-31 09:00:00'), 'BYearBegin': Timestamp('2010-01-01 09:00:00'), 'YearEnd': Timestamp('2010-12-31 09:00:00'), @@ -375,8 +380,9 @@ def test_rollback(self): 'Easter': Timestamp('2010-04-04 09:00:00')} # result will not be changed if the target is on the offset - for n in ['Day', 'MonthBegin', 'YearBegin', 'Week', 'Hour', 'Minute', - 'Second', 'Milli', 'Micro', 'Nano', 'DateOffset']: + for n in ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin', 'Week', + 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano', + 'DateOffset']: expecteds[n] = Timestamp('2011/01/01 09:00') # but be changed when normalize=True @@ -387,6 +393,7 @@ def test_rollback(self): normalized = {'Day': Timestamp('2010-12-31 00:00:00'), 'DateOffset': Timestamp('2010-12-31 00:00:00'), 'MonthBegin': Timestamp('2010-12-01 00:00:00'), + 'SemiMonthBegin': Timestamp('2010-12-15 00:00:00'), 'YearBegin': Timestamp('2010-01-01 00:00:00'), 'Week': Timestamp('2010-12-25 00:00:00'), 'Hour': Timestamp('2011-01-01 00:00:00'), @@ -2646,6 +2653,353 @@ def test_onOffset(self): assertOnOffset(offset, dt, expected) +class TestSemiMonthEnd(Base): + _offset = SemiMonthEnd + + def _get_tests(self): + tests = [] + + tests.append((SemiMonthEnd(), + {datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 31)})) + + tests.append((SemiMonthEnd(day_of_month=20), + {datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 20), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 20), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20)})) + + tests.append((SemiMonthEnd(0), + {datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 16): datetime(2008, 1, 31), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 15)})) + + tests.append((SemiMonthEnd(0, day_of_month=16), + {datetime(2008, 1, 1): datetime(2008, 1, 16), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 16)})) + + tests.append((SemiMonthEnd(2), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 11, 30)})) + + tests.append((SemiMonthEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 30): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + tests.append((SemiMonthEnd(-1, day_of_month=4), + {datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2007, 1, 4): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + tests.append((SemiMonthEnd(-2), + {datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 2, 15), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 14): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 15)})) + + return tests + + def test_offset_whole_year(self): + dates = (datetime(2007, 12, 31), + datetime(2008, 1, 15), + datetime(2008, 1, 31), + datetime(2008, 2, 15), + datetime(2008, 2, 29), + datetime(2008, 3, 15), + datetime(2008, 3, 31), + datetime(2008, 4, 15), + datetime(2008, 4, 30), + datetime(2008, 5, 15), + datetime(2008, 5, 31), + datetime(2008, 6, 15), + datetime(2008, 6, 30), + datetime(2008, 7, 15), + datetime(2008, 7, 31), + datetime(2008, 8, 15), + datetime(2008, 8, 31), + datetime(2008, 9, 15), + datetime(2008, 9, 30), + datetime(2008, 10, 15), + datetime(2008, 10, 31), + datetime(2008, 11, 15), + datetime(2008, 11, 30), + datetime(2008, 12, 15), + datetime(2008, 12, 31)) + + for base, exp_date in zip(dates[:-1], dates[1:]): + assertEq(SemiMonthEnd(), base, exp_date) + + # ensure .apply_index works as expected + s = DatetimeIndex(dates[:-1]) + result = SemiMonthEnd().apply_index(s) + exp = DatetimeIndex(dates[1:]) + tm.assert_index_equal(result, exp) + + # ensure generating a range with DatetimeIndex gives same result + result = DatetimeIndex(start=dates[0], end=dates[-1], freq='SM') + exp = DatetimeIndex(dates) + tm.assert_index_equal(result, exp) + + def test_offset(self): + for offset, cases in self._get_tests(): + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_index(self): + for offset, cases in self._get_tests(): + s = DatetimeIndex(cases.keys()) + result = offset.apply_index(s) + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) + + def test_onOffset(self): + + tests = [(datetime(2007, 12, 31), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 1), False), + (datetime(2008, 2, 29), True)] + + for dt, expected in tests: + assertOnOffset(SemiMonthEnd(), dt, expected) + + def test_vectorized_offset_addition(self): + for klass, assert_func in zip([Series, DatetimeIndex], + [self.assert_series_equal, + tm.assert_index_equal]): + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + Timestamp('2000-02-15', tz='US/Central')], name='a') + + result = s + SemiMonthEnd() + result2 = SemiMonthEnd() + s + exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), + Timestamp('2000-02-29', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'), + Timestamp('2000-02-01', tz='US/Central')], name='a') + result = s + SemiMonthEnd() + result2 = SemiMonthEnd() + s + exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + Timestamp('2000-02-15', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + +class TestSemiMonthBegin(Base): + _offset = SemiMonthBegin + + def _get_tests(self): + tests = [] + + tests.append((SemiMonthBegin(), + {datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2007, 1, 1)})) + + tests.append((SemiMonthBegin(day_of_month=20), + {datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20)})) + + tests.append((SemiMonthBegin(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 2): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2007, 1, 1)})) + + tests.append((SemiMonthBegin(0, day_of_month=16), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 5): datetime(2007, 1, 16), + datetime(2007, 1, 1): datetime(2007, 1, 1)})) + + tests.append((SemiMonthBegin(2), + {datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 15): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 12, 1)})) + + tests.append((SemiMonthBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 6, 14): datetime(2008, 6, 1), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 15)})) + + tests.append((SemiMonthBegin(-1, day_of_month=4), + {datetime(2007, 1, 1): datetime(2006, 12, 4), + datetime(2007, 1, 4): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2006, 12, 2): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 4)})) + + tests.append((SemiMonthBegin(-2), + {datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 1), + datetime(2008, 6, 14): datetime(2008, 5, 15), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 15): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 1)})) + + return tests + + def test_offset_whole_year(self): + dates = (datetime(2007, 12, 15), + datetime(2008, 1, 1), + datetime(2008, 1, 15), + datetime(2008, 2, 1), + datetime(2008, 2, 15), + datetime(2008, 3, 1), + datetime(2008, 3, 15), + datetime(2008, 4, 1), + datetime(2008, 4, 15), + datetime(2008, 5, 1), + datetime(2008, 5, 15), + datetime(2008, 6, 1), + datetime(2008, 6, 15), + datetime(2008, 7, 1), + datetime(2008, 7, 15), + datetime(2008, 8, 1), + datetime(2008, 8, 15), + datetime(2008, 9, 1), + datetime(2008, 9, 15), + datetime(2008, 10, 1), + datetime(2008, 10, 15), + datetime(2008, 11, 1), + datetime(2008, 11, 15), + datetime(2008, 12, 1), + datetime(2008, 12, 15)) + + for base, exp_date in zip(dates[:-1], dates[1:]): + assertEq(SemiMonthBegin(), base, exp_date) + + # ensure .apply_index works as expected + s = DatetimeIndex(dates[:-1]) + result = SemiMonthBegin().apply_index(s) + exp = DatetimeIndex(dates[1:]) + tm.assert_index_equal(result, exp) + + # ensure generating a range with DatetimeIndex gives same result + result = DatetimeIndex(start=dates[0], end=dates[-1], freq='SMS') + exp = DatetimeIndex(dates) + tm.assert_index_equal(result, exp) + + def test_offset(self): + for offset, cases in self._get_tests(): + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_index(self): + for offset, cases in self._get_tests(): + s = DatetimeIndex(cases.keys()) + result = offset.apply_index(s) + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) + + def test_onOffset(self): + tests = [(datetime(2007, 12, 1), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 31), False), + (datetime(2008, 2, 15), True)] + + for dt, expected in tests: + assertOnOffset(SemiMonthBegin(), dt, expected) + + def test_vectorized_offset_addition(self): + for klass, assert_func in zip([Series, DatetimeIndex], + [self.assert_series_equal, + tm.assert_index_equal]): + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + SemiMonthBegin() + result2 = SemiMonthBegin() + s + exp = klass([Timestamp('2000-02-01 00:15:00', tz='US/Central'), + Timestamp('2000-03-01', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'), + Timestamp('2000-02-01', tz='US/Central')], name='a') + result = s + SemiMonthBegin() + result2 = SemiMonthBegin() + s + exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + Timestamp('2000-02-15', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + class TestBQuarterBegin(Base): _offset = BQuarterBegin @@ -4537,6 +4891,8 @@ def test_all_offset_classes(self): BMonthEnd: ['11/2/2012', '11/30/2012'], CBMonthBegin: ['11/2/2012', '12/3/2012'], CBMonthEnd: ['11/2/2012', '11/30/2012'], + SemiMonthBegin: ['11/2/2012', '11/15/2012'], + SemiMonthEnd: ['11/2/2012', '11/15/2012'], Week: ['11/2/2012', '11/9/2012'], YearBegin: ['11/2/2012', '1/1/2013'], YearEnd: ['11/2/2012', '12/31/2012'], diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index f6d80f7ee410b..fcc544ec7f239 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3095,10 +3095,14 @@ def test_datetime64_with_DateOffset(self): exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) assert_func(result, exp) - s = klass([Timestamp('2000-01-05 00:15:00'), Timestamp( - '2000-01-31 00:23:00'), Timestamp('2000-01-01'), Timestamp( - '2000-03-31'), Timestamp('2000-02-29'), Timestamp( - '2000-12-31')]) + s = klass([Timestamp('2000-01-05 00:15:00'), + Timestamp('2000-01-31 00:23:00'), + Timestamp('2000-01-01'), + Timestamp('2000-03-31'), + Timestamp('2000-02-29'), + Timestamp('2000-12-31'), + Timestamp('2000-05-15'), + Timestamp('2001-06-15')]) # DateOffset relativedelta fastpath relative_kwargs = [('years', 2), ('months', 5), ('days', 3), @@ -3115,6 +3119,7 @@ def test_datetime64_with_DateOffset(self): # assert these are equal on a piecewise basis offsets = ['YearBegin', ('YearBegin', {'month': 5}), 'YearEnd', ('YearEnd', {'month': 5}), 'MonthBegin', 'MonthEnd', + 'SemiMonthEnd', 'SemiMonthBegin', 'Week', ('Week', { 'weekday': 3 }), 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', From 112685fd1218fad5f8e6342adc43c20eb40a04cb Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 14 Jun 2016 17:24:48 -0400 Subject: [PATCH 027/359] ENH: PeriodIndex now accepts pd.NaT Related to #12759. Author: sinhrks Closes #13430 from sinhrks/period_nat and squashes the following commits: 5990de6 [sinhrks] ENH: PeriodIndex now accepts pd.NaT --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/src/period.pyx | 37 ++++++++++++-- pandas/tseries/period.py | 30 ++++------- pandas/tseries/tests/test_period.py | 78 +++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index f1890fd3a23d1..0d4f07d19f880 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -325,6 +325,7 @@ Other API changes - ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) - ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) - ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) +- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) .. _whatsnew_0182.deprecations: diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 858aa58df8d7d..aca0d0dbc107b 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -24,6 +24,7 @@ cimport cython from datetime cimport * cimport util cimport lib +from lib cimport is_null_datetimelike import lib from pandas import tslib from tslib import Timedelta, Timestamp, iNaT, NaT @@ -458,13 +459,39 @@ def extract_ordinals(ndarray[object] values, freq): for i in range(n): p = values[i] - ordinals[i] = p.ordinal - if p.freqstr != freqstr: - msg = _DIFFERENT_FREQ_INDEX.format(freqstr, p.freqstr) - raise IncompatibleFrequency(msg) + + if is_null_datetimelike(p): + ordinals[i] = tslib.iNaT + else: + try: + ordinals[i] = p.ordinal + + if p.freqstr != freqstr: + msg = _DIFFERENT_FREQ_INDEX.format(freqstr, p.freqstr) + raise IncompatibleFrequency(msg) + + except AttributeError: + p = Period(p, freq=freq) + ordinals[i] = p.ordinal return ordinals + +def extract_freq(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + object p + + for i in range(n): + p = values[i] + try: + return p.freq + except AttributeError: + pass + + raise ValueError('freq not specified and cannot be inferred') + + cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef: Py_ssize_t i, n = len(stamps) @@ -719,7 +746,7 @@ cdef class Period(object): converted = other.asfreq(freq) ordinal = converted.ordinal - elif lib.is_null_datetimelike(value) or value in tslib._nat_strings: + elif is_null_datetimelike(value) or value in tslib._nat_strings: ordinal = tslib.iNaT if freq is None: raise ValueError("If value is NaT, freq cannot be None " diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 8a3ac1f080c90..750e7a5553ef6 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -40,14 +40,6 @@ def f(self): return property(f) -def _get_ordinals(data, freq): - f = lambda x: Period(x, freq=freq).ordinal - if isinstance(data[0], Period): - return period.extract_ordinals(data, freq) - else: - return lib.map_infer(data, f) - - def dt64arr_to_periodarr(data, freq, tz): if data.dtype != np.dtype('M8[ns]'): raise ValueError('Wrong dtype: %s' % data.dtype) @@ -235,14 +227,9 @@ def _from_arraylike(cls, data, freq, tz): except (TypeError, ValueError): data = com._ensure_object(data) - if freq is None and len(data) > 0: - freq = getattr(data[0], 'freq', None) - if freq is None: - raise ValueError('freq not specified and cannot be ' - 'inferred from first element') - - data = _get_ordinals(data, freq) + freq = period.extract_freq(data) + data = period.extract_ordinals(data, freq) else: if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: @@ -254,12 +241,15 @@ def _from_arraylike(cls, data, freq, tz): data = period.period_asfreq_arr(data.values, base1, base2, 1) else: - if freq is None and len(data) > 0: - freq = getattr(data[0], 'freq', None) + + if freq is None and com.is_object_dtype(data): + # must contain Period instance and thus extract ordinals + freq = period.extract_freq(data) + data = period.extract_ordinals(data, freq) if freq is None: - raise ValueError('freq not specified and cannot be ' - 'inferred from first element') + msg = 'freq not specified and cannot be inferred' + raise ValueError(msg) if data.dtype != np.int64: if np.issubdtype(data.dtype, np.datetime64): @@ -269,7 +259,7 @@ def _from_arraylike(cls, data, freq, tz): data = com._ensure_int64(data) except (TypeError, ValueError): data = com._ensure_object(data) - data = _get_ordinals(data, freq) + data = period.extract_ordinals(data, freq) return data, freq diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index de23306c80b71..807fb86b1b4da 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1742,6 +1742,84 @@ def test_constructor_datetime64arr(self): self.assertRaises(ValueError, PeriodIndex, vals, freq='D') + def test_constructor_empty(self): + idx = pd.PeriodIndex([], freq='M') + tm.assertIsInstance(idx, PeriodIndex) + self.assertEqual(len(idx), 0) + self.assertEqual(idx.freq, 'M') + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + pd.PeriodIndex([]) + + def test_constructor_pi_nat(self): + idx = PeriodIndex([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='M')]) + exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='M')])) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([pd.NaT, pd.NaT, Period('2011-01', freq='M'), + Period('2011-01', freq='M')]) + exp = PeriodIndex(['NaT', 'NaT', '2011-01', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(np.array([pd.NaT, pd.NaT, + Period('2011-01', freq='M'), + Period('2011-01', freq='M')])) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([pd.NaT, pd.NaT, '2011-01', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex([pd.NaT, pd.NaT]) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex(np.array([pd.NaT, pd.NaT])) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex(['NaT', 'NaT']) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex(np.array(['NaT', 'NaT'])) + + def test_constructor_incompat_freq(self): + msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='D')]) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='D')])) + + # first element is pd.NaT + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex([pd.NaT, Period('2011-01', freq='M'), + Period('2011-01', freq='D')]) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex(np.array([pd.NaT, Period('2011-01', freq='M'), + Period('2011-01', freq='D')])) + + def test_constructor_mixed(self): + idx = PeriodIndex(['2011-01', pd.NaT, Period('2011-01', freq='M')]) + exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(['NaT', pd.NaT, Period('2011-01', freq='M')]) + exp = PeriodIndex(['NaT', 'NaT', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([Period('2011-01-01', freq='D'), pd.NaT, + '2012-01-01']) + exp = PeriodIndex(['2011-01-01', 'NaT', '2012-01-01'], freq='D') + tm.assert_index_equal(idx, exp) + def test_constructor_simple_new(self): idx = period_range('2007-01', name='p', periods=2, freq='M') result = idx._simple_new(idx, 'p', freq=idx.freq) From 8f3229227856af4feedb5894da9ccd4168a4e2a0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Jun 2016 17:34:59 -0400 Subject: [PATCH 028/359] BLD: fix conda version --- ci/install_travis.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 07d33f56beba8..8be117cbecc05 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -76,7 +76,9 @@ fi bash miniconda.sh -b -p $HOME/miniconda || exit 1 conda config --set always_yes yes --set changeps1 no || exit 1 -conda update -q conda || exit 1 + +# fix the conda version +conda install conda==4.0.8 conda config --add channels http://conda.anaconda.org/pandas || exit 1 conda config --set ssl_verify false || exit 1 From 9670b3139171b3b628cecb87a3d0b46cc0361eba Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Jun 2016 17:44:13 -0400 Subject: [PATCH 029/359] TST: skip mmap error comparison on windows --- pandas/io/tests/test_common.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index b70fca3ed2d20..46c34abf5aeb7 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -1,7 +1,7 @@ """ Tests for the pandas.io.common functionalities """ -from pandas.compat import StringIO +import nose import mmap import os from os.path import isabs @@ -9,6 +9,7 @@ import pandas.util.testing as tm from pandas.io import common +from pandas.compat import is_platform_windows, StringIO from pandas import read_csv, concat @@ -97,6 +98,10 @@ def setUp(self): 'test_mmap.csv') def test_constructor_bad_file(self): + if is_platform_windows(): + raise nose.SkipTest("skipping construction error messages " + "tests on windows") + non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 From eefe71e27131bc4848e549e3688ef6700b57b73a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 14 Jun 2016 21:42:10 -0400 Subject: [PATCH 030/359] BLD: update appveyor script to use true/false --- appveyor.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 7941820204916..13a7ddbf0dfd7 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -59,8 +59,9 @@ install: - cmd: rmdir C:\cygwin /s /q # install our build environment - - cmd: conda config --set show_channel_urls yes --set always_yes yes --set changeps1 no - - cmd: conda update -q conda + - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false + #- cmd: conda update -q conda + - cmd: conda install conda==4.0.8 - cmd: conda config --add channels http://conda.anaconda.org/pandas - cmd: conda config --set ssl_verify false From f98b4b541e7a9d1b0d8f6674a84dc10080c2568b Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 14 Jun 2016 21:46:29 -0400 Subject: [PATCH 031/359] PERF: float hash slow in py3 closes #13166 closes #13335 Author: Chris Closes #13436 from chris-b1/float-hash and squashes the following commits: 3aec078 [Chris] smaller benches, explanatory comment 339ad1a [Chris] PERF: float hash slow in py3 --- asv_bench/benchmarks/groupby.py | 137 ++++++++----------------------- asv_bench/benchmarks/indexing.py | 15 +++- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/src/klib/khash_python.h | 18 +++- 4 files changed, 66 insertions(+), 106 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 586bd00b091fe..0611a3564ff7a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -379,15 +379,24 @@ def time_groupby_dt_timegrouper_size(self): #---------------------------------------------------------------------- # groupby with a variable value for ngroups -class groupby_ngroups_10000(object): +class groupby_ngroups_int_10000(object): goal_time = 0.2 + dtype = 'int' + ngroups = 10000 def setup(self): np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + size = self.ngroups * 2 + rng = np.arange(self.ngroups) + ts = rng.take(np.random.randint(0, self.ngroups, size=size)) + if self.dtype == 'int': + value = np.random.randint(0, size, size=size) + else: + value = np.concatenate([np.random.random(self.ngroups) * 0.1, + np.random.random(self.ngroups) * 10.0]) + + self.df = DataFrame({'timestamp': ts, + 'value': value}) def time_all(self): self.df.groupby('value')['timestamp'].all() @@ -482,109 +491,35 @@ def time_value_counts(self): def time_var(self): self.df.groupby('value')['timestamp'].var() - -class groupby_ngroups_100(object): +class groupby_ngroups_int_100(groupby_ngroups_int_10000): goal_time = 0.2 + dtype = 'int' + ngroups = 100 - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_all(self): - self.df.groupby('value')['timestamp'].all() - - def time_any(self): - self.df.groupby('value')['timestamp'].any() - - def time_count(self): - self.df.groupby('value')['timestamp'].count() - - def time_cumcount(self): - self.df.groupby('value')['timestamp'].cumcount() - - def time_cummax(self): - self.df.groupby('value')['timestamp'].cummax() - - def time_cummin(self): - self.df.groupby('value')['timestamp'].cummin() - - def time_cumprod(self): - self.df.groupby('value')['timestamp'].cumprod() - - def time_cumsum(self): - self.df.groupby('value')['timestamp'].cumsum() - - def time_describe(self): - self.df.groupby('value')['timestamp'].describe() - - def time_diff(self): - self.df.groupby('value')['timestamp'].diff() - - def time_first(self): - self.df.groupby('value')['timestamp'].first() - - def time_head(self): - self.df.groupby('value')['timestamp'].head() - - def time_last(self): - self.df.groupby('value')['timestamp'].last() - - def time_mad(self): - self.df.groupby('value')['timestamp'].mad() - - def time_max(self): - self.df.groupby('value')['timestamp'].max() - - def time_mean(self): - self.df.groupby('value')['timestamp'].mean() - - def time_median(self): - self.df.groupby('value')['timestamp'].median() - - def time_min(self): - self.df.groupby('value')['timestamp'].min() - - def time_nunique(self): - self.df.groupby('value')['timestamp'].nunique() - - def time_pct_change(self): - self.df.groupby('value')['timestamp'].pct_change() - - def time_prod(self): - self.df.groupby('value')['timestamp'].prod() - - def time_rank(self): - self.df.groupby('value')['timestamp'].rank() - - def time_sem(self): - self.df.groupby('value')['timestamp'].sem() - - def time_size(self): - self.df.groupby('value')['timestamp'].size() - - def time_skew(self): - self.df.groupby('value')['timestamp'].skew() - - def time_std(self): - self.df.groupby('value')['timestamp'].std() +class groupby_ngroups_float_100(groupby_ngroups_int_10000): + goal_time = 0.2 + dtype = 'float' + ngroups = 100 - def time_sum(self): - self.df.groupby('value')['timestamp'].sum() +class groupby_ngroups_float_10000(groupby_ngroups_int_10000): + goal_time = 0.2 + dtype = 'float' + ngroups = 10000 - def time_tail(self): - self.df.groupby('value')['timestamp'].tail() - def time_unique(self): - self.df.groupby('value')['timestamp'].unique() +class groupby_float32(object): + # GH 13335 + goal_time = 0.2 - def time_value_counts(self): - self.df.groupby('value')['timestamp'].value_counts() + def setup(self): + tmp1 = (np.random.random(10000) * 0.1).astype(np.float32) + tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) + tmp = np.concatenate((tmp1, tmp2)) + arr = np.repeat(tmp, 10) + self.df = DataFrame(dict(a=arr, b=arr)) - def time_var(self): - self.df.groupby('value')['timestamp'].var() + def time_groupby_sum(self): + self.df.groupby(['a'])['b'].sum() #---------------------------------------------------------------------- diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 32d80a7913234..53d37a8161f43 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -486,4 +486,17 @@ def setup(self): self.midx = self.midx.take(np.random.permutation(np.arange(100000))) def time_sort_level_zero(self): - self.midx.sortlevel(0) \ No newline at end of file + self.midx.sortlevel(0) + +class float_loc(object): + # GH 13166 + goal_time = 0.2 + + def setup(self): + a = np.arange(100000) + self.ind = pd.Float64Index(a * 4.8000000418824129e-08) + + def time_float_loc(self): + self.ind.get_loc(0) + + diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 0d4f07d19f880..f5dbfd80de7cc 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -345,7 +345,7 @@ Performance Improvements - Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) - increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - +- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h index cdd94b5d8522f..7684493d08855 100644 --- a/pandas/src/klib/khash_python.h +++ b/pandas/src/klib/khash_python.h @@ -2,9 +2,21 @@ #include "khash.h" -// kludge - -#define kh_float64_hash_func _Py_HashDouble +// Previously we were using the built in cpython hash function for doubles +// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021 +// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85 + +// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x)) +// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3). +// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t +// is 64 bits the truncation causes collission issues. Given all that, we use our own +// simple hash, viewing the double bytes as an int64 and using khash's default +// hash for 64 bit integers. +// GH 13436 +inline khint64_t asint64(double key) { + return *(khint64_t *)(&key); +} +#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ From f7528866e07048f051f46f621ca76947730c7d32 Mon Sep 17 00:00:00 2001 From: Geraint Duck Date: Wed, 15 Jun 2016 14:25:13 +0100 Subject: [PATCH 032/359] DOC: Corrected Series.str.extract documentation error (#13449) --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2f9f8ec936e78..ca8e701d0ce17 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -543,7 +543,7 @@ def str_extract(arr, pat, flags=0, expand=None): each group. Any capture group names in regular expression pat will be used for column names; otherwise capture group numbers will be used. The dtype of each result column is always object, even when - no match is found. If expand=True and pat has only one capture group, + no match is found. If expand=False and pat has only one capture group, then return a Series (if subject is a Series) or Index (if subject is an Index). From f67dd4bc2acf5ab7b871ced4d249503faf789afa Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 15 Jun 2016 17:37:19 -0500 Subject: [PATCH 033/359] BLD: use inline macro closes #13448 closes #13456 --- pandas/src/klib/khash_python.h | 2 +- pandas/tests/indexing/test_coercion.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h index 7684493d08855..a375a73b04c9e 100644 --- a/pandas/src/klib/khash_python.h +++ b/pandas/src/klib/khash_python.h @@ -13,7 +13,7 @@ // simple hash, viewing the double bytes as an int64 and using khash's default // hash for 64 bit integers. // GH 13436 -inline khint64_t asint64(double key) { +khint64_t PANDAS_INLINE asint64(double key) { return *(khint64_t *)(&key); } #define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 23600e1f4241c..97a5c48b878fe 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -461,6 +461,13 @@ def _assert_replace_conversion(self, from_key, to_key, how): result = s.replace(replacer) + # buggy on windows for bool/int64 + if (from_key == 'bool' and + to_key == 'int64' and + tm.is_platform_windows()): + raise nose.SkipTest("windows platform buggy: {0} -> {1}".format + (from_key, to_key)) + if ((from_key == 'float64' and to_key in ('bool', 'int64')) or @@ -471,7 +478,7 @@ def _assert_replace_conversion(self, from_key, to_key, how): to_key in ('bool')) or # TODO_GH12747 The result must be int? - (from_key == 'bool' and to_key in ('int64'))): + (from_key == 'bool' and to_key == 'int64')): # buggy on 32-bit if tm.is_platform_32bit(): From a965d855bed047353d1eba6c18ae9ffb7225490b Mon Sep 17 00:00:00 2001 From: Drewrey Lupton Date: Thu, 16 Jun 2016 08:18:19 -0400 Subject: [PATCH 034/359] ERR: fix error message for to_datetime Author: Drewrey Lupton Closes #13450 from drewrey/error_message and squashes the following commits: 988048d [Drewrey Lupton] ERR: fix error message for to_datetime --- pandas/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 6453e65ecdc81..7de62fbe71615 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2320,7 +2320,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue elif is_raise: - raise ValueError("time data %r does match format specified" % + raise ValueError("time data %r doesn't match format specified" % (val,)) else: return values From d814f433940031029f1ddf0d9abdecdf4ad31dac Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 16 Jun 2016 08:31:28 -0400 Subject: [PATCH 035/359] BUG: Fix csv.QUOTE_NONNUMERIC quoting in to_csv Closes #12922: "bug" traced to #12194 Author: gfyoung Closes #13418 from gfyoung/to-csv-quote-bugfix and squashes the following commits: 8e53112 [gfyoung] BUG: Fix quoting behaviour in to_csv for csv.QUOTE_NONNUMERIC --- doc/source/whatsnew/v0.18.2.txt | 2 + pandas/core/internals.py | 14 ++++ pandas/formats/format.py | 5 ++ pandas/tests/frame/test_to_csv.py | 109 ++++++++++++++++++++++-------- 4 files changed, 101 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index f5dbfd80de7cc..b3ce9911d3f4d 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -388,6 +388,8 @@ Bug Fixes - Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) - Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) + +- Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 97df81ad6be48..c931adc9a31df 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1529,6 +1529,20 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, if slicer is not None: values = values[:, slicer] + # see gh-13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == '.': + mask = isnull(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype='object') + + values[mask] = na_rep + return values + from pandas.formats.format import FloatArrayFormatter formatter = FloatArrayFormatter(values, na_rep=na_rep, float_format=float_format, diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 923ac25f0ebed..a8e184ce94c89 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1,4 +1,9 @@ # -*- coding: utf-8 -*- +""" +Internal module for formatting output data in csv, html, +and latex files. This module also applies to display formatting. +""" + from __future__ import print_function from distutils.version import LooseVersion # pylint: disable=W0141 diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index bacf604c491b1..c23702ef46ad2 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -824,35 +824,6 @@ def test_to_csv_float_format(self): index=['A', 'B'], columns=['X', 'Y', 'Z']) assert_frame_equal(rs, xp) - def test_to_csv_quoting(self): - df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) - - buf = StringIO() - df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC) - - result = buf.getvalue() - expected = ('"A","B"\n' - '1,"foo"\n' - '2,"bar"\n' - '3,"baz"\n') - - self.assertEqual(result, expected) - - # quoting windows line terminators, presents with encoding? - # #3503 - text = 'a,b,c\n1,"test \r\n",3\n' - df = pd.read_csv(StringIO(text)) - buf = StringIO() - df.to_csv(buf, encoding='utf-8', index=False) - self.assertEqual(buf.getvalue(), text) - - # testing if quoting parameter is passed through with multi-indexes - # related to issue #7791 - df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) - df = df.set_index(['a', 'b']) - expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' - self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected) - def test_to_csv_unicodewriter_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) @@ -1131,3 +1102,83 @@ def test_to_csv_with_dst_transitions(self): df.to_pickle(path) result = pd.read_pickle(path) assert_frame_equal(result, df) + + def test_to_csv_quoting(self): + df = DataFrame({ + 'c_string': ['a', 'b,c'], + 'c_int': [42, np.nan], + 'c_float': [1.0, 3.2], + 'c_bool': [True, False], + }) + + expected = """\ +,c_bool,c_float,c_int,c_string +0,True,1.0,42.0,a +1,False,3.2,,"b,c" +""" + result = df.to_csv() + self.assertEqual(result, expected) + + result = df.to_csv(quoting=None) + self.assertEqual(result, expected) + + result = df.to_csv(quoting=csv.QUOTE_MINIMAL) + self.assertEqual(result, expected) + + expected = """\ +"","c_bool","c_float","c_int","c_string" +"0","True","1.0","42.0","a" +"1","False","3.2","","b,c" +""" + result = df.to_csv(quoting=csv.QUOTE_ALL) + self.assertEqual(result, expected) + + # see gh-12922, gh-13259: make sure changes to + # the formatters do not break this behaviour + expected = """\ +"","c_bool","c_float","c_int","c_string" +0,True,1.0,42.0,"a" +1,False,3.2,"","b,c" +""" + result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) + self.assertEqual(result, expected) + + msg = "need to escape, but no escapechar set" + tm.assertRaisesRegexp(csv.Error, msg, df.to_csv, + quoting=csv.QUOTE_NONE) + tm.assertRaisesRegexp(csv.Error, msg, df.to_csv, + quoting=csv.QUOTE_NONE, + escapechar=None) + + expected = """\ +,c_bool,c_float,c_int,c_string +0,True,1.0,42.0,a +1,False,3.2,,b!,c +""" + result = df.to_csv(quoting=csv.QUOTE_NONE, + escapechar='!') + self.assertEqual(result, expected) + + expected = """\ +,c_bool,c_ffloat,c_int,c_string +0,True,1.0,42.0,a +1,False,3.2,,bf,c +""" + result = df.to_csv(quoting=csv.QUOTE_NONE, + escapechar='f') + self.assertEqual(result, expected) + + # see gh-3503: quoting Windows line terminators + # presents with encoding? + text = 'a,b,c\n1,"test \r\n",3\n' + df = pd.read_csv(StringIO(text)) + buf = StringIO() + df.to_csv(buf, encoding='utf-8', index=False) + self.assertEqual(buf.getvalue(), text) + + # xref gh-7791: make sure the quoting parameter is passed through + # with multi-indexes + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + df = df.set_index(['a', 'b']) + expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' + self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected) From fca35fbe40e7749e8202c64292f10a24c5effa19 Mon Sep 17 00:00:00 2001 From: priyankjain Date: Thu, 16 Jun 2016 17:00:21 -0400 Subject: [PATCH 036/359] BUG: Fix for .str.replace with invalid input closes #13438 Author: priyankjain Closes #13460 from priyankjain/13438 and squashes the following commits: d5c3f1b [priyankjain] BUG: Fix for .str.replace with invalid input --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/core/strings.py | 6 +++++- pandas/tests/test_strings.py | 7 +++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index b3ce9911d3f4d..fad56521bdfe8 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -391,7 +391,7 @@ Bug Fixes - Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) - +- Bug in ``.str.replace`` does not raise ``TypeError`` for invalid replacement (:issue:`13438`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ca8e701d0ce17..a3f687b7fd73c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -4,7 +4,7 @@ from pandas.core.common import (isnull, notnull, _values_from_object, is_bool_dtype, is_list_like, is_categorical_dtype, - is_object_dtype) + is_object_dtype, is_string_like) from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin @@ -309,6 +309,10 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): ------- replaced : Series/Index of objects """ + + # Check whether repl is valid (GH 13438) + if not is_string_like(repl): + raise TypeError("repl must be a string") use_re = not case or len(pat) > 1 or flags if use_re: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 73f9809a7f042..67d171bb8efda 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -430,6 +430,13 @@ def test_replace(self): result = values.str.replace("(?<=\w),(?=\w)", ", ", flags=re.UNICODE) tm.assert_series_equal(result, exp) + # GH 13438 + for klass in (Series, Index): + for repl in (None, 3, {'a': 'b'}): + for data in (['a', 'b', None], ['a', 'b', 'c', 'ad']): + values = klass(data) + self.assertRaises(TypeError, values.str.replace, 'a', repl) + def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd']) From 6d8c04ce64d975c2fd8c902c0e5df343805bd112 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 16 Jun 2016 20:06:10 -0400 Subject: [PATCH 037/359] ENH: add pd.asof_merge closes #1870 xref #2941 http://nbviewer.jupyter.org/gist/jreback/5f089d308750c89b2a7d7446b790c056 is a notebook of example usage and timings Author: Jeff Reback Closes #13358 from jreback/asof and squashes the following commits: 4592fa2 [Jeff Reback] TST: reorg tests/series/test_timeseries -> test_asof --- doc/source/api.rst | 3 + doc/source/merging.rst | 166 ++++-- doc/source/whatsnew/v0.18.2.txt | 94 ++- pandas/__init__.py | 3 +- pandas/algos.pyx | 40 +- pandas/core/frame.py | 6 + pandas/core/generic.py | 89 ++- pandas/core/groupby.py | 15 +- pandas/core/series.py | 49 +- pandas/hashtable.pyx | 3 +- pandas/indexes/category.py | 3 +- pandas/src/join.pyx | 148 ++++- pandas/tests/frame/test_asof.py | 72 +++ pandas/tests/series/test_asof.py | 158 +++++ pandas/tests/series/test_timeseries.py | 110 +--- pandas/tools/merge.py | 551 ++++++++++++++++-- .../tools/tests/data/allow_exact_matches.csv | 28 + .../allow_exact_matches_and_tolerance.csv | 28 + pandas/tools/tests/data/asof.csv | 28 + pandas/tools/tests/data/asof2.csv | 78 +++ pandas/tools/tests/{ => data}/cut_data.csv | 0 pandas/tools/tests/data/quotes.csv | 17 + pandas/tools/tests/data/quotes2.csv | 57 ++ pandas/tools/tests/data/tolerance.csv | 28 + pandas/tools/tests/data/trades.csv | 28 + pandas/tools/tests/data/trades2.csv | 78 +++ pandas/tools/tests/test_merge_asof.py | 352 +++++++++++ ...ordered_merge.py => test_merge_ordered.py} | 17 +- pandas/tools/tests/test_tile.py | 3 +- setup.py | 1 + 30 files changed, 1975 insertions(+), 278 deletions(-) create mode 100644 pandas/tests/frame/test_asof.py create mode 100644 pandas/tests/series/test_asof.py create mode 100644 pandas/tools/tests/data/allow_exact_matches.csv create mode 100644 pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv create mode 100644 pandas/tools/tests/data/asof.csv create mode 100644 pandas/tools/tests/data/asof2.csv rename pandas/tools/tests/{ => data}/cut_data.csv (100%) create mode 100644 pandas/tools/tests/data/quotes.csv create mode 100644 pandas/tools/tests/data/quotes2.csv create mode 100644 pandas/tools/tests/data/tolerance.csv create mode 100644 pandas/tools/tests/data/trades.csv create mode 100644 pandas/tools/tests/data/trades2.csv create mode 100644 pandas/tools/tests/test_merge_asof.py rename pandas/tools/tests/{test_ordered_merge.py => test_merge_ordered.py} (85%) diff --git a/doc/source/api.rst b/doc/source/api.rst index 0e893308dd935..0dde341d820e3 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -151,6 +151,8 @@ Data manipulations cut qcut merge + merge_ordered + merge_asof concat get_dummies factorize @@ -943,6 +945,7 @@ Time series-related :toctree: generated/ DataFrame.asfreq + DataFrame.asof DataFrame.shift DataFrame.first_valid_index DataFrame.last_valid_index diff --git a/doc/source/merging.rst b/doc/source/merging.rst index ba675d9aac830..74871fe68fc08 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -104,7 +104,7 @@ some configurable handling of "what to do with the other axes": - ``ignore_index`` : boolean, default False. If True, do not use the index values on the concatenation axis. The resulting axis will be labeled 0, ..., n - 1. This is useful if you are concatenating objects where the - concatenation axis does not have meaningful indexing information. Note + concatenation axis does not have meaningful indexing information. Note the index values on the other axes are still respected in the join. - ``copy`` : boolean, default True. If False, do not copy data unnecessarily. @@ -544,12 +544,12 @@ Here's a description of what each argument is for: can be avoided are somewhat pathological but this option is provided nonetheless. - ``indicator``: Add a column to the output DataFrame called ``_merge`` - with information on the source of each row. ``_merge`` is Categorical-type - and takes on a value of ``left_only`` for observations whose merge key - only appears in ``'left'`` DataFrame, ``right_only`` for observations whose - merge key only appears in ``'right'`` DataFrame, and ``both`` if the - observation's merge key is found in both. - + with information on the source of each row. ``_merge`` is Categorical-type + and takes on a value of ``left_only`` for observations whose merge key + only appears in ``'left'`` DataFrame, ``right_only`` for observations whose + merge key only appears in ``'right'`` DataFrame, and ``both`` if the + observation's merge key is found in both. + .. versionadded:: 0.17.0 @@ -718,7 +718,7 @@ The merge indicator df2 = DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) merge(df1, df2, on='col1', how='outer', indicator=True) -The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. +The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. .. ipython:: python @@ -1055,34 +1055,6 @@ them together on their indexes. The same is true for ``Panel.join``. labels=['left', 'right', 'right2'], vertical=False); plt.close('all'); -.. _merging.ordered_merge: - -Merging Ordered Data -~~~~~~~~~~~~~~~~~~~~ - -New in v0.8.0 is the ordered_merge function for combining time series and other -ordered data. In particular it has an optional ``fill_method`` keyword to -fill/interpolate missing data: - -.. ipython:: python - - left = DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], - 'lv': [1, 2, 3, 4], - 's': ['a', 'b', 'c', 'd']}) - - right = DataFrame({'k': ['K1', 'K2', 'K4'], - 'rv': [1, 2, 3]}) - - result = ordered_merge(left, right, fill_method='ffill', left_by='s') - -.. ipython:: python - :suppress: - - @savefig merging_ordered_merge.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=True); - plt.close('all'); - .. _merging.combine_first.update: Merging together values within Series or DataFrame columns @@ -1132,4 +1104,124 @@ values inplace: @savefig merging_update.png p.plot([df1_copy, df2], df1, labels=['df1', 'df2'], vertical=False); - plt.close('all'); \ No newline at end of file + plt.close('all'); + +.. _merging.time_series: + +Timeseries friendly merging +--------------------------- + +.. _merging.merge_ordered: + +Merging Ordered Data +~~~~~~~~~~~~~~~~~~~~ + +The ``pd.merge_ordered()`` function allows combining time series and other +ordered data. In particular it has an optional ``fill_method`` keyword to +fill/interpolate missing data: + +.. ipython:: python + + left = DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], + 'lv': [1, 2, 3, 4], + 's': ['a', 'b', 'c', 'd']}) + + right = DataFrame({'k': ['K1', 'K2', 'K4'], + 'rv': [1, 2, 3]}) + + result = pd.merge_ordered(left, right, fill_method='ffill', left_by='s') + +.. ipython:: python + :suppress: + + @savefig merging_ordered_merge.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=True); + plt.close('all'); + +.. _merging.merge_asof: + +Merging AsOf +~~~~~~~~~~~~ + +.. versionadded:: 0.18.2 + +An ``pd.merge_asof()`` this is similar to an ordered left-join except that we +match on nearest key rather than equal keys. + +For each row in the ``left`` DataFrame, we select the last row in the ``right`` +DataFrame whose ``on`` key is less than the left's key. Both DataFrames must +be sorted by the key. + +Optionally an asof merge can perform a group-wise merge. This matches the ``by`` key equally, +in addition to the nearest match on the ``on`` key. + +For example; we might have ``trades`` and ``quotes`` and we want to ``asof`` merge them. + +.. ipython:: python + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + +.. ipython:: python + + trades + quotes + +By default we are taking the asof of the quotes. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + +We only asof within ``2ms`` betwen the quote time and the trade time. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('2ms')) + +We only asof within ``10ms`` betwen the quote time and the trade time and we exclude exact matches on time. +Note that though we exclude the exact matches (of the quotes), prior quotes DO propogate to that point +in time. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('10ms'), + allow_exact_matches=False) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index fad56521bdfe8..6bc152aad6b01 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -19,6 +19,97 @@ Highlights include: New features ~~~~~~~~~~~~ +.. _whatsnew_0182.enhancements.asof_merge: + +``pd.merge_asof()`` for asof-style time-series joining +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A long-time requested feature has been added through the :func:`merge_asof` function, to +support asof style joining of time-series. (:issue:`1870`). Full documentation is +:ref:`here ` + +The :func:`merge_asof`` performs an asof merge, which is similar to a left-join +except that we match on nearest key rather than equal keys. + +.. ipython:: python + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + left + right + +We typically want to match exactly when possible, and use the most +recent value otherwise. + +.. ipython:: python + + pd.merge_asof(left, right, on='a') + +We can also match rows ONLY with prior data, and not an exact match. + +.. ipython:: python + + pd.merge_asof(left, right, on='a', allow_exact_matches=False) + + +In a typical time-series example, we have ``trades`` and ``quotes`` and we want to ``asof-join`` them. +This also illustrates using the ``by`` parameter to group data before merging. + +.. ipython:: python + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + +.. ipython:: python + + trades + quotes + +An asof merge joins on the ``on``, typically a datetimelike field, which is ordered, and +in this case we are using a grouper in the ``by`` field. This is like a left-outer join, except +that forward filling happens automatically taking the most recent non-NaN value. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + +This returns a merged DataFrame with the entries in the same order as the original left +passed DataFrame (``trades`` in this case). With the fields of the ``quotes`` merged. + .. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: ``pd.read_csv`` has improved support for duplicate column names @@ -124,8 +215,8 @@ Other enhancements idx.where([True, False, True]) - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) +- ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) @@ -335,6 +426,7 @@ Deprecations - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) - ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) +- top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) .. _whatsnew_0182.performance: diff --git a/pandas/__init__.py b/pandas/__init__.py index 53642fdcfeb31..350898c9925e7 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -43,7 +43,8 @@ from pandas.io.api import * from pandas.computation.api import * -from pandas.tools.merge import merge, concat, ordered_merge +from pandas.tools.merge import (merge, concat, ordered_merge, + merge_ordered, merge_asof) from pandas.tools.pivot import pivot_table, crosstab from pandas.tools.plotting import scatter_matrix, plot_params from pandas.tools.tile import cut, qcut diff --git a/pandas/algos.pyx b/pandas/algos.pyx index f1fd0204e2fd2..8e659a8566adb 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1,3 +1,5 @@ +# cython: profile=False + from numpy cimport * cimport numpy as np import numpy as np @@ -982,21 +984,35 @@ def is_lexsorted(list list_of_arrays): @cython.boundscheck(False) -def groupby_indices(ndarray values): +def groupby_indices(dict ids, ndarray[int64_t] labels, ndarray[int64_t] counts): + """ + turn group_labels output into a combined indexer maping the labels to + indexers + + Parameters + ---------- + ids: dict + mapping of label -> group indexer + labels: ndarray + labels for positions + counts: ndarray + group counts + + Returns + ------- + list of ndarrays of indices + + """ cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels, counts, arr, seen + Py_ssize_t i, n = len(labels) + ndarray[int64_t] arr, seen int64_t loc - dict ids = {} - object val int64_t k + dict result = {} - ids, labels, counts = group_labels(values) seen = np.zeros_like(counts) - # try not to get in trouble here... cdef int64_t **vecs = malloc(len(ids) * sizeof(int64_t*)) - result = {} for i from 0 <= i < len(counts): arr = np.empty(counts[i], dtype=np.int64) result[ids[i]] = arr @@ -1014,7 +1030,6 @@ def groupby_indices(ndarray values): seen[k] = loc + 1 free(vecs) - return result @cython.wraparound(False) @@ -1023,8 +1038,15 @@ def group_labels(ndarray[object] values): """ Compute label vector from input values and associated useful data + Parameters + ---------- + values: object ndarray + Returns ------- + tuple of (reverse mappings of label -> group indexer, + factorized labels ndarray, + group counts ndarray) """ cdef: Py_ssize_t i, n = len(values) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69def7502a6f7..b4b35953b4282 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -153,6 +153,12 @@ merged : DataFrame The output type will the be same as 'left', if it is a subclass of DataFrame. + +See also +-------- +merge_ordered +merge_asof + """ # ----------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0852c5a293f4e..348281d1a7e30 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -13,7 +13,7 @@ InvalidIndexError) import pandas.core.indexing as indexing from pandas.tseries.index import DatetimeIndex -from pandas.tseries.period import PeriodIndex +from pandas.tseries.period import PeriodIndex, Period from pandas.core.internals import BlockManager import pandas.core.algorithms as algos import pandas.core.common as com @@ -3629,6 +3629,93 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, res = res.T return res + # ---------------------------------------------------------------------- + # Timeseries methods Methods + + def asof(self, where, subset=None): + """ + The last row without any NaN is taken (or the last row without + NaN considering only the subset of columns in the case of a DataFrame) + + .. versionadded:: 0.18.2 For DataFrame + + If there is no good value, NaN is returned. + + Parameters + ---------- + where : date or array of dates + subset : string or list of strings, default None + if not None use these columns for NaN propagation + + Notes + ----- + Dates are assumed to be sorted + Raises if this is not the case + + Returns + ------- + where is scalar + + - value or NaN if input is Series + - Series if input is DataFrame + + where is Index: same shape object as input + + See Also + -------- + merge_asof + + """ + + if isinstance(where, compat.string_types): + from pandas import to_datetime + where = to_datetime(where) + + if not self.index.is_monotonic: + raise ValueError("asof requires a sorted index") + + if isinstance(self, ABCSeries): + if subset is not None: + raise ValueError("subset is not valid for Series") + nulls = self.isnull() + elif self.ndim > 2: + raise NotImplementedError("asof is not implemented " + "for {type}".format(type(self))) + else: + if subset is None: + subset = self.columns + if not is_list_like(subset): + subset = [subset] + nulls = self[subset].isnull().any(1) + + if not is_list_like(where): + start = self.index[0] + if isinstance(self.index, PeriodIndex): + where = Period(where, freq=self.index.freq).ordinal + start = start.ordinal + + if where < start: + return np.nan + + loc = self.index.searchsorted(where, side='right') + if loc > 0: + loc -= 1 + while nulls[loc] and loc > 0: + loc -= 1 + return self.iloc[loc] + + if not isinstance(where, Index): + where = Index(where) + + locs = self.index.asof_locs(where, ~(nulls.values)) + + # mask the missing + missing = locs == -1 + data = self.take(locs, is_copy=False) + data.index = where + data.loc[missing] = np.nan + return data + # ---------------------------------------------------------------------- # Action Methods diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bea62e98e4a2a..cc639b562dab8 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4329,8 +4329,19 @@ def _reorder_by_uniques(uniques, labels): def _groupby_indices(values): - return _algos.groupby_indices(_values_from_object( - com._ensure_object(values))) + + if is_categorical_dtype(values): + + # we have a categorical, so we can do quite a bit + # bit better than factorizing again + reverse = dict(enumerate(values.categories)) + codes = values.codes.astype('int64') + _, counts = _hash.value_count_scalar64(codes, False) + else: + reverse, codes, counts = _algos.group_labels( + _values_from_object(com._ensure_object(values))) + + return _algos.groupby_indices(reverse, codes, counts) def numpy_groupby(data, labels, axis=0): diff --git a/pandas/core/series.py b/pandas/core/series.py index 43b4ba3a51212..cf1639bacc3be 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -36,7 +36,7 @@ CombinedDatetimelikeProperties) from pandas.tseries.index import DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex -from pandas.tseries.period import PeriodIndex, Period +from pandas.tseries.period import PeriodIndex from pandas import compat from pandas.util.terminal import get_terminal_size from pandas.compat import zip, u, OrderedDict, StringIO @@ -46,7 +46,6 @@ import pandas.core.algorithms as algos import pandas.core.common as com -import pandas.core.datetools as datetools import pandas.core.nanops as nanops import pandas.formats.format as fmt from pandas.util.decorators import Appender, deprecate_kwarg, Substitution @@ -2601,52 +2600,6 @@ def last_valid_index(self): # ---------------------------------------------------------------------- # Time series-oriented methods - def asof(self, where): - """ - Return last good (non-NaN) value in Series if value is NaN for - requested date. - - If there is no good value, NaN is returned. - - Parameters - ---------- - where : date or array of dates - - Notes - ----- - Dates are assumed to be sorted - - Returns - ------- - value or NaN - """ - if isinstance(where, compat.string_types): - where = datetools.to_datetime(where) - - values = self._values - - if not hasattr(where, '__iter__'): - start = self.index[0] - if isinstance(self.index, PeriodIndex): - where = Period(where, freq=self.index.freq).ordinal - start = start.ordinal - - if where < start: - return np.nan - loc = self.index.searchsorted(where, side='right') - if loc > 0: - loc -= 1 - while isnull(values[loc]) and loc > 0: - loc -= 1 - return values[loc] - - if not isinstance(where, Index): - where = Index(where) - - locs = self.index.asof_locs(where, notnull(values)) - new_values = algos.take_1d(values, locs) - return self._constructor(new_values, index=where).__finalize__(self) - def to_timestamp(self, freq=None, how='start', copy=True): """ Cast to datetimeindex of timestamps, at *beginning* of period diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index f718c1ab0b8da..e1c3733a0449d 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1075,7 +1075,8 @@ def mode_int64(int64_t[:] values): @cython.wraparound(False) @cython.boundscheck(False) -def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'): +def duplicated_int64(ndarray[int64_t, ndim=1] values, + object keep='first'): cdef: int ret = 0, k int64_t value diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 4c9ca43f7f25d..3b7c660f5faa1 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -281,7 +281,8 @@ def is_unique(self): @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) def duplicated(self, keep='first'): from pandas.hashtable import duplicated_int64 - return duplicated_int64(self.codes.astype('i8'), keep) + codes = self.codes.astype('i8') + return duplicated_int64(codes, keep) def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index 8a9cf01375a68..a81ac0aa35d4e 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -125,6 +125,153 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, +def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, + Py_ssize_t max_groups, sort=True, + bint allow_exact_matches=1, + left_distance=None, + right_distance=None, + tolerance=None): + + cdef: + Py_ssize_t i, j, k, count = 0 + Py_ssize_t loc, left_pos, right_pos, position + Py_ssize_t offset + ndarray[int64_t] left_count, right_count + ndarray left_sorter, right_sorter, rev + ndarray[int64_t] left_indexer, right_indexer + int64_t lc, rc, tol, left_val, right_val, diff, indexer + ndarray[int64_t] ld, rd + bint has_tol = 0 + + # if we are using tolerance, set our objects + if left_distance is not None and right_distance is not None and tolerance is not None: + has_tol = 1 + ld = left_distance + rd = right_distance + tol = tolerance + + # NA group in location 0 + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) + + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + if right_count[i] > 0: + count += left_count[i] * right_count[i] + else: + count += left_count[i] + + # group 0 is the NA group + left_pos = 0 + right_pos = 0 + position = 0 + + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + + left_indexer = np.empty(count, dtype=np.int64) + right_indexer = np.empty(count, dtype=np.int64) + + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc == 0: + for j in range(lc): + indexer = position + j + left_indexer[indexer] = left_pos + j + + # take the most recent value + # if we are not the first + if right_pos: + + if has_tol: + + left_val = ld[left_pos + j] + right_val = rd[right_pos - 1] + diff = left_val - right_val + + # do we allow exact matches + if allow_exact_matches and diff > tol: + right_indexer[indexer] = -1 + continue + elif not allow_exact_matches: + if diff >= tol: + right_indexer[indexer] = -1 + continue + + right_indexer[indexer] = right_pos - 1 + else: + right_indexer[indexer] = -1 + position += lc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + + indexer = offset + k + left_indexer[indexer] = left_pos + j + + if has_tol: + + left_val = ld[left_pos + j] + right_val = rd[right_pos + k] + diff = left_val - right_val + + # do we allow exact matches + if allow_exact_matches and diff > tol: + right_indexer[indexer] = -1 + continue + + # we don't allow exact matches + elif not allow_exact_matches: + if diff >= tol or not right_pos: + right_indexer[indexer] = -1 + else: + right_indexer[indexer] = right_pos - 1 + continue + + else: + + # do we allow exact matches + if not allow_exact_matches: + + if right_pos: + right_indexer[indexer] = right_pos - 1 + else: + right_indexer[indexer] = -1 + continue + + right_indexer[indexer] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc + + left_indexer = _get_result_indexer(left_sorter, left_indexer) + right_indexer = _get_result_indexer(right_sorter, right_indexer) + + if not sort: # if not asked to sort, revert to original order + if len(left) == len(left_indexer): + # no multiple matches for any row on the left + # this is a short-cut to avoid groupsort_indexer + # otherwise, the `else` path also works in this case + if left_sorter.dtype != np.int_: + left_sorter = left_sorter.astype(np.int_) + + rev = np.empty(len(left), dtype=np.int_) + rev.put(left_sorter, np.arange(len(left))) + else: + rev, _ = groupsort_indexer(left_indexer, len(left)) + + if rev.dtype != np.int_: + rev = rev.astype(np.int_) + right_indexer = right_indexer.take(rev) + left_indexer = left_indexer.take(rev) + + return left_indexer, right_indexer + + def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): cdef: @@ -246,4 +393,3 @@ def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, last_obs[gid] = val return result - diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py new file mode 100644 index 0000000000000..6c15c75cb5427 --- /dev/null +++ b/pandas/tests/frame/test_asof.py @@ -0,0 +1,72 @@ +# coding=utf-8 + +import nose + +import numpy as np +from pandas import DataFrame, date_range + +from pandas.util.testing import assert_frame_equal +import pandas.util.testing as tm + +from .common import TestData + + +class TestFrameAsof(TestData, tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.N = N = 50 + rng = date_range('1/1/1990', periods=N, freq='53s') + self.df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, + index=rng) + + def test_basic(self): + + df = self.df.copy() + df.ix[15:30, 'A'] = np.nan + dates = date_range('1/1/1990', periods=self.N * 3, + freq='25s') + + result = df.asof(dates) + self.assertTrue(result.notnull().all(1).all()) + lb = df.index[14] + ub = df.index[30] + + dates = list(dates) + result = df.asof(dates) + self.assertTrue(result.notnull().all(1).all()) + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + self.assertTrue((rs == 14).all(1).all()) + + def test_subset(self): + + N = 10 + rng = date_range('1/1/1990', periods=N, freq='53s') + df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, + index=rng) + df.ix[4:8, 'A'] = np.nan + dates = date_range('1/1/1990', periods=N * 3, + freq='25s') + + # with a subset of A should be the same + result = df.asof(dates, subset='A') + expected = df.asof(dates) + assert_frame_equal(result, expected) + + # same with A/B + result = df.asof(dates, subset=['A', 'B']) + expected = df.asof(dates) + assert_frame_equal(result, expected) + + # B gives self.df.asof + result = df.asof(dates, subset='B') + expected = df.resample('25s', closed='right').ffill().reindex(dates) + expected.iloc[20:] = 9 + + assert_frame_equal(result, expected) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/series/test_asof.py b/pandas/tests/series/test_asof.py new file mode 100644 index 0000000000000..e2092feab9004 --- /dev/null +++ b/pandas/tests/series/test_asof.py @@ -0,0 +1,158 @@ +# coding=utf-8 + +import nose + +import numpy as np + +from pandas import (offsets, Series, notnull, + isnull, date_range, Timestamp) + +import pandas.util.testing as tm + +from .common import TestData + + +class TestSeriesAsof(TestData, tm.TestCase): + _multiprocess_can_split_ = True + + def test_basic(self): + + # array or list or dates + N = 50 + rng = date_range('1/1/1990', periods=N, freq='53s') + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range('1/1/1990', periods=N * 3, freq='25s') + + result = ts.asof(dates) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + self.assertTrue((rs == ts[lb]).all()) + + val = result[result.index[result.index >= ub][0]] + self.assertEqual(ts[ub], val) + + def test_scalar(self): + + N = 30 + rng = date_range('1/1/1990', periods=N, freq='53s') + ts = Series(np.arange(N), index=rng) + ts[5:10] = np.NaN + ts[15:20] = np.NaN + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + self.assertEqual(val1, ts[4]) + self.assertEqual(val2, ts[14]) + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + self.assertEqual(val1, ts[4]) + + # in there + result = ts.asof(ts.index[3]) + self.assertEqual(result, ts[3]) + + # no as of value + d = ts.index[0] - offsets.BDay() + self.assertTrue(np.isnan(ts.asof(d))) + + def test_with_nan(self): + # basic asof test + rng = date_range('1/1/2000', '1/2/2000', freq='4h') + s = Series(np.arange(len(rng)), index=rng) + r = s.resample('2h').mean() + + result = r.asof(r.index) + expected = Series([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.], + index=date_range('1/1/2000', '1/2/2000', freq='2h')) + tm.assert_series_equal(result, expected) + + r.iloc[3:5] = np.nan + result = r.asof(r.index) + expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.], + index=date_range('1/1/2000', '1/2/2000', freq='2h')) + tm.assert_series_equal(result, expected) + + r.iloc[-3:] = np.nan + result = r.asof(r.index) + expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.], + index=date_range('1/1/2000', '1/2/2000', freq='2h')) + tm.assert_series_equal(result, expected) + + def test_periodindex(self): + from pandas import period_range, PeriodIndex + # array or list or dates + N = 50 + rng = period_range('1/1/1990', periods=N, freq='H') + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range('1/1/1990', periods=N * 3, freq='37min') + + result = ts.asof(dates) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + pix = PeriodIndex(result.index.values, freq='H') + mask = (pix >= lb) & (pix < ub) + rs = result[mask] + self.assertTrue((rs == ts[lb]).all()) + + ts[5:10] = np.nan + ts[15:20] = np.nan + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + self.assertEqual(val1, ts[4]) + self.assertEqual(val2, ts[14]) + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + self.assertEqual(val1, ts[4]) + + # in there + self.assertEqual(ts.asof(ts.index[3]), ts[3]) + + # no as of value + d = ts.index[0].to_timestamp() - offsets.BDay() + self.assertTrue(isnull(ts.asof(d))) + + def test_errors(self): + + s = Series([1, 2, 3], + index=[Timestamp('20130101'), + Timestamp('20130103'), + Timestamp('20130102')]) + + # non-monotonic + self.assertFalse(s.index.is_monotonic) + with self.assertRaises(ValueError): + s.asof(s.index[0]) + + # subset with Series + N = 10 + rng = date_range('1/1/1990', periods=N, freq='53s') + s = Series(np.random.randn(N), index=rng) + with self.assertRaises(ValueError): + s.asof(s.index[0], subset='foo') + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 13b95ea97eedf..19acf54c7a3cb 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -3,10 +3,9 @@ from datetime import datetime -from numpy import nan import numpy as np -from pandas import Index, Series, notnull, date_range +from pandas import Index, Series, date_range from pandas.tseries.index import DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex @@ -179,51 +178,6 @@ def test_truncate(self): before=self.ts.index[-1] + offset, after=self.ts.index[0] - offset) - def test_asof(self): - # array or list or dates - N = 50 - rng = date_range('1/1/1990', periods=N, freq='53s') - ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='25s') - - result = ts.asof(dates) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - result = ts.asof(list(dates)) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - mask = (result.index >= lb) & (result.index < ub) - rs = result[mask] - self.assertTrue((rs == ts[lb]).all()) - - val = result[result.index[result.index >= ub][0]] - self.assertEqual(ts[ub], val) - - self.ts[5:10] = np.NaN - self.ts[15:20] = np.NaN - - val1 = self.ts.asof(self.ts.index[7]) - val2 = self.ts.asof(self.ts.index[19]) - - self.assertEqual(val1, self.ts[4]) - self.assertEqual(val2, self.ts[14]) - - # accepts strings - val1 = self.ts.asof(str(self.ts.index[7])) - self.assertEqual(val1, self.ts[4]) - - # in there - self.assertEqual(self.ts.asof(self.ts.index[3]), self.ts[3]) - - # no as of value - d = self.ts.index[0] - datetools.bday - self.assertTrue(np.isnan(self.ts.asof(d))) - def test_getitem_setitem_datetimeindex(self): from pandas import date_range @@ -424,68 +378,6 @@ def test_getitem_setitem_periodindex(self): result[4:8] = ts[4:8] assert_series_equal(result, ts) - def test_asof_periodindex(self): - from pandas import period_range, PeriodIndex - # array or list or dates - N = 50 - rng = period_range('1/1/1990', periods=N, freq='H') - ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='37min') - - result = ts.asof(dates) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - result = ts.asof(list(dates)) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - pix = PeriodIndex(result.index.values, freq='H') - mask = (pix >= lb) & (pix < ub) - rs = result[mask] - self.assertTrue((rs == ts[lb]).all()) - - ts[5:10] = np.NaN - ts[15:20] = np.NaN - - val1 = ts.asof(ts.index[7]) - val2 = ts.asof(ts.index[19]) - - self.assertEqual(val1, ts[4]) - self.assertEqual(val2, ts[14]) - - # accepts strings - val1 = ts.asof(str(ts.index[7])) - self.assertEqual(val1, ts[4]) - - # in there - self.assertEqual(ts.asof(ts.index[3]), ts[3]) - - # no as of value - d = ts.index[0].to_timestamp() - datetools.bday - self.assertTrue(np.isnan(ts.asof(d))) - - def test_asof_more(self): - from pandas import date_range - - s = Series([nan, nan, 1, 2, nan, nan, 3, 4, 5], - index=date_range('1/1/2000', periods=9)) - - dates = s.index[[4, 5, 6, 2, 1]] - - result = s.asof(dates) - expected = Series([2, 2, 3, 1, np.nan], index=dates) - - assert_series_equal(result, expected) - - s = Series([1.5, 2.5, 1, 2, nan, nan, 3, 4, 5], - index=date_range('1/1/2000', periods=9)) - result = s.asof(s.index[0]) - self.assertEqual(result, s[0]) - def test_asfreq(self): ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30), datetime( 2009, 11, 30), datetime(2009, 12, 31)]) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 182c0637ae29c..f963a271a767e 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -2,23 +2,30 @@ SQL-style merge routines """ +import copy import warnings import numpy as np from pandas.compat import range, lrange, lzip, zip, map, filter import pandas.compat as compat -from pandas.core.categorical import Categorical -from pandas.core.frame import DataFrame, _merge_doc +from pandas import (Categorical, DataFrame, Series, + Index, MultiIndex, Timedelta) +from pandas.core.frame import _merge_doc from pandas.core.generic import NDFrame -from pandas.core.series import Series -from pandas.core.index import (Index, MultiIndex, _get_combined_index, +from pandas.core.index import (_get_combined_index, _ensure_index, _get_consensus_names, _all_indexes_same) from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util.decorators import Appender, Substitution -from pandas.core.common import ABCSeries +from pandas.core.common import (ABCSeries, is_dtype_equal, + is_datetime64_dtype, + is_int64_dtype, + is_integer, + is_bool, + is_list_like, + needs_i8_conversion) import pandas.core.algorithms as algos import pandas.core.common as com @@ -47,9 +54,100 @@ class MergeError(ValueError): pass -def ordered_merge(left, right, on=None, left_by=None, right_by=None, +def _groupby_and_merge(by, on, left, right, _merge_pieces, + check_duplicates=True): + """ + groupby & merge; we are always performing a left-by type operation + + Parameters + ---------- + by: field to group + on: duplicates field + left: left frame + right: right frame + _merge_pieces: function for merging + check_duplicates: boolean, default True + should we check & clean duplicates + """ + + pieces = [] + if not isinstance(by, (list, tuple)): + by = [by] + + lby = left.groupby(by, sort=False) + + # if we can groupby the rhs + # then we can get vastly better perf + try: + + # we will check & remove duplicates if indicated + if check_duplicates: + if on is None: + on = [] + elif not isinstance(on, (list, tuple)): + on = [on] + + if right.duplicated(by + on).any(): + right = right.drop_duplicates(by + on, keep='last') + rby = right.groupby(by, sort=False) + except KeyError: + rby = None + + for key, lhs in lby: + + if rby is None: + rhs = right + else: + try: + rhs = right.take(rby.indices[key]) + except KeyError: + # key doesn't exist in left + lcols = lhs.columns.tolist() + cols = lcols + [r for r in right.columns + if r not in set(lcols)] + merged = lhs.reindex(columns=cols) + merged.index = range(len(merged)) + pieces.append(merged) + continue + + merged = _merge_pieces(lhs, rhs) + + # make sure join keys are in the merged + # TODO, should _merge_pieces do this? + for k in by: + try: + if k in merged: + merged[k] = key + except: + pass + + pieces.append(merged) + + # preserve the original order + # if we have a missing piece this can be reset + result = concat(pieces, ignore_index=True) + result = result.reindex(columns=pieces[0].columns, copy=False) + return result, lby + + +def ordered_merge(left, right, on=None, left_on=None, right_on=None, + left_by=None, right_by=None, fill_method=None, suffixes=('_x', '_y')): + + warnings.warn("ordered_merge is deprecated and replace by merged_ordered", + FutureWarning, stacklevel=2) + return merge_ordered(left, right, on=on, + left_on=left_on, right_on=right_on, + left_by=left_by, right_by=right_by, + fill_method=fill_method, suffixes=suffixes) + + +def merge_ordered(left, right, on=None, + left_on=None, right_on=None, + left_by=None, right_by=None, + fill_method=None, suffixes=('_x', '_y'), + how='outer'): """Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see examples) @@ -58,8 +156,6 @@ def ordered_merge(left, right, on=None, left_by=None, right_by=None, ---------- left : DataFrame right : DataFrame - fill_method : {'ffill', None}, default None - Interpolation method for data on : label or list Field names to join on. Must be found in both DataFrames. left_on : label or list, or array-like @@ -75,9 +171,18 @@ def ordered_merge(left, right, on=None, left_by=None, right_by=None, right_by : column name or list of column names Group right DataFrame by group columns and merge piece by piece with left DataFrame + fill_method : {'ffill', None}, default None + Interpolation method for data suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right side, respectively + how : {'left', 'right', 'outer', 'inner'}, default 'outer' + * left: use only keys from left frame (SQL: left outer join) + * right: use only keys from right frame (SQL: right outer join) + * outer: use union of keys from both frames (SQL: full outer join) + * inner: use intersection of keys from both frames (SQL: inner join) + + .. versionadded 0.18.2 Examples -------- @@ -110,46 +215,243 @@ def ordered_merge(left, right, on=None, left_by=None, right_by=None, merged : DataFrame The output type will the be same as 'left', if it is a subclass of DataFrame. + + See also + -------- + merge + merge_asof + """ def _merger(x, y): + # perform the ordered merge operation op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, - # left_index=left_index, right_index=right_index, - suffixes=suffixes, fill_method=fill_method) + suffixes=suffixes, fill_method=fill_method, + how=how) return op.get_result() if left_by is not None and right_by is not None: raise ValueError('Can only group either left or right frames') elif left_by is not None: - if not isinstance(left_by, (list, tuple)): - left_by = [left_by] - pieces = [] - for key, xpiece in left.groupby(left_by): - merged = _merger(xpiece, right) - for k in left_by: - # May have passed ndarray - try: - if k in merged: - merged[k] = key - except: - pass - pieces.append(merged) - return concat(pieces, ignore_index=True) + result, _ = _groupby_and_merge(left_by, on, left, right, + lambda x, y: _merger(x, y), + check_duplicates=False) elif right_by is not None: - if not isinstance(right_by, (list, tuple)): - right_by = [right_by] - pieces = [] - for key, ypiece in right.groupby(right_by): - merged = _merger(left, ypiece) - for k in right_by: - try: - if k in merged: - merged[k] = key - except: - pass - pieces.append(merged) - return concat(pieces, ignore_index=True) + result, _ = _groupby_and_merge(right_by, on, right, left, + lambda x, y: _merger(y, x), + check_duplicates=False) else: - return _merger(left, right) + result = _merger(left, right) + return result + + +def merge_asof(left, right, on=None, + left_on=None, right_on=None, + by=None, + suffixes=('_x', '_y'), + tolerance=None, + allow_exact_matches=True, + check_duplicates=True): + """Perform an asof merge. This is similar to a left-join except that we + match on nearest key rather than equal keys. + + For each row in the left DataFrame, we select the last row in the right + DataFrame whose 'on' key is less than or equal to the left's key. Both + DataFrames must be sorted by the key. + + Optionally perform group-wise merge. This searches for the nearest match + on the 'on' key within the same group according to 'by'. + + .. versionadded 0.18.2 + + Parameters + ---------- + left : DataFrame + right : DataFrame + on : label or list + Field names to join on. Must be found in both DataFrames. + The data MUST be ordered. Furthermore this must be a numeric column, + typically a datetimelike or integer. On or left_on/right_on + must be given. + left_on : label or list, or array-like + Field names to join on in left DataFrame. Can be a vector or list of + vectors of the length of the DataFrame to use a particular vector as + the join key instead of columns + right_on : label or list, or array-like + Field names to join on in right DataFrame or vector/list of vectors per + left_on docs + by : column name or list of column names + Group both the left and right DataFrames by the group columns; perform + the merge operation on these pieces and recombine. + suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively + tolerance : integer or Timedelta, optional, default None + select asof tolerance within this range; must be compatible + to the merge index. + allow_exact_matches : boolean, default True + + - If True, allow matching the same 'on' value + (i.e. less-than-or-equal-to) + - If False, don't match the same 'on' value + (i.e., stricly less-than) + + check_duplicates : boolean, default True + + - If True, check and remove duplicates for the right + DataFrame, on the [by, on] combination, keeping the last value. + - If False, no check for duplicates. If you *know* that + you don't have duplicates, then turning off the check for duplicates + can be more performant. + + Returns + ------- + merged : DataFrame + + Examples + -------- + >>> left + a left_val + 0 1 a + 1 5 b + 2 10 c + + >>> right + a right_val + 0 1 1 + 1 2 2 + 2 3 3 + 3 6 6 + 4 7 7 + + >>> pd.merge_asof(left, right, on='a') + a left_val right_val + 0 1 a 1 + 1 5 b 3 + 2 10 c 7 + + >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) + a left_val right_val + 0 1 a NaN + 1 5 b 3.0 + 2 10 c 7.0 + + For this example, we can achieve a similar result thru pd.merge_ordered, + though its not nearly as performant. + + + >>> (pd.merge_ordered(left, right, on='a') + ... .ffill() + ... .drop_duplicates(['left_val']) + ... ) + a left_val right_val + 0 1 a 1.0 + 3 5 b 3.0 + 6 10 c 7.0 + + Here is a real-worth times-series example + + >>> quotes + time ticker bid ask + 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 + 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 + 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 + 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 + 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 + 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 + 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 + 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 + + >>> trades + time ticker price quantity + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 + + # by default we are taking the asof of the quotes + >>> pd.asof_merge(trades, quotes, + ... on='time', + ... by='ticker') + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + # we only asof within 2ms betwen the quote time and the trade time + >>> pd.asof_merge(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('2ms')) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + # we only asof within 10ms betwen the quote time and the trade time + # and we exclude exact matches on time. However *prior* data will + # propogate forward + >>> pd.asof_merge(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('10ms'), + ... allow_exact_matches=False) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + See also + -------- + merge + merge_ordered + + """ + def _merger(x, y): + # perform the ordered merge operation + op = _AsOfMerge(x, y, + on=on, left_on=left_on, right_on=right_on, + by=by, suffixes=suffixes, + how='asof', tolerance=tolerance, + allow_exact_matches=allow_exact_matches) + return op.get_result() + + if by is not None: + result, groupby = _groupby_and_merge(by, on, left, right, + lambda x, y: _merger(x, y), + check_duplicates=check_duplicates) + + # we want to preserve the original order + # we had grouped, so need to reverse this + # if we DO have duplicates, then + # we cannot guarantee order + + sorter = np.concatenate([groupby.indices[g] for g, _ in groupby]) + if len(result) != len(sorter): + if check_duplicates: + raise AssertionError("invalid reverse grouping") + return result + + rev = np.empty(len(sorter), dtype=np.int_) + rev.put(sorter, np.arange(len(sorter))) + return result.take(rev).reset_index(drop=True) + + if check_duplicates: + if on is None: + on = [] + elif not isinstance(on, (list, tuple)): + on = [on] + + if right.duplicated(on).any(): + right = right.drop_duplicates(on, keep='last') + + return _merger(left, right) # TODO: transformations?? @@ -159,6 +461,7 @@ class _MergeOperation(object): Perform a database (SQL) merge operation between two DataFrame objects using either columns as keys or their row indexes """ + _merge_type = 'merge' def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, @@ -206,6 +509,8 @@ def __init__(self, left, right, how='inner', on=None, msg = msg.format(left.columns.nlevels, right.columns.nlevels) warnings.warn(msg, UserWarning) + self._validate_specification() + # note this function has side effects (self.left_join_keys, self.right_join_keys, @@ -233,7 +538,7 @@ def get_result(self): concat_axis=0, copy=self.copy) typ = self.left._constructor - result = typ(result_data).__finalize__(self, method='merge') + result = typ(result_data).__finalize__(self, method=self._merge_type) if self.indicator: result = self._indicator_post_merge(result) @@ -304,8 +609,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if left_has_missing: take_right = self.right_join_keys[i] - if not com.is_dtype_equal(result[name].dtype, - self.left[name].dtype): + if not is_dtype_equal(result[name].dtype, + self.left[name].dtype): take_left = self.left[name]._values elif name in self.right: @@ -316,8 +621,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if right_has_missing: take_left = self.left_join_keys[i] - if not com.is_dtype_equal(result[name].dtype, - self.right[name].dtype): + if not is_dtype_equal(result[name].dtype, + self.right[name].dtype): take_right = self.right[name]._values elif left_indexer is not None \ @@ -355,6 +660,13 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): else: result.insert(i, name or 'key_%d' % i, key_col) + def _get_join_indexers(self): + """ return the join indexers """ + return _get_join_indexers(self.left_join_keys, + self.right_join_keys, + sort=self.sort, + how=self.how) + def _get_join_info(self): left_ax = self.left._data.axes[self.axis] right_ax = self.right._data.axes[self.axis] @@ -373,9 +685,8 @@ def _get_join_info(self): sort=self.sort) else: (left_indexer, - right_indexer) = _get_join_indexers(self.left_join_keys, - self.right_join_keys, - sort=self.sort, how=self.how) + right_indexer) = self._get_join_indexers() + if self.right_index: if len(self.left) > 0: join_index = self.left.index.take(left_indexer) @@ -429,8 +740,6 @@ def _get_merge_keys(self): ------- left_keys, right_keys """ - self._validate_specification() - left_keys = [] right_keys = [] join_names = [] @@ -549,7 +858,8 @@ def _validate_specification(self): raise ValueError("len(right_on) must equal len(left_on)") -def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): +def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', + **kwargs): """ Parameters @@ -579,26 +889,27 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): lkey, rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False - kwargs = {'sort': sort} if how == 'left' else {} + kwargs = copy.copy(kwargs) + if how == 'left': + kwargs['sort'] = sort join_func = _join_functions[how] + return join_func(lkey, rkey, count, **kwargs) class _OrderedMerge(_MergeOperation): + _merge_type = 'ordered_merge' - def __init__(self, left, right, on=None, by=None, left_on=None, - right_on=None, axis=1, left_index=False, right_index=False, + def __init__(self, left, right, on=None, left_on=None, + right_on=None, axis=1, suffixes=('_x', '_y'), copy=True, - fill_method=None): + fill_method=None, how='outer'): self.fill_method = fill_method - _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, right_on=right_on, axis=axis, - left_index=left_index, - right_index=right_index, - how='outer', suffixes=suffixes, - sort=True # sorts when factorizing + how=how, suffixes=suffixes, + sort=True # factorize sorts ) def get_result(self): @@ -629,13 +940,133 @@ def get_result(self): concat_axis=0, copy=self.copy) typ = self.left._constructor - result = typ(result_data).__finalize__(self, method='ordered_merge') + result = typ(result_data).__finalize__(self, method=self._merge_type) self._maybe_add_join_keys(result, left_indexer, right_indexer) return result +class _AsOfMerge(_OrderedMerge): + _merge_type = 'asof_merge' + + def __init__(self, left, right, on=None, by=None, left_on=None, + right_on=None, axis=1, + suffixes=('_x', '_y'), copy=True, + fill_method=None, + how='asof', tolerance=None, + allow_exact_matches=True): + + self.by = by + self.tolerance = tolerance + self.allow_exact_matches = allow_exact_matches + + _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, + right_on=right_on, axis=axis, + how=how, suffixes=suffixes, + fill_method=fill_method) + + def _validate_specification(self): + super(_AsOfMerge, self)._validate_specification() + + # we only allow on to be a single item for on + if len(self.left_on) != 1: + raise MergeError("can only asof on a key for left") + + if len(self.right_on) != 1: + raise MergeError("can only asof on a key for right") + + # add by to our key-list so we can have it in the + # output as a key + if self.by is not None: + if not is_list_like(self.by): + self.by = [self.by] + + self.left_on = self.by + list(self.left_on) + self.right_on = self.by + list(self.right_on) + + @property + def _asof_key(self): + """ This is our asof key, the 'on' """ + return self.left_on[-1] + + def _get_merge_keys(self): + + # note this function has side effects + (left_join_keys, + right_join_keys, + join_names) = super(_AsOfMerge, self)._get_merge_keys() + + # validate index types are the same + for lk, rk in zip(left_join_keys, right_join_keys): + if not is_dtype_equal(lk.dtype, rk.dtype): + raise MergeError("incompatible merge keys, " + "must be the same type") + + # validate tolerance; must be a Timedelta if we have a DTI + if self.tolerance is not None: + + lt = left_join_keys[self.left_on.index(self._asof_key)] + msg = "incompatible tolerance, must be compat " \ + "with type {0}".format(type(lt)) + + if is_datetime64_dtype(lt): + if not isinstance(self.tolerance, Timedelta): + raise MergeError(msg) + if self.tolerance < Timedelta(0): + raise MergeError("tolerance must be positive") + + elif is_int64_dtype(lt): + if not is_integer(self.tolerance): + raise MergeError(msg) + if self.tolerance < 0: + raise MergeError("tolerance must be positive") + + else: + raise MergeError(msg) + + # validate allow_exact_matches + if not is_bool(self.allow_exact_matches): + raise MergeError("allow_exact_matches must be boolean, " + "passed {0}".format(self.allow_exact_matches)) + + return left_join_keys, right_join_keys, join_names + + def _get_join_indexers(self): + """ return the join indexers """ + + # we required sortedness in the join keys + msg = " keys must be sorted" + for lk in self.left_join_keys: + if not Index(lk).is_monotonic: + raise ValueError('left' + msg) + for rk in self.right_join_keys: + if not Index(rk).is_monotonic: + raise ValueError('right' + msg) + + kwargs = {} + + # tolerance + t = self.tolerance + if t is not None: + lt = self.left_join_keys[self.left_on.index(self._asof_key)] + rt = self.right_join_keys[self.right_on.index(self._asof_key)] + if needs_i8_conversion(lt): + lt = lt.view('i8') + t = t.value + rt = rt.view('i8') + kwargs['left_distance'] = lt + kwargs['right_distance'] = rt + kwargs['tolerance'] = t + + return _get_join_indexers(self.left_join_keys, + self.right_join_keys, + sort=self.sort, + how=self.how, + allow_exact_matches=self.allow_exact_matches, + **kwargs) + + def _get_multiindex_indexer(join_keys, index, sort): from functools import partial @@ -717,6 +1148,7 @@ def _right_outer_join(x, y, max_groups): 'left': _algos.left_outer_join, 'right': _right_outer_join, 'outer': _algos.full_outer_join, + 'asof': _algos.left_outer_asof_join, } @@ -724,6 +1156,7 @@ def _factorize_keys(lk, rk, sort=True): if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk): lk = lk.values rk = rk.values + if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk): klass = _hash.Int64Factorizer lk = com._ensure_int64(com._values_from_object(lk)) diff --git a/pandas/tools/tests/data/allow_exact_matches.csv b/pandas/tools/tests/data/allow_exact_matches.csv new file mode 100644 index 0000000000000..0446fb744c540 --- /dev/null +++ b/pandas/tools/tests/data/allow_exact_matches.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff --git a/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv b/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv new file mode 100644 index 0000000000000..0446fb744c540 --- /dev/null +++ b/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff --git a/pandas/tools/tests/data/asof.csv b/pandas/tools/tests/data/asof.csv new file mode 100644 index 0000000000000..d7d061bc46ccc --- /dev/null +++ b/pandas/tools/tests/data/asof.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff --git a/pandas/tools/tests/data/asof2.csv b/pandas/tools/tests/data/asof2.csv new file mode 100644 index 0000000000000..2c9c0392dd617 --- /dev/null +++ b/pandas/tools/tests/data/asof2.csv @@ -0,0 +1,78 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.084,AAPL,98.64,40,NASDAQ,98.55,98.56 +20160525 13:30:00.084,AAPL,98.55,149,EDGX,98.55,98.56 +20160525 13:30:00.086,AAPL,98.56,500,ARCA,98.55,98.63 +20160525 13:30:00.104,AAPL,98.63,647,EDGX,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,300,EDGX,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,1,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,62,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,10,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.105,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.105,AAPL,98.63,700,ARCA,98.62,98.63 +20160525 13:30:00.106,AAPL,98.63,61,EDGX,98.62,98.63 +20160525 13:30:00.107,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.107,AAPL,98.63,53,ARCA,98.62,98.63 +20160525 13:30:00.108,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.108,AAPL,98.63,839,ARCA,98.62,98.63 +20160525 13:30:00.115,AAPL,98.63,5,EDGX,98.62,98.63 +20160525 13:30:00.118,AAPL,98.63,295,EDGX,98.62,98.63 +20160525 13:30:00.118,AAPL,98.63,5,EDGX,98.62,98.63 +20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 +20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 +20160525 13:30:00.128,MSFT,51.92,100,ARCA,51.92,51.95 +20160525 13:30:00.129,AAPL,98.62,100,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,10,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,59,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,31,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,69,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,12,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,12,EDGX,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 +20160525 13:30:00.130,MSFT,51.95,317,ARCA,51.93,51.95 +20160525 13:30:00.130,MSFT,51.95,283,ARCA,51.93,51.95 +20160525 13:30:00.135,MSFT,51.93,100,EDGX,51.92,51.95 +20160525 13:30:00.135,AAPL,98.62,100,ARCA,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,12,NASDAQ,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,88,NASDAQ,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,162,NASDAQ,98.61,98.62 +20160525 13:30:00.144,AAPL,98.61,100,BATS,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,61,ARCA,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,25,ARCA,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,14,ARCA,98.61,98.62 +20160525 13:30:00.145,AAPL,98.62,12,ARCA,98.6,98.63 +20160525 13:30:00.145,AAPL,98.62,100,ARCA,98.6,98.63 +20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 +20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 diff --git a/pandas/tools/tests/cut_data.csv b/pandas/tools/tests/data/cut_data.csv similarity index 100% rename from pandas/tools/tests/cut_data.csv rename to pandas/tools/tests/data/cut_data.csv diff --git a/pandas/tools/tests/data/quotes.csv b/pandas/tools/tests/data/quotes.csv new file mode 100644 index 0000000000000..3f31d2cfffe1b --- /dev/null +++ b/pandas/tools/tests/data/quotes.csv @@ -0,0 +1,17 @@ +time,ticker,bid,ask +20160525 13:30:00.023,GOOG,720.50,720.93 +20160525 13:30:00.023,MSFT,51.95,51.95 +20160525 13:30:00.041,MSFT,51.95,51.95 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.072,GOOG,720.50,720.88 +20160525 13:30:00.075,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.92,51.95 diff --git a/pandas/tools/tests/data/quotes2.csv b/pandas/tools/tests/data/quotes2.csv new file mode 100644 index 0000000000000..7ade1e7faf1ae --- /dev/null +++ b/pandas/tools/tests/data/quotes2.csv @@ -0,0 +1,57 @@ +time,ticker,bid,ask +20160525 13:30:00.023,GOOG,720.50,720.93 +20160525 13:30:00.023,MSFT,51.95,51.95 +20160525 13:30:00.041,MSFT,51.95,51.95 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.072,GOOG,720.50,720.88 +20160525 13:30:00.075,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.92,51.95 +20160525 13:30:00.079,MSFT,51.92,51.95 +20160525 13:30:00.080,AAPL,98.55,98.56 +20160525 13:30:00.084,AAPL,98.55,98.56 +20160525 13:30:00.086,AAPL,98.55,98.63 +20160525 13:30:00.088,AAPL,98.65,98.63 +20160525 13:30:00.089,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.62,98.63 +20160525 13:30:00.105,AAPL,98.62,98.63 +20160525 13:30:00.107,AAPL,98.62,98.63 +20160525 13:30:00.115,AAPL,98.62,98.63 +20160525 13:30:00.115,AAPL,98.62,98.63 +20160525 13:30:00.118,AAPL,98.62,98.63 +20160525 13:30:00.128,AAPL,98.62,98.63 +20160525 13:30:00.128,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.61,98.63 +20160525 13:30:00.130,MSFT,51.93,51.95 +20160525 13:30:00.130,MSFT,51.93,51.95 +20160525 13:30:00.130,AAPL,98.61,98.63 +20160525 13:30:00.131,AAPL,98.61,98.62 +20160525 13:30:00.131,AAPL,98.61,98.62 +20160525 13:30:00.135,MSFT,51.92,51.95 +20160525 13:30:00.135,AAPL,98.61,98.62 +20160525 13:30:00.136,AAPL,98.61,98.62 +20160525 13:30:00.136,AAPL,98.61,98.62 +20160525 13:30:00.144,AAPL,98.61,98.62 +20160525 13:30:00.144,AAPL,98.61,98.62 +20160525 13:30:00.145,AAPL,98.61,98.62 +20160525 13:30:00.145,AAPL,98.61,98.63 +20160525 13:30:00.145,AAPL,98.61,98.63 +20160525 13:30:00.145,AAPL,98.60,98.63 +20160525 13:30:00.145,AAPL,98.61,98.63 +20160525 13:30:00.145,AAPL,98.60,98.63 diff --git a/pandas/tools/tests/data/tolerance.csv b/pandas/tools/tests/data/tolerance.csv new file mode 100644 index 0000000000000..d7d061bc46ccc --- /dev/null +++ b/pandas/tools/tests/data/tolerance.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff --git a/pandas/tools/tests/data/trades.csv b/pandas/tools/tests/data/trades.csv new file mode 100644 index 0000000000000..b26a4ce714255 --- /dev/null +++ b/pandas/tools/tests/data/trades.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter +20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ +20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ +20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ +20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.076,AAPL,98.5600,1000,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.076,AAPL,98.5600,300,ARCA +20160525 13:30:00.076,AAPL,98.5600,400,ARCA +20160525 13:30:00.076,AAPL,98.5600,600,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ diff --git a/pandas/tools/tests/data/trades2.csv b/pandas/tools/tests/data/trades2.csv new file mode 100644 index 0000000000000..64021faa68ce3 --- /dev/null +++ b/pandas/tools/tests/data/trades2.csv @@ -0,0 +1,78 @@ +time,ticker,price,quantity,marketCenter +20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ +20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ +20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ +20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.076,AAPL,98.5600,1000,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.076,AAPL,98.5600,300,ARCA +20160525 13:30:00.076,AAPL,98.5600,400,ARCA +20160525 13:30:00.076,AAPL,98.5600,600,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ +20160525 13:30:00.084,AAPL,98.6400,40,NASDAQ +20160525 13:30:00.084,AAPL,98.5500,149,EDGX +20160525 13:30:00.086,AAPL,98.5600,500,ARCA +20160525 13:30:00.104,AAPL,98.6300,647,EDGX +20160525 13:30:00.104,AAPL,98.6300,300,EDGX +20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,1,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,62,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,10,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,100,ARCA +20160525 13:30:00.105,AAPL,98.6300,100,ARCA +20160525 13:30:00.105,AAPL,98.6300,700,ARCA +20160525 13:30:00.106,AAPL,98.6300,61,EDGX +20160525 13:30:00.107,AAPL,98.6300,100,ARCA +20160525 13:30:00.107,AAPL,98.6300,53,ARCA +20160525 13:30:00.108,AAPL,98.6300,100,ARCA +20160525 13:30:00.108,AAPL,98.6300,839,ARCA +20160525 13:30:00.115,AAPL,98.6300,5,EDGX +20160525 13:30:00.118,AAPL,98.6300,295,EDGX +20160525 13:30:00.118,AAPL,98.6300,5,EDGX +20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ +20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ +20160525 13:30:00.128,MSFT,51.9200,100,ARCA +20160525 13:30:00.129,AAPL,98.6200,100,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,10,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,59,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,31,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,69,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,12,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,12,EDGX +20160525 13:30:00.129,AAPL,98.6200,100,ARCA +20160525 13:30:00.129,AAPL,98.6200,100,ARCA +20160525 13:30:00.130,MSFT,51.9500,317,ARCA +20160525 13:30:00.130,MSFT,51.9500,283,ARCA +20160525 13:30:00.135,MSFT,51.9300,100,EDGX +20160525 13:30:00.135,AAPL,98.6200,100,ARCA +20160525 13:30:00.144,AAPL,98.6200,12,NASDAQ +20160525 13:30:00.144,AAPL,98.6200,88,NASDAQ +20160525 13:30:00.144,AAPL,98.6200,162,NASDAQ +20160525 13:30:00.144,AAPL,98.6100,100,BATS +20160525 13:30:00.144,AAPL,98.6200,61,ARCA +20160525 13:30:00.144,AAPL,98.6200,25,ARCA +20160525 13:30:00.144,AAPL,98.6200,14,ARCA +20160525 13:30:00.145,AAPL,98.6200,12,ARCA +20160525 13:30:00.145,AAPL,98.6200,100,ARCA +20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ +20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py new file mode 100644 index 0000000000000..5d78ccf199ed3 --- /dev/null +++ b/pandas/tools/tests/test_merge_asof.py @@ -0,0 +1,352 @@ +import nose +import os + +import numpy as np +import pandas as pd +from pandas import (merge_asof, read_csv, + to_datetime, Timedelta) +from pandas.tools.merge import MergeError +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + + +class TestAsOfMerge(tm.TestCase): + _multiprocess_can_split_ = True + + def read_data(self, name, dedupe=False): + path = os.path.join(tm.get_data_path(), name) + x = read_csv(path) + if dedupe: + x = (x.drop_duplicates(['time', 'ticker'], keep='last') + .reset_index(drop=True) + ) + x.time = to_datetime(x.time) + return x + + def setUp(self): + + self.trades = self.read_data('trades.csv') + self.quotes = self.read_data('quotes.csv', dedupe=True) + self.asof = self.read_data('asof.csv') + self.tolerance = self.read_data('tolerance.csv') + self.allow_exact_matches = self.read_data('allow_exact_matches.csv') + self.allow_exact_matches_and_tolerance = self.read_data( + 'allow_exact_matches_and_tolerance.csv') + + def test_examples1(self): + """ doc-string examples """ + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + pd.merge_asof(left, right, on='a') + + def test_examples2(self): + """ doc-string examples """ + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('2ms')) + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('10ms'), + allow_exact_matches=False) + + def test_basic(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, + on='time', + by='ticker') + assert_frame_equal(result, expected) + + def test_basic_categorical(self): + + expected = self.asof + trades = self.trades.copy() + trades.ticker = trades.ticker.astype('category') + quotes = self.quotes.copy() + quotes.ticker = quotes.ticker.astype('category') + + result = merge_asof(trades, quotes, + on='time', + by='ticker') + assert_frame_equal(result, expected) + + def test_missing_right_by(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes + + q = quotes[quotes.ticker != 'MSFT'] + result = merge_asof(trades, q, + on='time', + by='ticker') + expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan + assert_frame_equal(result, expected) + + def test_basic2(self): + + expected = self.read_data('asof2.csv') + trades = self.read_data('trades2.csv') + quotes = self.read_data('quotes2.csv', dedupe=True) + + result = merge_asof(trades, quotes, + on='time', + by='ticker') + assert_frame_equal(result, expected) + + def test_basic_no_by(self): + f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \ + .reset_index(drop=True) + + # just use a single ticker + expected = f(self.asof) + trades = f(self.trades) + quotes = f(self.quotes) + + result = merge_asof(trades, quotes, + on='time') + assert_frame_equal(result, expected) + + def test_valid_join_keys(self): + + trades = self.trades + quotes = self.quotes + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + left_on='time', + right_on='bid', + by='ticker') + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on=['time', 'ticker'], + by='ticker') + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + by='ticker') + + def test_with_duplicates(self): + + q = pd.concat([self.quotes, self.quotes]).sort_values( + ['time', 'ticker']).reset_index(drop=True) + result = merge_asof(self.trades, q, + on='time', + by='ticker') + expected = self.read_data('asof.csv') + assert_frame_equal(result, expected) + + result = merge_asof(self.trades, q, + on='time', + by='ticker', + check_duplicates=False) + expected = self.read_data('asof.csv') + expected = pd.concat([expected, expected]).sort_values( + ['time', 'ticker']).reset_index(drop=True) + + # the results are not ordered in a meaningful way + # nor are the exact matches duplicated, so comparisons + # are pretty tricky here, however the uniques are the same + + def aligner(x, ticker): + return (x[x.ticker == ticker] + .sort_values(['time', 'ticker', 'quantity', 'price', + 'marketCenter', 'bid', 'ask']) + .drop_duplicates(keep='last') + .reset_index(drop=True) + ) + + for ticker in expected.ticker.unique(): + r = aligner(result, ticker) + e = aligner(expected, ticker) + assert_frame_equal(r, e) + + def test_with_duplicates_no_on(self): + + df1 = pd.DataFrame({'key': [1, 1, 3], + 'left_val': [1, 2, 3]}) + df2 = pd.DataFrame({'key': [1, 3, 3], + 'right_val': [1, 2, 3]}) + result = merge_asof(df1, df2, on='key', check_duplicates=False) + expected = pd.DataFrame({'key': [1, 1, 3, 3], + 'left_val': [1, 2, 3, 3], + 'right_val': [1, 1, 2, 3]}) + assert_frame_equal(result, expected) + + df1 = pd.DataFrame({'key': [1, 1, 3], + 'left_val': [1, 2, 3]}) + df2 = pd.DataFrame({'key': [1, 2, 2], + 'right_val': [1, 2, 3]}) + result = merge_asof(df1, df2, on='key') + expected = pd.DataFrame({'key': [1, 1, 3], + 'left_val': [1, 2, 3], + 'right_val': [1, 1, 3]}) + assert_frame_equal(result, expected) + + def test_valid_allow_exact_matches(self): + + trades = self.trades + quotes = self.quotes + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on='time', + by='ticker', + allow_exact_matches='foo') + + def test_valid_tolerance(self): + + trades = self.trades + quotes = self.quotes + + # dti + merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=Timedelta('1s')) + + # integer + merge_asof(trades.reset_index(), quotes.reset_index(), + on='index', + by='ticker', + tolerance=1) + + # incompat + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=1) + + # invalid + with self.assertRaises(MergeError): + merge_asof(trades.reset_index(), quotes.reset_index(), + on='index', + by='ticker', + tolerance=1.0) + + # invalid negative + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=-Timedelta('1s')) + + with self.assertRaises(MergeError): + merge_asof(trades.reset_index(), quotes.reset_index(), + on='index', + by='ticker', + tolerance=-1) + + def test_non_sorted(self): + + trades = self.trades.sort_values('time', ascending=False) + quotes = self.quotes.sort_values('time', ascending=False) + + # we require that we are already sorted on time & quotes + self.assertFalse(trades.time.is_monotonic) + self.assertFalse(quotes.time.is_monotonic) + with self.assertRaises(ValueError): + merge_asof(trades, quotes, + on='time', + by='ticker') + + trades = self.trades.sort_values('time') + self.assertTrue(trades.time.is_monotonic) + self.assertFalse(quotes.time.is_monotonic) + with self.assertRaises(ValueError): + merge_asof(trades, quotes, + on='time', + by='ticker') + + quotes = self.quotes.sort_values('time') + self.assertTrue(trades.time.is_monotonic) + self.assertTrue(quotes.time.is_monotonic) + + # ok, though has dupes + merge_asof(trades, self.quotes, + on='time', + by='ticker') + + def test_tolerance(self): + + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=Timedelta('1day')) + expected = self.tolerance + assert_frame_equal(result, expected) + + def test_allow_exact_matches(self): + + result = merge_asof(self.trades, self.quotes, + on='time', + by='ticker', + allow_exact_matches=False) + expected = self.allow_exact_matches + assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance(self): + + result = merge_asof(self.trades, self.quotes, + on='time', + by='ticker', + tolerance=Timedelta('100ms'), + allow_exact_matches=False) + expected = self.allow_exact_matches_and_tolerance + assert_frame_equal(result, expected) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_ordered_merge.py b/pandas/tools/tests/test_merge_ordered.py similarity index 85% rename from pandas/tools/tests/test_ordered_merge.py rename to pandas/tools/tests/test_merge_ordered.py index 53f00d9761f32..0511a0ca6d1cf 100644 --- a/pandas/tools/tests/test_ordered_merge.py +++ b/pandas/tools/tests/test_merge_ordered.py @@ -1,7 +1,7 @@ import nose import pandas as pd -from pandas import DataFrame, ordered_merge +from pandas import DataFrame, merge_ordered from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal @@ -17,10 +17,15 @@ def setUp(self): self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], 'rvalue': [1, 2, 3., 4]}) + def test_deprecation(self): + + with tm.assert_produces_warning(FutureWarning): + pd.ordered_merge(self.left, self.right, on='key') + # GH #813 def test_basic(self): - result = ordered_merge(self.left, self.right, on='key') + result = merge_ordered(self.left, self.right, on='key') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1, nan, 2, nan, 3, nan], 'rvalue': [nan, 1, 2, 3, nan, 4]}) @@ -28,7 +33,7 @@ def test_basic(self): assert_frame_equal(result, expected) def test_ffill(self): - result = ordered_merge( + result = merge_ordered( self.left, self.right, on='key', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1., 1, 2, 2, 3, 3.], @@ -42,7 +47,7 @@ def test_multigroup(self): left['group'] = ['a'] * 3 + ['b'] * 3 # right['group'] = ['a'] * 4 + ['b'] * 4 - result = ordered_merge(left, self.right, on='key', left_by='group', + result = merge_ordered(left, self.right, on='key', left_by='group', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, @@ -51,11 +56,11 @@ def test_multigroup(self): assert_frame_equal(result, expected.ix[:, result.columns]) - result2 = ordered_merge(self.right, left, on='key', right_by='group', + result2 = merge_ordered(self.right, left, on='key', right_by='group', fill_method='ffill') assert_frame_equal(result, result2.ix[:, result.columns]) - result = ordered_merge(left, self.right, on='key', left_by='group') + result = merge_ordered(left, self.right, on='key', left_by='group') self.assertTrue(result['group'].notnull().all()) def test_merge_type(self): diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 0b91fd1ef1c02..bb5429b5e8836 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -216,8 +216,7 @@ def test_label_formatting(self): def test_qcut_binning_issues(self): # #1978, 1979 - path = os.path.join(curpath(), 'cut_data.csv') - + path = os.path.join(tm.get_data_path(), 'cut_data.csv') arr = np.loadtxt(path) result = qcut(arr, 20) diff --git a/setup.py b/setup.py index 1d189364239a9..adea92896d382 100755 --- a/setup.py +++ b/setup.py @@ -591,6 +591,7 @@ def pxd(name): 'tests/data/*.xlsx', 'tests/data/*.xlsm', 'tests/data/*.table', + 'tests/tools/data/*.csv', 'tests/parser/data/*.csv', 'tests/parser/data/*.gz', 'tests/parser/data/*.bz2', From b06bc7ab10552079b5300f53ae9458b5b1583402 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Jun 2016 14:29:03 -0400 Subject: [PATCH 038/359] BLD: revert to using an always current conda closes #13444 BLD: have a correct channel priorty --- appveyor.yml | 10 +++++++--- ci/install_travis.sh | 17 ++++++++++++----- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 13a7ddbf0dfd7..0af538aed40dc 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -60,11 +60,15 @@ install: # install our build environment - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false - #- cmd: conda update -q conda - - cmd: conda install conda==4.0.8 - - cmd: conda config --add channels http://conda.anaconda.org/pandas + - cmd: conda update -q conda - cmd: conda config --set ssl_verify false + # add the pandas channel *before* defaults to have defaults take priority + - cmd: conda config --add channels pandas + - cmd: conda config --remove channels defaults + - cmd: conda config --add channels defaults + - cmd: conda install anaconda-client + # this is now the downloaded conda... - cmd: conda info -a diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 8be117cbecc05..b490699460622 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -68,6 +68,7 @@ python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" [ "$python_major_version" == "2" ] && python_major_version="" # install miniconda +echo "install miniconda" if [ "${TRAVIS_OS_NAME}" == "osx" ]; then wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 else @@ -75,12 +76,18 @@ else fi bash miniconda.sh -b -p $HOME/miniconda || exit 1 -conda config --set always_yes yes --set changeps1 no || exit 1 - -# fix the conda version -conda install conda==4.0.8 -conda config --add channels http://conda.anaconda.org/pandas || exit 1 +echo "update conda" conda config --set ssl_verify false || exit 1 +conda config --set always_yes true --set changeps1 false || exit 1 +conda update -q conda + +# add the pandas channel *before* defaults to have defaults take priority +echo "add channels" +conda config --add channels pandas || exit 1 +conda config --remove channels defaults || exit 1 +conda config --add channels defaults || exit 1 + +conda install anaconda-client # Useful for debugging any issues with conda conda info -a || exit 1 From 013c2ce760083e5454552ad44bba8d2f44da2caa Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Jun 2016 08:05:54 -0400 Subject: [PATCH 039/359] DOC: follow on merge_asof closes #13358 --- doc/source/merging.rst | 132 ++++++++++++++------------------ doc/source/whatsnew/v0.18.2.txt | 5 +- pandas/tools/merge.py | 19 +++-- 3 files changed, 73 insertions(+), 83 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 74871fe68fc08..c629c4e5ea7f7 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -78,7 +78,7 @@ some configurable handling of "what to do with the other axes": :: pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False) + keys=None, levels=None, names=None, verify_integrity=False) - ``objs``: a sequence or mapping of Series, DataFrame, or Panel objects. If a dict is passed, the sorted keys will be used as the `keys` argument, unless @@ -510,48 +510,45 @@ standard database join operations between DataFrame objects: :: - merge(left, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True, indicator=False) - -Here's a description of what each argument is for: - - - ``left``: A DataFrame object - - ``right``: Another DataFrame object - - ``on``: Columns (names) to join on. Must be found in both the left and - right DataFrame objects. If not passed and ``left_index`` and - ``right_index`` are ``False``, the intersection of the columns in the - DataFrames will be inferred to be the join keys - - ``left_on``: Columns from the left DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame - - ``right_on``: Columns from the right DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame - - ``left_index``: If ``True``, use the index (row labels) from the left - DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex - (hierarchical), the number of levels must match the number of join keys - from the right DataFrame - - ``right_index``: Same usage as ``left_index`` for the right DataFrame - - ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults - to ``inner``. See below for more detailed description of each method - - ``sort``: Sort the result DataFrame by the join keys in lexicographical - order. Defaults to ``True``, setting to ``False`` will improve performance - substantially in many cases - - ``suffixes``: A tuple of string suffixes to apply to overlapping - columns. Defaults to ``('_x', '_y')``. - - ``copy``: Always copy data (default ``True``) from the passed DataFrame - objects, even when reindexing is not necessary. Cannot be avoided in many - cases but may improve performance / memory usage. The cases where copying - can be avoided are somewhat pathological but this option is provided - nonetheless. - - ``indicator``: Add a column to the output DataFrame called ``_merge`` - with information on the source of each row. ``_merge`` is Categorical-type - and takes on a value of ``left_only`` for observations whose merge key - only appears in ``'left'`` DataFrame, ``right_only`` for observations whose - merge key only appears in ``'right'`` DataFrame, and ``both`` if the - observation's merge key is found in both. - - .. versionadded:: 0.17.0 - + pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=True, + suffixes=('_x', '_y'), copy=True, indicator=False) + +- ``left``: A DataFrame object +- ``right``: Another DataFrame object +- ``on``: Columns (names) to join on. Must be found in both the left and + right DataFrame objects. If not passed and ``left_index`` and + ``right_index`` are ``False``, the intersection of the columns in the + DataFrames will be inferred to be the join keys +- ``left_on``: Columns from the left DataFrame to use as keys. Can either be + column names or arrays with length equal to the length of the DataFrame +- ``right_on``: Columns from the right DataFrame to use as keys. Can either be + column names or arrays with length equal to the length of the DataFrame +- ``left_index``: If ``True``, use the index (row labels) from the left + DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex + (hierarchical), the number of levels must match the number of join keys + from the right DataFrame +- ``right_index``: Same usage as ``left_index`` for the right DataFrame +- ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults + to ``inner``. See below for more detailed description of each method +- ``sort``: Sort the result DataFrame by the join keys in lexicographical + order. Defaults to ``True``, setting to ``False`` will improve performance + substantially in many cases +- ``suffixes``: A tuple of string suffixes to apply to overlapping + columns. Defaults to ``('_x', '_y')``. +- ``copy``: Always copy data (default ``True``) from the passed DataFrame + objects, even when reindexing is not necessary. Cannot be avoided in many + cases but may improve performance / memory usage. The cases where copying + can be avoided are somewhat pathological but this option is provided + nonetheless. +- ``indicator``: Add a column to the output DataFrame called ``_merge`` + with information on the source of each row. ``_merge`` is Categorical-type + and takes on a value of ``left_only`` for observations whose merge key + only appears in ``'left'`` DataFrame, ``right_only`` for observations whose + merge key only appears in ``'right'`` DataFrame, and ``both`` if the + observation's merge key is found in both. + + .. versionadded:: 0.17.0 The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` and ``right`` is a subclass of DataFrame, the return type will still be @@ -573,11 +570,11 @@ terminology used to describe join operations between two SQL-table like structures (DataFrame objects). There are several cases to consider which are very important to understand: - - **one-to-one** joins: for example when joining two DataFrame objects on - their indexes (which must contain unique values) - - **many-to-one** joins: for example when joining an index (unique) to one or - more columns in a DataFrame - - **many-to-many** joins: joining columns on columns. +- **one-to-one** joins: for example when joining two DataFrame objects on + their indexes (which must contain unique values) +- **many-to-one** joins: for example when joining an index (unique) to one or + more columns in a DataFrame +- **many-to-many** joins: joining columns on columns. .. note:: @@ -714,15 +711,15 @@ The merge indicator .. ipython:: python - df1 = DataFrame({'col1':[0,1], 'col_left':['a','b']}) - df2 = DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) - merge(df1, df2, on='col1', how='outer', indicator=True) + df1 = pd.DataFrame({'col1': [0, 1], 'col_left':['a', 'b']}) + df2 = pd.DataFrame({'col1': [1, 2, 2],'col_right':[2, 2, 2]}) + pd.merge(df1, df2, on='col1', how='outer', indicator=True) The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. .. ipython:: python - merge(df1, df2, on='col1', how='outer', indicator='indicator_column') + pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') .. _merging.join.index: @@ -924,7 +921,7 @@ a level name of the multi-indexed frame. left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, - index=Index(['K0', 'K1', 'K2'], name='key')) + index=pd.Index(['K0', 'K1', 'K2'], name='key')) index = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), ('K2', 'Y2'), ('K2', 'Y3')], @@ -1116,28 +1113,20 @@ Timeseries friendly merging Merging Ordered Data ~~~~~~~~~~~~~~~~~~~~ -The ``pd.merge_ordered()`` function allows combining time series and other +A :func:`pd.merge_ordered` function allows combining time series and other ordered data. In particular it has an optional ``fill_method`` keyword to fill/interpolate missing data: .. ipython:: python - left = DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], - 'lv': [1, 2, 3, 4], - 's': ['a', 'b', 'c', 'd']}) - - right = DataFrame({'k': ['K1', 'K2', 'K4'], - 'rv': [1, 2, 3]}) + left = pd.DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], + 'lv': [1, 2, 3, 4], + 's': ['a', 'b', 'c', 'd']}) - result = pd.merge_ordered(left, right, fill_method='ffill', left_by='s') - -.. ipython:: python - :suppress: + right = pd.DataFrame({'k': ['K1', 'K2', 'K4'], + 'rv': [1, 2, 3]}) - @savefig merging_ordered_merge.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=True); - plt.close('all'); + pd.merge_ordered(left, right, fill_method='ffill', left_by='s') .. _merging.merge_asof: @@ -1146,12 +1135,7 @@ Merging AsOf .. versionadded:: 0.18.2 -An ``pd.merge_asof()`` this is similar to an ordered left-join except that we -match on nearest key rather than equal keys. - -For each row in the ``left`` DataFrame, we select the last row in the ``right`` -DataFrame whose ``on`` key is less than the left's key. Both DataFrames must -be sorted by the key. +A :func:`pd.merge_asof` is similar to an ordered left-join except that we match on nearest key rather than equal keys. For each row in the ``left`` DataFrame, we select the last row in the ``right`` DataFrame whose ``on`` key is less than the left's key. Both DataFrames must be sorted by the key. Optionally an asof merge can perform a group-wise merge. This matches the ``by`` key equally, in addition to the nearest match on the ``on`` key. diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 6bc152aad6b01..db2bccf6ac349 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -9,6 +9,7 @@ We recommend that all users upgrade to this version. Highlights include: + - ``pd.merge_asof()`` for asof-style time-series joining, see :ref:`here ` .. contents:: What's new in v0.18.2 :local: @@ -28,7 +29,7 @@ A long-time requested feature has been added through the :func:`merge_asof` func support asof style joining of time-series. (:issue:`1870`). Full documentation is :ref:`here ` -The :func:`merge_asof`` performs an asof merge, which is similar to a left-join +The :func:`merge_asof` performs an asof merge, which is similar to a left-join except that we match on nearest key rather than equal keys. .. ipython:: python @@ -108,7 +109,7 @@ that forward filling happens automatically taking the most recent non-NaN value. by='ticker') This returns a merged DataFrame with the entries in the same order as the original left -passed DataFrame (``trades`` in this case). With the fields of the ``quotes`` merged. +passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. .. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index f963a271a767e..a8f5af8dbddba 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -243,6 +243,8 @@ def _merger(x, y): result = _merger(left, right) return result +ordered_merge.__doc__ = merge_ordered.__doc__ + def merge_asof(left, right, on=None, left_on=None, right_on=None, @@ -335,7 +337,7 @@ def merge_asof(left, right, on=None, 1 5 b 3.0 2 10 c 7.0 - For this example, we can achieve a similar result thru pd.merge_ordered, + For this example, we can achieve a similar result thru ``pd.merge_ordered()``, though its not nearly as performant. @@ -348,7 +350,7 @@ def merge_asof(left, right, on=None, 3 5 b 3.0 6 10 c 7.0 - Here is a real-worth times-series example + Here is a real-world times-series example >>> quotes time ticker bid ask @@ -369,7 +371,8 @@ def merge_asof(left, right, on=None, 3 2016-05-25 13:30:00.048 GOOG 720.92 100 4 2016-05-25 13:30:00.048 AAPL 98.00 100 - # by default we are taking the asof of the quotes + By default we are taking the asof of the quotes + >>> pd.asof_merge(trades, quotes, ... on='time', ... by='ticker') @@ -380,7 +383,8 @@ def merge_asof(left, right, on=None, 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN - # we only asof within 2ms betwen the quote time and the trade time + We only asof within 2ms betwen the quote time and the trade time + >>> pd.asof_merge(trades, quotes, ... on='time', ... by='ticker', @@ -392,9 +396,10 @@ def merge_asof(left, right, on=None, 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN - # we only asof within 10ms betwen the quote time and the trade time - # and we exclude exact matches on time. However *prior* data will - # propogate forward + We only asof within 10ms betwen the quote time and the trade time + and we exclude exact matches on time. However *prior* data will + propogate forward + >>> pd.asof_merge(trades, quotes, ... on='time', ... by='ticker', From 883df65b095709e2146b1f36c9697b92eb913731 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 17 Jun 2016 12:28:09 -0400 Subject: [PATCH 040/359] BUG: Fix inconsistent C engine quoting behaviour 1) Add significant testing to quoting in read_csv 2) Fix bug in C engine in which a NULL `quotechar` would raise even though `quoting=csv.QUOTE_NONE`. 3) Fix bug in C engine in which `quoting=csv.QUOTE_NONNUMERIC` wouldn't cause non-quoted fields to be cast to `float`. Author: gfyoung Closes #13411 from gfyoung/quoting-read-csv-tests and squashes the following commits: 0e791a5 [gfyoung] BUG: Fix inconsistent C engine quoting behaviour --- doc/source/io.rst | 5 +- doc/source/merging.rst | 4 +- doc/source/whatsnew/v0.18.2.txt | 24 +++-- pandas/io/parsers.py | 3 +- pandas/io/tests/parser/quoting.py | 140 +++++++++++++++++++++++++ pandas/io/tests/parser/test_parsers.py | 3 +- pandas/parser.pyx | 42 ++++++-- pandas/tools/merge.py | 5 +- 8 files changed, 196 insertions(+), 30 deletions(-) create mode 100644 pandas/io/tests/parser/quoting.py diff --git a/doc/source/io.rst b/doc/source/io.rst index 61625104f5c1d..b011072d8c3fb 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -287,11 +287,10 @@ lineterminator : str (length 1), default ``None`` quotechar : str (length 1) The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. -quoting : int or ``csv.QUOTE_*`` instance, default ``None`` +quoting : int or ``csv.QUOTE_*`` instance, default ``0`` Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of ``QUOTE_MINIMAL`` (0), ``QUOTE_ALL`` (1), ``QUOTE_NONNUMERIC`` (2) or - ``QUOTE_NONE`` (3). Default (``None``) results in ``QUOTE_MINIMAL`` - behavior. + ``QUOTE_NONE`` (3). doublequote : boolean, default ``True`` When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate whether or not to interpret two consecutive ``quotechar`` elements diff --git a/doc/source/merging.rst b/doc/source/merging.rst index c629c4e5ea7f7..b69d0d8ba3015 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -1113,7 +1113,7 @@ Timeseries friendly merging Merging Ordered Data ~~~~~~~~~~~~~~~~~~~~ -A :func:`pd.merge_ordered` function allows combining time series and other +A :func:`merge_ordered` function allows combining time series and other ordered data. In particular it has an optional ``fill_method`` keyword to fill/interpolate missing data: @@ -1135,7 +1135,7 @@ Merging AsOf .. versionadded:: 0.18.2 -A :func:`pd.merge_asof` is similar to an ordered left-join except that we match on nearest key rather than equal keys. For each row in the ``left`` DataFrame, we select the last row in the ``right`` DataFrame whose ``on`` key is less than the left's key. Both DataFrames must be sorted by the key. +A :func:`merge_asof` is similar to an ordered left-join except that we match on nearest key rather than equal keys. For each row in the ``left`` DataFrame, we select the last row in the ``right`` DataFrame whose ``on`` key is less than the left's key. Both DataFrames must be sorted by the key. Optionally an asof merge can perform a group-wise merge. This matches the ``by`` key equally, in addition to the nearest match on the ``on`` key. diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index db2bccf6ac349..c0251f7170534 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -9,7 +9,7 @@ We recommend that all users upgrade to this version. Highlights include: - - ``pd.merge_asof()`` for asof-style time-series joining, see :ref:`here ` +- :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` .. contents:: What's new in v0.18.2 :local: @@ -22,8 +22,8 @@ New features .. _whatsnew_0182.enhancements.asof_merge: -``pd.merge_asof()`` for asof-style time-series joining -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:func:`merge_asof` for asof-style time-series joining +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A long-time requested feature has been added through the :func:`merge_asof` function, to support asof style joining of time-series. (:issue:`1870`). Full documentation is @@ -113,10 +113,10 @@ passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` me .. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: -``pd.read_csv`` has improved support for duplicate column names -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:func:`read_csv` has improved support for duplicate column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:ref:`Duplicate column names ` are now supported in ``pd.read_csv()`` whether +:ref:`Duplicate column names ` are now supported in :func:`read_csv` whether they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) .. ipython :: python @@ -187,7 +187,7 @@ Other enhancements - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) -- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) ` (:issue:`10008`, :issue:`13156`) +- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here ` (:issue:`10008`, :issue:`13156`) - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) .. ipython:: python @@ -406,6 +406,8 @@ New Behavior: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) +Furthermore: + - Passing duplicated ``percentiles`` will now raise a ``ValueError``. - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) @@ -462,7 +464,7 @@ Bug Fixes - Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) -- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) +- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()`` ); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) - Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) @@ -473,9 +475,9 @@ Bug Fixes - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) - Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) -- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame`` appropriately when empty (:issue:`13212`) - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) -- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue: `13306`) +- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue:`13306`) - Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) @@ -493,6 +495,8 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) +- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) +- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 475eb73812666..9baff67845dac 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -202,10 +202,9 @@ quotechar : str (length 1), optional The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. -quoting : int or csv.QUOTE_* instance, default None +quoting : int or csv.QUOTE_* instance, default 0 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). - Default (None) results in QUOTE_MINIMAL behavior. doublequote : boolean, default ``True`` When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate whether or not to interpret two consecutive quotechar elements INSIDE a diff --git a/pandas/io/tests/parser/quoting.py b/pandas/io/tests/parser/quoting.py new file mode 100644 index 0000000000000..d0f1493be0621 --- /dev/null +++ b/pandas/io/tests/parser/quoting.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +""" +Tests that quoting specifications are properly handled +during parsing for all of the parsers defined in parsers.py +""" + +import csv +import pandas.util.testing as tm + +from pandas import DataFrame +from pandas.compat import StringIO + + +class QuotingTests(object): + + def test_bad_quote_char(self): + data = '1,2,3' + + # Python 2.x: "...must be an 1-character..." + # Python 3.x: "...must be a 1-character..." + msg = '"quotechar" must be a(n)? 1-character string' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar='foo') + + msg = 'quotechar must be set if quoting enabled' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar=None, + quoting=csv.QUOTE_MINIMAL) + + msg = '"quotechar" must be string, not int' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar=2) + + def test_bad_quoting(self): + data = '1,2,3' + + msg = '"quoting" must be an integer' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quoting='foo') + + # quoting must in the range [0, 3] + msg = 'bad "quoting" value' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quoting=5) + + def test_quote_char_basic(self): + data = 'a,b,c\n1,2,"cat"' + expected = DataFrame([[1, 2, 'cat']], + columns=['a', 'b', 'c']) + result = self.read_csv(StringIO(data), quotechar='"') + tm.assert_frame_equal(result, expected) + + def test_quote_char_various(self): + data = 'a,b,c\n1,2,"cat"' + expected = DataFrame([[1, 2, 'cat']], + columns=['a', 'b', 'c']) + quote_chars = ['~', '*', '%', '$', '@', 'P'] + + for quote_char in quote_chars: + new_data = data.replace('"', quote_char) + result = self.read_csv(StringIO(new_data), quotechar=quote_char) + tm.assert_frame_equal(result, expected) + + def test_null_quote_char(self): + data = 'a,b,c\n1,2,3' + + # sanity checks + msg = 'quotechar must be set if quoting enabled' + + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar=None, + quoting=csv.QUOTE_MINIMAL) + + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar='', + quoting=csv.QUOTE_MINIMAL) + + # no errors should be raised if quoting is None + expected = DataFrame([[1, 2, 3]], + columns=['a', 'b', 'c']) + + result = self.read_csv(StringIO(data), quotechar=None, + quoting=csv.QUOTE_NONE) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), quotechar='', + quoting=csv.QUOTE_NONE) + tm.assert_frame_equal(result, expected) + + def test_quoting_various(self): + data = '1,2,"foo"' + cols = ['a', 'b', 'c'] + + # QUOTE_MINIMAL and QUOTE_ALL apply only to + # the CSV writer, so they should have no + # special effect for the CSV reader + expected = DataFrame([[1, 2, 'foo']], columns=cols) + + # test default (afterwards, arguments are all explicit) + result = self.read_csv(StringIO(data), names=cols) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), quotechar='"', + quoting=csv.QUOTE_MINIMAL, names=cols) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), quotechar='"', + quoting=csv.QUOTE_ALL, names=cols) + tm.assert_frame_equal(result, expected) + + # QUOTE_NONE tells the reader to do no special handling + # of quote characters and leave them alone + expected = DataFrame([[1, 2, '"foo"']], columns=cols) + result = self.read_csv(StringIO(data), quotechar='"', + quoting=csv.QUOTE_NONE, names=cols) + tm.assert_frame_equal(result, expected) + + # QUOTE_NONNUMERIC tells the reader to cast + # all non-quoted fields to float + expected = DataFrame([[1.0, 2.0, 'foo']], columns=cols) + result = self.read_csv(StringIO(data), quotechar='"', + quoting=csv.QUOTE_NONNUMERIC, + names=cols) + tm.assert_frame_equal(result, expected) + + def test_double_quote(self): + data = 'a,b\n3,"4 "" 5"' + + expected = DataFrame([[3, '4 " 5']], + columns=['a', 'b']) + result = self.read_csv(StringIO(data), quotechar='"', + doublequote=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[3, '4 " 5"']], + columns=['a', 'b']) + result = self.read_csv(StringIO(data), quotechar='"', + doublequote=False) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index fda7b28769647..21f903342a611 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -11,6 +11,7 @@ from .common import ParserTests from .header import HeaderTests from .comment import CommentTests +from .quoting import QuotingTests from .usecols import UsecolsTests from .skiprows import SkipRowsTests from .index_col import IndexColTests @@ -28,7 +29,7 @@ class BaseParser(CommentTests, CompressionTests, IndexColTests, MultithreadTests, NAvaluesTests, ParseDatesTests, ParserTests, SkipRowsTests, - UsecolsTests): + UsecolsTests, QuotingTests): def read_csv(self, *args, **kwargs): raise NotImplementedError diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 063b2158d999a..3928bc8472113 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -7,6 +7,7 @@ from libc.string cimport strncpy, strlen, strcmp, strcasecmp cimport libc.stdio as stdio import warnings +from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE from cpython cimport (PyObject, PyBytes_FromString, PyBytes_AsString, PyBytes_Check, PyUnicode_Check, PyUnicode_AsUTF8String) @@ -283,6 +284,7 @@ cdef class TextReader: object compression object mangle_dupe_cols object tupleize_cols + list dtype_cast_order set noconvert, usecols def __cinit__(self, source, @@ -393,8 +395,13 @@ cdef class TextReader: raise ValueError('Only length-1 escapes supported') self.parser.escapechar = ord(escapechar) - self.parser.quotechar = ord(quotechar) - self.parser.quoting = quoting + self._set_quoting(quotechar, quoting) + + # TODO: endianness just a placeholder? + if quoting == QUOTE_NONNUMERIC: + self.dtype_cast_order = [' 1: @@ -548,6 +555,29 @@ cdef class TextReader: def set_error_bad_lines(self, int status): self.parser.error_bad_lines = status + def _set_quoting(self, quote_char, quoting): + if not isinstance(quoting, int): + raise TypeError('"quoting" must be an integer') + + if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: + raise TypeError('bad "quoting" value') + + if not isinstance(quote_char, (str, bytes)) and quote_char is not None: + dtype = type(quote_char).__name__ + raise TypeError('"quotechar" must be string, ' + 'not {dtype}'.format(dtype=dtype)) + + if quote_char is None or quote_char == '': + if quoting != QUOTE_NONE: + raise TypeError("quotechar must be set if quoting enabled") + self.parser.quoting = quoting + self.parser.quotechar = -1 + elif len(quote_char) > 1: # 0-len case handled earlier + raise TypeError('"quotechar" must be a 1-character string') + else: + self.parser.quoting = quoting + self.parser.quotechar = ord(quote_char) + cdef _make_skiprow_set(self): if isinstance(self.skiprows, (int, np.integer)): parser_set_skipfirstnrows(self.parser, self.skiprows) @@ -1066,7 +1096,7 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) else: col_res = None - for dt in dtype_cast_order: + for dt in self.dtype_cast_order: try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_flist) @@ -1847,12 +1877,6 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL: return table -# if at first you don't succeed... - -# TODO: endianness just a placeholder? -cdef list dtype_cast_order = ['>> (pd.merge_ordered(left, right, on='a') ... .ffill() From 9d33c7be38fd09fa493c68ab81c50ee7a681de34 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Nimmi Date: Fri, 17 Jun 2016 18:00:14 -0400 Subject: [PATCH 041/359] BUG: fix to_datetime to handle int16 and int8 closes #13451 Author: Ravi Kumar Nimmi Closes #13464 from ravinimmi/bugfix and squashes the following commits: dc4944d [Ravi Kumar Nimmi] BUG: fix to_datetime to handle int16 and int8 --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/tseries/tests/test_timeseries.py | 27 +++++++++++++++++++++++++ pandas/tseries/tools.py | 7 ++++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index c0251f7170534..8a14765aa6df2 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -504,6 +504,7 @@ Bug Fixes - Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) +- Bug in ``pd.to_datetime()`` which overflowed on ``int8``, `int16`` dtypes (:issue:`13451`) - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index fcc544ec7f239..b0caa1f6a77cb 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2563,6 +2563,33 @@ def test_dataframe(self): with self.assertRaises(ValueError): to_datetime(df2) + def test_dataframe_dtypes(self): + # #13451 + df = DataFrame({'year': [2015, 2016], + 'month': [2, 3], + 'day': [4, 5]}) + + # int16 + result = to_datetime(df.astype('int16')) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:00:00')]) + assert_series_equal(result, expected) + + # mixed dtypes + df['month'] = df['month'].astype('int8') + df['day'] = df['day'].astype('int8') + result = to_datetime(df) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:00:00')]) + assert_series_equal(result, expected) + + # float + df = DataFrame({'year': [2000, 2001], + 'month': [1.5, 1], + 'day': [1, 1]}) + with self.assertRaises(ValueError): + to_datetime(df) + class TestDatetimeIndex(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index d5e87d1df2462..efb8590dfccf4 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -508,7 +508,12 @@ def f(value): def coerce(values): # we allow coercion to if errors allows - return to_numeric(values, errors=errors) + values = to_numeric(values, errors=errors) + + # prevent overflow in case of int8 or int16 + if com.is_integer_dtype(values): + values = values.astype('int64', copy=False) + return values values = (coerce(arg[unit_rev['year']]) * 10000 + coerce(arg[unit_rev['month']]) * 100 + From e24ab24df2e092c90ef2f0b49bca9016139ec50c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 16 Jun 2016 21:13:11 -0400 Subject: [PATCH 042/359] BLD: update appveyor build to use numpy 1.11 for 3.5 build BLD: correctly install tools.tests.data closes #13472 --- appveyor.yml | 3 ++- setup.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 0af538aed40dc..c424420dda666 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,7 +32,8 @@ environment: PYTHON_VERSION: "3.5" PYTHON_ARCH: "64" CONDA_PY: "35" - CONDA_NPY: "110" + CONDA_NPY: "111" + # We always use a 64-bit machine, but can build x86 distributions # with the PYTHON_ARCH variable (which is used by CMD_IN_ENV). diff --git a/setup.py b/setup.py index adea92896d382..8f8865ecc3b7a 100755 --- a/setup.py +++ b/setup.py @@ -591,7 +591,6 @@ def pxd(name): 'tests/data/*.xlsx', 'tests/data/*.xlsm', 'tests/data/*.table', - 'tests/tools/data/*.csv', 'tests/parser/data/*.csv', 'tests/parser/data/*.gz', 'tests/parser/data/*.bz2', @@ -602,7 +601,7 @@ def pxd(name): 'tests/data/*.html', 'tests/data/html_encoding/*.html', 'tests/json/data/*.json'], - 'pandas.tools': ['tests/*.csv'], + 'pandas.tools': ['tests/data/*.csv'], 'pandas.tests': ['data/*.pickle', 'data/*.csv'], 'pandas.tests.formats': ['data/*.csv'], From 35bb1a1c2d915d862ca0daadbe1d32180a998ccf Mon Sep 17 00:00:00 2001 From: cmazzullo Date: Sat, 18 Jun 2016 11:22:02 -0400 Subject: [PATCH 043/359] BUG: df.pivot_table: margins_name ignored when aggfunc is a list closes #13354 Author: cmazzullo Closes #13435 from cmazzullo/pivot_table and squashes the following commits: 6017e25 [cmazzullo] Fixed up whitespace for pep8 b4b09bf [cmazzullo] Compared whole dataframes in test_pivot_table_margins_name_with_aggfunc_list 98ab8be [cmazzullo] Added doublequotes to variable names 669931f [cmazzullo] Added unit test for pivot_table margins_name when aggfunc is a list 0dcb78f [cmazzullo] Switched documentation to v0.18.2 d6a6036 [cmazzullo] BUG: df.pivot_table: margins_name is ignored when there aggfunc is list #13354 --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/tools/pivot.py | 2 +- pandas/tools/tests/test_pivot.py | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 8a14765aa6df2..575a1f3ddcfc2 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -497,6 +497,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) +- Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index a4e6cc404a457..e1405bc9e6add 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -86,7 +86,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, - margins=margins) + margins=margins, margins_name=margins_name) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 82feaae13f771..7ec4018d301af 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -779,6 +779,28 @@ def test_pivot_table_with_iterator_values(self): ) tm.assert_frame_equal(pivot_values_gen, pivot_values_list) + def test_pivot_table_margins_name_with_aggfunc_list(self): + # GH 13354 + margins_name = 'Weekly' + costs = pd.DataFrame( + {'item': ['bacon', 'cheese', 'bacon', 'cheese'], + 'cost': [2.5, 4.5, 3.2, 3.3], + 'day': ['M', 'M', 'T', 'T']} + ) + table = costs.pivot_table( + index="item", columns="day", margins=True, + margins_name=margins_name, aggfunc=[np.mean, max] + ) + ix = pd.Index( + ['bacon', 'cheese', margins_name], dtype='object', name='item' + ) + tups = [('mean', 'cost', 'M'), ('mean', 'cost', 'T'), + ('mean', 'cost', margins_name), ('max', 'cost', 'M'), + ('max', 'cost', 'T'), ('max', 'cost', margins_name)] + cols = pd.MultiIndex.from_tuples(tups, names=[None, None, 'day']) + expected = pd.DataFrame(table.values, index=ix, columns=cols) + tm.assert_frame_equal(table, expected) + class TestCrosstab(tm.TestCase): From 20dd17a70063387ccca5f7aca3dbefc904d99457 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 18 Jun 2016 23:25:48 +0200 Subject: [PATCH 044/359] DOC/API: Styler documentation changes Closes https://github.com/pydata/pandas/issues/13222 - Update docstring and notebook on output shapes matching - Change build process to use nbconvert to execute the notebook and then convert using NBConvert. Second commit puts some more restrictions on the output of a user's function passed to `.apply`. Author: Tom Augspurger Closes #13225 from TomAugspurger/styler-doc and squashes the following commits: a7ff39c [Tom Augspurger] format error messages 3ac4895 [Tom Augspurger] TST: Additional Styler init and repr tests 9093df8 [Tom Augspurger] DOC: Styler documentation changes --- doc/make.py | 93 +- doc/source/contributing.rst | 4 +- doc/source/html-styling.ipynb | 18044 +-------------------------- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/formats/style.py | 56 +- pandas/tests/formats/test_style.py | 44 +- 6 files changed, 266 insertions(+), 17976 deletions(-) diff --git a/doc/make.py b/doc/make.py index c09514d758833..05bf618ee677e 100755 --- a/doc/make.py +++ b/doc/make.py @@ -18,13 +18,16 @@ """ from __future__ import print_function -import glob +import io +import glob # noqa import os import shutil import sys -import sphinx +from contextlib import contextmanager + +import sphinx # noqa import argparse -import jinja2 +import jinja2 # noqa os.environ['PYTHONPATH'] = '..' @@ -102,10 +105,90 @@ def clean(): shutil.rmtree('source/generated') +@contextmanager +def cleanup_nb(nb): + try: + yield + finally: + try: + os.remove(nb + '.executed') + except OSError: + pass + + +def execute_nb(src, dst, allow_errors=False, timeout=1000, kernel_name=''): + """ + Execute notebook in `src` and write the output to `dst` + + Parameters + ---------- + src, dst: str + path to notebook + allow_errors: bool + timeout: int + kernel_name: str + defualts to value set in notebook metadata + + Returns + ------- + dst: str + """ + import nbformat + from nbconvert.preprocessors import ExecutePreprocessor + + with io.open(src, encoding='utf-8') as f: + nb = nbformat.read(f, as_version=4) + + ep = ExecutePreprocessor(allow_errors=allow_errors, + timeout=timeout, + kernel_name=kernel_name) + ep.preprocess(nb, resources={}) + + with io.open(dst, 'wt', encoding='utf-8') as f: + nbformat.write(nb, f) + return dst + + +def convert_nb(src, dst, to='html', template_file='basic'): + """ + Convert a notebook `src`. + + Parameters + ---------- + src, dst: str + filepaths + to: {'rst', 'html'} + format to export to + template_file: str + name of template file to use. Default 'basic' + """ + from nbconvert import HTMLExporter, RSTExporter + + dispatch = {'rst': RSTExporter, 'html': HTMLExporter} + exporter = dispatch[to.lower()](template_file=template_file) + + (body, resources) = exporter.from_filename(src) + with io.open(dst, 'wt', encoding='utf-8') as f: + f.write(body) + return dst + + def html(): check_build() - os.system('jupyter nbconvert --to=html --template=basic ' - '--output=source/html-styling.html source/html-styling.ipynb') + + notebooks = [ + 'source/html-styling.ipynb', + ] + + for nb in notebooks: + with cleanup_nb(nb): + try: + print("Converting %s" % nb) + executed = execute_nb(nb, nb + '.executed', allow_errors=True) + convert_nb(executed, nb.rstrip('.ipynb') + '.html') + except ImportError: + pass + if os.system('sphinx-build -P -b html -d build/doctrees ' 'source build/html'): raise SystemExit("Building HTML failed.") diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 3d39656faafa4..6e0c747cd06fc 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -360,7 +360,7 @@ code. It is easiest to :ref:`create a development environment `, then install:: - conda install -n pandas_dev sphinx ipython + conda install -n pandas_dev sphinx ipython nbconvert nbformat Furthermore, it is recommended to have all `optional dependencies `_ @@ -369,6 +369,8 @@ messages when building the docs. This happens because all the code in the docume is executed during the doc build, and so code examples using optional dependencies will generate errors. Run ``pd.show_versions()`` to get an overview of the installed version of all dependencies. +`nbconvert `_ and `nbformat `_ are required to build the Jupyter notebooks +included in the documentation. .. warning:: diff --git a/doc/source/html-styling.ipynb b/doc/source/html-styling.ipynb index 77813a03c704a..8668ee3de7470 100644 --- a/doc/source/html-styling.ipynb +++ b/doc/source/html-styling.ipynb @@ -47,14 +47,16 @@ "`Styler.apply` passes each column or row into your DataFrame one-at-a-time or the entire table at once, depending on the `axis` keyword argument.\n", "For columnwise use `axis=0`, rowwise use `axis=1`, and for the entire table at once use `axis=None`.\n", "\n", - "The result of the function application, a CSS attribute-value pair, is stored in an internal dictionary on your ``Styler`` object.\n", + "For `Styler.applymap` your function should take a scalar and return a single string with the CSS attribute-value pair.\n", + "\n", + "For `Styler.apply` your function should take a Series or DataFrame (depending on the axis parameter), and return a Series or DataFrame with an identical shape where each value is a string with a CSS attribute-value pair.\n", "\n", "Let's see some examples." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": false }, @@ -79,293 +81,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style" ] @@ -381,31 +101,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "['',\n", - " ' \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "s = df.style.applymap(color_negative_red)\n", "s" @@ -1052,7 +170,9 @@ "source": [ "Notice the similarity with the standard `df.applymap`, which operates on DataFrames elementwise. We want you to be able to resuse your existing knowledge of how to interact with DataFrames.\n", "\n", - "Notice also that our function returned a string containing the CSS attribute and value, separated by a colon just like in a `\n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.apply(highlight_max)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case the input is a `Series`, one column at a time.\n", + "Notice that the output shape of `highlight_max` matches the input shape, an array with `len(s)` items." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1412,693 +228,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.\\\n", " applymap(color_negative_red).\\\n", @@ -2121,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "collapsed": true }, @@ -2141,301 +275,20 @@ " index=data.index, columns=data.columns)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When using ``Styler.apply(func, axis=None)``, the function must return a DataFrame with the same index and column labels." + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.apply(highlight_max, color='darkorange', axis=None)" ] @@ -2451,7 +304,9 @@ "- `Styler.applymap(func)` for elementwise styles\n", "- `Styler.apply(func, axis=0)` for columnwise styles\n", "- `Styler.apply(func, axis=1)` for rowwise styles\n", - "- `Styler.apply(func, axis=None)` for tablewise styles" + "- `Styler.apply(func, axis=None)` for tablewise styles\n", + "\n", + "And crucially the input and output shapes of `func` must match. If `x` is the input then ``func(x).shape == x.shape``." ] }, { @@ -2479,311 +334,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.apply(highlight_max, subset=['B', 'C', 'D'])" ] @@ -2797,341 +352,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.applymap(color_negative_red,\n", " subset=pd.IndexSlice[2:5, ['B', 'D']])" @@ -3162,293 +387,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 100.00%\n", - " \n", - " \n", - " 132.92%\n", - " \n", - " \n", - " nan%\n", - " \n", - " \n", - " -31.63%\n", - " \n", - " \n", - " -99.08%\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 200.00%\n", - " \n", - " \n", - " -107.08%\n", - " \n", - " \n", - " -143.87%\n", - " \n", - " \n", - " 56.44%\n", - " \n", - " \n", - " 29.57%\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 300.00%\n", - " \n", - " \n", - " -162.64%\n", - " \n", - " \n", - " 21.96%\n", - " \n", - " \n", - " 67.88%\n", - " \n", - " \n", - " 188.93%\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 400.00%\n", - " \n", - " \n", - " 96.15%\n", - " \n", - " \n", - " 10.40%\n", - " \n", - " \n", - " -48.12%\n", - " \n", - " \n", - " 85.02%\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 500.00%\n", - " \n", - " \n", - " 145.34%\n", - " \n", - " \n", - " 105.77%\n", - " \n", - " \n", - " 16.56%\n", - " \n", - " \n", - " 51.50%\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 600.00%\n", - " \n", - " \n", - " -133.69%\n", - " \n", - " \n", - " 56.29%\n", - " \n", - " \n", - " 139.29%\n", - " \n", - " \n", - " -6.33%\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 700.00%\n", - " \n", - " \n", - " 12.17%\n", - " \n", - " \n", - " 120.76%\n", - " \n", - " \n", - " -0.20%\n", - " \n", - " \n", - " 162.78%\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 800.00%\n", - " \n", - " \n", - " 35.45%\n", - " \n", - " \n", - " 103.75%\n", - " \n", - " \n", - " -38.57%\n", - " \n", - " \n", - " 51.98%\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 900.00%\n", - " \n", - " \n", - " 168.66%\n", - " \n", - " \n", - " -132.60%\n", - " \n", - " \n", - " 142.90%\n", - " \n", - " \n", - " -208.94%\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 1000.00%\n", - " \n", - " \n", - " -12.98%\n", - " \n", - " \n", - " 63.15%\n", - " \n", - " \n", - " -58.65%\n", - " \n", - " \n", - " 29.07%\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.format(\"{:.2%}\")" ] @@ -3462,293 +405,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1000\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.32\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -100\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " +0.56\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -200\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " +0.68\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 1000\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.48\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1000\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " +0.17\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -100\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " +1.39\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0000\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0000\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.39\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 2000\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " +1.43\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -000\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.format({'B': \"{:0<4.0f}\", 'D': '{:+.2f}'})" ] @@ -3762,293 +423,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " ±1.33\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " ±1.07\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " ±1.63\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " ±0.96\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " ±1.45\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " ±1.34\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " ±0.12\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " ±0.35\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " ±1.69\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " ±0.13\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.format({\"B\": lambda x: \"±{:.2f}\".format(abs(x))})" ] @@ -4069,299 +448,11 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.highlight_null(null_color='red')" ] @@ -4375,593 +466,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import seaborn as sns\n", "\n", @@ -4980,333 +489,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Uses the full color range\n", "df.loc[:4].style.background_gradient(cmap='viridis')" @@ -5314,383 +501,11 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Compreess the color range\n", "(df.loc[:4]\n", @@ -5708,489 +523,11 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.bar(subset=['A', 'B'], color='#d65f5f')" ] @@ -6204,646 +541,22 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.highlight_max(axis=0)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.highlight_min(axis=0)" ] @@ -6857,793 +570,11 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.set_properties(**{'background-color': 'black',\n", " 'color': 'lawngreen',\n", @@ -7666,593 +597,11 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df2 = -df\n", "style1 = df.style.applymap(color_negative_red)\n", @@ -8261,593 +610,11 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " -1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " 0.31628\n", - " \n", - " \n", - " 0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " 1.07082\n", - " \n", - " \n", - " 1.43871\n", - " \n", - " \n", - " -0.564417\n", - " \n", - " \n", - " -0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " -3\n", - " \n", - " \n", - " 1.6264\n", - " \n", - " \n", - " -0.219565\n", - " \n", - " \n", - " -0.678805\n", - " \n", - " \n", - " -1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " -4\n", - " \n", - " \n", - " -0.961538\n", - " \n", - " \n", - " -0.104011\n", - " \n", - " \n", - " 0.481165\n", - " \n", - " \n", - " -0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " -5\n", - " \n", - " \n", - " -1.45342\n", - " \n", - " \n", - " -1.05774\n", - " \n", - " \n", - " -0.165562\n", - " \n", - " \n", - " -0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " -6\n", - " \n", - " \n", - " 1.33694\n", - " \n", - " \n", - " -0.562861\n", - " \n", - " \n", - " -1.39285\n", - " \n", - " \n", - " 0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " -7\n", - " \n", - " \n", - " -0.121668\n", - " \n", - " \n", - " -1.2076\n", - " \n", - " \n", - " 0.00204021\n", - " \n", - " \n", - " -1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " -8\n", - " \n", - " \n", - " -0.354493\n", - " \n", - " \n", - " -1.03753\n", - " \n", - " \n", - " 0.385684\n", - " \n", - " \n", - " -0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " -9\n", - " \n", - " \n", - " -1.68658\n", - " \n", - " \n", - " 1.32596\n", - " \n", - " \n", - " -1.42898\n", - " \n", - " \n", - " 2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " -10\n", - " \n", - " \n", - " 0.12982\n", - " \n", - " \n", - " -0.631523\n", - " \n", - " \n", - " 0.586538\n", - " \n", - " \n", - " -0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "style2 = df2.style\n", "style2.use(style1.export())\n", @@ -8898,693 +665,11 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.3\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.32\n", - " \n", - " \n", - " -0.99\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 0.3\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 0.22\n", - " \n", - " \n", - " 0.68\n", - " \n", - " \n", - " 1.9\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.96\n", - " \n", - " \n", - " 0.1\n", - " \n", - " \n", - " -0.48\n", - " \n", - " \n", - " 0.85\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " 0.17\n", - " \n", - " \n", - " 0.52\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -0.063\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.12\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " -0.002\n", - " \n", - " \n", - " 1.6\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.35\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " -0.39\n", - " \n", - " \n", - " 0.52\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.7\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -2.1\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.13\n", - " \n", - " \n", - " 0.63\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " 0.29\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "with pd.option_context('display.precision', 2):\n", " html = (df.style\n", @@ -9602,693 +687,11 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.3\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.32\n", - " \n", - " \n", - " -0.99\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 0.3\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 0.22\n", - " \n", - " \n", - " 0.68\n", - " \n", - " \n", - " 1.9\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.96\n", - " \n", - " \n", - " 0.1\n", - " \n", - " \n", - " -0.48\n", - " \n", - " \n", - " 0.85\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " 0.17\n", - " \n", - " \n", - " 0.52\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -0.063\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.12\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " -0.002\n", - " \n", - " \n", - " 1.6\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.35\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " -0.39\n", - " \n", - " \n", - " 0.52\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.7\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -2.1\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.13\n", - " \n", - " \n", - " 0.63\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " 0.29\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style\\\n", " .applymap(color_negative_red)\\\n", @@ -10319,595 +722,11 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Colormaps, with a caption.
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.set_caption('Colormaps, with a caption.')\\\n", " .background_gradient(cmap=cm)" @@ -10931,315 +750,11 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Hover to highlight.
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from IPython.display import HTML\n", "\n", @@ -11312,592 +827,11 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from IPython.html import widgets\n", "@widgets.interact\n", @@ -11910,7 +844,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": { "collapsed": false }, @@ -11931,6821 +865,11 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Hover to magify
\n", - " \n", - " 0\n", - " \n", - " 1\n", - " \n", - " 2\n", - " \n", - " 3\n", - " \n", - " 4\n", - " \n", - " 5\n", - " \n", - " 6\n", - " \n", - " 7\n", - " \n", - " 8\n", - " \n", - " 9\n", - " \n", - " 10\n", - " \n", - " 11\n", - " \n", - " 12\n", - " \n", - " 13\n", - " \n", - " 14\n", - " \n", - " 15\n", - " \n", - " 16\n", - " \n", - " 17\n", - " \n", - " 18\n", - " \n", - " 19\n", - " \n", - " 20\n", - " \n", - " 21\n", - " \n", - " 22\n", - " \n", - " 23\n", - " \n", - " 24\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 0.23\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " -0.84\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " -0.96\n", - " \n", - " \n", - " -0.22\n", - " \n", - " \n", - " -0.62\n", - " \n", - " \n", - " 1.8\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " 0.87\n", - " \n", - " \n", - " -0.92\n", - " \n", - " \n", - " -0.23\n", - " \n", - " \n", - " 2.2\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 0.076\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " -0.42\n", - " \n", - " \n", - " 2.3\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " 2.8\n", - " \n", - " \n", - " 0.68\n", - " \n", - " \n", - " -1.6\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 1.6\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 0.0037\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " 3.4\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 1.3\n", - " \n", - " \n", - " -0.52\n", - " \n", - " \n", - " -0.015\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -1.9\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -0.68\n", - " \n", - " \n", - " -0.81\n", - " \n", - " \n", - " 0.35\n", - " \n", - " \n", - " -0.055\n", - " \n", - " \n", - " 1.8\n", - " \n", - " \n", - " -2.8\n", - " \n", - " \n", - " 2.3\n", - " \n", - " \n", - " 0.78\n", - " \n", - " \n", - " 0.44\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " -0.65\n", - " \n", - " \n", - " 3.2\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 0.52\n", - " \n", - " \n", - " 2.2\n", - " \n", - " \n", - " -0.37\n", - " \n", - " \n", - " -3\n", - " \n", - " \n", - " 3.7\n", - " \n", - " \n", - " -1.9\n", - " \n", - " \n", - " 2.5\n", - " \n", - " \n", - " 0.21\n", - " \n", - " \n", - " -0.24\n", - " \n", - " \n", - " -0.1\n", - " \n", - " \n", - " -0.78\n", - " \n", - " \n", - " -3\n", - " \n", - " \n", - " -0.82\n", - " \n", - " \n", - " -0.21\n", - " \n", - " \n", - " -0.23\n", - " \n", - " \n", - " 0.86\n", - " \n", - " \n", - " -0.68\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -4.9\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " 0.61\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 3.7\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " 0.43\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -0.43\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " 0.12\n", - " \n", - " \n", - " 0.6\n", - " \n", - " \n", - " -0.89\n", - " \n", - " \n", - " 0.27\n", - " \n", - " \n", - " -3.7\n", - " \n", - " \n", - " -2.7\n", - " \n", - " \n", - " -0.31\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 0.91\n", - " \n", - " \n", - " -5.8\n", - " \n", - " \n", - " 2.8\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 0.28\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " 4.5\n", - " \n", - " \n", - " -1.9\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 5.2\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " -3.8\n", - " \n", - " \n", - " 4.7\n", - " \n", - " \n", - " -0.72\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " -0.18\n", - " \n", - " \n", - " 0.83\n", - " \n", - " \n", - " -0.22\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -4.3\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " -0.97\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 2.2\n", - " \n", - " \n", - " -6.3\n", - " \n", - " \n", - " 3.3\n", - " \n", - " \n", - " 2.5\n", - " \n", - " \n", - " 2.1\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " -0.84\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " -0.99\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " 3.9\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -0.94\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " 0.087\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " -0.11\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -0.85\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " 0.8\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " -6.5\n", - " \n", - " \n", - " 2.8\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 3.8\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " -0.74\n", - " \n", - " \n", - " 5.4\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " 3.8\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " 0.34\n", - " \n", - " \n", - " 0.57\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 0.54\n", - " \n", - " \n", - " -4.4\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " -4\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -0.2\n", - " \n", - " \n", - " -4.7\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " -8.5\n", - " \n", - " \n", - " 3.3\n", - " \n", - " \n", - " 2.5\n", - " \n", - " \n", - " 5.8\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " -0.44\n", - " \n", - " \n", - " 4.7\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " -0.21\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 5.5\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 0.18\n", - " \n", - " \n", - " 0.11\n", - " \n", - " \n", - " 0.036\n", - " \n", - " \n", - " -6\n", - " \n", - " \n", - " -0.45\n", - " \n", - " \n", - " -6.2\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " 0.71\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " 0.67\n", - " \n", - " \n", - " -7.3\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " 3.4\n", - " \n", - " \n", - " 6.7\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 0.92\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -0.65\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " -3.5\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 0.82\n", - " \n", - " \n", - " -2.4\n", - " \n", - " \n", - " -0.4\n", - " \n", - " \n", - " -6.1\n", - " \n", - " \n", - " -0.52\n", - " \n", - " \n", - " -6.6\n", - " \n", - " \n", - " -3.5\n", - " \n", - " \n", - " -0.043\n", - " \n", - " \n", - " -4.6\n", - " \n", - " \n", - " 0.51\n", - " \n", - " \n", - " -5.8\n", - " \n", - " \n", - " 3.2\n", - " \n", - " \n", - " 2.4\n", - " \n", - " \n", - " 5.1\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 0.38\n", - " \n", - " \n", - " 5.5\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -0.8\n", - " \n", - " \n", - " 7.1\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -0.44\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " -0.33\n", - " \n", - " \n", - " -0.8\n", - " \n", - " \n", - " 0.26\n", - " \n", - " \n", - " -3.4\n", - " \n", - " \n", - " -0.82\n", - " \n", - " \n", - " -6.1\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -8.5\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " 0.41\n", - " \n", - " \n", - " -4.7\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " -6.9\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " 5.2\n", - " \n", - "
\n", - " 10\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " -0.98\n", - " \n", - " \n", - " 7.8\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -0.71\n", - " \n", - " \n", - " -0.46\n", - " \n", - " \n", - " 1.8\n", - " \n", - " \n", - " -2.8\n", - " \n", - " \n", - " 0.48\n", - " \n", - " \n", - " -6\n", - " \n", - " \n", - " -3.4\n", - " \n", - " \n", - " -7.8\n", - " \n", - " \n", - " -5.5\n", - " \n", - " \n", - " -0.7\n", - " \n", - " \n", - " -4.6\n", - " \n", - " \n", - " -0.52\n", - " \n", - " \n", - " -7.7\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 5.8\n", - " \n", - "
\n", - " 11\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " 4.5\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " -0.49\n", - " \n", - " \n", - " 0.017\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " -0.6\n", - " \n", - " \n", - " 0.49\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " -5.9\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -8.2\n", - " \n", - " \n", - " -3.4\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -4.3\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " -7.9\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " 5.8\n", - " \n", - "
\n", - " 12\n", - " \n", - " \n", - " 3.2\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -3.1\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " 0.33\n", - " \n", - " \n", - " 6.7\n", - " \n", - " \n", - " -2.8\n", - " \n", - " \n", - " -0.2\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " 2.6\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " 0.75\n", - " \n", - " \n", - " -5.3\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -7.6\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -4.8\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -9\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 6.4\n", - " \n", - " \n", - " 5.6\n", - " \n", - "
\n", - " 13\n", - " \n", - " \n", - " 2.3\n", - " \n", - " \n", - " 4.5\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " 6.8\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -0.8\n", - " \n", - " \n", - " 0.71\n", - " \n", - " \n", - " 2.3\n", - " \n", - " \n", - " -0.16\n", - " \n", - " \n", - " -0.46\n", - " \n", - " \n", - " -5.1\n", - " \n", - " \n", - " -3.8\n", - " \n", - " \n", - " -7.6\n", - " \n", - " \n", - " -4\n", - " \n", - " \n", - " 0.33\n", - " \n", - " \n", - " -3.7\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " -8.7\n", - " \n", - " \n", - " 2.5\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " 6.7\n", - " \n", - "
\n", - " 14\n", - " \n", - " \n", - " 3.8\n", - " \n", - " \n", - " 4.3\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 6.2\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " -0.33\n", - " \n", - " \n", - " -0.97\n", - " \n", - " \n", - " 1.7\n", - " \n", - " \n", - " 3.6\n", - " \n", - " \n", - " 0.29\n", - " \n", - " \n", - " -4.2\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " -6.7\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -2.4\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " -9.4\n", - " \n", - " \n", - " 3.4\n", - " \n", - " \n", - " 6.1\n", - " \n", - " \n", - " 7.5\n", - " \n", - "
\n", - " 15\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " -4\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " 5.7\n", - " \n", - " \n", - " -3.1\n", - " \n", - " \n", - " -0.33\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " 2.2\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -4.3\n", - " \n", - " \n", - " -5.7\n", - " \n", - " \n", - " -4.4\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " -11\n", - " \n", - " \n", - " 2.6\n", - " \n", - " \n", - " 6.7\n", - " \n", - " \n", - " 5.9\n", - " \n", - "
\n", - " 16\n", - " \n", - " \n", - " 4.1\n", - " \n", - " \n", - " 4.3\n", - " \n", - " \n", - " -2.4\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " -0.47\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " -4.8\n", - " \n", - " \n", - " 1.6\n", - " \n", - " \n", - " 0.23\n", - " \n", - " \n", - " 0.099\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " 1.8\n", - " \n", - " \n", - " -3.1\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " -5.5\n", - " \n", - " \n", - " -3\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -0.56\n", - " \n", - " \n", - " -13\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 6.2\n", - " \n", - " \n", - " 4.9\n", - " \n", - "
\n", - " 17\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " 4.6\n", - " \n", - " \n", - " -3.5\n", - " \n", - " \n", - " -3.8\n", - " \n", - " \n", - " 6.6\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -0.75\n", - " \n", - " \n", - " 6.6\n", - " \n", - " \n", - " -4.8\n", - " \n", - " \n", - " 3.6\n", - " \n", - " \n", - " -0.29\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " -5\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -3.1\n", - " \n", - " \n", - " -2.4\n", - " \n", - " \n", - " 0.84\n", - " \n", - " \n", - " -13\n", - " \n", - " \n", - " 3.6\n", - " \n", - " \n", - " 7.4\n", - " \n", - " \n", - " 4.7\n", - " \n", - "
\n", - " 18\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " -2.8\n", - " \n", - " \n", - " -4.2\n", - " \n", - " \n", - " 7.1\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " 7.9\n", - " \n", - " \n", - " -4.9\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -0.63\n", - " \n", - " \n", - " 0.35\n", - " \n", - " \n", - " 7.5\n", - " \n", - " \n", - " 0.87\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " -4.2\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " -9.7\n", - " \n", - " \n", - " 3.4\n", - " \n", - " \n", - " 7.1\n", - " \n", - " \n", - " 4.4\n", - " \n", - "
\n", - " 19\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 6.2\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " 7.2\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " 6.5\n", - " \n", - " \n", - " -5.2\n", - " \n", - " \n", - " -0.24\n", - " \n", - " \n", - " 0.0072\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " 6.4\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " -5.2\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 1.6\n", - " \n", - " \n", - " -11\n", - " \n", - " \n", - " 2.8\n", - " \n", - " \n", - " 7.5\n", - " \n", - " \n", - " 3.9\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "np.random.seed(25)\n", "cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)\n", @@ -18816,7 +940,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3" + "version": "3.5.1" } }, "nbformat": 4, diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 575a1f3ddcfc2..c8a8b8eb0547b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -233,6 +233,7 @@ API changes - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) - An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) - Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) +- ``Styler.apply`` is now more strict about the outputs your function must return. For ``axis=0`` or ``axis=1``, the output shape must be identical. For ``axis=None``, the output must be a DataFrame with identical columns and index labels. (:issue:`13222`) .. _whatsnew_0182.api.tolist: diff --git a/pandas/formats/style.py b/pandas/formats/style.py index f66ac7485c76e..477ecccc03f4f 100644 --- a/pandas/formats/style.py +++ b/pandas/formats/style.py @@ -133,7 +133,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, self._todo = [] if not isinstance(data, (pd.Series, pd.DataFrame)): - raise TypeError + raise TypeError("``data`` must be a Series or DataFrame") if data.ndim == 1: data = data.to_frame() if not data.index.is_unique or not data.columns.is_unique: @@ -427,11 +427,30 @@ def _compute(self): def _apply(self, func, axis=0, subset=None, **kwargs): subset = slice(None) if subset is None else subset subset = _non_reducing_slice(subset) + data = self.data.loc[subset] if axis is not None: - result = self.data.loc[subset].apply(func, axis=axis, **kwargs) + result = data.apply(func, axis=axis, **kwargs) else: - # like tee - result = func(self.data.loc[subset], **kwargs) + result = func(data, **kwargs) + if not isinstance(result, pd.DataFrame): + raise TypeError( + "Function {!r} must return a DataFrame when " + "passed to `Styler.apply` with axis=None".format(func)) + if not (result.index.equals(data.index) and + result.columns.equals(data.columns)): + msg = ('Result of {!r} must have identical index and columns ' + 'as the input'.format(func)) + raise ValueError(msg) + + result_shape = result.shape + expected_shape = self.data.loc[subset].shape + if result_shape != expected_shape: + msg = ("Function {!r} returned the wrong shape.\n" + "Result has shape: {}\n" + "Expected shape: {}".format(func, + result.shape, + expected_shape)) + raise ValueError(msg) self._update_ctx(result) return self @@ -444,15 +463,19 @@ def apply(self, func, axis=0, subset=None, **kwargs): Parameters ---------- - func: function - axis: int, str or None + func : function + ``func`` should take a Series or DataFrame (depending + on ``axis``), and return an object with the same shape. + Must return a DataFrame with identical index and + column labels when ``axis=None`` + axis : int, str or None apply to each column (``axis=0`` or ``'index'``) or to each row (``axis=1`` or ``'columns'``) or - to the entire DataFrame at once with ``axis=None``. - subset: IndexSlice + to the entire DataFrame at once with ``axis=None`` + subset : IndexSlice a valid indexer to limit ``data`` to *before* applying the function. Consider using a pandas.IndexSlice - kwargs: dict + kwargs : dict pass along to ``func`` Returns @@ -461,9 +484,22 @@ def apply(self, func, axis=0, subset=None, **kwargs): Notes ----- + The output shape of ``func`` should match the input, i.e. if + ``x`` is the input row, column, or table (depending on ``axis``), + then ``func(x.shape) == x.shape`` should be true. + This is similar to ``DataFrame.apply``, except that ``axis=None`` applies the function to the entire DataFrame at once, rather than column-wise or row-wise. + + Examples + -------- + >>> def highlight_max(x): + ... return ['background-color: yellow' if v == x.max() else '' + for v in x] + ... + >>> df = pd.DataFrame(np.random.randn(5, 2)) + >>> df.style.apply(highlight_max) """ self._todo.append((lambda instance: getattr(instance, '_apply'), (func, axis, subset), kwargs)) @@ -488,6 +524,7 @@ def applymap(self, func, subset=None, **kwargs): Parameters ---------- func : function + ``func`` should take a scalar and return a scalar subset : IndexSlice a valid indexer to limit ``data`` to *before* applying the function. Consider using a pandas.IndexSlice @@ -742,6 +779,7 @@ def set_properties(self, subset=None, **kwargs): -------- >>> df = pd.DataFrame(np.random.randn(10, 4)) >>> df.style.set_properties(color="white", align="right") + >>> df.style.set_properties(**{'background-color': 'yellow'}) """ values = ';'.join('{p}: {v}'.format(p=p, v=v) for p, v in kwargs.items()) diff --git a/pandas/tests/formats/test_style.py b/pandas/tests/formats/test_style.py index 5a79e3f6897f0..9a34f545bd119 100644 --- a/pandas/tests/formats/test_style.py +++ b/pandas/tests/formats/test_style.py @@ -46,6 +46,17 @@ def h(x, foo='bar'): 'c': pd.Categorical(['a', 'b'])}) ] + def test_init_non_pandas(self): + with tm.assertRaises(TypeError): + Styler([1, 2, 3]) + + def test_init_series(self): + result = Styler(pd.Series([1, 2])) + self.assertEqual(result.data.ndim, 2) + + def test_repr_html_ok(self): + self.styler._repr_html_() + def test_update_ctx(self): self.styler._update_ctx(self.attrs) expected = {(0, 0): ['color: red'], @@ -102,7 +113,7 @@ def test_clear(self): def test_render(self): df = pd.DataFrame({"A": [0, 1]}) style = lambda x: pd.Series(["color: red", "color: blue"], name=x.name) - s = Styler(df, uuid='AB').apply(style).apply(style, axis=1) + s = Styler(df, uuid='AB').apply(style) s.render() # it worked? @@ -539,6 +550,37 @@ def test_display_dict(self): self.assertEqual(ctx['body'][0][1]['display_value'], '0.1') self.assertEqual(ctx['body'][0][3]['display_value'], 'AAA') + def test_bad_apply_shape(self): + df = pd.DataFrame([[1, 2], [3, 4]]) + with tm.assertRaises(ValueError): + df.style._apply(lambda x: 'x', subset=pd.IndexSlice[[0, 1], :]) + + with tm.assertRaises(ValueError): + df.style._apply(lambda x: [''], subset=pd.IndexSlice[[0, 1], :]) + + with tm.assertRaises(ValueError): + df.style._apply(lambda x: ['', '', '', '']) + + with tm.assertRaises(ValueError): + df.style._apply(lambda x: ['', '', ''], subset=1) + + with tm.assertRaises(ValueError): + df.style._apply(lambda x: ['', '', ''], axis=1) + + def test_apply_bad_return(self): + def f(x): + return '' + df = pd.DataFrame([[1, 2], [3, 4]]) + with tm.assertRaises(TypeError): + df.style._apply(f, axis=None) + + def test_apply_bad_labels(self): + def f(x): + return pd.DataFrame(index=[1, 2], columns=['a', 'b']) + df = pd.DataFrame([[1, 2], [3, 4]]) + with tm.assertRaises(ValueError): + df.style._apply(f, axis=None) + @tm.mplskip class TestStylerMatplotlibDep(TestCase): From b01e07e3395f7cee5a9922af979ea41aa40ff892 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 19 Jun 2016 09:31:22 -0500 Subject: [PATCH 045/359] DOC: add nbformat for notebook conversion The build [here](https://travis- ci.org/pydata/pandas/jobs/138633677#L1866) didn't succeed. Had `nbconvert` in the debs, but not `nbformat`. Author: Tom Augspurger Closes #13487 from TomAugspurger/travis-doc-html-fu and squashes the following commits: 9c74612 [Tom Augspurger] DOC: add nbformat for notebook conversion --- ci/requirements-2.7_DOC_BUILD.run | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/requirements-2.7_DOC_BUILD.run b/ci/requirements-2.7_DOC_BUILD.run index 854776762fdb5..507ce9ea5aac5 100644 --- a/ci/requirements-2.7_DOC_BUILD.run +++ b/ci/requirements-2.7_DOC_BUILD.run @@ -1,6 +1,7 @@ ipython sphinx nbconvert +nbformat matplotlib scipy lxml From 3b75e03e1bf9798c56f1d20cbc5a5c6c0d560979 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 Jun 2016 09:34:09 -0500 Subject: [PATCH 046/359] DOC: specify correct kernelspec for converting notebooks (#13491) --- doc/make.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/doc/make.py b/doc/make.py index 05bf618ee677e..8e7b1d95dbafb 100755 --- a/doc/make.py +++ b/doc/make.py @@ -116,6 +116,11 @@ def cleanup_nb(nb): pass +def get_kernel(): + """Find the kernel name for your python version""" + return 'python%s' % sys.version_info.major + + def execute_nb(src, dst, allow_errors=False, timeout=1000, kernel_name=''): """ Execute notebook in `src` and write the output to `dst` @@ -184,10 +189,12 @@ def html(): with cleanup_nb(nb): try: print("Converting %s" % nb) - executed = execute_nb(nb, nb + '.executed', allow_errors=True) + kernel_name = get_kernel() + executed = execute_nb(nb, nb + '.executed', allow_errors=True, + kernel_name=kernel_name) convert_nb(executed, nb.rstrip('.ipynb') + '.html') - except ImportError: - pass + except (ImportError, IndexError): + print("Failed to convert %s" % nb) if os.system('sphinx-build -P -b html -d build/doctrees ' 'source build/html'): @@ -199,6 +206,7 @@ def html(): except: pass + def zip_html(): try: print("\nZipping up HTML docs...") From 0f351dc475eaa5fdddf1ecd24e70ac0114e58e9b Mon Sep 17 00:00:00 2001 From: priyankjain Date: Tue, 21 Jun 2016 06:06:02 -0400 Subject: [PATCH 047/359] BUG: Rolling negative window issue fix #13383 closes #13383 Author: priyankjain Closes #13441 from priyankjain/novice-bug-fixes and squashes the following commits: 26c9b2d [priyankjain] BUG: Rolling negative window issue fix #13383 --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/window.py | 4 ++++ pandas/tests/test_window.py | 14 ++++++++++++++ 3 files changed, 19 insertions(+) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index c8a8b8eb0547b..91425aff6fe7a 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -480,6 +480,7 @@ Bug Fixes - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) - Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue:`13306`) - Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) +- Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) - Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) diff --git a/pandas/core/window.py b/pandas/core/window.py index fbc56335aabd9..1e34d18fe3e54 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -321,6 +321,8 @@ def validate(self): if isinstance(window, (list, tuple, np.ndarray)): pass elif com.is_integer(window): + if window < 0: + raise ValueError("window must be non-negative") try: import scipy.signal as sig except ImportError: @@ -850,6 +852,8 @@ def validate(self): super(Rolling, self).validate() if not com.is_integer(self.window): raise ValueError("window must be an integer") + elif self.window < 0: + raise ValueError("window must be non-negative") @Substitution(name='rolling') @Appender(SelectionMixin._see_also_template) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 2ec419221c6d8..3693ebdb12e2f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -331,6 +331,11 @@ def test_constructor(self): c(window=2, min_periods=1, center=True) c(window=2, min_periods=1, center=False) + # GH 13383 + c(0) + with self.assertRaises(ValueError): + c(-1) + # not valid for w in [2., 'foo', np.array([2])]: with self.assertRaises(ValueError): @@ -340,6 +345,15 @@ def test_constructor(self): with self.assertRaises(ValueError): c(window=2, min_periods=1, center=w) + def test_constructor_with_win_type(self): + # GH 13383 + tm._skip_if_no_scipy() + for o in [self.series, self.frame]: + c = o.rolling + c(0, win_type='boxcar') + with self.assertRaises(ValueError): + c(-1, win_type='boxcar') + def test_numpy_compat(self): # see gh-12811 r = rwindow.Rolling(Series([2, 4, 6]), window=2) From 1a12eadaf92eb1ee05446d24e5d7a12cde971ce3 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Nimmi Date: Tue, 21 Jun 2016 08:40:46 -0400 Subject: [PATCH 048/359] BUG: is_normalized returned False for local tz closes #13459 Author: Ravi Kumar Nimmi Closes #13484 from ravinimmi/bugfix and squashes the following commits: 4d48367 [Ravi Kumar Nimmi] BUG: is_normalized returned False for local tz --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/tests/test_pytables.py | 36 ++++---------------- pandas/tseries/tests/test_timezones.py | 22 +++++++++++- pandas/tslib.pyx | 4 +-- pandas/util/testing.py | 47 ++++++++++++++++++++++++++ 5 files changed, 76 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 91425aff6fe7a..c207946d0bf19 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -485,6 +485,7 @@ Bug Fixes - Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) - Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) +- Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) - Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9c13162bd774c..ab5362da21a7d 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -34,7 +34,8 @@ assert_panel_equal, assert_frame_equal, assert_series_equal, - assert_produces_warning) + assert_produces_warning, + set_timezone) from pandas import concat, Timestamp from pandas import compat from pandas.compat import range, lrange, u @@ -5309,14 +5310,6 @@ def test_store_timezone(self): # issue storing datetime.date with a timezone as it resets when read # back in a new timezone - import platform - if platform.system() == "Windows": - raise nose.SkipTest("timezone setting not supported on windows") - - import datetime - import time - import os - # original method with ensure_clean_store(self.path) as store: @@ -5327,34 +5320,17 @@ def test_store_timezone(self): assert_frame_equal(result, df) # with tz setting - orig_tz = os.environ.get('TZ') - - def setTZ(tz): - if tz is None: - try: - del os.environ['TZ'] - except: - pass - else: - os.environ['TZ'] = tz - time.tzset() - - try: - - with ensure_clean_store(self.path) as store: + with ensure_clean_store(self.path) as store: - setTZ('EST5EDT') + with set_timezone('EST5EDT'): today = datetime.date(2013, 9, 10) df = DataFrame([1, 2, 3], index=[today, today, today]) store['obj1'] = df - setTZ('CST6CDT') + with set_timezone('CST6CDT'): result = store['obj1'] - assert_frame_equal(result, df) - - finally: - setTZ(orig_tz) + assert_frame_equal(result, df) def test_legacy_datetimetz_object(self): # legacy from < 0.17.0 diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index afe9d0652db19..d68ff793c9b6a 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -18,7 +18,7 @@ import pandas.util.testing as tm from pandas.types.api import DatetimeTZDtype -from pandas.util.testing import assert_frame_equal +from pandas.util.testing import assert_frame_equal, set_timezone from pandas.compat import lrange, zip try: @@ -1398,6 +1398,26 @@ def test_normalize_tz(self): self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) + def test_normalize_tz_local(self): + # GH 13459 + from dateutil.tz import tzlocal + + timezones = ['US/Pacific', 'US/Eastern', 'UTC', 'Asia/Kolkata', + 'Asia/Shanghai', 'Australia/Canberra'] + + for timezone in timezones: + with set_timezone(timezone): + rng = date_range('1/1/2000 9:30', periods=10, freq='D', + tz=tzlocal()) + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', + tz=tzlocal()) + self.assert_index_equal(result, expected) + + self.assertTrue(result.is_normalized) + self.assertFalse(rng.is_normalized) + def test_tzaware_offset(self): dates = date_range('2012-11-01', periods=3, tz='US/Pacific') offset = dates + offsets.Hour(5) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 7de62fbe71615..8837881af0b6c 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -4810,12 +4810,10 @@ def dates_normalized(ndarray[int64_t] stamps, tz=None): elif _is_tzlocal(tz): for i in range(n): pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) - if (dts.min + dts.sec + dts.us) > 0: - return False dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) dt = dt + tz.utcoffset(dt) - if dt.hour > 0: + if (dt.hour + dt.minute + dt.second + dt.microsecond) > 0: return False else: trans, deltas, typ = _get_dst_info(tz) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 8c4d2f838ee8d..2961b2fb2241f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2667,3 +2667,50 @@ def patch(ob, attr, value): delattr(ob, attr) else: setattr(ob, attr, old) + + +@contextmanager +def set_timezone(tz): + """Context manager for temporarily setting a timezone. + + Parameters + ---------- + tz : str + A string representing a valid timezone. + + Examples + -------- + + >>> from datetime import datetime + >>> from dateutil.tz import tzlocal + >>> tzlocal().tzname(datetime.now()) + 'IST' + + >>> with set_timezone('US/Eastern'): + ... tzlocal().tzname(datetime.now()) + ... + 'EDT' + """ + if is_platform_windows(): + import nose + raise nose.SkipTest("timezone setting not supported on windows") + + import os + import time + + def setTZ(tz): + if tz is None: + try: + del os.environ['TZ'] + except: + pass + else: + os.environ['TZ'] = tz + time.tzset() + + orig_tz = os.environ.get('TZ') + setTZ(tz) + try: + yield + finally: + setTZ(orig_tz) From 41f12161e24c44e4ff8088ff614bafe055015651 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 22 Jun 2016 06:05:01 -0400 Subject: [PATCH 049/359] TST: Fix MMapWrapper init test for Windows Turns out Windows errors differently when an invalid `fileno` is passed into the `mmap` constructor, so there's no need to skip the test (xref: 9670b31). Author: gfyoung Closes #13494 from gfyoung/mmap-wrapper-init-test-fix and squashes the following commits: db7c6b1 [gfyoung] TST: Fix MMapWrapper init test for Windows --- pandas/io/tests/test_common.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 46c34abf5aeb7..cf5ec7d911051 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -1,7 +1,6 @@ """ Tests for the pandas.io.common functionalities """ -import nose import mmap import os from os.path import isabs @@ -98,15 +97,18 @@ def setUp(self): 'test_mmap.csv') def test_constructor_bad_file(self): - if is_platform_windows(): - raise nose.SkipTest("skipping construction error messages " - "tests on windows") - non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 - msg = "Invalid argument" - tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file) + # the error raised is different on Windows + if is_platform_windows(): + msg = "The parameter is incorrect" + err = OSError + else: + msg = "Invalid argument" + err = mmap.error + + tm.assertRaisesRegexp(err, msg, common.MMapWrapper, non_file) target = open(self.mmap_file, 'r') target.close() From 30d710f4c8a07cb7ea3bc91f6eb05c4bbdfa2f24 Mon Sep 17 00:00:00 2001 From: Matthieu Brucher Date: Wed, 22 Jun 2016 06:20:25 -0400 Subject: [PATCH 050/359] BUG: windows with TemporaryFile an read_csv #13398 dcloses #13398 Author: Matthieu Brucher Closes #13481 from mbrucher/issue-13398 and squashes the following commits: 8b52631 [Matthieu Brucher] Yet another small update for more general regex 0d54151 [Matthieu Brucher] Simplified 5871625 [Matthieu Brucher] Grammar aa3f0aa [Matthieu Brucher] lint change 1c33fb5 [Matthieu Brucher] Simplified test and added what's new note. d8ceb57 [Matthieu Brucher] lint changes fd20aaf [Matthieu Brucher] Moved the test to the Python parser test file 98e476e [Matthieu Brucher] Using same way of referencing as just above, consistency. 119fb65 [Matthieu Brucher] Added reference to original issue in the test + test the result itself (assuming that previous test is OK) 5af8465 [Matthieu Brucher] Adding a test with Python engine d8decae [Matthieu Brucher] #13398 Change the way of reading back to readline (consistent with the test before entering the function) --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 2 +- pandas/io/tests/parser/python_parser_only.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index c207946d0bf19..40fec4d071f16 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -496,6 +496,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a tempfile.TemporaryFile on Windows with Python 3 (:issue:`13398`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9baff67845dac..dc9455289b757 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1868,7 +1868,7 @@ class MyDialect(csv.Dialect): else: def _read(): - line = next(f) + line = f.readline() pat = re.compile(sep) yield pat.split(line.strip()) for line in f: diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index a08cb36c13f80..6f0ea75c4da93 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -171,3 +171,17 @@ def test_read_table_buglet_4x_multiindex(self): columns=list('abcABC'), index=list('abc')) actual = self.read_table(StringIO(data), sep='\s+') tm.assert_frame_equal(actual, expected) + + def test_temporary_file(self): + # GH13398 + data1 = "0 0" + + from tempfile import TemporaryFile + new_file = TemporaryFile("w+") + new_file.write(data1) + new_file.flush() + new_file.seek(0) + + result = self.read_csv(new_file, sep=r"\s*", header=None) + expected = DataFrame([[0, 0]]) + tm.assert_frame_equal(result, expected) From 48f0c3a6a413dbb7bac825c37c0ef21580435b14 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 23 Jun 2016 08:28:02 -0500 Subject: [PATCH 051/359] DOC/BLD: Travis doc build notebook dependencies (#13493) Added notebook deps for travis doc build. --- ci/requirements-2.7_DOC_BUILD.run | 2 ++ doc/make.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/requirements-2.7_DOC_BUILD.run b/ci/requirements-2.7_DOC_BUILD.run index 507ce9ea5aac5..b87a41df4191d 100644 --- a/ci/requirements-2.7_DOC_BUILD.run +++ b/ci/requirements-2.7_DOC_BUILD.run @@ -1,7 +1,9 @@ ipython +ipykernel sphinx nbconvert nbformat +notebook matplotlib scipy lxml diff --git a/doc/make.py b/doc/make.py index 8e7b1d95dbafb..d46be2611ce3d 100755 --- a/doc/make.py +++ b/doc/make.py @@ -193,7 +193,8 @@ def html(): executed = execute_nb(nb, nb + '.executed', allow_errors=True, kernel_name=kernel_name) convert_nb(executed, nb.rstrip('.ipynb') + '.html') - except (ImportError, IndexError): + except (ImportError, IndexError) as e: + print(e) print("Failed to convert %s" % nb) if os.system('sphinx-build -P -b html -d build/doctrees ' From 01e3de4a2cb730afa8fca2d4dbc00b331c95c1c2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Jun 2016 17:37:45 +0200 Subject: [PATCH 052/359] DOC: fix accessor docs for sphinx > 1.3 (GH12161) (#13499) --- doc/source/conf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 87510d13ee484..6ceeee4ad6afb 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -318,6 +318,7 @@ # Add custom Documenter to handle attributes/methods of an AccessorProperty # eg pandas.Series.str and pandas.Series.dt (see GH9322) +import sphinx from sphinx.util import rpartition from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter from sphinx.ext.autosummary import Autosummary @@ -365,7 +366,10 @@ def resolve_name(self, modname, parents, path, base): if not modname: modname = self.env.temp_data.get('autodoc:module') if not modname: - modname = self.env.temp_data.get('py:module') + if sphinx.__version__ > '1.3': + modname = self.env.ref_context.get('py:module') + else: + modname = self.env.temp_data.get('py:module') # ... else, it stays None, which means invalid return modname, parents + [base] From f42283d2f66fe0fca821b9dee6c1b42ee0726e20 Mon Sep 17 00:00:00 2001 From: Neil Parley Date: Thu, 23 Jun 2016 19:59:10 -0400 Subject: [PATCH 053/359] COMPAT: mmap error is not always returned in English Fixes a build error from https://github.com/pydata/pandas/pull/12946 caused by mmap error being returned in Italian when `LOCALE_OVERRIDE="it_IT.UTF-8"`. The test fails with: `AssertionError: "Invalid argument" does not match "[Errno 22] Argomento non valido"` ```python msg = "Invalid argument" tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file) ``` i.e. message is not being matched. Change to match the errno instead as that's the same across languages. Author: Neil Parley Closes #13507 from nparley/mmap-test-fix and squashes the following commits: 160af24 [Neil Parley] mmap error is not always returned in English --- pandas/io/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index cf5ec7d911051..5740944558a5d 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -105,7 +105,7 @@ def test_constructor_bad_file(self): msg = "The parameter is incorrect" err = OSError else: - msg = "Invalid argument" + msg = "[Errno 22]" err = mmap.error tm.assertRaisesRegexp(err, msg, common.MMapWrapper, non_file) From ab116a7139c2c377555650b001b461a2b1eaf15c Mon Sep 17 00:00:00 2001 From: Neil Parley Date: Thu, 23 Jun 2016 20:00:13 -0400 Subject: [PATCH 054/359] BUG: Travis building on container-based infrastructure closes #10598 Author: Neil Parley Closes #12946 from nparley/travis and squashes the following commits: --- .travis.yml | 199 +++++++++++++++++++++++++--------- ci/before_install_travis.sh | 2 - ci/check_cache.sh | 25 +++++ ci/install-2.7_NUMPY_DEV.sh | 2 - ci/install-3.5_NUMPY_DEV.sh | 2 - ci/install_travis.sh | 105 ++++++++---------- ci/prep_ccache.sh | 52 --------- ci/prep_cython_cache.sh | 42 +++++++ ci/submit_ccache.sh | 39 ------- ci/submit_cython_cache.sh | 23 ++++ pandas/util/print_versions.py | 2 + 11 files changed, 283 insertions(+), 210 deletions(-) create mode 100755 ci/check_cache.sh delete mode 100755 ci/prep_ccache.sh create mode 100755 ci/prep_cython_cache.sh delete mode 100755 ci/submit_ccache.sh create mode 100755 ci/submit_cython_cache.sh diff --git a/.travis.yml b/.travis.yml index 5a16c1a6c25e7..b909a1f980d6d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,14 +1,24 @@ - +sudo: false language: python -env: +# To turn off cached miniconda, cython files and compiler cache comment out the +# USE_CACHE=true line for the build in the matrix below. To delete caches go to +# https://travis-ci.org/OWNER/REPOSITORY/caches or run +# travis cache --delete inside the project directory from the travis command line client +# The cash directories will be deleted if anything in ci/ changes in a commit +cache: + directories: + - $HOME/miniconda # miniconda cache + - $HOME/.cache # cython cache + - $HOME/.ccache # compiler cache +env: global: # scatterci API key #- secure: "Bx5umgo6WjuGY+5XFa004xjCiX/vq0CyMZ/ETzcs7EIBI1BE/0fIDXOoWhoxbY9HPfdPGlDnDgB9nGqr5wArO2s+BavyKBWg6osZ3dmkfuJPMOWeyCa92EeP+sfKw8e5HSU5MizW9e319wHWOF/xkzdHR7T67Qd5erhv91x4DnQ=" # ironcache API key - - secure: "e4eEFn9nDQc3Xa5BWYkzfX37jaWVq89XidVX+rcCNEr5OlOImvveeXnF1IzbRXznH4Sv0YsLwUd8RGUWOmyCvkONq/VJeqCHWtTMyfaCIdqSyhIP9Odz8r9ahch+Y0XFepBey92AJHmlnTh+2GjCDgIiqq4fzglojnp56Vg1ojA=" - - secure: "CjmYmY5qEu3KrvMtel6zWFEtMq8ORBeS1S1odJHnjQpbwT1KY2YFZRVlLphfyDQXSz6svKUdeRrCNp65baBzs3DQNA8lIuXGIBYFeJxqVGtYAZZs6+TzBPfJJK798sGOj5RshrOJkFG2rdlWNuTq/XphI0JOrN3nPUkRrdQRpAw=" + #- secure: "e4eEFn9nDQc3Xa5BWYkzfX37jaWVq89XidVX+rcCNEr5OlOImvveeXnF1IzbRXznH4Sv0YsLwUd8RGUWOmyCvkONq/VJeqCHWtTMyfaCIdqSyhIP9Odz8r9ahch+Y0XFepBey92AJHmlnTh+2GjCDgIiqq4fzglojnp56Vg1ojA=" + #- secure: "CjmYmY5qEu3KrvMtel6zWFEtMq8ORBeS1S1odJHnjQpbwT1KY2YFZRVlLphfyDQXSz6svKUdeRrCNp65baBzs3DQNA8lIuXGIBYFeJxqVGtYAZZs6+TzBPfJJK798sGOj5RshrOJkFG2rdlWNuTq/XphI0JOrN3nPUkRrdQRpAw=" # pandas-docs-bot GH - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" @@ -29,72 +39,129 @@ matrix: - BUILD_TYPE=conda - JOB_TAG=_OSX - TRAVIS_PYTHON_VERSION=3.5 + - CACHE_NAME="35_osx" + - USE_CACHE=true - python: 2.7 env: - - JOB_NAME: "27_slow_nnet_LOCALE" - - NOSE_ARGS="slow and not network and not disabled" - - LOCALE_OVERRIDE="zh_CN.GB18030" - - FULL_DEPS=true - - JOB_TAG=_LOCALE + - JOB_NAME: "27_slow_nnet_LOCALE" + - NOSE_ARGS="slow and not network and not disabled" + - LOCALE_OVERRIDE="zh_CN.UTF-8" + - FULL_DEPS=true + - JOB_TAG=_LOCALE + - CACHE_NAME="27_slow_nnet_LOCALE" + - USE_CACHE=true + addons: + apt: + packages: + - language-pack-zh-hans - python: 2.7 env: - - JOB_NAME: "27_nslow" - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - CLIPBOARD_GUI=gtk2 - - LINT=true + - JOB_NAME: "27_nslow" + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD_GUI=gtk2 + - LINT=true + - CACHE_NAME="27_nslow" + - USE_CACHE=true + addons: + apt: + packages: + - python-gtk2 - python: 3.4 env: - - JOB_NAME: "34_nslow" - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel + - JOB_NAME: "34_nslow" + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD=xsel + - CACHE_NAME="34_nslow" + - USE_CACHE=true + addons: + apt: + packages: + - xsel - python: 3.5 env: - - JOB_NAME: "35_nslow" - - NOSE_ARGS="not slow and not network and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel - - COVERAGE=true + - JOB_NAME: "35_nslow" + - NOSE_ARGS="not slow and not network and not disabled" + - FULL_DEPS=true + - CLIPBOARD=xsel + - COVERAGE=true + - CACHE_NAME="35_nslow" +# - USE_CACHE=true # Don't use cache for 35_nslow + addons: + apt: + packages: + - xsel +# In allow_failures - python: 2.7 env: - - JOB_NAME: "27_slow" - - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" - - FULL_DEPS=true + - JOB_NAME: "27_slow" + - JOB_TAG=_SLOW + - NOSE_ARGS="slow and not network and not disabled" + - FULL_DEPS=true + - CACHE_NAME="27_slow" + - USE_CACHE=true +# In allow_failures - python: 3.4 env: - - JOB_NAME: "34_slow" - - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel + - JOB_NAME: "34_slow" + - JOB_TAG=_SLOW + - NOSE_ARGS="slow and not network and not disabled" + - FULL_DEPS=true + - CLIPBOARD=xsel + - CACHE_NAME="34_slow" + - USE_CACHE=true + addons: + apt: + packages: + - xsel +# In allow_failures - python: 2.7 env: - - JOB_NAME: "27_build_test_conda" - - JOB_TAG=_BUILD_TEST - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - BUILD_TEST=true + - JOB_NAME: "27_build_test_conda" + - JOB_TAG=_BUILD_TEST + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - BUILD_TEST=true + - CACHE_NAME="27_build_test_conda" + - USE_CACHE=true +# In allow_failures - python: 3.5 env: - - JOB_NAME: "35_numpy_dev" - - JOB_TAG=_NUMPY_DEV - - NOSE_ARGS="not slow and not network and not disabled" - - PANDAS_TESTING_MODE="deprecate" + - JOB_NAME: "35_numpy_dev" + - JOB_TAG=_NUMPY_DEV + - NOSE_ARGS="not slow and not network and not disabled" + - PANDAS_TESTING_MODE="deprecate" + - CACHE_NAME="35_numpy_dev" + - USE_CACHE=true + addons: + apt: + packages: + - libatlas-base-dev + - gfortran +# In allow_failures - python: 2.7 env: - - JOB_NAME: "27_nslow_nnet_COMPAT" - - NOSE_ARGS="not slow and not network and not disabled" - - LOCALE_OVERRIDE="it_IT.UTF-8" - - INSTALL_TEST=true - - JOB_TAG=_COMPAT + - JOB_NAME: "27_nslow_nnet_COMPAT" + - NOSE_ARGS="not slow and not network and not disabled" + - LOCALE_OVERRIDE="it_IT.UTF-8" + - INSTALL_TEST=true + - JOB_TAG=_COMPAT + - CACHE_NAME="27_nslow_nnet_COMPAT" + - USE_CACHE=true + addons: + apt: + packages: + - language-pack-it +# In allow_failures - python: 2.7 env: - - JOB_NAME: "doc_build" - - FULL_DEPS=true - - DOC_BUILD=true # if rst files were changed, build docs in parallel with tests - - JOB_TAG=_DOC_BUILD + - JOB_NAME: "doc_build" + - FULL_DEPS=true + - DOC_BUILD=true + - JOB_TAG=_DOC_BUILD + - CACHE_NAME="doc_build" + - USE_CACHE=true allow_failures: - python: 2.7 env: @@ -102,6 +169,8 @@ matrix: - JOB_TAG=_SLOW - NOSE_ARGS="slow and not network and not disabled" - FULL_DEPS=true + - CACHE_NAME="27_slow" + - USE_CACHE=true - python: 3.4 env: - JOB_NAME: "34_slow" @@ -109,6 +178,12 @@ matrix: - NOSE_ARGS="slow and not network and not disabled" - FULL_DEPS=true - CLIPBOARD=xsel + - CACHE_NAME="34_slow" + - USE_CACHE=true + addons: + apt: + packages: + - xsel - python: 2.7 env: - JOB_NAME: "27_build_test_conda" @@ -116,12 +191,21 @@ matrix: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - BUILD_TEST=true + - CACHE_NAME="27_build_test_conda" + - USE_CACHE=true - python: 3.5 env: - JOB_NAME: "35_numpy_dev" - JOB_TAG=_NUMPY_DEV - NOSE_ARGS="not slow and not network and not disabled" - PANDAS_TESTING_MODE="deprecate" + - CACHE_NAME="35_numpy_dev" + - USE_CACHE=true + addons: + apt: + packages: + - libatlas-base-dev + - gfortran - python: 2.7 env: - JOB_NAME: "27_nslow_nnet_COMPAT" @@ -129,12 +213,20 @@ matrix: - LOCALE_OVERRIDE="it_IT.UTF-8" - INSTALL_TEST=true - JOB_TAG=_COMPAT + - CACHE_NAME="27_nslow_nnet_COMPAT" + - USE_CACHE=true + addons: + apt: + packages: + - language-pack-it - python: 2.7 env: - JOB_NAME: "doc_build" - FULL_DEPS=true - DOC_BUILD=true - JOB_TAG=_DOC_BUILD + - CACHE_NAME="doc_build" + - USE_CACHE=true before_install: - echo "before_install" @@ -153,9 +245,10 @@ before_install: install: - echo "install start" - - ci/prep_ccache.sh + - ci/check_cache.sh + - ci/prep_cython_cache.sh - ci/install_travis.sh - - ci/submit_ccache.sh + - ci/submit_cython_cache.sh - echo "install done" before_script: @@ -175,6 +268,6 @@ after_success: after_script: - echo "after_script start" - ci/install_test.sh - - source activate pandas && ci/print_versions.py + - source activate pandas && python -c "import pandas; pandas.show_versions();" - ci/print_skipped.py /tmp/nosetests.xml - echo "after_script done" diff --git a/ci/before_install_travis.sh b/ci/before_install_travis.sh index 76775ecbc78f0..f90427f97d3b7 100755 --- a/ci/before_install_travis.sh +++ b/ci/before_install_travis.sh @@ -9,8 +9,6 @@ echo "inside $0" # overview if [ "${TRAVIS_OS_NAME}" == "linux" ]; then - sudo apt-get update $APT_ARGS # run apt-get update for all versions - sh -e /etc/init.d/xvfb start fi diff --git a/ci/check_cache.sh b/ci/check_cache.sh new file mode 100755 index 0000000000000..cd7a6e8f6b6f9 --- /dev/null +++ b/ci/check_cache.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +if [ "$TRAVIS_PULL_REQUEST" == "false" ] +then + echo "Not a PR: checking for changes in ci/ from last 2 commits" + git diff HEAD~2 --numstat | grep -E "ci/" + ci_changes=$(git diff HEAD~2 --numstat | grep -E "ci/"| wc -l) +else + echo "PR: checking for changes in ci/ from last 2 commits" + git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:PR_HEAD + git diff PR_HEAD~2 --numstat | grep -E "ci/" + ci_changes=$(git diff PR_HEAD~2 --numstat | grep -E "ci/"| wc -l) +fi + +MINICONDA_DIR="$HOME/miniconda/" +CACHE_DIR="$HOME/.cache/" +CCACHE_DIR="$HOME/.ccache/" + +if [ $ci_changes -ne 0 ] +then + echo "Files have changed in ci/ deleting all caches" + rm -rf "$MINICONDA_DIR" + rm -rf "$CACHE_DIR" + rm -rf "$CCACHE_DIR" +fi \ No newline at end of file diff --git a/ci/install-2.7_NUMPY_DEV.sh b/ci/install-2.7_NUMPY_DEV.sh index 00b6255daf70f..22ac8f6547879 100644 --- a/ci/install-2.7_NUMPY_DEV.sh +++ b/ci/install-2.7_NUMPY_DEV.sh @@ -12,8 +12,6 @@ pip uninstall numpy -y # these wheels don't play nice with the conda libgfortran / openblas # time conda install -n pandas libgfortran openblas || exit 1 -time sudo apt-get $APT_ARGS install libatlas-base-dev gfortran - # install numpy wheel from master pip install --pre --upgrade --no-index --timeout=60 --trusted-host travis-dev-wheels.scipy.org -f http://travis-dev-wheels.scipy.org/ numpy diff --git a/ci/install-3.5_NUMPY_DEV.sh b/ci/install-3.5_NUMPY_DEV.sh index ecb07ca23c667..946ec43ad9f1a 100644 --- a/ci/install-3.5_NUMPY_DEV.sh +++ b/ci/install-3.5_NUMPY_DEV.sh @@ -12,8 +12,6 @@ pip uninstall numpy -y # these wheels don't play nice with the conda libgfortran / openblas # time conda install -n pandas libgfortran openblas || exit 1 -time sudo apt-get $APT_ARGS install libatlas-base-dev gfortran - # install numpy wheel from master pip install --pre --upgrade --no-index --timeout=60 --trusted-host travis-dev-wheels.scipy.org -f http://travis-dev-wheels.scipy.org/ numpy scipy diff --git a/ci/install_travis.sh b/ci/install_travis.sh index b490699460622..3d9651d4f579b 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -28,73 +28,68 @@ function edit_init() edit_init -python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" -[ "$python_major_version" == "2" ] && python_major_version="" - home_dir=$(pwd) echo "home_dir: [$home_dir]" -if [ -n "$LOCALE_OVERRIDE" ]; then - # make sure the locale is available - # probably useless, since you would need to relogin - time sudo locale-gen "$LOCALE_OVERRIDE" - - # Need to enable for locale testing. The location of the locale file(s) is - # distro specific. For example, on Arch Linux all of the locales are in a - # commented file--/etc/locale.gen--that must be commented in to be used - # whereas Ubuntu looks in /var/lib/locales/supported.d/* and generates locales - # based on what's in the files in that folder - time echo 'it_CH.UTF-8 UTF-8' | sudo tee -a /var/lib/locales/supported.d/it - time sudo locale-gen +python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" +[ "$python_major_version" == "2" ] && python_major_version="" -fi +MINICONDA_DIR="$HOME/miniconda" -# install gui for clipboard testing -if [ -n "$CLIPBOARD_GUI" ]; then - echo "Using CLIPBOARD_GUI: $CLIPBOARD_GUI" - [ -n "$python_major_version" ] && py="py" - python_cb_gui_pkg=python${python_major_version}-${py}${CLIPBOARD_GUI} - time sudo apt-get $APT_ARGS install $python_cb_gui_pkg -fi +if [ -d "$MINICONDA_DIR" ] && [ -e "$MINICONDA_DIR/bin/conda" ] && [ "$USE_CACHE" ]; then + echo "Miniconda install already present from cache: $MINICONDA_DIR" + conda config --set always_yes yes --set changeps1 no || exit 1 + echo "update conda" + conda update -q conda || exit 1 -# install a clipboard if $CLIPBOARD is not empty -if [ -n "$CLIPBOARD" ]; then - echo "Using clipboard: $CLIPBOARD" - time sudo apt-get $APT_ARGS install $CLIPBOARD -fi + # Useful for debugging any issues with conda + conda info -a || exit 1 -python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" -[ "$python_major_version" == "2" ] && python_major_version="" + # set the compiler cache to work + if [ "${TRAVIS_OS_NAME}" == "linux" ]; then + echo "Using ccache" + export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH + gcc=$(which gcc) + echo "gcc: $gcc" + ccache=$(which ccache) + echo "ccache: $ccache" + export CC='ccache gcc' + fi -# install miniconda -echo "install miniconda" -if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 else - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 -fi -bash miniconda.sh -b -p $HOME/miniconda || exit 1 + echo "Using clean Miniconda install" + echo "Not using ccache" + rm -rf "$MINICONDA_DIR" + # install miniconda + if [ "${TRAVIS_OS_NAME}" == "osx" ]; then + wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + else + wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + fi + bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 -echo "update conda" -conda config --set ssl_verify false || exit 1 -conda config --set always_yes true --set changeps1 false || exit 1 -conda update -q conda + echo "update conda" + conda config --set ssl_verify false || exit 1 + conda config --set always_yes true --set changeps1 false || exit 1 + conda update -q conda -# add the pandas channel *before* defaults to have defaults take priority -echo "add channels" -conda config --add channels pandas || exit 1 -conda config --remove channels defaults || exit 1 -conda config --add channels defaults || exit 1 + # add the pandas channel *before* defaults to have defaults take priority + echo "add channels" + conda config --add channels pandas || exit 1 + conda config --remove channels defaults || exit 1 + conda config --add channels defaults || exit 1 -conda install anaconda-client + conda install anaconda-client -# Useful for debugging any issues with conda -conda info -a || exit 1 + # Useful for debugging any issues with conda + conda info -a || exit 1 + time conda create -n pandas python=$TRAVIS_PYTHON_VERSION nose coverage flake8 || exit 1 + +fi # build deps REQ="ci/requirements-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.build" -time conda create -n pandas python=$TRAVIS_PYTHON_VERSION nose coverage flake8 || exit 1 # may have additional installation instructions for this build INSTALL="ci/install-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.sh" @@ -107,16 +102,6 @@ time conda install -n pandas --file=${REQ} || exit 1 source activate pandas -# set the compiler cache to work -if [[ "$IRON_TOKEN" && "${TRAVIS_OS_NAME}" == "linux" ]]; then - export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH - gcc=$(which gcc) - echo "gcc: $gcc" - ccache=$(which ccache) - echo "ccache: $ccache" - export CC='ccache gcc' -fi - if [ "$BUILD_TEST" ]; then # build testing diff --git a/ci/prep_ccache.sh b/ci/prep_ccache.sh deleted file mode 100755 index 7e586cc4d3085..0000000000000 --- a/ci/prep_ccache.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -if [ "${TRAVIS_OS_NAME}" != "linux" ]; then - echo "not using ccache on non-linux" - exit 0 -fi - -if [ "$IRON_TOKEN" ]; then - - home_dir=$(pwd) - - # install the compiler cache - sudo apt-get $APT_ARGS install ccache p7zip-full - # iron_cache, pending py3 fixes upstream - pip install -I --allow-external --allow-insecure git+https://github.com/iron-io/iron_cache_python.git@8a451c7d7e4d16e0c3bedffd0f280d5d9bd4fe59#egg=iron_cache - - python ci/ironcache/get.py - ccache -C - - clear_cache=0 - if [ -f ~/ccache.7z ]; then - echo "Cache retrieved" - clear_cache=1 - cd $HOME - 7za e $HOME/ccache.7z - # ls -l $HOME - cd / - tar xvf $HOME/ccache - rm -rf $HOME/ccache.7z - rm -rf $HOME/ccache - - fi - - # did the last commit change cython files? - cd $home_dir - - retval=$(git diff HEAD~3 --numstat | grep -P "pyx|pxd"|wc -l) - echo "number of cython files changed: $retval" - - if [ $clear_cache -eq 1 ] && [ $retval -eq 0 ] - then - # nope, reuse cython files - echo "Will reuse cached cython file" - touch "$TRAVIS_BUILD_DIR"/pandas/*.c - touch "$TRAVIS_BUILD_DIR"/pandas/src/*.c - touch "$TRAVIS_BUILD_DIR"/pandas/*.cpp - else - echo "Rebuilding cythonized files" - fi -fi - -exit 0 diff --git a/ci/prep_cython_cache.sh b/ci/prep_cython_cache.sh new file mode 100755 index 0000000000000..162f7a1034be6 --- /dev/null +++ b/ci/prep_cython_cache.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +ls "$HOME/.cache/" +CACHE_File="$HOME/.cache/cython_files.tar" + +clear_cache=0 +home_dir=$(pwd) + +if [ -f "$CACHE_File" ] && [ "$USE_CACHE" ]; then + + echo "Cache available" + clear_cache=1 + # did the last commit change cython files? + # go back 2 commits + if [ "$TRAVIS_PULL_REQUEST" == "false" ] + then + echo "Not a PR: checking for cython files changes from last 2 commits" + git diff HEAD~2 --numstat | grep -E "pyx|pxd" + retval=$(git diff HEAD~2 --numstat | grep -E "pyx|pxd"| wc -l) + else + echo "PR: checking for any cython file changes from last 5 commits" + git diff PR_HEAD~5 --numstat | grep -E "pyx|pxd" + retval=$(git diff PR_HEAD~5 --numstat | grep -E "pyx|pxd"| wc -l) + fi + echo "number of cython files changed: $retval" +fi + +if [ $clear_cache -eq 1 ] && [ $retval -eq 0 ] && [ "$USE_CACHE" ] +then + # nope, reuse cython files + echo "Will reuse cached cython file" + cd / + tar xvmf $CACHE_File + cd $home_dir +else + echo "Rebuilding cythonized files" + echo "Use cache = $USE_CACHE" + echo "Clear cache = $clear_cache" +fi + + +exit 0 diff --git a/ci/submit_ccache.sh b/ci/submit_ccache.sh deleted file mode 100755 index 7630bb7cc2760..0000000000000 --- a/ci/submit_ccache.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -if [ "${TRAVIS_OS_NAME}" != "linux" ]; then - echo "not using ccache on non-linux" - exit 0 -fi - -if [ "$IRON_TOKEN" ]; then - - home_dir=$(pwd) - ccache -s - - MISSES=$(ccache -s | grep "cache miss" | grep -Po "\d+") - echo "MISSES: $MISSES" - - if [ x"$MISSES" == x"0" ]; then - echo "No cache misses detected, skipping upload" - exit 0 - fi - - # install the compiler cache - sudo apt-get $APT_ARGS install ccache p7zip-full - # iron_cache, pending py3 fixes upstream - pip install -I --allow-external --allow-insecure git+https://github.com/iron-io/iron_cache_python.git@8a451c7d7e4d16e0c3bedffd0f280d5d9bd4fe59#egg=iron_cache - - rm -rf $HOME/ccache.7z - - tar cf - $HOME/.ccache \ - "$TRAVIS_BUILD_DIR"/pandas/{index,algos,lib,tslib,parser,hashtable}.c \ - "$TRAVIS_BUILD_DIR"/pandas/src/{sparse,testing}.c \ - "$TRAVIS_BUILD_DIR"/pandas/msgpack.cpp \ - | 7za a -si $HOME/ccache.7z - - split -b 500000 -d $HOME/ccache.7z $HOME/ccache. - - python ci/ironcache/put.py -fi - -exit 0 diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh new file mode 100755 index 0000000000000..3d41d652960c9 --- /dev/null +++ b/ci/submit_cython_cache.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +CACHE_File="$HOME/.cache/cython_files.tar" +rm -rf $CACHE_File + +home_dir=$(pwd) + +pyx_files=`find ${TRAVIS_BUILD_DIR} -name "*.pyx"` +echo "pyx files:" +echo $pyx_files + +tar cf ${CACHE_File} --files-from /dev/null + +for i in ${pyx_files} +do + f=${i%.pyx} + ls $f.{c,cpp} | tar rf ${CACHE_File} -T - +done + +echo "Cython files in cache tar:" +tar tvf ${CACHE_File} + +exit 0 diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index e74568f39418c..70df1df336704 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -4,6 +4,7 @@ import struct import subprocess import codecs +import locale import importlib @@ -47,6 +48,7 @@ def get_sys_info(): ("byteorder", "%s" % sys.byteorder), ("LC_ALL", "%s" % os.environ.get('LC_ALL', "None")), ("LANG", "%s" % os.environ.get('LANG', "None")), + ("LOCALE", "%s.%s" % locale.getlocale()), ]) except: From bf4786a15f51d14ccf1318e44ae080868d8595ee Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 24 Jun 2016 14:04:12 +0200 Subject: [PATCH 055/359] DOC: various doc build fixes (#13502) * DOC: fix indentation error in nth docstring * DOC: suppress warning for deprecated matplotlib style * DOC: fix code block directive * DOC: fix duplicate explicit target name * DOC: fix malformed hyperlink target * DOC: use valid header symbols * DOC: fix interlinking to scipy * DOC: fix link to python docs on range --- doc/source/advanced.rst | 2 +- doc/source/conf.py | 2 +- doc/source/io.rst | 2 +- doc/source/options.rst | 1 + doc/source/r_interface.rst | 4 ++-- doc/source/whatsnew/v0.18.0.txt | 6 +++--- doc/source/whatsnew/v0.18.1.txt | 2 +- pandas/core/groupby.py | 3 ++- 8 files changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index e50e792201d26..0c843dd39b56f 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -729,7 +729,7 @@ Int64Index and RangeIndex Prior to 0.18.0, the ``Int64Index`` would provide the default index for all ``NDFrame`` objects. ``RangeIndex`` is a sub-class of ``Int64Index`` added in version 0.18.0, now providing the default index for all ``NDFrame`` objects. -``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analagous to python :ref:`range types `. +``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analagous to python `range types `__. .. _indexing.float64index: diff --git a/doc/source/conf.py b/doc/source/conf.py index 6ceeee4ad6afb..99126527759f6 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -292,7 +292,7 @@ 'matplotlib': ('http://matplotlib.org/', None), 'python': ('http://docs.python.org/3', None), 'numpy': ('http://docs.scipy.org/doc/numpy', None), - 'scipy': ('http://docs.scipy.org/doc/scipy', None), + 'scipy': ('http://docs.scipy.org/doc/scipy/reference', None), 'py': ('http://pylib.readthedocs.org/en/latest/', None) } import glob diff --git a/doc/source/io.rst b/doc/source/io.rst index b011072d8c3fb..e9bd029b30537 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4789,7 +4789,7 @@ Reading Space on disk (in bytes) -.. code-block:: +.. code-block:: none 25843712 Apr 8 14:11 test.sql 24007368 Apr 8 14:11 test_fixed.hdf diff --git a/doc/source/options.rst b/doc/source/options.rst index d761d827006be..25f03df4040a3 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -71,6 +71,7 @@ with no argument ``describe_option`` will print out the descriptions for all ava .. ipython:: python :suppress: + :okwarning: pd.reset_option("all") diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index efe403c85f330..7e72231c21b15 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -17,7 +17,7 @@ rpy2 / R interface In v0.16.0, the ``pandas.rpy`` interface has been **deprecated and will be removed in a future version**. Similar functionality can be accessed - through the `rpy2 `_ project. + through the `rpy2 `__ project. See the :ref:`updating ` section for a guide to port your code from the ``pandas.rpy`` to ``rpy2`` functions. @@ -73,7 +73,7 @@ The ``convert_to_r_matrix`` function can be replaced by the normal comparison to the ones in pandas, please report this at the `issue tracker `_. -See also the documentation of the `rpy2 `_ project. +See also the documentation of the `rpy2 `__ project. R interface with rpy2 diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 93c76bc80684f..7418cd0e6baa3 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -764,7 +764,7 @@ yields a ``Resampler``. r Downsampling -'''''''''''' +"""""""""""" You can then use this object to perform operations. These are downsampling operations (going from a higher frequency to a lower one). @@ -796,7 +796,7 @@ These accessors can of course, be combined r[['A','B']].agg(['mean','sum']) Upsampling -'''''''''' +"""""""""" .. currentmodule:: pandas.tseries.resample @@ -842,7 +842,7 @@ New API In the new API, you can either downsample OR upsample. The prior implementation would allow you to pass an aggregator function (like ``mean``) even though you were upsampling, providing a bit of confusion. Previous API will work but with deprecations -'''''''''''''''''''''''''''''''''''''''''''' +"""""""""""""""""""""""""""""""""""""""""""" .. warning:: diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 51982c42499ff..ba14ac51012c7 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -374,7 +374,7 @@ New Behavior: df.groupby('c', sort=False).nth(1) -.. _whatsnew_0181.numpy_compatibility +.. _whatsnew_0181.numpy_compatibility: numpy function compatibility ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index cc639b562dab8..f6915e962c049 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1197,7 +1197,8 @@ def nth(self, n, dropna=None): 1 4 5 6 - # NaNs denote group exhausted when using dropna + NaNs denote group exhausted when using dropna + >>> g.nth(1, dropna='any') B A From 1a9abc44bbfd65675fd99701fe33aad8805ab147 Mon Sep 17 00:00:00 2001 From: Neil Parley Date: Mon, 27 Jun 2016 09:03:34 -0400 Subject: [PATCH 056/359] Force rebuilding of python files in PR Force the rebuilding of cython files in PR as can't rely on the git history Author: Neil Parley Closes #13515 from nparley/no-pr-cache and squashes the following commits: 75e71e7 [Neil Parley] Undo test commit 7fcccc0 [Neil Parley] Testing commit 3cc7afe [Neil Parley] Force rebuilding of python files in PR --- ci/prep_cython_cache.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/prep_cython_cache.sh b/ci/prep_cython_cache.sh index 162f7a1034be6..3295bd15d016c 100755 --- a/ci/prep_cython_cache.sh +++ b/ci/prep_cython_cache.sh @@ -21,6 +21,8 @@ if [ -f "$CACHE_File" ] && [ "$USE_CACHE" ]; then echo "PR: checking for any cython file changes from last 5 commits" git diff PR_HEAD~5 --numstat | grep -E "pyx|pxd" retval=$(git diff PR_HEAD~5 --numstat | grep -E "pyx|pxd"| wc -l) + echo "Forcing cython rebuild due to possibility of history rewritting in PR" + retval=-1 fi echo "number of cython files changed: $retval" fi From 9e73c714433fe071a4441bf1e22dd6c01bd01717 Mon Sep 17 00:00:00 2001 From: Amol Agrawal Date: Wed, 29 Jun 2016 03:48:02 +0530 Subject: [PATCH 057/359] DOC: Added additional example for groupby by indexer. (#13276) --- doc/source/groupby.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 02309fe5d6509..c9095c3ae1a60 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1015,6 +1015,23 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on df df.groupby(df.sum(), axis=1).sum() +Groupby by Indexer to 'resample' data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Resampling produces new hypothetical samples(resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. + +In order to resample to work on indices that are non-datetimelike , the following procedure can be utilized. + +In the following examples, **df.index // 5** returns a binary array which is used to determine what get's selected for the groupby operation. + +.. note:: The below example shows how we can downsample by consolidation of samples into fewer samples. Here by using **df.index // 5**, we are aggregating the samples in bins. By applying **std()** function, we aggregate the information contained in many samples into a small subset of values which is their standard deviation thereby reducing the number of samples. + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(10,2)) + df + df.index // 5 + df.groupby(df.index // 5).std() Returning a Series to propagate names ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 30a313350edde27fae4cf5824b1f23ebbe529306 Mon Sep 17 00:00:00 2001 From: Chris Warth Date: Wed, 29 Jun 2016 03:15:56 -0700 Subject: [PATCH 058/359] update visualization projects (#13528) --- doc/source/ecosystem.rst | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 0d7315d20eac3..51e00d2e01fd0 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -77,8 +77,16 @@ more advanced types of plots then those offered by pandas. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The `Vincent `__ project leverages `Vega `__ -(that in turn, leverages `d3 `__) to create plots . It has great support -for pandas data objects. +(that in turn, leverages `d3 `__) to create +plots. Although functional, as of Summer 2016 the Vincent project has not been updated +in over two years and is `unlikely to receive further updates `__. + +`IPython Vega `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Like Vincent, the `IPython Vega `__ project leverages `Vega +`__ to create plots, but primarily +targets the IPython Notebook environment. `Plotly `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 3944a369265f27268d1b3867a161e97f9c63cd62 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 29 Jun 2016 14:16:22 +0200 Subject: [PATCH 059/359] BUG: date_range closed keyword with timezone aware start/end (GH12684) (#13510) --- doc/source/whatsnew/v0.18.1.txt | 2 +- pandas/tseries/index.py | 7 ++++++ pandas/tseries/tests/test_daterange.py | 35 ++++++++++++++++++-------- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index ba14ac51012c7..7f74d8a769e4b 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -658,7 +658,7 @@ Bug Fixes - Bug in ``CategoricalIndex.get_loc`` returns different result from regular ``Index`` (:issue:`12531`) - Bug in ``PeriodIndex.resample`` where name not propagated (:issue:`12769`) - +- Bug in ``date_range`` ``closed`` keyword and timezones (:issue:`12684`). - Bug in ``pd.concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`) - Bug in ``pd.concat`` did not handle empty ``Series`` properly (:issue:`11082`) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index af60a2d028c93..77500081be62c 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -541,6 +541,13 @@ def _generate(cls, start, end, periods, name, offset, ambiguous=ambiguous) index = index.view(_NS_DTYPE) + # index is localized datetime64 array -> have to convert + # start/end as well to compare + if start is not None: + start = start.tz_localize(tz).asm8 + if end is not None: + end = end.tz_localize(tz).asm8 + if not left_closed and len(index) and index[0] == start: index = index[1:] if not right_closed and len(index) and index[-1] == end: diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 6ad33b6b973de..854b60c17853b 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -485,7 +485,7 @@ def test_range_closed(self): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) - for freq in ["3D", "2M", "7W", "3H", "A"]: + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: closed = date_range(begin, end, closed=None, freq=freq) left = date_range(begin, end, closed="left", freq=freq) right = date_range(begin, end, closed="right", freq=freq) @@ -501,11 +501,11 @@ def test_range_closed(self): self.assert_index_equal(expected_right, right) def test_range_closed_with_tz_aware_start_end(self): - # GH12409 + # GH12409, GH12684 begin = Timestamp('2011/1/1', tz='US/Eastern') end = Timestamp('2014/1/1', tz='US/Eastern') - for freq in ["3D", "2M", "7W", "3H", "A"]: + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: closed = date_range(begin, end, closed=None, freq=freq) left = date_range(begin, end, closed="left", freq=freq) right = date_range(begin, end, closed="right", freq=freq) @@ -520,15 +520,28 @@ def test_range_closed_with_tz_aware_start_end(self): self.assert_index_equal(expected_left, left) self.assert_index_equal(expected_right, right) - # test with default frequency, UTC - begin = Timestamp('2011/1/1', tz='UTC') - end = Timestamp('2014/1/1', tz='UTC') + begin = Timestamp('2011/1/1') + end = Timestamp('2014/1/1') + begintz = Timestamp('2011/1/1', tz='US/Eastern') + endtz = Timestamp('2014/1/1', tz='US/Eastern') + + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: + closed = date_range(begin, end, closed=None, freq=freq, + tz='US/Eastern') + left = date_range(begin, end, closed="left", freq=freq, + tz='US/Eastern') + right = date_range(begin, end, closed="right", freq=freq, + tz='US/Eastern') + expected_left = left + expected_right = right - intervals = ['left', 'right', None] - for i in intervals: - result = date_range(start=begin, end=end, closed=i) - self.assertEqual(result[0], begin) - self.assertEqual(result[-1], end) + if endtz == closed[-1]: + expected_left = closed[:-1] + if begintz == closed[0]: + expected_right = closed[1:] + + self.assert_index_equal(expected_left, left) + self.assert_index_equal(expected_right, right) def test_range_closed_boundary(self): # GH 11804 From e8e0aae92d8c93558c06c58817bc9055d9f0a229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Thu, 30 Jun 2016 18:59:11 -0400 Subject: [PATCH 060/359] DOC: Update documentation for rename (#13533) --- doc/source/basics.rst | 7 +++++-- pandas/core/generic.py | 9 +++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 917d2f2bb8b04..8145e9536a82a 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1159,14 +1159,17 @@ mapping (a dict or Series) or an arbitrary function. s.rename(str.upper) If you pass a function, it must return a value when called with any of the -labels (and must produce a set of unique values). But if you pass a dict or -Series, it need only contain a subset of the labels as keys: +labels (and must produce a set of unique values). A dict or +Series can also be used: .. ipython:: python df.rename(columns={'one' : 'foo', 'two' : 'bar'}, index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'}) +If the mapping doesn't include a column/index label, it isn't renamed. Also +extra labels in the mapping don't throw an error. + The :meth:`~DataFrame.rename` method also provides an ``inplace`` named parameter that is by default ``False`` and copies the underlying data. Pass ``inplace=True`` to rename the data in place. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 348281d1a7e30..5ce9161fdffb0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -555,8 +555,8 @@ def swaplevel(self, i=-2, j=-1, axis=0): _shared_docs['rename'] = """ Alter axes input function or functions. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left - as-is. Alternatively, change ``Series.name`` with a scalar - value (Series only). + as-is. Extra labels listed don't throw an error. Alternatively, change + ``Series.name`` with a scalar value (Series only). Parameters ---------- @@ -611,6 +611,11 @@ def swaplevel(self, i=-2, j=-1, axis=0): 0 1 4 1 2 5 2 3 6 + >>> df.rename(index=str, columns={"A": "a", "C": "c"}) + a B + 0 1 4 + 1 2 5 + 2 3 6 """ @Appender(_shared_docs['rename'] % dict(axes='axes keywords for this' From a01644c07f376b4e2eb48d8dfca7222473e3088d Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Fri, 1 Jul 2016 10:18:37 -0400 Subject: [PATCH 061/359] BUG: Can't store callables using __setitem__ (#13516) * BUG: Can't store callables using __setitem__ * Use an internal method instead of a new parameter * Add a docstring, fix pep8 complaints * Move whatsnew entry to API section --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 102 +++++++++++++++------------ pandas/core/series.py | 2 +- pandas/tests/frame/test_indexing.py | 10 +++ pandas/tests/series/test_indexing.py | 10 +++ 6 files changed, 79 insertions(+), 49 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 40fec4d071f16..eae03b2a86661 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -421,6 +421,7 @@ Other API changes - ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) - ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) - ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) +- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) .. _whatsnew_0182.deprecations: @@ -482,7 +483,6 @@ Bug Fixes - Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) - Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) - - Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) - Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) - Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b4b35953b4282..e804271d8afa9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2395,7 +2395,7 @@ def _setitem_frame(self, key, value): self._check_inplace_setting(value) self._check_setitem_copy() - self.where(-key, value, inplace=True) + self._where(-key, value, inplace=True) def _ensure_valid_index(self, value): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5ce9161fdffb0..6b25cf6ed71a1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4424,54 +4424,14 @@ def _align_series(self, other, join='outer', axis=None, level=None, right = right.fillna(fill_value, method=method, limit=limit) return left.__finalize__(self), right.__finalize__(other) - _shared_docs['where'] = (""" - Return an object of same shape as self and whose corresponding - entries are from self where cond is %(cond)s and otherwise are from - other. - - Parameters - ---------- - cond : boolean %(klass)s, array or callable - If cond is callable, it is computed on the %(klass)s and - should return boolean %(klass)s or array. - The callable must not change input %(klass)s - (though pandas doesn't check it). - - .. versionadded:: 0.18.1 - - A callable can be used as cond. - - other : scalar, %(klass)s, or callable - If other is callable, it is computed on the %(klass)s and - should return scalar or %(klass)s. - The callable must not change input %(klass)s - (though pandas doesn't check it). - - .. versionadded:: 0.18.1 - - A callable can be used as other. - - inplace : boolean, default False - Whether to perform the operation in place on the data - axis : alignment axis if needed, default None - level : alignment level if needed, default None - try_cast : boolean, default False - try to cast the result back to the input type (if possible), - raise_on_error : boolean, default True - Whether to raise on invalid data types (e.g. trying to where on - strings) - - Returns - ------- - wh : same type as caller - """) - - @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True")) - def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - try_cast=False, raise_on_error=True): + def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, + try_cast=False, raise_on_error=True): + """ + Equivalent to public method `where`, except that `other` is not + applied as a function even if callable. Used in __setitem__. + """ cond = com._apply_if_callable(cond, self) - other = com._apply_if_callable(other, self) if isinstance(cond, NDFrame): cond, _ = cond.align(self, join='right', broadcast_axis=1) @@ -4627,6 +4587,56 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, return self._constructor(new_data).__finalize__(self) + _shared_docs['where'] = (""" + Return an object of same shape as self and whose corresponding + entries are from self where cond is %(cond)s and otherwise are from + other. + + Parameters + ---------- + cond : boolean %(klass)s, array or callable + If cond is callable, it is computed on the %(klass)s and + should return boolean %(klass)s or array. + The callable must not change input %(klass)s + (though pandas doesn't check it). + + .. versionadded:: 0.18.1 + + A callable can be used as cond. + + other : scalar, %(klass)s, or callable + If other is callable, it is computed on the %(klass)s and + should return scalar or %(klass)s. + The callable must not change input %(klass)s + (though pandas doesn't check it). + + .. versionadded:: 0.18.1 + + A callable can be used as other. + + inplace : boolean, default False + Whether to perform the operation in place on the data + axis : alignment axis if needed, default None + level : alignment level if needed, default None + try_cast : boolean, default False + try to cast the result back to the input type (if possible), + raise_on_error : boolean, default True + Whether to raise on invalid data types (e.g. trying to where on + strings) + + Returns + ------- + wh : same type as caller + """) + + @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True")) + def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, + try_cast=False, raise_on_error=True): + + other = com._apply_if_callable(other, self) + return self._where(cond, other, inplace, axis, level, try_cast, + raise_on_error) + @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False")) def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): diff --git a/pandas/core/series.py b/pandas/core/series.py index cf1639bacc3be..e2726bef0bd03 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -738,7 +738,7 @@ def setitem(key, value): if is_bool_indexer(key): key = check_bool_indexer(self.index, key) try: - self.where(~key, value, inplace=True) + self._where(~key, value, inplace=True) return except InvalidIndexError: pass diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 78354f32acbda..d7fed8131a4f4 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -207,6 +207,16 @@ def test_setitem_callable(self): exp = pd.DataFrame({'A': [11, 12, 13, 14], 'B': [5, 6, 7, 8]}) tm.assert_frame_equal(df, exp) + def test_setitem_other_callable(self): + # GH 13299 + inc = lambda x: x + 1 + + df = pd.DataFrame([[-1, 1], [1, -1]]) + df[df > 0] = inc + + expected = pd.DataFrame([[-1, inc], [inc, -1]]) + tm.assert_frame_equal(df, expected) + def test_getitem_boolean(self): # boolean indexing d = self.tsframe.index[10] diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index d01ac3e1aef42..15ca238ee32a0 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -441,6 +441,16 @@ def test_setitem_callable(self): s[lambda x: 'A'] = -1 tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list('ABCD'))) + def test_setitem_other_callable(self): + # GH 13299 + inc = lambda x: x + 1 + + s = pd.Series([1, 2, -1, 4]) + s[s < 0] = inc + + expected = pd.Series([1, 2, inc, 4]) + tm.assert_series_equal(s, expected) + def test_slice(self): numSlice = self.series[10:20] numSliceEnd = self.series[-10:] From ffb582c433dcb48cd06d219c21c68847cf19de32 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 3 Jul 2016 19:22:24 -0400 Subject: [PATCH 062/359] Removed unnecessary params in cum_func Picks up from #13167 by properly removing the parameters and ensuring that `numpy` compatibility has been maintained. The current test suite does a good job of checking that already, so no tests were added. Closes #13541. Author: gfyoung Closes #13550 from gfyoung/cum-func-cleanup and squashes the following commits: 6ba7a77 [gfyoung] Removed unnecessary params in cum_func --- pandas/compat/numpy/function.py | 20 ++++++++++++++++++-- pandas/core/generic.py | 4 ++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 274761f5d0b9c..15bf6d31b7109 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -21,7 +21,7 @@ from numpy import ndarray from pandas.util.validators import (validate_args, validate_kwargs, validate_args_and_kwargs) -from pandas.core.common import is_integer, UnsupportedFunctionCall +from pandas.core.common import is_bool, is_integer, UnsupportedFunctionCall from pandas.compat import OrderedDict @@ -148,10 +148,26 @@ def validate_clip_with_axis(axis, args, kwargs): CUM_FUNC_DEFAULTS = OrderedDict() CUM_FUNC_DEFAULTS['dtype'] = None CUM_FUNC_DEFAULTS['out'] = None -validate_cum_func = CompatValidator(CUM_FUNC_DEFAULTS, method='kwargs') +validate_cum_func = CompatValidator(CUM_FUNC_DEFAULTS, method='both', + max_fname_arg_count=1) validate_cumsum = CompatValidator(CUM_FUNC_DEFAULTS, fname='cumsum', method='both', max_fname_arg_count=1) + +def validate_cum_func_with_skipna(skipna, args, kwargs, name): + """ + If this function is called via the 'numpy' library, the third + parameter in its signature is 'dtype', which takes either a + 'numpy' dtype or 'None', so check if the 'skipna' parameter is + a boolean or not + """ + if not is_bool(skipna): + args = (skipna,) + args + skipna = True + + validate_cum_func(args, kwargs, fname=name) + return skipna + LOGICAL_FUNC_DEFAULTS = dict(out=None) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs') diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6b25cf6ed71a1..cc5c45158bf4f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5484,8 +5484,8 @@ def _make_cum_function(cls, name, name1, name2, axis_descr, desc, accum_func, axis_descr=axis_descr) @Appender("Return cumulative {0} over requested axis.".format(name) + _cnum_doc) - def cum_func(self, axis=None, dtype=None, out=None, skipna=True, **kwargs): - nv.validate_cum_func(tuple(), kwargs, fname=name) + def cum_func(self, axis=None, skipna=True, *args, **kwargs): + skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) if axis is None: axis = self._stat_axis_number else: From 449e8246a6f9dc6239be5659bbda840d8de7306a Mon Sep 17 00:00:00 2001 From: Neil Parley Date: Sun, 3 Jul 2016 19:24:13 -0400 Subject: [PATCH 063/359] Cython cache diff compare As talked about in #13425 with @gfyoung and @jreback this PR does cython caching by comparing the pyx files and not relying on the git history. Author: Neil Parley Closes #13526 from nparley/pyx-diff and squashes the following commits: 8b4ad0b [Neil Parley] Remove test commit 2dc2ce2 [Neil Parley] Merge branch 'upstream' into pyx-diff 81cbb7b [Neil Parley] Reset pyx changes 79fe0f0 [Neil Parley] Remove do_not_merge dca6e61 [Neil Parley] Test commit 30bd9da [Neil Parley] Check for the case where less cython file has been deleted aaf4dd2 [Neil Parley] Merge branch 'upstream' into pyx-diff 6cc3d6e [Neil Parley] Diff python pyx files as a test if they have changed --- ci/prep_cython_cache.sh | 66 ++++++++++++++++++++++++++++----------- ci/submit_cython_cache.sh | 12 +++++-- 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/ci/prep_cython_cache.sh b/ci/prep_cython_cache.sh index 3295bd15d016c..6f16dce2fb431 100755 --- a/ci/prep_cython_cache.sh +++ b/ci/prep_cython_cache.sh @@ -1,43 +1,73 @@ #!/bin/bash ls "$HOME/.cache/" + +PYX_CACHE_DIR="$HOME/.cache/pyxfiles" +pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx"` +pyx_cache_file_list=`find ${PYX_CACHE_DIR} -name "*.pyx"` + CACHE_File="$HOME/.cache/cython_files.tar" +# Clear the cython cache 0 = NO, 1 = YES clear_cache=0 + +pyx_files=`echo "$pyx_file_list" | wc -l` +pyx_cache_files=`echo "$pyx_cache_file_list" | wc -l` + +if [[ pyx_files -ne pyx_cache_files ]] +then + echo "Different number of pyx files" + clear_cache=1 +fi + home_dir=$(pwd) -if [ -f "$CACHE_File" ] && [ "$USE_CACHE" ]; then +if [ -f "$CACHE_File" ] && [ "$USE_CACHE" ] && [ -d "$PYX_CACHE_DIR" ]; then + + echo "Cache available - checking pyx diff" + + for i in ${pyx_file_list} + do + diff=`diff -u $i $PYX_CACHE_DIR${i}` + if [[ $? -eq 2 ]] + then + echo "${i##*/} can't be diffed; probably not in cache" + clear_cache=1 + fi + if [[ ! -z $diff ]] + then + echo "${i##*/} has changed:" + echo $diff + clear_cache=1 + fi + done - echo "Cache available" - clear_cache=1 - # did the last commit change cython files? - # go back 2 commits if [ "$TRAVIS_PULL_REQUEST" == "false" ] then - echo "Not a PR: checking for cython files changes from last 2 commits" - git diff HEAD~2 --numstat | grep -E "pyx|pxd" - retval=$(git diff HEAD~2 --numstat | grep -E "pyx|pxd"| wc -l) + echo "Not a PR" + # Uncomment next 2 lines to turn off cython caching not in a PR + # echo "Non PR cython caching is disabled" + # clear_cache=1 else - echo "PR: checking for any cython file changes from last 5 commits" - git diff PR_HEAD~5 --numstat | grep -E "pyx|pxd" - retval=$(git diff PR_HEAD~5 --numstat | grep -E "pyx|pxd"| wc -l) - echo "Forcing cython rebuild due to possibility of history rewritting in PR" - retval=-1 + echo "In a PR" + # Uncomment next 2 lines to turn off cython caching in a PR + # echo "PR cython caching is disabled" + # clear_cache=1 fi - echo "number of cython files changed: $retval" + fi -if [ $clear_cache -eq 1 ] && [ $retval -eq 0 ] && [ "$USE_CACHE" ] +if [ $clear_cache -eq 0 ] && [ "$USE_CACHE" ] then - # nope, reuse cython files + # No and use_cache is set echo "Will reuse cached cython file" cd / tar xvmf $CACHE_File cd $home_dir else echo "Rebuilding cythonized files" - echo "Use cache = $USE_CACHE" - echo "Clear cache = $clear_cache" + echo "Use cache (Blank if not set) = $USE_CACHE" + echo "Clear cache (1=YES) = $clear_cache" fi diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh index 3d41d652960c9..4f60df0ccb2d8 100755 --- a/ci/submit_cython_cache.sh +++ b/ci/submit_cython_cache.sh @@ -1,17 +1,23 @@ #!/bin/bash CACHE_File="$HOME/.cache/cython_files.tar" +PYX_CACHE_DIR="$HOME/.cache/pyxfiles" +pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx"` + rm -rf $CACHE_File +rm -rf $PYX_CACHE_DIR home_dir=$(pwd) -pyx_files=`find ${TRAVIS_BUILD_DIR} -name "*.pyx"` +mkdir $PYX_CACHE_DIR +rsync -Rv $pyx_file_list $PYX_CACHE_DIR + echo "pyx files:" -echo $pyx_files +echo $pyx_file_list tar cf ${CACHE_File} --files-from /dev/null -for i in ${pyx_files} +for i in ${pyx_file_list} do f=${i%.pyx} ls $f.{c,cpp} | tar rf ${CACHE_File} -T - From 8f8d75d315f5a61bbf2060ff821bff9b282a1582 Mon Sep 17 00:00:00 2001 From: Piotr Jucha Date: Sun, 3 Jul 2016 19:25:54 -0400 Subject: [PATCH 064/359] BUG: Fix groupby with "as_index" for categorical multi #13204 closes #13204 Fixes a bug that returns all nan's for groupby(as_index=False) with multiple column groupers containing a categorical one (#13204). Also: fixes an internal bug in the string representation of `Grouping`. Author: Piotr Jucha Closes #13394 from pijucha/groupbycat13204 and squashes the following commits: 374402c [Piotr Jucha] BUG: Fix groupby with as_index for categorical multi groupers #13204 --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/groupby.py | 38 +++++++++++++++++++++--- pandas/tests/test_groupby.py | 51 +++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index eae03b2a86661..be1f745537d05 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -527,3 +527,4 @@ Bug Fixes - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) +- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f6915e962c049..04e4db9d1fdc6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2250,7 +2250,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = to_timedelta(self.grouper) def __repr__(self): - return 'Grouping(%s)' % self.name + return 'Grouping({0})'.format(self.name) def __iter__(self): return iter(self.indices) @@ -3741,9 +3741,39 @@ def _reindex_output(self, result): return result levels_list = [ping.group_index for ping in groupings] - index = MultiIndex.from_product(levels_list, names=self.grouper.names) - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d).sortlevel(axis=self.axis) + index, _ = MultiIndex.from_product( + levels_list, names=self.grouper.names).sortlevel() + + if self.as_index: + d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + return result.reindex(**d) + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `result`. An idea is to do: + # result = result.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `result`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = [(i, ping.name) for (i, ping) + in enumerate(groupings) if ping.in_axis] + g_nums, g_names = zip(*in_axis_grps) + + result = result.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + result = result.set_index(self.grouper.result_index + ).reindex(index, copy=False) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + result = result.reset_index(level=g_nums) + + return result.reset_index(drop=True) def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6659e6b106a67..bc25525f936ac 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6304,6 +6304,47 @@ def test_groupby_categorical_two_columns(self): nan, nan, nan, nan, 200, 34]}, index=idx) tm.assert_frame_equal(res, exp) + def test_groupby_multi_categorical_as_index(self): + # GH13204 + df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), + 'A': [10, 11, 11], + 'B': [101, 102, 103]}) + result = df.groupby(['cat', 'A'], as_index=False).sum() + expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # function grouper + f = lambda r: df.loc[r, 'A'] + result = df.groupby(['cat', f], as_index=False).sum() + expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') + result = df.groupby(['cat', s], as_index=False).sum() + expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # is original index dropped? + expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + + for name in [None, 'X', 'B', 'cat']: + df.index = Index(list("abc"), name=name) + result = df.groupby(['cat', 'A'], as_index=False).sum() + tm.assert_frame_equal(result, expected, check_index_type=True) + def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. @@ -6431,6 +6472,16 @@ def test_numpy_compat(self): tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, getattr(g, func), foo=1) + def test_grouping_string_repr(self): + # GH 13394 + mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) + df = DataFrame([[1, 2, 3]], columns=mi) + gr = df.groupby(df[('A', 'a')]) + + result = gr.grouper.groupings[0].__repr__() + expected = "Grouping(('A', 'a'))" + tm.assert_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() From 7c9ba14c6ddd91916ffd7fc5548731265533e88d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Jul 2016 12:37:52 +0200 Subject: [PATCH 065/359] DOC: update sphinx requirements for doc building (#13532) --- doc/source/contributing.rst | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 6e0c747cd06fc..51fa2a9de953b 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -351,31 +351,33 @@ How to build the *pandas* documentation Requirements ~~~~~~~~~~~~ -To build the *pandas* docs there are some extra requirements: you will need to +First, you need to have a development environment to be able to build pandas +(see the docs on :ref:`creating a development environment above `). +Further, to build the docs, there are some extra requirements: you will need to have ``sphinx`` and ``ipython`` installed. `numpydoc `_ is used to parse the docstrings that follow the Numpy Docstring Standard (see above), but you don't need to install this because a local copy of numpydoc is included in the *pandas* source code. +`nbconvert `_ and +`nbformat `_ are required to build +the Jupyter notebooks included in the documentation. -It is easiest to :ref:`create a development environment `, then install:: +If you have a conda environment named ``pandas_dev``, you can install the extra +requirements with:: conda install -n pandas_dev sphinx ipython nbconvert nbformat -Furthermore, it is recommended to have all `optional dependencies -`_ +Furthermore, it is recommended to have all :ref:`optional dependencies `. installed. This is not strictly necessary, but be aware that you will see some error messages when building the docs. This happens because all the code in the documentation is executed during the doc build, and so code examples using optional dependencies will generate errors. Run ``pd.show_versions()`` to get an overview of the installed version of all dependencies. -`nbconvert `_ and `nbformat `_ are required to build the Jupyter notebooks -included in the documentation. .. warning:: - You need to have ``sphinx`` version 1.2.2 or newer, but older than version 1.3. - Versions before 1.1.3 should also work. + You need to have ``sphinx`` version >= 1.3.2. Building the documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~ From f20b41e73fdc84089289cd3999762892f85ec4b6 Mon Sep 17 00:00:00 2001 From: Yuichiro Kaneko Date: Tue, 5 Jul 2016 01:01:16 +0900 Subject: [PATCH 066/359] TST: Move `test_crosstab_margins` to `TestPivotTable` (#13553) This test case assert `pivot_table` method. So it should be defined on `TestPivotTable`. --- pandas/tools/tests/test_pivot.py | 40 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 7ec4018d301af..cda2343fbb842 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -801,6 +801,26 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): expected = pd.DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) + def test_categorical_margins(self): + # GH 10989 + df = pd.DataFrame({'x': np.arange(8), + 'y': np.arange(8) // 4, + 'z': np.arange(8) % 2}) + + expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) + expected.index = Index([0, 1, 'All'], name='y') + expected.columns = Index([0, 1, 'All'], name='z') + + data = df.copy() + table = data.pivot_table('x', 'y', 'z', margins=True) + tm.assert_frame_equal(table, expected) + + data = df.copy() + data.y = data.y.astype('category') + data.z = data.z.astype('category') + table = data.pivot_table('x', 'y', 'z', margins=True) + tm.assert_frame_equal(table, expected) + class TestCrosstab(tm.TestCase): @@ -919,26 +939,6 @@ def test_crosstab_dropna(self): names=['b', 'c']) tm.assert_index_equal(res.columns, m) - def test_categorical_margins(self): - # GH 10989 - df = pd.DataFrame({'x': np.arange(8), - 'y': np.arange(8) // 4, - 'z': np.arange(8) % 2}) - - expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) - expected.index = Index([0, 1, 'All'], name='y') - expected.columns = Index([0, 1, 'All'], name='z') - - data = df.copy() - table = data.pivot_table('x', 'y', 'z', margins=True) - tm.assert_frame_equal(table, expected) - - data = df.copy() - data.y = data.y.astype('category') - data.z = data.z.astype('category') - table = data.pivot_table('x', 'y', 'z', margins=True) - tm.assert_frame_equal(table, expected) - def test_crosstab_no_overlap(self): # GS 10291 From 66253672592c694a5c00d30cfb2bae28ea25ae2a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Jul 2016 10:37:17 +0200 Subject: [PATCH 067/359] TST: confirm bug in partial string multi-index slicing is fixed (GH12685) (#13559) --- pandas/tests/indexes/test_multi.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index bec52f5f47b09..fb5576bed90b4 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -6,7 +6,7 @@ import re import warnings -from pandas import (date_range, MultiIndex, Index, CategoricalIndex, +from pandas import (DataFrame, date_range, MultiIndex, Index, CategoricalIndex, compat) from pandas.core.common import PerformanceWarning from pandas.indexes.base import InvalidIndexError @@ -2201,6 +2201,15 @@ def test_partial_string_timestamp_multiindex(self): with assertRaises(KeyError): df_swap.loc['2016-01-01'] + # GH12685 (partial string with daily resolution or below) + dr = date_range('2013-01-01', periods=100, freq='D') + ix = MultiIndex.from_product([dr, ['a', 'b']]) + df = DataFrame(np.random.randn(200, 1), columns=['A'], index=ix) + + result = df.loc[idx['2013-03':'2013-03', :], :] + expected = df.iloc[118:180] + tm.assert_frame_equal(result, expected) + def test_rangeindex_fallback_coercion_bug(self): # GH 12893 foo = pd.DataFrame(np.arange(100).reshape((10, 10))) From 4aff800e321bd9abac2dbefa3ffcc808c3f525ee Mon Sep 17 00:00:00 2001 From: "Francis T. O'Donovan" Date: Tue, 5 Jul 2016 04:38:26 -0400 Subject: [PATCH 068/359] DOC: Update old Google Code and SourceForge links (#13534) --- LICENSES/ULTRAJSON_LICENSE | 4 ++-- README.md | 4 ++-- doc/source/conf.py | 2 +- doc/source/ecosystem.rst | 4 ++-- doc/source/index.rst.template | 2 +- doc/source/install.rst | 6 +++--- doc/source/io.rst | 2 +- doc/source/r_interface.rst | 2 +- doc/source/release.rst | 2 +- doc/source/whatsnew/v0.13.0.txt | 2 +- doc/source/whatsnew/v0.16.0.txt | 4 ++-- doc/source/whatsnew/v0.8.0.txt | 2 +- doc/sphinxext/numpydoc/phantom_import.py | 2 +- pandas/io/auth.py | 3 ++- pandas/io/ga.py | 2 +- pandas/io/stata.py | 2 +- pandas/src/klib/khash.h | 2 +- pandas/src/ujson/lib/ultrajson.h | 2 +- pandas/src/ujson/lib/ultrajsondec.c | 2 +- pandas/src/ujson/lib/ultrajsonenc.c | 2 +- pandas/src/ujson/python/JSONtoObj.c | 2 +- pandas/src/ujson/python/objToJSON.c | 2 +- pandas/src/ujson/python/py_defines.h | 2 +- pandas/src/ujson/python/ujson.c | 2 +- pandas/src/ujson/python/version.h | 2 +- pandas/stats/fama_macbeth.py | 3 ++- pandas/stats/ols.py | 3 ++- pandas/stats/plm.py | 6 ++++-- pandas/stats/var.py | 3 ++- pandas/util/decorators.py | 2 +- 30 files changed, 43 insertions(+), 37 deletions(-) diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE index defca46e7f820..3b2886eb9cfae 100644 --- a/LICENSES/ULTRAJSON_LICENSE +++ b/LICENSES/ULTRAJSON_LICENSE @@ -25,10 +25,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. - * Copyright (c) 1994 Sun Microsystems, Inc. \ No newline at end of file + * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/README.md b/README.md index 40b8e1e0a1272..e1149ac10795e 100644 --- a/README.md +++ b/README.md @@ -170,8 +170,8 @@ conda install pandas - [SciPy](http://www.scipy.org): miscellaneous statistical functions - [PyTables](http://www.pytables.org): necessary for HDF5-based storage - [SQLAlchemy](http://www.sqlalchemy.org): for SQL database support. Version 0.8.1 or higher recommended. -- [matplotlib](http://matplotlib.sourceforge.net/): for plotting -- [statsmodels](http://statsmodels.sourceforge.net/) +- [matplotlib](http://matplotlib.org/): for plotting +- [statsmodels](http://www.statsmodels.org/) - Needed for parts of `pandas.stats` - For Excel I/O: - [xlrd/xlwt](http://www.python-excel.org/) diff --git a/doc/source/conf.py b/doc/source/conf.py index 99126527759f6..a1b71f0279c7a 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -288,7 +288,7 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - 'statsmodels': ('http://statsmodels.sourceforge.net/devel/', None), + 'statsmodels': ('http://www.statsmodels.org/devel/', None), 'matplotlib': ('http://matplotlib.org/', None), 'python': ('http://docs.python.org/3', None), 'numpy': ('http://docs.scipy.org/doc/numpy', None), diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 51e00d2e01fd0..8fafe8ec9eaa2 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -24,7 +24,7 @@ substantial projects that you feel should be on this list, please let us know. Statistics and Machine Learning ------------------------------- -`Statsmodels `__ +`Statsmodels `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Statsmodels is the prominent python "statistics and econometrics library" and it has @@ -123,7 +123,7 @@ compatible with non-HTML IPython output formats.) qgrid is "an interactive grid for sorting and filtering DataFrames in IPython Notebook" built with SlickGrid. -`Spyder `__ +`Spyder `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Spyder is a cross-platform Qt-based open-source Python IDE with diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 6011c22e9cc2e..1996ad75ea92a 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -92,7 +92,7 @@ Some other notes specialized tool. - pandas is a dependency of `statsmodels - `__, making it an important part of the + `__, making it an important part of the statistical computing ecosystem in Python. - pandas has been used extensively in production in financial applications. diff --git a/doc/source/install.rst b/doc/source/install.rst index 0abaa70586d5a..b43d2b8aac517 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -252,7 +252,7 @@ Optional Dependencies - `pymysql `__: for MySQL. - `SQLite `__: for SQLite, this is included in Python's standard library by default. -* `matplotlib `__: for plotting +* `matplotlib `__: for plotting * `openpyxl `__, `xlrd/xlwt `__: Needed for Excel I/O * `XlsxWriter `__: Alternative Excel writer * `Jinja2 `__: Template engine for conditional HTML formatting. @@ -264,9 +264,9 @@ Optional Dependencies `__, `pygtk `__, `xsel `__, or `xclip - `__: necessary to use + `__: necessary to use :func:`~pandas.io.clipboard.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* Google's `python-gflags `__ , +* Google's `python-gflags <`__ , `oauth2client `__ , `httplib2 `__ and `google-api-python-client `__ diff --git a/doc/source/io.rst b/doc/source/io.rst index e9bd029b30537..da0444a8b8df9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4277,7 +4277,7 @@ to existing tables. You will need to install some additional dependencies: -- Google's `python-gflags `__ +- Google's `python-gflags `__ - `httplib2 `__ - `google-api-python-client `__ diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index 7e72231c21b15..f3df1ebdf25cb 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -73,7 +73,7 @@ The ``convert_to_r_matrix`` function can be replaced by the normal comparison to the ones in pandas, please report this at the `issue tracker `_. -See also the documentation of the `rpy2 `__ project. +See also the documentation of the `rpy2 `__ project. R interface with rpy2 diff --git a/doc/source/release.rst b/doc/source/release.rst index 37778c46a8ec0..df76c90d0f5e6 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -630,7 +630,7 @@ Highlights include: modules are deprecated. We refer users to external packages like `seaborn `_, `pandas-qt `_ and - `rpy2 `_ for similar or equivalent + `rpy2 `_ for similar or equivalent functionality, see :ref:`here ` See the :ref:`v0.16.0 Whatsnew ` overview or the issue tracker on GitHub for an extensive list diff --git a/doc/source/whatsnew/v0.13.0.txt b/doc/source/whatsnew/v0.13.0.txt index e8f2f54b873d6..0944d849cfafd 100644 --- a/doc/source/whatsnew/v0.13.0.txt +++ b/doc/source/whatsnew/v0.13.0.txt @@ -825,7 +825,7 @@ Experimental # Your Google BigQuery Project ID # To find this, see your dashboard: - # https://code.google.com/apis/console/b/0/?noredirect + # https://console.developers.google.com/iam-admin/projects?authuser=0 projectid = xxxxxxxxx; df = gbq.read_gbq(query, project_id = projectid) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 68a558a2b7fd0..4255f4839bca0 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -19,7 +19,7 @@ Highlights include: modules are deprecated. We refer users to external packages like `seaborn `_, `pandas-qt `_ and - `rpy2 `_ for similar or equivalent + `rpy2 `_ for similar or equivalent functionality, see :ref:`here ` Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -508,7 +508,7 @@ Deprecations We refer users to the external package `pandas-qt `_. (:issue:`9615`) - The ``pandas.rpy`` interface is deprecated and will be removed in a future version. - Similar functionaility can be accessed thru the `rpy2 `_ project (:issue:`9602`) + Similar functionaility can be accessed thru the `rpy2 `_ project (:issue:`9602`) - Adding ``DatetimeIndex/PeriodIndex`` to another ``DatetimeIndex/PeriodIndex`` is being deprecated as a set-operation. This will be changed to a ``TypeError`` in a future version. ``.union()`` should be used for the union set operation. (:issue:`9094`) - Subtracting ``DatetimeIndex/PeriodIndex`` from another ``DatetimeIndex/PeriodIndex`` is being deprecated as a set-operation. This will be changed to an actual numeric subtraction yielding a ``TimeDeltaIndex`` in a future version. ``.difference()`` should be used for the differencing set operation. (:issue:`9094`) diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/whatsnew/v0.8.0.txt index a76c4e487d5d8..0d2cfeb2d7cfc 100644 --- a/doc/source/whatsnew/v0.8.0.txt +++ b/doc/source/whatsnew/v0.8.0.txt @@ -241,7 +241,7 @@ matplotlib knows how to handle ``datetime.datetime`` but not Timestamp objects. While I recommend that you plot time series using ``TimeSeries.plot``, you can either use ``to_pydatetime`` or register a converter for the Timestamp type. See `matplotlib documentation -`__ for more on this. +`__ for more on this. .. warning:: diff --git a/doc/sphinxext/numpydoc/phantom_import.py b/doc/sphinxext/numpydoc/phantom_import.py index 9a60b4a35b18f..4b4fec863a0e3 100755 --- a/doc/sphinxext/numpydoc/phantom_import.py +++ b/doc/sphinxext/numpydoc/phantom_import.py @@ -11,7 +11,7 @@ can be used to get the current docstrings from a Pydocweb instance without needing to rebuild the documented module. -.. [1] http://code.google.com/p/pydocweb +.. [1] https://github.com/pv/pydocweb """ from __future__ import division, absolute_import, print_function diff --git a/pandas/io/auth.py b/pandas/io/auth.py index b20b7c8ff1b04..e42df6a7309b7 100644 --- a/pandas/io/auth.py +++ b/pandas/io/auth.py @@ -30,7 +30,8 @@ class AuthenticationConfigError(ValueError): %s -with information from the APIs Console . +with information from the APIs Console +. """ DOC_URL = ('https://developers.google.com/api-client-library/python/guide/' diff --git a/pandas/io/ga.py b/pandas/io/ga.py index 6dd0bb7472c37..45424e78ddbe7 100644 --- a/pandas/io/ga.py +++ b/pandas/io/ga.py @@ -1,5 +1,5 @@ """ -1. Goto https://code.google.com/apis/console +1. Goto https://console.developers.google.com/iam-admin/projects 2. Create new project 3. Goto APIs and register for OAuth2.0 for installed applications 4. Download JSON secret file and move into same directory as this file diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ae7200cf6fb2e..c7390cf240f8a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -7,7 +7,7 @@ a once again improved version. You can find more information on http://presbrey.mit.edu/PyDTA and -http://statsmodels.sourceforge.net/devel/ +http://www.statsmodels.org/devel/ """ import numpy as np diff --git a/pandas/src/klib/khash.h b/pandas/src/klib/khash.h index 0f1a17c6333f4..dc004a0e1770b 100644 --- a/pandas/src/klib/khash.h +++ b/pandas/src/klib/khash.h @@ -52,7 +52,7 @@ int main() { * The capacity is a power of 2. This seems to dramatically improve the speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - - http://code.google.com/p/ulib/ + - https://github.com/stefanocasazza/ULib - http://nothings.org/computer/judy/ * Allow to optionally use linear probing which usually has better diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/src/ujson/lib/ultrajson.h index f83f74a0fe0da..c37fe8c8e6c38 100644 --- a/pandas/src/ujson/lib/ultrajson.h +++ b/pandas/src/ujson/lib/ultrajson.h @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/lib/ultrajsondec.c b/pandas/src/ujson/lib/ultrajsondec.c index 9a4d5972b101b..5496068832f2e 100644 --- a/pandas/src/ujson/lib/ultrajsondec.c +++ b/pandas/src/ujson/lib/ultrajsondec.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c index 5e2a226ae8d63..2adf3cb707bdb 100644 --- a/pandas/src/ujson/lib/ultrajsonenc.c +++ b/pandas/src/ujson/lib/ultrajsonenc.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/src/ujson/python/JSONtoObj.c index 9c1b4febd9895..e4d02db4cb60a 100644 --- a/pandas/src/ujson/python/JSONtoObj.c +++ b/pandas/src/ujson/python/JSONtoObj.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 46ae623ae88a7..925c18cd23d8f 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/python/py_defines.h b/pandas/src/ujson/python/py_defines.h index 7a5083e131512..723eaed336f6b 100644 --- a/pandas/src/ujson/python/py_defines.h +++ b/pandas/src/ujson/python/py_defines.h @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/python/ujson.c b/pandas/src/ujson/python/ujson.c index 2eb8a80c0325c..48ea92ed3bc8c 100644 --- a/pandas/src/ujson/python/ujson.c +++ b/pandas/src/ujson/python/ujson.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/python/version.h b/pandas/src/ujson/python/version.h index 0ccfbfe74521c..2d4fd137edefe 100644 --- a/pandas/src/ujson/python/version.h +++ b/pandas/src/ujson/python/version.h @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/stats/fama_macbeth.py b/pandas/stats/fama_macbeth.py index caad53df2c7fe..f7d50e8e72a5c 100644 --- a/pandas/stats/fama_macbeth.py +++ b/pandas/stats/fama_macbeth.py @@ -37,7 +37,8 @@ def __init__(self, y, x, intercept=True, nw_lags=None, import warnings warnings.warn("The pandas.stats.fama_macbeth module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see here: http://statsmodels.sourceforge.net/stable/index.html", + "like statsmodels, see here: " + "http://www.statsmodels.org/stable/index.html", FutureWarning, stacklevel=4) if dropped_dummies is None: diff --git a/pandas/stats/ols.py b/pandas/stats/ols.py index e2375ea180ed2..678689f2d2b30 100644 --- a/pandas/stats/ols.py +++ b/pandas/stats/ols.py @@ -51,7 +51,8 @@ def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, import warnings warnings.warn("The pandas.stats.ols module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html", + "like statsmodels, see some examples here: " + "http://www.statsmodels.org/stable/regression.html", FutureWarning, stacklevel=4) try: diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py index dca1977fb19bd..baa30cde9344e 100644 --- a/pandas/stats/plm.py +++ b/pandas/stats/plm.py @@ -39,7 +39,8 @@ def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, import warnings warnings.warn("The pandas.stats.plm module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/mixed_linear.html", + "like statsmodels, see some examples here: " + "http://www.statsmodels.org/stable/mixed_linear.html", FutureWarning, stacklevel=4) self._x_orig = x self._y_orig = y @@ -743,7 +744,8 @@ def __init__(self, y, x, window_type='full_sample', window=None, import warnings warnings.warn("The pandas.stats.plm module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/mixed_linear.html", + "like statsmodels, see some examples here: " + "http://www.statsmodels.org/stable/mixed_linear.html", FutureWarning, stacklevel=4) for attr in self.ATTRIBUTES: diff --git a/pandas/stats/var.py b/pandas/stats/var.py index cc78ca2886fb3..db4028d60f5c8 100644 --- a/pandas/stats/var.py +++ b/pandas/stats/var.py @@ -31,7 +31,8 @@ def __init__(self, data, p=1, intercept=True): import warnings warnings.warn("The pandas.stats.var module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/vector_ar.html#var", + "like statsmodels, see some examples here: " + "http://www.statsmodels.org/stable/vector_ar.html#var", FutureWarning, stacklevel=4) try: diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index 58cd0c13d8ec7..e1888a3ffd62a 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -94,7 +94,7 @@ def wrapper(*args, **kwargs): # Substitution and Appender are derived from matplotlib.docstring (1.1.0) -# module http://matplotlib.sourceforge.net/users/license.html +# module http://matplotlib.org/users/license.html class Substitution(object): From b4b1e962215be1502cbf4ed70d222c0fa0d6acc3 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Tue, 5 Jul 2016 17:39:35 +0900 Subject: [PATCH 069/359] TST: Add tests for single group (#13561) --- pandas/tests/test_groupby.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index bc25525f936ac..10362cbb24888 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2085,8 +2085,8 @@ def test_groupby_head_tail(self): assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - empty_not_as = DataFrame(columns=df.columns, index=pd.Index( - [], dtype=df.index.dtype)) + empty_not_as = DataFrame(columns=df.columns, + index=pd.Index([], dtype=df.index.dtype)) empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) assert_frame_equal(empty_not_as, g_not_as.head(0)) @@ -4549,6 +4549,15 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None + def test_groupby_with_single_column(self): + df = pd.DataFrame({'a': list('abssbab')}) + tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]]) + # GH 13530 + exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a')) + tm.assert_frame_equal(df.groupby('a').count(), exp) + tm.assert_frame_equal(df.groupby('a').sum(), exp) + tm.assert_frame_equal(df.groupby('a').nth(1), exp) + def test_groupby_with_small_elem(self): # GH 8542 # length=2 @@ -4989,8 +4998,8 @@ def test_cumcount_empty(self): ge = DataFrame().groupby(level=0) se = Series().groupby(level=0) - e = Series(dtype='int64' - ) # edge case, as this is usually considered float + # edge case, as this is usually considered float + e = Series(dtype='int64') assert_series_equal(e, ge.cumcount()) assert_series_equal(e, se.cumcount()) From 6647687ff7f203eba0f4d376956308dc97e78015 Mon Sep 17 00:00:00 2001 From: Paul Mestemaker Date: Tue, 5 Jul 2016 06:40:28 -0400 Subject: [PATCH 070/359] Update gotchas.rst Removed redundant words Author: Paul Mestemaker Closes #13560 from PaulMest/documentation-wording and squashes the following commits: 4fa6c93 [Paul Mestemaker] Update gotchas.rst --- doc/source/gotchas.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index c79b902d559d5..99d7486cde2d0 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -173,7 +173,7 @@ dtype in order to store the NAs. These are summarized by this table: ``integer``, cast to ``float64`` ``boolean``, cast to ``object`` -While this may seem like a heavy trade-off, in practice I have found very few +While this may seem like a heavy trade-off, I have found very few cases where this is an issue in practice. Some explanation for the motivation here in the next section. From 2134b6307928b526776473d641f88d5d15a6d025 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 5 Jul 2016 06:41:52 -0400 Subject: [PATCH 071/359] BUG: categorical unpickle to use _coerce_indexer_dtype follow up for #13080 to use `_coerce_indexer_dtype`. Author: sinhrks Closes #13426 from sinhrks/cat_pickle and squashes the following commits: e8702f8 [sinhrks] BUG: categorical unpickle to use _coerce_indexer_dtype --- pandas/core/categorical.py | 5 +++-- .../0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle | Bin 0 -> 191074 bytes .../0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle | Bin 0 -> 127687 bytes .../0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle | Bin 0 -> 127220 bytes .../io/tests/generate_legacy_storage_files.py | 8 +++++++- pandas/io/tests/test_pickle.py | 16 ++++++++++++---- 6 files changed, 22 insertions(+), 7 deletions(-) create mode 100644 pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index fa3d13c174245..6dba41a746e19 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -999,11 +999,12 @@ def __setstate__(self, state): raise Exception('invalid pickle state') # Provide compatibility with pre-0.15.0 Categoricals. - if '_codes' not in state and 'labels' in state: - state['_codes'] = state.pop('labels').astype(np.int8) if '_categories' not in state and '_levels' in state: state['_categories'] = self._validate_categories(state.pop( '_levels')) + if '_codes' not in state and 'labels' in state: + state['_codes'] = _coerce_indexer_dtype(state.pop('labels'), + state['_categories']) # 0.16.0 ordered change if '_ordered' not in state: diff --git a/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle b/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle new file mode 100644 index 0000000000000000000000000000000000000000..917ad2b0ff1a3534bd34a71568b241121a769cf8 GIT binary patch literal 191074 zcmd4a1(;UlzBp{88M?crySuw{0O=YSns;VshVHfy8xc`V5JXWF5kW#hQ9w{JK|m2i zNm0Ic4fm*c_C9BS=lcJz@B8iRntN8fYd!0U^~5{4XC#de-BB_!#jw$Xqb7#J8_7yW zCLcT?E-I9K(CBfa#!f0fXiQ90`BCFW#tj<~6Ek2^M8BA*L1RY8#>I>q6c-UnF=*_7 z(SrxXmXC{#iWwFaTR!1J5lzEah#NL4s#S2=h)~KlN#;i0F>rq8P^yw`?r0M+_tv!| z2FFbr8x;{s9htOU%{EEe+z~z@P02RNBk!o&zJ2@FH^Y(eqfL_dl95SA$A&)|{$Jd@ zxPc2JJ0?l8`}FaYhqEL}l6~@+$iTsAldlcj^vJ4^lGLq`NZ21kvI8xRr77*27LHc7${mMJp%hzWyZLz&|v z?-XMrTR9N`P^mvlqx64s;eXyJaXW5c_^M@YOJCXl<}%7A$v^Act@Oo* z%0;HESiVx#@|8m6<3km~@16k@qhdo9ql-tT92jIxY^YM7Q04egm6DOEM-7eX7v2{e zsye7&zk%b1g69XV`J_$5*;+|E)Yyh5r+rW`+D zqX&$N3e{{sZeV26QNu#DqJwg(9UrPwGP-DVknMHD*ioBM*e-bYsmb;K%)17` zLnoOVw8IXG?XY1&JDhs^yQ@LC9rlR}U%gSdETaF){Kmlrk|vaDrhk@Z+`!vTZ4#WC zEaB8#|IEn5h8s69s!yot?YC(5FA65H920N(Pycc2=Ktani68mrr4q|JoRs!`LM=-6 z2+}%cbX2Hi!f$aC#)Mi$#mx&UQ0v>CuZ^38^KwXHUbg+W@3cSLz^(gVXdoR8Z<(5z z8YWcLcHukyRch^nJB3TF;jJR+aC?4r437S1Mbara^*H04&k$CDqFLis7M3#(h8GWk=;zQk{!!53R$>@sF^`gUVtVew4u9DGJqyK6~ zJ)>(xx4G4Pq7xcUiXkJ%42TQ$N@zR1<3oKC+fLtb?d})O=f7TA{r{UZCZ_41H97H> zHQ=_oG0>IT{_W*=>nZJ43!^hc|1~}t7yakRWYE}2ag!tZjg1J6FBusT*`-3W0kJ{1 z6`Ig`!`7QO+a!rh)~rj{(8RdTp-JHmas#v+^U!7;gi0Ihp;P8~h!!>JF{uhU* zI$XI%wd()kaJ<7+s{j4Ne_niAR6=1-zb#K^{IBygJ~T5Z>{&r!j}8iZc6?~g?S(xz zT-c$%7k1LP*wDO$lAa$QT98=M3&SP7D4eRlhQ5n~R7K2SAsA!qpQyZp9#vqHLpG1&&CNqn)otzwUM7{R zSFTd=?=QuQx5oKf5=JRQ#*7KQ+M=P4+rhDYp;5Mi%M-!^ezyUF#PyIGI zbnI5WIey!nKl_jG93T2T$bc_`3^){Iz=`-f+&C8K*KPCR4BhUdajG7sE>; z;ZFFM+Y0X2|KAi`eCSe8*DnWk{Y+5Tuf&J0Cf4=v=rsHU*TQxE+^xDE88Kwo$dUaL zruU)ie`a@d%^txs4^;nui%i}xsL40}T)6pt+`#C0(LrVZBR+JqWb~=XEb4f$IHaP- z1T(N(bv@>e@UJEN#3T*&{A~~ylPuVOtI~(dD{<8_CV9f{|7hVd+)xe{*xTq*<|Ii9 zZ0NHha8#pgLwHFuSk#P36KwuDZ@4wQjEM+#rilKtt;M7b zuJV7gxkdk@O~s^h$8fU>8br8m{EIddlm7Oz{@!e2GK7yMw49iX;ookn1Hp35tvWC~ zDEEbt{~s3VaK)85oRIjKEJ4b`#WnmN#gsL?{~yH^ACoQESw21{d%}k*4o|4-mHNcw z2ye&7i!JcqMO`fnq6mw^=d21}5@bk7Y`mNFV zZ55-KkB29De}6d?PuTq*y&S?5u>Z3!hq(5MHKas174b18gS7mM8d55}|MnUZ)hDKO z!pVPc$z_5);S5TZ&~erb+SOn2pLd*pb(Gexzp6XS|KOAVc$R9npAn4H5}u>_zj%%s z&P|!{92JAB{T2W7bNn~;q~`6Z2(J2%bk@54j9aA=t|zr`r88W=>jY&u^$;+VRB zeI`82kEs{lyw&~(Bn&feJGcJr=Qc<@xADI?x8d#QHcC9V$-g+a@$Kg}Njx|FRt&y; zi;9nF8r~JWxtj%NMpqmjyvV{sr~Wgx$AZ-Qn*`!;joDIHAVyp!JT6k7@Pi z@$k4a;dtv?$Ai`qKHlcf<4M~j%r;`$-Z~!C%JA`ae;!ZPCV9f=+uu4Kl=ir&;QN;7 z86y3@p6A8%O56yIn;6qOyb+$`-FDqR;eElq!*7D1?(_{d z6D}403M8gq;-|ykg2eQ{_33a891xsz+f@bzd%{;qoiM9v_*ZMW^;`I8LL{2FE|O^C zY%9@!q;b%{zIx)-Ts#;D-xmLFJK>w2n8CqCdjv~@Lx#nK|1cmbyxk*cZn42%3Ji`J z65bBJ_lb=@5Fax%I63%xgSaY{W1_{Vuppkzl{jDjX!^n85TY> zJPsHhCO&3F!ZlL$8yPh|YGiE8$na)35C2?9iWwCgZ*xbxOeKO$>^dWOSBS*l{+VC@ zZ@&lsPiLnY-8U1%qw$0Y|Jgx8WMwvHXAb6MF6L$)=4C$SX8{&uAr@v47Ui8R#^Nl& z@JdiZlwxU?VOf@Ac~)TJtG|-%%B;ewtj6lB!Nj3PE!(wOhjm$x_4$u42a`0izcHJz zDVwo5TQG5rpq1^`Y{Rx}$M)>N#4fy(?au7Nu8d?ic4rUX#h&cN-t5D^?8p8bz=0gZ z!HnV%4rMflaX3eCBu8;H$8an|jA1O}IF92vffG52lR1S`8P92)&KaD^S)9!|oXdHf z&jnn_MO@4!T*_r!&K10y_wZic$NRaG4{#M9csuPjC&NkJTYkrrJjL&MnrC>H zKk!GM<4-)#3;dZE`3ryLC0^zgUgb4j=Wo2h-}wh`GI2-~tm_4ll)-vl5WzxD5W#Xv z5W(_S5Wxai5W#{<5W#|15WyN(5NR1KI0X?bDFqQM9|aLC9tDy3CqBVCO%TCKO%TD# zOc23hOc23hOc22$Oc22`Oc1#ktg8eOtfT}Htfd4Itdj&0ER+NhER_TiERF95lqH| z2&NQ41k>aof{Abt!K62cU`i20FzpN?m|O-COeTW}rW8RW4&j1HT@b+}E{I?X7ep{| z3nG}Y1rbcnf(T}1K{RAA^9rIdgV|LO!IUb9U`iE4FqH}-m`eo_%%Fk@=1xHbGo~Pd zxl$0p3@M0UZWKf?BMKsz`UDY7cY+9}Iza>zn;?S8Oc23LCWv4H6GSj=2_l%R1QEQV${+hX)Z1#DfTi;z0ys@F0SrcMzjFhGQ9G3}YF`aU9PHoXAO>%qg78cuwPV z&frYW;%v_0T+ZWsF5p5g;$kl0QZD0iuHfCghxhV6-p`eMfUEc*AL7G&gpcwuKF-yA zf@}CB*YYW@-j97Jj1j6fj{ybf8u#w;Lp6sU-&C8@iMRQDzEW6f8!1Q&OdmQiGTQhN77sG zr=(2A|I>erp22l9G7~d13$rpCvoi;CG8c0*5A!k~^Roa8vJeZi2#fMg7GrUiU`du@ zX_jGGmScHVU`1A9WmaKTR%3P6U`^IyZPsC3)?V$^He++PU`w`QYqnuq zwqtvCU`KXhXLey%MzR~bvj^{DPxfMO_F-T4V}B0dKn~(yMsWy-GMd9UoFh1rqd1yl zIF=#CFqUx~$MKxNiJZjAoWiM$=QK{|49?^%&gLA>+Q~g}?F= zFY^ko@*1!6H{Rgy{DU`{_z#foNT&a1GA4Y{96pr7c1osVYNlZX(=r{?GXpa+6Eial zvoagAGY4}r7jrWY^D-avvj7XS5DT*ii}FquV{w*XNtR-1mSI_zV|i9!MOI>ER$*0E zV|CVGP1a&<)?r=NV|_MYLpEY#HepjXV{^7(OSWQbwqaYgV|#XBM|NUoc41dWvKzZI z;fus@{k+R|PxfMO_F-T4V}B0dKn~(yMsWy-GMd9UoFh1rqd1ylIF=#CFqUx~$MKxN ziJZjAoWiM$=QK{|49?^%&gLA>+Q~g}?F=FY^ko@*1!6H{Rgy z{DU`{@aIcsuPjC&NkJTYkrrJjL&M znrC>HKk!GM<4-)#3;dZE`3ryLC0^zgUgb4j=Wo2h-}wh`GD(VC{r(+H%4AH=6imrf zOwBZmU|ObQdS+loW@2V$VOC~icIIGC=3;KXFuiRXENKl37g z;jg^J%e=y?yvFPNjW_r^|KLp~NvZ#5QYK?^reI2@Vrr&g1k*Ad(=!7zG7~d13$rpC zvoi;CG8c0*5A!k~^Roa8vJeZi2#fMg7GrUiU`du@X_jGGmScHVU`1A9WmaKTR%3P6 zU`^IyZPsC3)?V$^He++PU`w`QYqnuqwqtvCU`KXhXLey%MzR~bGvVLU z1pUA5p6tcm?8Cn7$Nn6^fgHrajN%XuWi*FzI7e_KM{zXAa4bWNVJzb~j^jCj6FG^K zIfYXh&uN^_8Jx*koXt6$%Xys71zgBQT+Ah0%4J;66}+4G@Lt}>`?-=2a1|fqLwuNz z@KHX-$GMtMa1Ed2T0X^fe45X2J)h-se4a1xMZUxh+{l-?iJSQfxA0ZI#;tsvZ*Uvm zaL;RSZ@Gy_?C_m*f9_MHLoL}$+ zzvNf^n&0qSe#etM#qW8VXLy!B@JF8GPdv{H{FxW|3xDM$Ugi~ELhq%*?{9%*O1@!JN#++|0wg%*XsJ zz=ABq!Ysm~ypzRPoF!P2rC6F}SeE5jo)uV;l~|coSe4aSoi$jKwOE^VSeNx!pAFcM zjo6q?*p$uKoGsXrt=O7v*p}_so*meco!FUO*p-p&#_mk`@7o0ZzwMsv#op}0zU;^T z9KeAb#KDZ>5DsNDhjBPZa3n`@G{Zu-pBj7k`HhdALK)Pn2+#LKE}tnnon>IpX6FT z#dUm|&u~4T<#T+VFYraa#0}iYm$`|X`3kr2Rldfpe4TG_8{gzx+|IYTgYR%B-{mgu z=6l@3z1+wBe4iii01xs*e#Arkn4j=4kMJlz$3qHvJo4z37fJRo3jO5vK3pi4coFE z+p_~ZvJ*SA3%l~xf2%C~@BeIfXAj=Rp6tcm?8Cn7$Nn6^fgHrajN%XuWi*FzI7e_K zM{zXAa4bWNVJzb~j^jCj6FG^KIfYXh&uN^_8Jx*koXt6$%Xys71zgBQT+Ah0%4J;6 z6}+4G@Lt}>`?-=2a1|fqLwuNz@KHX-$GMtMa1Ed2T0X^fe45X2J)h-se4a1xMZUxh z+{l-?iJSQfxA0ZI#;tsvZ*Uvma zL;RSZ@Gy_?C_m*f9_MHLoL}$+zvNf^n&0qSe#etM#qW8VXLy!B@JF8GPdv{H{FxW| z3xDM$Ugi~ElQJ2TGX+yJ6;m?}Bbb)yn4TG!k(rp8S(ugC zn4LM8lew6id6<{^n4bk$kcC*7MOc(~vKWiA1WU3MOS25ivK-5^0xPl-E3*o#vKp(i z25YhwYqJjPvL5TR0UNRr8?y!bkZS zALnX5!8Lr6Yxxw{@o7H8^?a7k@p-<$7x@x5a3f#lCT`{{+`?D+8n^OwzQJvLlW%c5 z-{ua!!<~GWySSV0aS!)$ANTWpe!v4f$Pf7u5AkDu!oxhmqx_V|c$}Z{bAG`S{E}bs zYktFT`5jO46u;+bp5a;kz#n;zKk+;-@Mm7+FZ`94c$rstmDhNkzwrit=O4VuBoX?5 zCS@`vX9}idDyC){MldbYF+DRdBQr5GvoI^OF*|cGCv!13^Dr;-F+U5iAPccDi?Ar~ zWHAhGQ9G3}YF`aU9PHoXAO>%qg78cuwPV&frYW;%v_0T+ZWsF5p5g;$kl0 zQZD0iuHfCghxhV6-p`eMfUEc*AL7G&gpcwuKF-yAf@}CB*YYW@-j97Jj1j6fj{ybf8u#w z;Lp6sU-&C8@iMRQDzEW6f8!1Q&OdmQNz&^7nUu+xoGF-+shFB+7{Rnm$Mnp=jLgK$ z%)+e9#_Y_&oXo}C%)`9Q$NVh7f-J;F#F^pv#$8kI-a3Uvh zGN*7V<2jAfIfFAfi?cb0b2*Rmxqu6~h>N*|OSz28xq^4|9^T9Qct2P20j}bMe25S8 z5kAVt_&8Vd39jLjT+64pj!*L$uIICSj?eQ2zQ~ujfgAZUH*qsx;TFEi*SM9h^9^p} zn|zDg`8Id(9q#12+{N8|k9)Y6`?#O)^8+5>L4L@Oc!(eK6CUOf9_6Pz#^d~qpYscz z;FtW0U-KJ&%kOxSr}#Zj^9;}O2mZ)&{E6pzfj{#if8npZ#LK+GtGveR{EavGJOAKK zCj8$&g@*ulq`Nf^NXleP&J;|^R7}k@j9^-(V|r#_MrLAWW?@!lV|M0XPUd26=3!pu zV}2H3K^9_R7GY7|$zm+d5-iD5EX^`3%W^Ew3arRVtjsE`%4)368m!4$tj#*C%X+NO z25iViY|JKX%4TfN7Hr8@Y|S=o%XVzf4(!NI?949g%1Cx&clO|2?8#p2%|7hQe(cWy z9LPZ&%qR}wP)2hYhjRo+aui2%497CW7{)S=<2arZIFXY$nNv8G@tnr#oWYr##o3(0 zxtz!OT)>4~#Kl~~rCi44T*13}5AWrDyq_!i09WxrKE#Ll2p{ERe4MNK1lRCMuH{o) z$EW!W*YjCE$LIM1U*t>Nz>R#Fo4A>;a0_4MYuw7$`3AS~O}@qLe49J?4tMfh?&5B~ z$35K3ecaFY`2i2`AV1_sJj9Rr2@mrKkMdI<<8gk*&-n#U@JoKhulWtX<##;EQ~aK% zd4^~C1ApW>{>1aVz@K@MzwlRH;$>dpRbJzD{>B^poqzBqlcYEQXHq6(a;9KPrebQQ zVFc4M9n&)dGcpr1GYhja8?!S9b21lmGY|7JAM>*S3$hRkvj~gwP8MTvmS9PiVriCP zS(amYR$xU|Vr5ogRaRql)?iK6Vr|x8UDjiLHef?GVq-R8Q#NCBwqQ%PVr#ZxTef3+ zc3?+#VrOp%} z#xRy~9LMpTz=@p1$(+KejOR2?=M2u|EY9W}&gDGL=K?O|A};0h8VP1%gi*@7+Eimlm(ZP||P*?}F|iJjSnT^Y%4?9Lv%i#^$kz1fF- z*^m7>fCD**gBisk9Li`8<8Y4PNRHxYj^S8_7{ge`aU92U0w;13Cvys?GM>{ooijL- zvpAb`IG6J{p9{E-aRE;d(yH=lDEd;EQ~T8@Q1#a}zi76>j0He2rWAI^W;Zc6dV?55!_&LAe34Y11_%*-bxBQML zd5YijG|%uXf8dWi$Deqf7x*(T@)!QfOT5f0yvl35&fj>0zw-~?WRi^feM zDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`X9G55BQ|CeHf1w5XA8DuE4F4E zwq-lEX9spwJUT_$J@tcD~IWe1|*vE_ZP^-{T(cC-^15;@A9!-|{=2%ko{DD969Dm|@Uf|EX$Y1y? zFYz+3@G7tII)CF0{?0#mlSwk^|CyA@n4Bq?lBt-QX&Aw@Ovm)hz>Lhq%*?{9%*O1@ z!JN#++|0wg%*XsJz=ABq!Ysm~ypzRPoF!P2rC6F}SeE5jo)uV;l~|coSe4aSoi$jK zwOE^VSeNx!pAFcMjo6q?*p$uKoGsXrt=O7v*p}_so*meco!FUO*p-p&#_sIFyV#Sx z*qeRWm;KnE12~X_IG9l!!l8`jFb?Mkj^rqg<`|A;h%t<19LI4yCvYMsaWbcHD&skg z(>a4PIg7J7hjTfP^SOWvxrmFogiE=M%ejJg^B&&I`*=TB@&T^mgM5e&^ASGE$M`r` z^9ioulU&QExQ->#3_&fjLO(w~#|7TJr zV{)coN~U6JreOrrG9A-112ZxcGcyabG8?lq2XitPb2AU~G9UA^01L7Z3$qA|@=g|G zah707mSSm^VOf@Ac~)RWR$^sVVO3URb=F`_)?#heVO`c^eKuf2HezEoVN*6^bGBeh zwqk3xVOzFidv;()c4B9CVOK`78@say?_y8(VsG|gU-n~v4&Xoz;$TK`2!}G7!#JEH zIFh3{nqxSYA;vJ4aU93-oWO~k#L1k(sf_0|PUj5HC%t!brALHX(%_q2qPjW4v;yOOfXSklv@;N@w7x*Gy z;s$Qy%iP4xe1%*1DqrJPzRowejc@WTZs*(F!FRZm?{XJ+^F8k2Uhd<5zRwSMfCu>@ zKjI;N%ujfjM|hN<@)(cvGk(r5c!FQ@D}K#y_$|NVNuJ{OJk2va%OCh7&+#Xo=LP=E zi~NPZ@)9re3a|1Suk$zF;P3o{H<=`h{+~&ijLDgTDVd6?nT8Qe%XCc749v((%*-sz z%52Qe9L&jF%*{N^%Y4kw0xZZvEX*P-$~#$%#aV(SS&F4uhGkifOmgh=Uo$AsotR4&!i+;7E?*XpZ4nh8V+G#&I0Sa{?!F5+`#C zr!tbQGcY4FF*CC;E3+{>b1)}!F*oxtFY_@!3$P#yu`r9U zDDPx37H0{TWGR+r8J1-^mS+W4WF=N+6;@?6R%Z>?WG&Wa9oA(%)@K7YWFt0a6EHH{atP?&Uu2=llGC z2Y8Sl@*^JN$NYqcd4xy#DUb0uKjY{8f+zSTzv9>YhTrl#p5!Th&(l1^v;2WS@*IES zd0ybpyvSepD=+afukb3b@j8Fw4gStQc#}!8>HnFO$(Woen3AcOnrRrpv`okJ%)pGy z#LUdXtjxyj%)y+@#oWxpyv)b^EWm;+#KJ7XqP&yESezwTlBHOhWmuNwSe_MFk(F4P zRalkPSe-RkleJizby%16Sf35pkd4@wP1uyp*qklclC9X9ZP=FW*q$BOk)7C?UD%b8 z?8ffw!MoU#z1W+5*q8m-p946MgE*K`9KxZD<}eQD2#(|^j^-GSWr#71WgN$GJST7> zCvh^Ta4O?Djng@UGdYX1IfrvOkMp^J3%Q7kxr9r(jLW%#ck>?J%lmjgSMmX_;)8sM z5AzW|%E$OPSMv$3;gej;r?`$!^BJz^vwV)v^98=hm$-o&`7$?gGhg8rzRK6Qm9O&+ zZsVJLi`)4&ckmtVER$*0EV|CVGP1a&<)?r=NV|_MYLpEY# zHepjXV{^7(OSWQbwqaYgV|#XBM|NUoc41dWvKzaz2k&A}_F`}LVPE!Re-7Y44&q=& zaR`Spn!`ApBRGFYn|1T*(KxiVyN3KFmk>C?DhFT+JuAhEH-WpW-?`&1blt z&+<7w&lmV2U*ZOC8n5#=-r(>2gEyHZhyI^QnT*Mqf+?AbshNfmOv`jk z&kW4SOw7zI%*t%c&K%6iT+Gcp%*%Yt&jKvSLM+T8EXq4sjKx`kC0UB4S%zgR?oIFqwDn{zmq^EjUixR8sum`k{n%eb5?csK9iy}Xb2b0r_( zDn7`E_%I*gqkN2yb2Xpf8a~Oje2VM%G@s#mKFjC$JYV38e2E*lkuP%-H}e&4;j4U& zTlqTQ;5NR=x44~ea|hqyPQJ@s+|Bp6hkLn?`}saU-~k@whx~|#_%T1>VIJX8e#&D! z&d>Nczu*ae$*=e|zu~w1jwgAF-}5xj@GO7ek37epc%B#dGcWQN{>n?d%qzUgYrM|i zc!R(558h;wocezzWilpb3Z`T#re+#OFfG$DJu@&PGchx>Fe|e$J9986b1^sbFfa2l zKMSxR3$ZYZuqf|jF&1YDmSicGW*L@cIhJPyR%9hsW))UtHCAU0)?_W#W*ydLJ=SLf zHe@3Y}ihk1lY`6-X_I6vd({DLR=CBNd={D$B1JD%hze$Uf9 z!?XN>Kk^)Z;(1=+&%DTA_$x2*GOzF|ukkv6;|>1KKX{W#a_Rq>l*yQ!DVUO}n3`!A z!L&@r^vuAF%*4#h!mP~3?99QO%*EWy!@SJL{4BtNEX2Yr!lJyB#aNsrSdyh!nq^p) zXFuiRXENKl37g;jg^J%e=y? zyvFPNjW_r^|KLp~$*uoqQYK?^reI2@Vrr&g1k*Ad(=!7zG7~d13$rpCvoi;CG8c0* z5A!k~^Roa8vJeZi2#fMg7GrUiU`du@X_jGGmScHVU`1A9WmaKTR%3P6U`^IyZPsC3 z)?V$^He++PU`w`QYqnuqwqtvCU`KXhXLey%MzR~bvj^{DPxfMO_F-T4 zV}B0dKn~(yMsWy-GMd9UoFh1rqd1ylIF=#CFqUx~$MKxNiJZjAoWiM$=QK{|49?^% z&gLA>+Q~g}?F=FY^ko@*1!6H{Rgy{DU`{B#-`|NtukvnSv>q zim91~5lqW;OwSC=$V|-4EX>Mm%+4Il$z06MJj}~{%+CTW$U-d4A}q=~S&YS5f+bms zrCEk$S&rpdffZSam05*VS&h|MgEd);wONOCS&#MEfDPG*joE}v*^JHEf-TvKt=Wcc z*^cemfgRb2o!Nz58Od(!&K|srJ=u%B*@u1EkNr7-138F;8O0$S%4iPbaE{hGQ9G3}YF`aU9PHoXAO>%qg78cuwPV z&frYW;%v_0T+ZWsF5p5g;$kl0QZD0iuHfCghxhXT;nE(0014JajrMKZwr$(CZQHhO z+qP}nwr#uL%$qZt`8OxCB5G9;^<`AlWKQ8!PUCdW;7rcqY|i0a&f|P8;6g6qVlLrQ zF5_~p;7YFIYOdj0uH$-c;6`rZW^UnDZsT_D;7;!1Ztme;?&E$Q;6WbZVIJX89^-MI z;7Ok1X`bO(p5u95;6+~IWnSS`UgLG%;7#7*ZQkKs-s62f;6py*V?N{)#nep0v`okJ z%)pGy#LUdXtjxyj%)y+@#oWxpyv)b^EWm;+#KJ7XqAbSZEWwg2#nLRpvMk5)tiXz_ z#LBF~s;tK9tihVB#oDaHx~#|gY`}(W#KvsGrfkOMY{8an#nx=Ywrt1t?7)uf#Ln!( zuI$F{?7^Pw#op}0zU;^T9KeAb#K9cGp&Z8H9Kn$s#nBwYu^h+ooWO~k#L1k(shq~? zoWYr##o3(0xtz!OT)>4~#Kl~~rCi44T)~xG#noKHwOq&b+`x_8#Le8ot=z`#+`*mP z#ogS)z1+wBJivoI#KSzoqddmrJi(JZ#nU{)vpmQ1yugdR#LK+GtGveRyuq8i#oN5Y zyS&Hye87i%#K(NXr+miee8HD|#n*hpw|vL<{J@X=#LxV~ul&aE{K236#ozqHzx>Al zarB=78Hj-ygh3gM!5M-f8H%A9hG7|w;TeGu8Hte@g;5!e(HVm=8H=$QhjAH?@tJ@L znTUy*gh`o<$(e#FnTn~IhH06O>6w8UnTeU1g;|-6*_nemnTxrZhk2Qg`B{JkS%`&M zghg45#aV(SS&F4uhGkifOmghGRL7<2iv7If;`wg;P0=(>a4PIg7J7hjTfP^SOWvxrmFogiE=M%ejIpxr(c~ zhHJTw>$!m&xrv*(g z=Xrq_d5M>Kg;#lv*Lj0Cd5gDshj)38_xXSi`G}ACgira5&-sEc`HHXkhHv?f@A-ir z`H7$Tgr zGYX?J8ly7?V=@+FGY;c29^*3s6EYDKGYOM28Iv;wQ!*7(GY!)+9n&)dGcpr1GYhja z8?!S9b21lmGY|7JAM>*S3$hRkvj~f_7>lz6OR^M8vkc3!9Luu;E3y(RvkI%S8mqGg zYqAz=vkvRB9_zCK8?q4_vk9BB8Jn{OTe1~fvklv_9ow@5JF*iyvkSYj8@sayd$JdM zvk&{SANz9v2XYVxa|nlW7>9ENM{*QLa}39F9LIA4Cvp-ea|)+&8mDsxXL1&2a}MWn z9_Mob7jh97a|xGn8JBYfS8^3sa}C#W9oKUMH*ym8n5#PZ}Jvz^A7Lw9`Ex3AMz0& z^9i5w8K3h7U-A`S^9|qf9pCc>9|OeGe+FbA24)Zj zWiSS32!>=RhGrOsWjKas1V&^eMrIU7Wi&=-48~+E#%3JGWjw}b0w!c4CT0>QWilpb z3Z`T#re+$ZWjdy324-X?W@Z*9LixF&Ji5RQ5?-N9LsSW&k3B!Nu10noXTmO&KaD^S)9!|oXdHf&jnn_MO@4! zT*_r!&J|qARb0(AT+4M_&kfwjP29{a+{$g-&K=yzUEIw*+{=C3&jUQjLp;nQJj!D{ z&J#SzQ#{QxJj-)D&kMZBOT5f0yvl35&KtbRTfEIXyvuvM&j)i zSA5Mke9L!y&ky{_PyEa;{K{|q&L8~AU;NEK{L6m~kRZVSpZ_o*12HgzFermDI72Wb zLoqbNFf79{JR>k7BQY|gFe;-lI%6;Fe|e$J9986b1^sbFfa2lKMSxR3$ZYZuqcbMI7_f3OR+S|uq?~5 zJS(swE3q=GuqvyuI%}{dYq2)#urBMdJ{zzh8?iB)uqm6dIa{zLTd_6Uur1rMJv*=? zJFzpnuq(TCi2XQcma43gyI7e_KM{zXAa4g4hJST7>Cvh^T za4M&9I%jYuXK^;?a4zR@J{NEy7jZF{a4DB@IahEcS8+Aha4pwyJvVS8H*qt!a4WZQ zJ9ls=cX2oOa4+|9KM(LA5AiUM@F zV|*rHLMCEjCSg)0V{)coN~U6JreRv9V|r#_MrLAWW?@!lV|M0XPUd26=3!puV}2H3 zK^9_R7GY5qV{w*XNtR-1mSI_zV|i9!MOI>ER$*0EV|CVGP1a&<)?r=NV|_MYLpEY# zHepjXV{^7(OSWQbwqaYgV|#XBM|NUoc41d`V|VsoPxfMO_F-T4V}B0dKn~(y4&hJ^ z<8Y4PNRHxYj^S92<9JTsL{8#lPT^Ee<8;p8OwQtL&f#3n<9sgQLN4NBF5yxx<8rRx zO0ME+uHjm)<9cr3MsDI}ZsAsL<96=gPVVAv?%`hU<9;6CK_22^9^p|Q<8hwgNuJ_q zp5a-Z<9S}-MPA}%Ug1?<<8|KPP2S>d-r-%|<9$BhLq6hTKH*b7<8!{?OTOZ3zTsQG z<9mMKM}FdGe&JVs<9GhxPyXU>{^4K#V}L~Z&wvcXzzo8m494IL!H^8a&Lhq z%*?{9%*O1@!JN#++|0wg%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS!z>2KI%B;ew ztj6lB!J4ea+N{I6tjGFnz=mwZ#%#i-Y{uqn!Io^r)@;MJY{&NOz>e(1&g{aj?8ffw z!Jh2J-t5D^?8p8bz=0gZ!5qS&9LC`s!I2!r(Hz6E9LMpTz=@p1$(+KeoW|*#!I_-J z*_^|G!IfOa)m+21T*vj?z>VC*&D_GR+{W$P!JXX2-Q2^y z+{gVqz=J%*!#u*HJjUZZ!IM12(>%koJje6Az>B=Z%e=y?yvFOi!JE9r+q}cOyvO@| zz=wRq$9%%4e8%T|!Iyl+*L=gbe8>0vz>oaI&-}u#{KoJ6!Jqua-~7YB{Ko)^^`8M5 zh=Cb|K^cs}8G<1hilG^XVHu9$8G#WQiIEwFQ5lWV8G|tyi?JDpaT$;CnScqIh>4km zNtukvnSv>qim91~X_=1cnSmLZiJ6&&S(%O5nS(i*i@BMHd6|#-S%3vuh=o~%MOlo+ zS%M{5ilteGWm%5pS%DQh8VP1%gi*@7+E zimlm(ZP||P*?}F|iJjSnUD=J@*@HdVi@n*0ec6xwIe-H>h=VzVLphAYIf5fOilaG( zV>yoFIe`;7iIX{nQ#p;(IfFAfi?cb0b2*Rmxqu6~h>N*|OSz28xq>UXimSPXYq^f= zxq%zGiJQ5FTe*$fxq~~oi@Ujpd%2JMd4LCbh=+NEM|q6Ld4eZ-il=#oXL*k2d4U&s ziI;hWS9y)sd4o53i??})cX^NZ`G61kh>!V%Px*|``GPO`im&;GZ~2bz`GFt#iJ$p} zU-^yS`GY_Ci@*7YfBBCAlITAJG7tkZ2!k>hgEIs}G898I48t-U!!rUSG7=**3ZpU_ zqca9$G8SVq4&yQ&<1+yhG7%Fq36nAzlQRWVG8I!Z4bw6m(=!7zG7~d13$rpCvoi;C zG8c0*5A!k~^Roa8vJeZi2#c~9i?akvvJ^|R49l_{%d-M2vJxw^3ahdjtFs1cvKDKz z4(qZW>$3qHvJo4z37fJRo3jO5vK3pi4coFE+p_~ZvJ*SA3%jx#yR!#-vKM=^5Bsto z`*Q#Xau5e|2#0bQhjRo+aui2%499XD$8!QFauO$V3a4@!r*j5pau#QE4(DU6 z2#@j@kMjgi@)S?=4A1f$&+`H=@)9re3a|1Suk!|P@)mFN4)5|F@ACm4@(~~N37_&A zpYsJ@@)ck64d3z|-}3`M@)JMv3%~Lkzw-xw@)v*e5C8HX10>ad24o-xW)KEtFa~D` zhGZy)W*CNLIEH5gMr0&LW)wzcG)89(#$+tUW*o+4JjQ1NCS)QeW)dc4GA3sVrerFn zW*VktI;Lj^W@IL2W)@~;HfCoI=43ABW*+8cKIUfu7Gxn7W)T);F&1YDmSicGW*L@c zIhJPyR%9hsW))UtHCAU0)?_W#W*ydLJ=SLfHe@3<{6&lIiBYQUgRZS<`rJ$HD2cp z-sCOb<{jSUJ>KU7KI9`l<`X{UGd|}FzT_*u<{Q4{JHF=!e&i>9<`;hDH-6_2{^T$I z<{$p$KL$vq{|v}L49p-5%3uu65Ddvs49zeM%W&*mcF53%Rf4u|(6CACP959+CqRJa zL7Jy16<|n}|1=y{v17acx{$oof69bUHl%r!5taLA|F6CI-&H{SKn%d zG|R9o%dtEwup%q5GOMsEtFbz3uqJD#;r?upt|LMGrO=WyRkcauqS)5H~X+J`>{U)0*Ks{Ja3eQy zGq-Rnw{bgna3^@Fs8ZHt+B*@9{n#@F5@ZF`w`$pYb_g@FidIHQ(?p-|;;^@FPF*Gr#aFzwtYN z@F#!qH~;W2|1rRS{%7$Y24o-xW)KEtFa~D`hGZy)W*CNLIEH5gMr0&LW)wzcG)89( z#$+tUW*o+4JjQ1NCS)QeW)dc4GA3sVrerFnW*VktI;Lj^W@IL2W)@~;HvaDsE{AtLmw1_1c$L?9oi})sw|JX(c$fEhpAYzu zkNB8R_>|B1oG@KzxbPf_?Q0}AmIPJhyKHW z48*_;!k`Ss;0(c#48_n4!>|m;@QlESjKs){!l;bK=#0UbjK$cD!?=vc_)NfrOvJ=Y z!lX>ba4+1 zY{k}W!?tY4_Uyop?8MIO!mjMb?(D&y?8V;f!@lgt{v5!89K^vK!l4|-;T*w{9L3Qb z!?7I4@tnYkoW#kT!l|6b>72otoWfJjBC1!lOLK<2=EWJjK&I!?Qfc^Sr=| zyu{1A!mGT->%766yv5tR!@Io4`+UHMe8k6m!l!)3=X}AJe8ty%!?%3L_x!+*{KU`v z!ms?s@BG1^{Ken=!@vB;0D<(M0U3ya8H7O@jKLX#AsLFH8HQmQj^P=B5gCb*8HG_9 zjnNr{F&T@o8HaHhkMWs+37LqAnS@E1jLDgTDVd6?nTBbZj_H|!8JUThnT1)IjoF!l zIhl*OnTL6qkNH`E1zCuNS%gJdjKx`kC0UB4S%zgjng@UGdYX1IfrvOkMp^J z3%Q7kxr9r(jLW%#E4hlRxrS@Gj_bLB8@Y*_xrJM~joZ0{JGqOyxrckXkNbIm2YHBx zd4xxKjK_I`CwYped4^|sj^}xS7kP=7d4*Sbjn{dDH+hS(_ANh%&`GsHkjo1rpG9KeI0TVJ26Eg{uG8vOI1yeE= zQ!@?IG9A-112ZxcGcyabG8?lq2XitPb2AU~G9UA^01L7Z3$qA|vKWiA1WU3MOS25i zvK-5^0xPl-E3*o#vKp(i25YhwYqJjPvL5TR0UNRr8?yXLAncavtY%0T*%+7jp@hav7I%1y^zvS91;5avj%m12=LLH**WO zavQgE2X}H8cXJQ-av%5e01xsI5Az6*@)(cv1W)o5PxB1V@*L0e0x$9sFY^ko@*1!6 z25<5fZ}SfC@*eN=0Uz=aAM**H@)@7=1z++NU-J#$@*Usv13&T;Kl2N}@*BVN2Y>Px zfAbIj@*e{P(SHVHAO>a-24ye?X9$L5D28SjhGjU0X9PxMBt~WwMrAZcXAH(JXAb6MF6L$)=4C$S zX8{&uAr@v47G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`X9G55 zBQ|CeHf1w5XA8DuE4F4Ewq-lEX9sp49jL!s2$V5!cBuvU=OwJTc$y7|uG)&8MOwSC= z$V|-4EX>Mm%+4Il$z06MJj}~{%+CTW$U-d4A}q>cEY1=v$xM$W7eLE!@g&+|C``$z9yd zJ>1KE+|L6%$U{8LBRtAuJkAq5$x}SdGd#<4JkJZf$Vb5JG{$# zyw3-G$VYt4Cw$6he9jkq$ya>MH+;)?e9sU3$WQ#tFZ{}H{LUZz$zS}8n2?E> zm`RwF$(Woen3AcOnrWDp>6o4wn30*7nOT^X*_fRR?oIFqwDn{zmq^EjUixR8sum`k{n%eb5?xRR^5nrpb0 z>$sj9xRINH=XjnM zc#)TQnOAs~*La;bc$2qyn|FAZ_jsQV_>hnIm{0hW&-k1#_>!;qns4})@A#e{_>rIZ znP2#o-}s$B_>;f*n}7J1{}>>+{xcv0F))KLD1$LLLog&mF*L(4EWbQGcY4FF*CC;E3+{> zb1)}!F*oxtFY_@!3$P#yu`r9UD2uT;ORywMu{6uDEX%PxE3hIfu`;W$Dyy+NYp^D3 zu{P_lF6*&A8?Yf8u`!#lDVwo5Td*Ztu{GPUE!(j@JFp`=u`|1{E4#5fd$1>au{Zm$ zFZ;1S2XG(@iy=9F7NR^AMha`@iCw9 zDWCB2>oY324Y|aVNeER zaE4$=hGJ-jVOWM^ct&7EMq*?}VN^zAbjDyz#$s&7VO+*zd?sK*CSqbHVNxbza;9KP zrebQQVOpkRdS+loW@2V$VOC~icIIGC=3;K84j-r{ZE;a%S2eLmnrKH_6O;Zr{2bH3n9zT#`X;ak4rdw$?Ye&T0-;a7g+cmCi{ z{^D=`;a~n^fROsnfDFXI48ouc#^4OWkPOAp48yPt$MB56h>XO@jKZjl#^{W}n2g2P zjKjE$$M{UZgiOT5Ov0p0#^g-FluX6cOvAKH$Mnp=jLgK$%)+e9#_Y_&oXo}C%)`9Q z$NVh7f-JNj_kzF?82_>#_sIFp6tcm?8Cn7$Nn6^fgHra z9KxX-#^D^nksQU*9K*33$MKxNiJZjAoWiM`#_62FnViMhoWr@C$N5~qgJnVE%InT^?*gE^UtxtWJ~nUDEdfCX8Ig;|6}S&YS5f+bmsrCEk$S&rpdffZSa zm05*VS&h|MgEd);wONOCS&#MEfDPG*joE}v*^JHEf-TvKt=Wcc*^cemfgRb2o!Nz5 z*^S-VgFV@cz1fF-*^m7>fCD**gE@plIgGpufB5Cby^gEAO{GXz626hku%!!jJhGXf(r5+gGTqcR$!GX`Ta7GpCG<1!xOGXWDa z5fd{BlQJ2TGX+yJ6;m?}(=r{?GXpa+6EialvoagAGY4}r7jrWY^D-avvj7XS5DT*i zi?SGtvjj`B6ic%V%d#BHvjQu!5-YO`tFjuavj%Ij7HhK(>#`o}vjH2j5gW4!o3a_3 zvjtnS65D)VRkMbCg^8`=w6i@RE&+;74 z^8zpO5-;-#uksqN^9FD77H{(o@A4k+^8p|75g+pjpYj=>^95h>6<_lW-|`*b^8-Kf z6F>6{zw#Tu^9O(O7k~2)|MDLLgwcNnWFQ7+5C&y124@I{WGIGa7=~pyhGzsuWF$sr z6h>t? zWG&Wa9oA(%)@K7YWFt0a6E?yQj^_kUZs!i}!9`5Bn?&kp> z49QRo%`gnha174~jL1lg%qWb?XpGJnjLBGx%{Yw9c#O{kOvpq`%p^?8WK7N! zOvzMC%`{BQbWG0-%*ag4%q+~xY|PFa%*kBL%{%qg78X`Id(oXJ_7%{iRQd7RG$T*yUS%q3jPWn9h`T**~j%{5%hbzIL4+{jJb z%q`r?ZQRZs+{sl%p*L?V?53iJjqi$%`-g9b3D%ryvR$u%qzUg zYrM`IyvbX<%{#oyd%VvFe8@+9%qM)xXMD~Ve92dQ%{P3@cYMze{K!xI%rE@PZ~V?5 z{K;SZ%|HChe+&>#{~3^h7??pAl))IBAsCXO7@A=imf;wl5g3t?7@1KRmC+cTF&LAv z7@Khzm+=^%37C+Hn3zeJl*yQ!DVUO}n3`#rmg$(D8JLlon3-9amD!k`Ihd2Vn45W+ zm-(2V1z3=USeQjvl*L$_C0LTBSej*6mgQKU6k7BQY|gFe;-lI%6;Fe|e$J9986b1^sbFfa2lKMSxR3$ZYZuqcbMI7_f3OR+S|uq?~5JS(sw zE3q=GuqvyuI%}{dYq2)#urBMdJ{zzh8?iB)uqm6dIa{zLTd_6Uur1rMJv*=?JFzpn zuq(TCi2XQcma43gyI7e_KM{zXAa4g4hJST7>Cvh^Ta4M&9 zI%jYuXK^;?a4zR@J{NEy7jZF{a4DB@IahEcS8+Aha4pwyJvVS8H*qt!a4WZQJ9ls= zcX2oOa4+|9KM(LA5AiUM@FV|*rH zLMCEjCSg)0V{)coN~U6JreRv9V|r#_MrLAWW?@!lV|M0XPUd26=3!puV}2H3K^9_R z7GY5qV{w*XNtR-1mSI_zV|i9!MOI>ER$*0EV|CVGP1a&<)?r=NV|_MYLpEY#HepjX zV{^7(OSWQbwqaYgV|#XBM|NUoc41d`V|VsoPxfMO_F-T4V}B0dKn~(y4&hJ^<8Y4P zNRHxYj^S92<9JTsL{8#lPT^Ee<8;p8OwQtL&f#3n<9sgQLN4NBF5yxx<8rRxO0ME+ zuHjm)<9cr3MsDI}ZsAsL<96=gPVVAv?%`hU<9;6CK_22^9^p|Q<8hwgNuJ_qp5a-Z z<9S}-MPA}%Ug1?<<8|KPP2S>d-r-%|<9$BhLq6hTKH*b7<8!{?OTOZ3zTsQG<9mMK zM}FdGe&JVs<9GhxPyXU>{^4K#V}OYI&wvcXzzo8m494IL!H^8a&Lhq%*?{9 z%*O1@!JN#++|0wg%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS!z>2KI%B;ewtj6lB z!J4ea+N{I6tjGFnz=mwZ#%#i-Y{uqn!Io^r)@;MJY{&NOz>e(1&g{aj?8ffw!Jh2J z-t5D^?8p8bz=0gZ!5qS&9LC`s!I2!r(Hz6E9LMpTz=@p1$(+KeoW|*#!I_-J*_^|< zoX7cGz=d4I#azOrT*l>G!IfOa)m+21T*vj?z>VC*&D_GR+{W$P!JXX2-Q2^y+{gVq zz=J%*!#u*HJjUZZ!IM12(>%koJje6Az>B=Z%e=y?yvFOi!JE9r+q}cOyvO@|z=wRq z$9%%4e8%T|!Iyl+*L=gbe8>0vz>oaI&-}u#{KoJ6!Jqua-~7YB{Ko*1^q&D4h=Cb| zK^cs}8G<1hilG^XVHu9$8G#WQiIEwFQ5lWV8G|tyi?JDpaT$;CnScqIh>4kmNtukv znSv>qim91~X_=1cnSmLZiJ6&&S(%O5nS(i*i@BMHd6|#-S%3vuh=o~%MOlo+S%M{5 zilteGWm%5pS%DQh8VP1%gi*@7+Eimlm( zZP||P*?}F|iJjSnUD=J@*@HdVi@n*0ec6xwIe-H>h=VzVLphAYIf5fOilaG(V>yoF zIe`;7iIX{nQ#p;(IfFAfi?cb0b2*Rmxqu6~h>N*|OSz28xq>UXimSPXYq^f=xq%zG ziJQ5FTe*$fxq~~oi@Ujpd%2JMd4LCbh=+NEM|q6Ld4eZ-il=#oXL*k2d4U&siI;hW zS9y)sd4o53i??})cX^NZ`G61kh>!V%Px*|``GPO`im&;GZ~2bz`GFt#iJ$p}U-^yS z`GY_Ci@*7YfBBCABI`c`G7tkZ2!k>hgEIs}G898I48t-U!!rUSG7=**3ZpU_qca9$ zG8SVq4&yQ&<1+yhG7%Fq36nAzlQRWVG8I!Z4bw6m(=!7zG7~d13$rpCvoi;CG8c0* z5A!k~^Roa8vJeZi2#c~9i?akvvJ^|R49l_{%d-M2vJxw^3ahdjtFs1cvKDKz4(qZW z>$3qHvJo4z37fJRo3jO5vK3pi4cq>|cJ4ads;Y4mHPRqmBHbn3-Q5y`bjPMQ-6bW6 zDBUHXlz`G5(x8HLw~C!$-Zk0pR{4H@=RD`!=ed8~^~ZYGh&lG0Bj*}(@6BH9%{~lg zU-n~v4&Xoz;$RNpP!8j8j^Id+;%JWHSiZt>9M1`y$Vq&alR1U2aVlTuG`_(%Ih`{& zle0LRb2yjtIG+o+kc+sOOSqKFxSVfs1y^zv-{w19%{5%hbzIMPxq%zGiSO}!Zsrzl zMm%+4Il$y|JaxtWJ~nUDEdfCX8Ig;|6}S&YS5f+bmsrCEk$ zS&rpdffZSam05*VS&h|MgEd);wONOCS&#MkA{($F8?iB8ViPvy%WTHxY{8an#nx=Y zwrt1t?7)uf#Ln!(t_)*0c4rUvWH0t+ABM9p`>{UQm9KLe-{6~^&KaD^S)9!|oXdHf&jnn_MO@4!T*_r!&bPRN zE4hkq^Bu0{8m{F!uIIblz>VC*_xL_Ha|^d}8@F=@cXAhZa}W1&ANTVB5AqN{;9(x& zQ6A%Qp5RHI;%R=!kN7dq@GL*!IiBYQUgRZS<`rJ$HD2cp-sCOb<{jSUJ>KU7e#+1I zIUn*1e#x)+HNWAv{EpxA2mZ*P_%k2zF@NE&{Eff!5B|w0DXjmD%4m$v7>vnSjLkTV z%V!yn@tJ@LnTUy*gh`o<$(e%BF(sd8DyC){re!*&X9i|uCT3<9W@R>JXAb6MF22Cr z%)`9Q$NVh7f-Jp(ta2&^T0w;13U*%*@;cJ}A*Ex-E@J&wV z49?^%&gLA>MsDJJe4m@S zg4kmNtukvnS#$TC7)+1 zre+$ZWjdy324-X?W@Z*a4+X+e$I#df?x70e$8+AEx+UU{DD96C;rSw ze9T|?D}Uqf{DXfo%JbHLMrAZcXAH(mS-jD>a3eSIJ-*M)+`_Hg#_im}o!rIU+{3-x$NfCOgFM6! zc$i0cl*f3SCwP*lc$y#bBYw;?Jj+jbj^}xS7kP=7d4*Sbjn{dDH+hS(>w%wPB`f8+1`gMTtgD(gR^G8&^Z24gZ7 zV>1rp@>#}Xd?sK*CSqbHVNxbza;D&OOv&e&im91~X_=1cnSmLZiJ6&&S(%O5nS(i* zi!U%Y^Dr;-F+U5iAPccDi?Aq*u{cYxBulY0%djlVu{##2Cu|8j912$wMHs(ug!lrzg&DfkR*pjW-nr+yY?bx0j*pZ#snO)eGVeH24?7^Pw z#op}0aQ0?yD9LMpTz=@p1S2>we_!_73bxz|O ze3R2TgEKjcvpI)zIgj(XfD5^Zi@AhLxs1#C7FTd3SMhDW!_{2FwOq&be3u)zk(>A) z-{)p-;Z|%766yv5tR!@Io4`+UGp`58awLw><8`4zwBH~g00@q7NjANdo1 z<|97lFZ`9i@pt~gKN%&p^`B80jnNr{F&T@o8HaKCEaNdg6EGnYF)@=cDU&fdQ}8*a zLhq%*?{9%*O1@!JN#+7nqxQn3wsOp9NTug;dpRbJzD-r!B%;%(mHUEbq; zKH#VPjGyx%zu=erieK{^e#`IpJ%8Yj{E0vF5g+px{>tC@JOALHjFQIs&!~*X=#0Ub zjK$cD!?=8w@fe>8n2?E>m`RwF$(Woe_#9L6d8T4&reRv9V|r#_MrLAWW?@!lV|M0X zPUhkZ%*{N^%Y4kw0xZZvEX*P-%3>_e5-iD5EX^`3%W^Ew3arRVtjsE`%4)368m!4$ ztj#*C%X+NO7ukRf*@%t#5}U9oUuH8lXA8DuE4F4Ewq-lEX9spfCD**gE@plIgGC-GHI<`llhseGN& z_y*tPbk5*R&f;v&;atw+d@kTZF5+S?;ZiQ+a=yhCT**~@o9}Qn*KjS@aXsJV25#gg zzQ^~unOnG(+qj)OxRblMn|rvI`?#M6c#wzq0T1&CkMbCg^8`=w6i@R*e#DP?hG+Q+ z&+$Aj@FFkqGOzF|ukku>@Fs8ZHt+B*@9{n#@Kb)q&-svF@JoKhulWtX<#+s^Kk!HX z#GmFe|e$J9986bMXb{W*+8cKIUfu7Gxn7W)T);F&1YD zmSicGW*L@cIhJPyR%9hsW))UtHCAU0)?_W#W*ydLJ=W)oY`}(W#KwGyP1uw#vl*MS z1zWNeTeA(@vK`yA13R)4JF^SBGK}5Wojur-z1W+57|y=z$Nn6^fgHra9KxX-#^D^n zksQU*9K*4Eh2uD$6F8BR_$nuJ3SZ+?zRqcUgKu&=XK*HGaW?00F6VJR7jPjLaWR*0 zDVK3M-{K0c1KE+|L6% z$V2>qhk1lYd5p(-f+u;3r}-g2;>SG0v;2hTc%Bz{k(YRxS9q1zc%3(Rlec)AcX*fg zc%KjWDL>=qe8?~OCBNd={D$B1JATg}_#=Pf&wRwk{Dr^rH~!8)_$Q;Jv;H$GqcJ*T zFeYO$HsdfZpJhD8X96Z+?l6U_&-yW4^>DY|59}jLq4CE!m2#*@kV|j_uij9odPU*@ayh#%}D+ z9_-0p?9Dz5XJ7VXe-7Y44&q=A;ZP3aaE{40 z=QO^-H#wa%IFqwDn{zmq^EjUixR8sum`k{n%eb6xaRpa$72oDNT+KCH%XM7Oce#NZ zxry)beQxF!Zsj&^=ML`VF7D1Y{k}W!?tY4_Uyop?8MIO!mbQsH+E+a_GB;iW*>&LFZ;1S2XG(gB(jLK+? z&KQizSd7g$jLT;kkMWs+37LqAnS@E1jLDgT&oL#RXDX&<8m47Bre_9bWF}^27G`BO zW@irOWG=qI+|0wg%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS!z>2KI%B;ewtj6lB z!J4ea+N{I6tjGF%kqy|8jo6qku?d^8n5#PZ}Jvz^A7Lw9`Ex3KjmlqoDca0zvNf^n&0qSe#h_m z1ApXC{F#sVn7{B>{>I<=2mfT0j8UGvo?%o*V|2z~OvYkt#$jAO%Xo~>1Wd?8Ow1%q z%4AH=6nu^;`8-oGHPbLH(=k0WFe5WDGqW%&voSk!Feh{I1?FZR=4C$SX8{&uAr@v4 z7G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`=ZkE>hHS*fe2Goi zlrOUxo3jO5vK3pi4coFE+p_~ZvJ*SA3%fFm-PoNy*pt23n|&D0zU;^T9KeAb#K9cG zp&Z8H9Kn$s#nBwYv3!N&IGz(Yk(2l;Cvys4<5a%RX?%lkayn;lCTDRr=Ws6PaXuGt zAs2BmmvAYUaXH`O3a;cTzRh>Inrpb0>$slpasxMV6W`13bt>{D6mfghzRd$9aM$d5Wj`AwS~BJj1j6gy(pk7kH7Ec$rstmDhNkH+Yk` zc$;^4m-l#|5BMoR4&!i+;7E?*XpZ4nzQS=F&k3B!Nqm)) zIfburDqrU`zQH#+oijL-vpAb`IG6J{p9{Ee(1&g{aj3}ZKTXAkydFZO01hO;mGu|EfJAO~?U zhj1u|aX3eCBu8;H$8aoP;W&=x1Wx26zRJm*!q+&JuX7sT;G3Mz8Jx*koXt6$%Xys7 z1zgBQT+Ah0%4J;6x442Uxr%S|9j@jYuH`ze=eyj%joifd_&zst3%7C`w{r(~au;`V z5BG8(_wxV`@(@4ZVIJX89^-MI;7Ok1X@1C$_%YA$EI;8np63N#Y#BGavCWf8nqEjlc5`{>dm= ztpAM4XpGJnjLBGx%{Yw9XBm(2nScqIh>4kmNtukvnS#$TC7)+1re+$ZWjdy324-X? zW@Z*a4+X+e$I#df?x70e$8+AEx+UU{DD96C;rSwe9T|?D}Uqf{DXfo zN;d01qcR$!GX`Ta7GpCG6w8U znTeU1g;|-6*_nemnTszlH}fzr^D#dQupkSuFpID#i?KLMup~>dG|R9o%dtEwup%q5 zGOMsEtFbz3uqJD#;sxWCJ#2BR1wsY{I5|na$XoE!dK+*qUwFmhIS{9oUhb z*qL3}m0|40?(D&y?8V;f!*KRxKlbMU4&)#X<`53$Fb?Mkj^rqg<`|CUD;&r1oWO~k z#8)|)Q}`OE@^wz*8+?<~IfFAfi?cb0b2*Rmxqu6~h>N*|OSz28`4(4jC0FrnzQfgA z!?j$;^?a8bxRIOq9^dC?ZsAsL<96=gPVVAv?%`hU<9;6CK_216?z#sV&f94}T<}dt}zwvke!9N)#yY-(@8I92ygE1M4u^ESP`7GlxJ`*q@ z`?c#*yl($?jbn7}*{?v+@ZQfvi4xW|^pdG|R9o%dtEwup%q5GOMsEtFbz3uqJD#;sx zWCJ#2BR1wsY{I5|na$XoE!dK+*qUwFmhIS{9oUhb*qL3}m0|40?(D&y?8V;f!*KRx zKlbMU4&)#X<`53$Fb?Mkj^rqg<`|CUD;&r1oWO~k#8)|)Q}`OE@^wz*8+?<~IfFAf zi?cb0b2*Rmxqu6~h>N*|OSz28`4(4jC0FrnzQfgA!?j$;^?a8bxRIOq9^dC?ZsAsL z<96=gPVVAv?%`hU<9;6CK_216?z#sV&f94}T<}dt} zzwvke!9N+f@IUiRL_}0ZV|2z~OvYkt#$jAO%Xo~>1Wd?8Ow1%q%4AH=6nu^;`8-oG zHPbLH(=k0WFe5WDGqW%&voSk!Feh{I1?FZR=4C$SX8{&uAr@v47G*IOX9<>MDVAm# zmSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`=ZkE>hHS*fe2GoilrOUxo3jO5vK3pi z4coFE+p_~ZvJ*SA3xnJ5;1$Mh4DP{$R}c1NF9!GS!7I3L4_?8Yd+-Ww)q__*_U8Z& z?yD9LMpTz=@p1S2>we_!_73bxz|Oe3R2TgEKjcvpI)z zIgj(XfD5^Zi@AhLxs1#C7FTd3SMhDW!_{2FwOq&be3u)zk(>A)-{)p-;Z|%766yv5tR!@Io4`+UGp`58awLw><8`4zwBH~g00@q7NjANdo1<|97lFZ`9i@pt~g zKN%%z#60s1qcR$!GX`Ta7GpCG6w8UnTeU1g;|-6*_nemnTszlH}fzr^D#dQupkSuFpID#i?KLMup~>dG|R9o%dtEw zup%q5GOMsEtFbz3uqJD#;sxWCJ#2BR1wsY{I5|na$XoE!dK+*qUwFmhIS{ z9oUhb*qL3}m0|40?(D&y?8V;f!*KRxKlbMU4&)#X<`53$Fb?Mkj^rqg<`|CUD;&r1 zoWO~k#8)|)Q}`OE@^wz*8+?<~IfFAfi?cb0b2*Rmxqu6~h>N*|OSz28`4(4jC0Frn zzQfgA!?j$;^?a8bxRIOq9^dC?ZsAsL<96=gPVVAv?%`hU<9;6CK_216?z#sV&f94}T<}dt}zwvke!9N)#n)RPi8I92ygE1M4u^ESP`7Glx zJ`*q@6EQK9Fe#HUIaBaCrsVTX#nep0v`okJ%)pGy#LUdXtjxyj%)y+@#TS^Hd6<{^ zn4bk$kcC*7MOc)@SezwTlBHOhWmuNwSe_MFk(F4PRalkPSe-RkleJizby%16Sf4Ml z0UNRr8}lVLVN<@$W^B$DY{^z^%{FYyc5KfM?8r{+%r5N8Fm_{i_Fzx;VsG|gIQz06 z`*Q#Xau5e|2#0bQhjRo+aui2%49D^nj^lVv;6zU1tDMX!e2r82I;ZgszRBsF!I_-J z*_^|miz~R2tN1qG;cBkoTCU@IzRL~V$W45Y?{hP^a4WZQ zJ9ls=cX2oOa4+|9KM(LA5Ag#Y<`Ev{F&^g$p5!T>=7;=84j-r{ZE;a%S2eLi5ny5q*T3+oyBHXVA>p7XCK?fpU#|NoQrC$|>qBW^IF zWMD>SVrFJxR%T;%=3q|d;tR~pJj}~{%+CTW$U-d4A}q>cEY1=v$xau{ZlLoPF7k{W*XGIf#QfghM%u!#RQ@If|n>hGY2($8kI- za3Ux1RZivx^9Yaf7?1M= zPx2H`^Fw~bk9me?`3cYQJTLGfFYz+3@G7tII&bhMZ}B$o@GkH1J|FN?e#X!FkYDgi ze#Ni(4Zr1g{GLDXNB+d0`G}AC3xDNr{GEUBPeyqr;=?nH%4m$v7>vnSjLkTV%V!yn z@tJ@LnTUy*gh`o<$(e%BF(sd8DyC){re!*&X9i|uCT3<9W@R>JXAb6MF22Cr%)`9Q z$NVh7f-JBF^5E5rz1fH1 z?91SmJ$Uu!01o6J4(1RJ0r}A}9;~RXF z(>a4PIg7J7hjTfP^SOWvxrmFogiE=M%lQ^pa3xpqZN9_RT*I|o$Mt-d8@Q31_#WTq zW^UnDZsT_D;7;!1Ztme;?&E$Q;6WbZ2RzIpJj!D{&J#SzQ#{QN`4KB=Z%e=y?yvFOi!JE9r+q}cOyvO@|z)$%ZKj%Y!!7uq0zvegmmf!Jv{=gsk6MyC- zKISj{mA~i- z$>*7hshNgpnU3k1ff<>JnVE%InT^?*gE^UtFEBUrFfa2lKMSxR3$ZYZuqcbMI7_f3 zOR+S|uq?~5JS(swE3q=GuqvyuI%}{dYq2)#urBMdK3`-5He@3<=1XkCrhJ*r*qklc zlC9X9ZP=FW*q$BOk)7C?UD%aj?8ffw!Jh2J-t5D0_GLfz=Kv1mAP(jb4&^Wo=LnAE zD30bBj^!&H$MKxNiJZh&Ihj-V8mIDgPU9PVlhZkaGdYX1IfrvOkMp^J3%Q7kxr9r( zjLZ2JS8yd)@om1t)m+21T*vi%mm9c|oA@5z=Vor;{FdMGd;Y*5`4fNUBR=LY{FT4)cmBaY86}$apHUf&(HVm= z8H=$QhjIBV<1s!HFd-8$F_SPUlQB6{@HwXB^GwCmOvAKH$Mnp=jLgK$%)+e9#_Y_& zoXo`+n45W+m-(2V1z3=USeQjvl*L$_C0LTBSej*6mgQKU672otoWd(4A1ft zp5u95;6+~IWnSS`UgLG%;7#7*ZQkKs-s62fVB{aYM-Tncd-3r8p>NfZfAc;d^m%yr zz|i|&fAhZNuP5fWnuzwqIq$o9QM;kJGAw(k|*G4GQj;#O|kuWi-f*tp>%LODWz>_0L@ScZttqeA3p z86NB|<_P^N!rv7T>@x-x5H+F#g6+l}O+#-{Dm;0w7;)WC{hxqfD>69b$-fC`obtb2 z@{|F*0go+p+`cDJ@iz3=RbzHF5`PXqla|N58IhqDNpi{42 z;lbADlR&$+;o-slr@)Z#U=y@szjmRM3ARC>u7F@e^hu!+6%g!;MqGK@Cl~NkWd)n1 z|HvEcnMTxAMBa|W!-MV9E_sKC2ivDPy5tWFD!fa9mf^uRYL4OI!7gf!E`_@k32MC7 zGogRFFf+(dEG(#!{=>s({goaisGm^!?1=Q>())Mp-?v}8c3qN$Y2Z1b$jAns8ywj( zd|tp)=R7}%34UC}^(_d35hopSnHo5IsB#vD(w?gDMUltWit(hO>i=h3|L?+)H@Bg3 zp1l4u+^=2Rmf?#de(c|Ua!KTmjX33{LGXW`a@*nI!6tD~+rcJrP}{-P2eln+77q^( zwu&PvJBrF)8Kf5r3ofgF+lb0+9$A^ILXnY`8EhtpDl^zie!4P)UFCmNX0W;ZFK5=J zO{g-14dKYE-szAiPkKhbP_erd`}gd!`{`o$ zJSjGqBZ6sRZz%NFw6HG}2u%x5wbFh^hbFgRh6%ReBW9RhJv)X6oAFI!go^6dCA?$D z@L*5=Nn!m4_6iR+<|F2q!@)666?!Cy30-N7;2*0r&Jod|f4_%PpS=Fl|2VZ^i~nCr zi5ydrRDw z_g?rPCmTF(_^X_MUUVq_$t&_St>00ii+?grpK@M*&k4cviYMhov`g^VBBHPS%QPK4 z%lJp$;E~3YycPeNrh}&&L0=D^ZUlWj=$t`c51w*_rs;D}qJwEVavDA#WGEIE)J|xc zzVKIiP7>Jgv+ zWi`I;!2jr5q1gX*H3z?$80rBxLK&axt2dPw?MVli*m#hF@ZUQ?M22MLBTUIO%)m^{ z!fedJT+Chm$#ixr;`ILR2f=fpf7Da(c<8^@Q)p5P9vcM>eJ{utYUrTigND8za(MWI zC$SOj9PP<+8$3n|wR2DjCbh=#I`nJZqwAoK9a{Gr(l`9GP)cOO2G5ux8}?!7bEsjz z2zaVtzYJnR4Qu@ko;y8t!vE5w!857gkpIgj4IX7Z)uh2QttY1tG-)ua1rN7!bV*aA zK%pYV3gj4&8PHkG;bF z)ya^iPND>jqgSfZuu=Grp@@DB!hZ@8D);B0+<$os&77e-fk&a#rz-DpDB>TL_e&7` z=ctINyihy;8XOsF=ih=Hp>_`1A!z5{LkIc{_?%sOFw}{YN=*{#j0(CyU<0C6A{iOB^Li$%0dF1Wr1b zDR1(S7aMII7MT5htrN);|3^7-|5;AZcb^R+ng**y=$^koc)U zR5(20KSD*q6NN$%%XQ+=r>EOCNe~li+h-!$HfeBRaJd!!Wvxl}bl&8Vd95{1^_dh; z=Y1|RZ|JLAFdILWH|5iLpO4I2>8~{`RVXUx@~MNIp)Mcv(V)wx3Btp}(>{s*=NgtS z$WSaS=qaH!Ed5{Uk!x6ni1eTbgwiwql^(5D^e1y;rik>QWkczi|4NTqD`M`<5|JLX zm^H9*oQR3DL;v34Swqp0BP?6+IYX#QL*G)fhdzf!SdM_FY9MD26EO=0?Vc+L{6H)t(sLL(t|cKvO*h0p)H_}-NUPf;)aJ;dveJ9r?!Qj zq{pov3{D_SQA{*QsDwnAg$?|&d! z*Ma{xi`r8K)l*QYr9wj^QDk3>914*;OZ7w9o*JAlIxckY8mhP8OLpY1^(x~3ySvwj zADFj%g!x&31zCuNS%gJdjKx`kC0UB4StjDHxj{t5{C(HlFtTDIR8lpTz$2uKDF4y;xXK{qCm1o=o?8U9(VpWUp%;9NaRz zML?+S{yxQ)K}@LE8J(?y;FIs#HTrezQ6ao_C>$2NndGyF%@1{l&=;6CLB^+gLE9iE zm}~y(1^;EMCr31F$58M|E&aOmU?tsWMH zhlh8268+C^Bt2tK*IzdA<6@P464eWFcRus;{-6a7PRPj!g_kp=zLC6+wZ3Q;1vLZng3 zOJ`(Pi1dGUiHQ9Fr^iQ@6Zy3{a<#(VnY$J zEB20vg@1PPMup?~zVqMj-uwUiJkNgjtXZ>W&FV7~JkwGp#NJdUGR@FYgQ6$I1{FgqD9oih}ej-(_=Fx#AYfJnPJrE z(8#C}LnlSYW}Y9Ju3u8L*enCXISpqdBK$QddPICwL~PdfH?>dEKE>RwvDqSX^cx!& z-7ge9A}W69xTFZN+2=>59zK3hTx^bn$eTvBziI4%#4C=9%{g$?*pXu&FUaqOI~u>~Si4ILF9TQIr!6R-Gh^{ZYB{losY zqvC7Tu)lCZY>_fCIb(9g{8a`;lgc0_xa^_HmtCw(`&9pW*)B9i;^(F$Qlw-mre+$Z zWjdy32F4GFNxH=1k!i;S?RNB_*b*^+UPp3MPTCXEIWdBZDVY#^bD5Y7kvDa?;fhMR zqGCb4q?W%lOv`jk&kT%UMrLAW{+Zod#zv-#8yGbrDijx6Iw3Y_^5GU87auipOl;ZY zYKV^;F?2vg_v9TBvE|xUtzNTM-MV#ZRQc-@(XnID)XK+2rcLU&u@w?xE0&45zOs{c zMRZH-+7Yppg05CMA+|~|67&oDR8(x$`pU>Vhv;=il!KMSUIRG-*7*R`;^ z|3z{)CZD_=KOjh^UQ+(*-uP#HR9tjKtI+64 z(W5%HXqq&@b^UXI>-G;Nm0ZNhHF~2#zH`O=l|=V{NFw==8d{P+RUk!ixiS6pml zpV%H{dWCCa^r-0Ao{7KW$B&Nf6&+kw^tkY4-PR|zcbTNBzWv6kz9W40l!;9-Tk=`| zI!jFcn4aO`u}{*`eQ!LvU-;-$iAU$Vu|XscP~mEd=@m|;e{^JqxY42b(V;=nq1dRz zfj8U?2VU2j2PuzWyd0c7UPk}h@$%39lT=jSHdbU!l1$HRO%faY;NUufDj!lNCaCI| zpsN2?4@3W<9whc>=S(i4WN)a-VgHasa?~4>`m2zmV}d#yUM8kzOt0vepsq%!tC2UB z_NZ|3|7~53PO7UhHy#}uKKehdt59@IQU%2Y-8nuc=*DBq#59fR5EFFVaS5^G%fz&d z`KxbEh-niu=nVQ&2W+vrt){Xg_{XfayoS+=%hRbnYxEvG0 z%#?!<+v~*c2ROUE)L3ZNm4%VxiO#j{!j9`G$_aW zt}Dl7k_$Ryr{oT~Jh4MgxqjW&I9R~+iCs}9CUF+7U8QQR>t^AY@bYzKLhPzCF`4Jb zWQqA}S$cm=_W3b?Ep*PuT9=vr*5@>*!{5G)oRzQ^$)w(*afcI5H&9KmYv#IXKFB#mO9 zCFT0_8*}}||0LI62BrH|xOBe`m+pJv()}hO_S@@A_q(8UKmL2^rW_O>`+Z{3{*Vy+ zV{*~{6cp{{q)h&NVW3)peRErgN=kxs>Gnbt;a&C!;#y>{QCDYO*JqoK9nZBBe+!x zrA<6CUE-06H%y`Q;op5i8NxBbx2>Uw#GThY%m`%+hyAsVOm1>PXyPY%m6qJ=Lz#j- zQGG(01FpNP3i1*?WOQg~aDyMw{BOTQS%R2B_Ob>fgu-_uWg;{7OO72K7s?(41=%~D z5XuqWdHwVh${7wz%3d%!g>r>A6WjZM;L%j>V9Um&P@Z5T=F*?TQz&n+9holi&LnzJ zC||HOFBrHY)Ak!UdQfoM{Nd<-d#V#E5Ck|=_`y!qq~}?of{DSwvz;53Mk&J2c2cEG zaZ~(&vB51*s8Be)#8NICKH`SUDDqe0!BCO!zb)mWLHwYJ6bmOBG?C(oNj4ppSd~F5 zDG_WZgi0ocCbg1O;aMVdb2xsRVd1j=TSFNe+@y`^m-IL!R4Vb%TM`eQm|TN#V}ge= z(Up>)m?Um_ZW1aTL=Eb%OyY56!^Z_rC*#Hr2p;`~$_1N|nUjuAK0)~)Ea{FtR3RL@ zPpD!zB(Z2Jg#&IVn#$p@poX(1-gHj*cSZBRx)_L%U`A$QW@h1^4ZKQ_pP)XghBN)= zg98PW^fb6^`u)p>z9MmQ%~uWQpaI7kP=7f7f-QZVJb=M0Yazl034~PB#sJa^j`3ZVj z!*Hg9p4KQa%T0$Rw%MSkH4e5DLQN7wlX_ZklM`HS({TJY!@?Dw)YBp(1`i!EqF>?z ztx&Td_|H0v$=@qnQUi3r=E1(m)cwNUr$zXuN0_0O;iLw{Y#FA*wFuhbqG5dttg+s#k1)=ugfW%vafzT8q0ZsWKkx5usNyc+gOjQ_Xm!bN z20~pEga4zK1HmY8rpU>rKV?smqR93>OTvz9`u)YQ`(K$;@p53sK?SFTy^{B>AwLJU z-ZXM=*yZ(7rnwUM@#*E=!q(_heqGq&**{zOOW>Gw2^Yd1sGg(z)xa#rHcSlLr$_x0 zSF`^)3UmuH8?@!_;p@yb?7B`K>GJ-Mb@E~V(35*OY0BBjy)S4||I|%;{!7H`x@mY3 zpVUon3w|Xvl-}V(5*x}ejo|hmBq4N1VtD5H!)~bjKH($(UFG);ACy@6!@?ClKl1+|K7z1h2ysw7Ow83EpN4iBgJ?_OkF_X;C4A}PU<;iUS6MukaQc8yN_dHt$z zOgJp4#niK7!ppAM@MhBeU*h+5_l18gxk5p-q$U-Ym}dB~L~@gg4?+?`V-v&wwMmT& zANlW^)cEj0L1u!d6n`I2CL{*`N8?HGh~a+_$l2L77;lLm%}>!#%?PI6Q7z>qq;Qo3R8PWYEd;XxsB-o7)~6$}VB=6&k_ zjl55DlIsVx@Z+5u@<07wB8D$A{Ngn68fN_SHOzEW&~Ad&*q@J?f}?`Ze_sAc@e4Ic zvKUMJZB_P9^)k!J|Eibp@i$!8?0<xY+xf2NT*yetYflSY@Q;ADv} zw?m7Au%rZ*{6hlwT%W+b$q6+3D}kkdCUDpL7?vK52ir`3L(S{d0e+_Nfz(-;lIF z+?2*fhu^v;&RUTf!f(Zsrj^HnU6Gml4T>Hd6}(#QH+<-*L7|P|ur!G;dqbPTzjOY% z|8Fbb&B2~OSH5Eh#Kp8t2t6K-ojMdfq*~~S;Ac>)9TP%Zf?wg6cA>3_KmPXKF0?Hi z5{&oh6YmI{{MEjbeg%6IKglLPqDVIRPBGcU_WS4C+0gcXPCYp_iKqBaw}ThA!P~yj zj&Q2I!cV9N4-Ex>N3%27?iKDBap6C#85G(TY=_?khIWS|CN=Cmi9fIJ8GFNFe@(nk zhBt@h4!(!w3BF=3B|h$m9XlcPRIr&i^9~E2DtH+i+82(H_#ox!;Ma}!UC)HWf`o%x z!SIZ}KQT)91!cIo4hv^5oW`?(-V=tN3x3B9OA~x2gboDXiGM;BU%guBV6ZVPOYjw} zPYxySJ{;aXEPL=BWFUCZbR@ie!(GAAa9I1BI%F#yuGa3;>IB<9$-i!_!T+~^8S|gs z>t}V|Yz&tFiJ#zai4s3KnTxrZhk2Qg`B{JkS%`&Mghg45#aV(Sc{5A#7M5o6tMan8 z%dtEwup%q5GLxr|sA&Dnx2*@~^1 z{D7sc?RIR>TiJmf*@?+Zf-bhZvKzZIl0Ddyy?7gY^LF0BKJ3eW?9V6;;6M&yGzW7C zV>pzHg4w*?&L1+<{s|llYEN%_%xs4em=|Rcz_3ah=+NEM|q6L`8-eXBv0`)U*L;; ziD&pS&+-+%%GdZh-{3jE$+vi(Z}S4*;YGg7_xL_P;D`K(AM+D_%1iu=pYscT$*=e| zzu~w1j^FbK{>YzrnLqOif8kaB%HQ}q|KK$yFS2gBDJdW&Q!zEeYmo3KEz>bQGcbY~ znTeShUbBQhS(%ODg-iI8gE^Ut;q_Pe6JCOaKY5vt`B{JkS%`&Mghg45#aV(Sc{5A# z7M5limSs7XX9ZSdC01q?R%JC-XARb5E!Jl860WZ8daTa|Y{*7z%qDEgW^B$DY{^z^ z%{FYyc5KgE*?}F|iJjSnUD=J@8Oa{($zHsTy?HzDU?28nKlW!72XG(G!IfOa`?;EH_y8Z|LtM*;xsH!;Js;%;KE{pQ#Lax1PjCyj zavQgE2X}H8cXJQ-@<~3$eSDhFa6g~rb3DLS5U*&6jop10Q-{f07&$oGj@9-ku<$HXeAMitd#E*U*YOdq=cC-f$GDN3xS5ah32xz5ZsT_D;7;!1Ztme;KFO!Jk5BU%?&q_7 zjt6*r@X|^_&LAem;8!f^BaE4@Ay4`;E()?m-#cV@E2a?ul$X_^ABEQ z@;^DeDV6!3shIdGJJ^-Rc3P%mdS+k*Gcpr1GYhja8?!S9b21lmGY|7JAM>*S3$hRk zvj~f_7>lz6OY&xx;w>!AGAzq-EYAw8$V#ltDy+(Ctj-#&$y%(9ENM{*QLa|~k{VjSZ+mg6{{6F8BRIGIzJz&kmW(>R?o zIFqwDn{zmq^EjV(@op~QLN4NBF5x}AmrHpcmvK2)a3xpqey-*kKEMb05ZCfyuHz$I z&quj|k8vY6aWfz16WqeB+{W$P!JXX2-Q2^ye3DObAD`wk+|OtE91rjy5AiUM@F<_mn0FYydt=2^bNSNR%W=Nml7H~ALN^KD+>JG{tu`5xcr2mFv9@ne3% zPkD)-@pFE`FZmU}<~RJ7-|>6?z#sV&FY{+!;V-<(U-=t<=O4Vr#J}JQ#?PBloBx@L zshNgpnU3k1ff3BeOw7zI%*t%c&K%6iT+Gcp%*%Yt&jKvSLM+T8EXram&Jrxin^}st zur$lCEX%PxE3hIfu`;W$Dyy+NYp^D3u{P_lF6*&A8?Yf8u`!#lDVwo5Td*Ztu{GPU zE!(j@Z)FE|WG8lJ7j|Vgc4s7euqS)*HumQ2yn}t%m;KnEQ5?X59K>i2<`BkkD2H)4 zM{p!ZaWuy;mLbM5o?|(V<2iv7If;`wg$cZqQ#p;(IfFAfi?cb0b2*Rmc^B{I0xskt zF6I*6!+W`u_i-7Qa|Ks&74PS2uHgfGkPmS!ALcqf!u5QV8~7MEauYZ6aX!H<+{$g- z&K=yzUEIw*+{-8V6!-CIKEwTdme2735AqNX^9Yaf7?1OLp5RHI;%UCX7x@y;@MWIm zD}0r&@pZnzb9|F;@jTz=1-`?Je3$R>eSW|X`4K_e5-iD^S&Fx?G|R9o%dtEwup%q5GOMsEtFbz3 zuqJD#;r?upt|e^9gR@R&L{V?%+=D;%@HYUOvgExQ|cs z8Sdw^e2xcrkcW7fM|hOSc%0Ak1W)o5PxA%7$d`DAFY_#4;j4U&uk#I_Fe|e$J9986b1^sbFfa2lKMSxR3$ZYZ zuqcbMI7_f3Z)Pdp!qP0mvMk5)tiXz_#LBF~s;tK9tihVB#oDaHx~#|gY`}(W#KvsG zrfkOMY{8an#nx=Ywrt1typ;{FdMGd;Y*5`4cbmXI|kiyvkqs8-M2?yv7vi%>PWuR7}k@Ov`jk z{4dZ1No24c!Hmqr%*?{9%*O1@!JN#++|0wg%*XsJz=ABq!Ysm~EXLw2!IHe0rFaWV zvkc3!9Luu;E3y(RvkI%S8mqGgYqAz=vkvRB9_zCK8?q4_vk9BB8Jn{OTe1~fvklv_ z9ozF(c3?+#VrOyoFIe`;7iIX{n3A~e2IgQgfgEKjcvpI)zIgj&s7w_f* zF61IE<`Uk+d%2YNaT%9$1y^zv@8@c+;RAe-4{U62#@j@kMnt+;7Ok1X}-W0`4Z3Y zWuE0Le3h^9b-uxKe3NhSJm2O8zQc=rm+$d?e!vg;5kKZ9{FImY89(P2{E}bsYktFT z`5nLK5B!lo@iKqr75>7j{FT4)cmBa^Op)IF&y-BX)J(&)Ovm)hzzAk!CT3<9W@R>J zXAb6MF6L$)=4C$SX8{&uAr@v47G*IOX9UXiuZFh z*YE*8$cMO=4|5$K;d(yG4Sb9nxrv+kIG^AaZsj&^=ML`VF7Dd(6Mo7|{EVOT3x3J3_%*-bxBQOZ^9TOOpLm%+^9q0ARsPD~_&fjL zHKxd5{%1<2Vrr&gTBc)qW?%#}G7~d13$rpCvoi;CG8c0*5A!k~^Roa8vJeZi2#c~9 zi?akv@@AIeEiBD4EX#5%&kC%_O03K(tjcPv&Kj)ATCB}Ftjl_=&jxJBMr_O`Y|3VA z&K7LRR&32SY|D0R&s*7n9odPU*@a!%jolf^9_-0pyp6qiJMUm0_GLfzXA}o;AO|s; zgE@pT9LixF&Ji5RQ5?-NjAe*%jOSR6<9JTsL{8#lPGJJ?N*|_wZgW<$YYn}M`3XPeC4R=w`31k^ zSNxja@LPVz@A(6N$3qHvJo4z37fJRo3jO5vK3pi4coFE+w)d-U`KXhXLey% zc4K!&vIl#z7jI*4-p)JNhkeCzyvTR?9^dB&{E#2*U*YOdq=cC-f z$GDN3xS5ah32xz5ZsT_D;7;!1Ztme;KFO!Jk5BU%?&q_7jt6*r@X|^ z_&LAem;8!f^BaE4@Ay4`;E()?m-#cV@E2a?ul$X_^ABEQ;{ShDumrd%Q_?yhB~vjq z(=aX5F+DRdf*F~KnVE%InT^?*gE^UtxtWJ~nUDEdfCX8Ig;|6}S&YS5f+cw~OYs(# zW*L@cIhJPyR%9hsW))UtHCAU0)?_W#W*ydLJ=SLfHe@3P!Wa(aFb?Mk zj^rqg<`~8@#5l%tEXQ#?CvYMsaWbbcfp>B$r*S%Ga3*JQHs^3I=W#yo;@w=pgK0eK7xS!ARIUe9a9^zph;ZYvraX!xzJjqi$%@_D0U*Z|Q%(Hxj zuktm%&Nq0DZ}Kgk=i9u%cX*NS@;$!K5BMQJ;>Y}ipYjqvU-Bz{&2RWEzvK7( zfj{ymUgpod!e4lmzw$T!&OdmKDKfkNXG*4GYNlaYrek_$U<5NV6EialvoagAGY4}r z7jrWY^D-avvj7XS5DT*ii?SGtvjj`>W|rbDEX^`3%W^Ew3arRVtjsE`%4)368m!4$ ztj#*C%X+NO25iViY|JKX%4TfN7Hr8@Y|S=o%XVzfTiJmf*@>Omgk zkJTYksy`2&CCPrS^Zd4<35Du3l~{GEUB8dGF3 z|1%|1F*VaLEz>bQGcbY~nTeU1g;|-6*_nemnTxrZhk2Qg`B{JkS%`&Mghg45#aV(S zc{5A#7M5limSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`X9G55BQ|CeHf1w5XA8Du zE4F4Ewq-lE=dJ9(j_kzF?82_>#_o({5B6j)-p1a%op-Pg`?4SVGl~N^kb@Y_!5qRE z4&^Wo=LnAED30bB#xle>#&aykaXcq*A}4V&r!awcaw?~BI%jYuXK^;?a4zR@KJVh) zT)>4~#Kl~~dw4IG@;)x(a<1S?uHyY%%{6?05Aq?d<-=UZN4TDkaswaZMsDI}KF%k& zg$3qHvJo4z37fJRo3jO5vK3pi4coFE+w)d-U`KXhXLey%c4K!& zvIl#z7jI*4-p)JNhkeCz zyvTR?9^dB&{E#2*U*YOdq=cC-f$GDN3 zxS5ah32xz5ZsT_D;7;!1Ztme;KFO!Jk5BU%?&q_7jt6*r@X|^_&LAe zm;8!f^BaE4@Ay4`;E()?m-#cV@E2a?ul$X_^ABEQitOfprerFnW*VktI;Lj^Mld5Y zF*CC;E3+{>b1)}!F*oxtFY_@!3$P#yu`r9UD2uT;ORywwW+~pn(k#QWEXVS!z>2KI z%B;ewtj6lB!J4ea+N{I6tjGFnz=mwZ#%#i-Y{uqn!Io^r)@;MJY{&Mzl^xiTo!FUO z*p=PbossOpp6tcj*qgWW4)$SR_G5oWaR3K$5TiMmLm0!M9LC`s!I2!r(Hz5Ah8V|q zj^#Lx=LAmVBu?fOCh$&9l-G-}wixF+~pZKT|RlQ!@?IG9A-110$G`nV6Ybn3dU>ojI73xtN=In3wsOp9NTu zg; z@iTtTFZdMm%+4Il$z06MJj}~{%+CTW$U-d4A}q>cEY1=v$(vba@izA6?Yx71*q8m-pHUpZfgHqW4(1TXa43gyI7e_KM{zXA zFqR?4F`i>Nj^jCj6FG^KIfV(llT$g3(>a4PIg7J7hjTfP^LZEV<^nF{A};0<-otyj zl=pENmvaSIaux6AYOdh}e2@=uEg$AOKEm~UlpFXMH*ym<^Km}GE!@g&+|C``$z9yd zJ>1JD`4so@X+FdKe3sAg01xsI5Az6*@)(cvd7j`&p5ke~z!&)v&+uiQw@qK>45BU*4<|q7=m-rb!=NJ5vU-4^x!*BT=zvmD9kw5V= zf94hb!mIq1zwvke!D~#B%lyxjOvThp!?aAt^vu8rW@IL2W)@~;HfCoI=43ABW*+8c zKIUfu7Gxn7W)T);F&1YDmgLPW#amdKWmuNwSe_MFk(F4PRalkPSe-RkleJizby%16 zSf35pkd4@wP1uyp*qklclC9X9ZP=FW*q*nt13R)4JF^SBvKzZIl0Ddyy?7gY^LF0B zKJ3eW?9V6;;6M&yGzW7CV>pzHg4w*?&L1+<{s|llYEN%_%xs4em=|Rcz_3ah=+NE zM|q6L`8-eXBv0`)U*L;;iD&pS&+-+%%GdZh-{3jE$+vi(Z}S4*;YGg7_xL_P;D`K( zAM+D_%1iu=pYscT$*=e|zu~w1j^FbK{>YzrnLqOif8kaB%HQ}q|KK&I$Zh^-N~U6J zreRv9V|r#_1T!)dGcyabG8?lq2XitPb2AU~G9UA^01L7Z3$qA|vKWiA1WWQ}mf|fe z%`z;@axBjZtjJ2N%qpzPYOKy0tjSue%{r{hdaTa|Y{*7z%qDEgW^B$DY{^z^%{FYy zc5KgE*?}F|iJjSnUD=J@8Oa{($zHsTy?HzDU?28nKlW!72XG(G!IfOa`?;EH_y8Z|LtM*;xsH!;Js;%;KE{pQ#Lax1PjCyjavQgE z2X}H8cXJQ-@<~3$eSDhFa6g~rb3DLS5 zU*&6jop10Q-{f07&$oGj@9-ku<$HXeAMitd#EE>b1cVkJST7>Cvh^T zFoAb+DyMNeXK*HGaW?00F6VJR@8aEDz=d4I#azOBcrTaoJ}%>OuHZ_p;{9CBHGF^% z@*%F}!(7KlxSo%410Ul?ZsKM>&L_BqTe*$fxq~~oi@Ujpd-)`v;yymjXSkow@;M&h zK_22^9^p|Q<8eOE6FkXNJk1yQB46SezRa_Hg|G58zRovzj&JfUp6A=Vz;}3&@A5sq z&ky(^KjO#ygrD*fKjY{8f?x70e$8+AEx+UU{DD96Ctl{yyux32mA~>g{?0#mjVbb) z|Cy4hn3`#rmg$(D85qHg%*4#h!mP~3?99QO%*EWy!@SJL{4BtNEX2Yr!lEq3;w-_E zyqTqV3rn*M%d#BHvjQu!5-YO`tFjuavj%Ij7HhK(>#`o}vjH2j5gW4!o3a_3vjtnS z6; z{FdMGd;Y*5`4cbmXI|kiyvkqs8-M2?yv7vy%>PWuR7}k@Ov`jk&kT%UMrLAWW?@!l zV|M0XPUd26=3!puV}2H3K^9_R7GY5qV{w*XN#4v-yoIG%hGkifLMGrO=WyRkbX z*@HdVi?^{iZ|5EC!@lgt{*2-P4&)$4b1;W6hC?}w!#RQ@If|n>hOrDWj`1AJaU9PH zoXAO>%qdLZot(;PoX#1X$yuDuIh@ORoX@*>Hy3ar7jZF{@E+dFrM!>JxST7vlB;+> zS91*?;DdaKYxywO@e!`)qujv9xRIN z=d*l{2Y8T&c$i0cl*f3S&+`ON@)S?=1-{6ac!n?YEMMWPe2uU34W8qhe2eG#HZSlU zUgW!ckMHvXe#npbF+bs_yu{D=IlthS{EA=m8-C00_&tB%kNk<3`7^KZ7hdJB{Eff! z4_;%6{N{hAWGbd+8m47Bre_95Fe5WDGqW%&voSk!Feh^{H}fzr^D#dQupkSuFpID# zi?KLMuq1D0Dc-`;EW@%a$MUSeimb%Stir0S#_FuWnykgzti!sj$NFr*hHS*fY{I5& z#^!9nmTbk=Y{Rx}$M(FH9oUhb*qL3}mEG8#k?g^q?8V#Io44~0_F-T4V}C|*00(jq zqdAyE7{j3)#^D^nksQU*9K%?K7{_>y$UG*0IX&g3l4<{ZxD zJkIA`yqgQSkc+sOOL!0O-Y%Q^HFZ#W8BD1 z+|0-M1h;T2w{bgna3^V$^He++PU`w`QYqnuqwqtwV$`0(vPVCGs z?8{WxIDi8=h|wI(A&lWr4&!i+;7E?*XpUhlLyTiQ z$8sFUa{?!F5+`#C6L=@5avG;|24`{>XLAncavtaNF5b-rT*yUS%q6^s_i`!k<1#Mi z3a;cT-p|!s!w2{vAL3d*%yoQ(>-i`*@G)-WCT`~Ae1coJmD{+TJGhg(xSM;pmrwF3 z?&H&ZhWq&}pW^`@wJUf_$J@t zdA`jHe1{kL|FBe#QFvT!!-mK99ox2T+qP}nwrx9UlOj%=q>R&sjcq&mp6^}f{;`j} zu75MLW}nyaI^MvWcnfdi9lVS8@IF4khxiB|;}d*}&+s|Ez?b+6U*j8mi|_C~e!!3T z2|wc({EFXDNKF4xLKziQQ9~UKG|@sE9dyw{9|H_A!WhQExEK%PV**Twi7+uH!K9cB zlVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI4$O(UFgNDGyqFL3V*xCPg|ILd!J=3U zi(?5aiKVbKmcg=E4$ET&tcaDcGFHK=SPiRV4XlZ^ur}7gx>yhEV*_l6jj%B`!KT;@ zn_~-XiLJ0Tw!ya84%=e~?1-JPGj_qQ*bTd55A2D(us8O>zSs}@;{Y6pgK#ho!J#+| zhvNtwiKB2dj=`}w4#(pJoQRWfGETv%I1Q)c44jFxa5m1txi}B!;{sfWi*PY6!KJti zm*WatiK}omuEDjq4%g!b+=!cSGj74HxDB`C4%~^m@H_k-cjF%X0e{4w@MqkM`*1%V zz=L=Q591O11%Jh(cnpu@Z}>a@fq&v(_&1)wlXwbG;~6}Q=kPrKga6_Myoi_ZGG4)} z_#a-w>v#ii;w`+5cknLW!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC|_yIrSC;W_G z@GE{pAqo9Q31w7JMGbW{&_oMubkIc)eGD+f2xAxr<6=CFj|ng#Cc?y+1e0PiOpYlq zC8omEmta2uj}5RPHp0f(1e;q9k zCAPxW*aq8TJ8X{~up@TD&e#RJVmIuLJ+LSC!rs^i`(i)rj{|TZ4#L4W1c%}<9F8M! zB#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6T#hSn zC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!td~V+>Lwi2mBF#!k=+3?!*0f01x6J zJd8*17yK2E;xRmqzv1ur2mXnF;oo=yPvR*&jc4#Ip2PF_5B`f6@FHHq%XkH^;(vGz zuj388iMQ}J-od+g5AWjxe29Js)Gh-IairFwb=D?ho3v**0%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$ zV;L-q<*+*1(!r3u|K?tc&%qJ~qIH*a#bA6KsmjusOECme>kg zV;gLX?XW#|z>e4nJ7X8@irug~_Q0Ol3wvW9?2G-dKMufwI0y&h5FCoba5#>@kvIxR z;}{%^<8VAqz==2sC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5 z;~HFx>u^18z>T;GH{%xEira8I?!cY63%|qfaX0S4AMi)~34g}DxDWT^0X&F@@Gu_1 zU+`BvipTIc{)WHfANVK!g@5A-Jc+09G@ik;cn;6wKlm?Rz>9bZFXI)wivQs?ypA{U zCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00!;vgK1LvSb#!{ImrN8%_Pjbm^u zj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf% zuEX`X0XO0%+>BdrD{jN>xC3|MF8mI^$KALGf50E{C;S=r;y&Du2k;;s!ozq3f5Bhz zC?3P(_#6I?f8d|^7ygYW@FbqX(|88Y;yFBz|KPuP0Wabuyo^`yD*lJp@H*bWn|KRv z;~l(<_wYVGz=!wCN zR8d164K&e08y$4fLmvYSF~S(e!MGR?<6{C$h>0*UCc&hb43lFDOo^#5HKxI|m=4op z2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$ z1+0jburgM`s#p!HV-2i{wXinU!Ma!v>th3Kh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc z2keNQurqeSuGkH`V-M_!y|6d-!M@lJ`{Mu{h=Xu24#A-~42R*ZsI1b0- z1e}PIa57H8sW=U%;|!dMvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj z2Hc37a5HYft+)-h;||=3yYM^w9(Us&`~iQ&pYUhgi~Ddt9>9Zm2oK{C`~`o-qj(ID z<8Syo{(*nuU-&nkz>|0iPvaRpi|6n>{)7MG1-yut@G@S(tN0&Y!|QkhZ{jVyjd$=a z-oyL&03YHbe2h=_xJ%n;wSu!U+^n_Lm>tIM+s$AP(=-O zG|)r~ZFJB@4}A! z!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76uqC#_*4PHyVmoY)9k3&I z!p_(QyJ9!&jyrj=$mW z_y_)pf8pPF0#D*8JdJ1YES|&j_z(Vz7w{rp!pnFCui}4r4X@)3yotB)Hr~Ozcn|O6 z1AK^&@G(BYr}zw?;|qL=ukba#!MFGh-{S}Th@bE?e!;K!4TY4#|1A+Elu85)v!9&z?xVKYhxX(i}kQRHo%712peM)Y>LgWIkv!-*a}-?8*Gd1uswFbj@Su1 zV;Ag--LO0Mz@FF(dt)E$i~X=a4#0sp2nXX39E!tmIF7)PI0{GO7#xe^a6C@Hi8u)- z;}o2V({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn z;}+bC+i*MXz@4}Yzr*iwH}1h7@JIX!f5yGI5BK8%Jcx(zFdo5Q@K-#F$M87*hQH$< z_$U5_f8z-}iKp;1p24$t4$tF1_%B|-i+Bky;}yJ$|KT;fjyLco-oo2>2k+uNypIp? zAwI&#_ynKgGklIO@Fl*&*Z2nC;yZkgAMhi7!q4~xzv4F(Qqg~uP(}q+)KEtQO|;NP z2VL~g#{ff&Fotn3F2=+7m;e)EB20`)FexU(SI818ZU}tc`WBF4n{P*Z>=1BW#RKuqigf=GX#TVk>NoZLlr2!}iz#J7Op7j9suR zcEj%21AAgG?2Ub}FZRR!H~D!}YiUH{vGTj9YLk zZo}=k19##s{0_gz-M9yTz#s7^{2BM+KHQH7@E{(-!*~RL!C&zx9>e4K8~%=e;Gg&x z{*5Q_B%Z?4cm~hnIXsX5;J%!rvVGiJf8m<_XI z4$O(UFgNDGyqFL3V*xCPg|ILd!J=3Ui(?5aiKVbKmcg=E4$ET&tcaDcGFHK=SPiRV z4XlZ^ur}7gx>yhEV*_l6jj%B`!KT;@n_~-XiLJ0Tw!ya84%=e~?1-JPGj_qQ*bTd5 z5A2D(us8O>zSs}@;{Y6pgK#ho!J#+|hvNtwiKB2dj=`}w4#(pJoQRWfGETv%I1Q)c z44jFxa5m1txi}B!;{sfWi*PY6!KJtim*WatiK}omuEDjq4%g!b+=!cSGj74HxDB`C z4%~^m@H_k-cjF%X0e{4w@MqkM`*1%Vz=L=Q591O11%Jh(cnpu@Z}>a@fq&v(_&1)w zlXwbG;~6}Q=kPrKga6_Myoi_ZGG4)}_#a-w>v#ii;w`+5cknLW!~6IEAL1i?j8E_> zKEvnu0$<`Qe2s7LExyC|_yIrSC;W_G@GE{pAr1XU31w7JMGbW{&_oMubkIc)eGD+f z2xAxr<6=CFj|ng#Cc?y+1e0PiOpYlqC8omEmta2uj}5RPHp0f(1e;q9kCAPxW*aq8TJ8X{~up@TD&e#RJVmIuLJ+LSC z!rs^i`(i)rj{|TZ4#L4W1c%}<9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx( z!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN z!td~V+>Lwi2mBF#!k=+3?!*0f01x6JJd8*17yK2E;xRmqzv1ur2mXnF;oo=yPvR*& zjc4#Ip2PF_5B`f6@FHHq%XkH^;(vGzuj388iMQ}J-od+g5AWjxe29Js)Gh-IairFwb=D?ho3v**0 z%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$V;L-q<*+*1(!r3u|K? ztc&%qJ~qIH*a#bA6KsmjusOECme>kgV;gLX?XW#|z>e4nJ7X8@irug~_Q0Ol3wvW9 z?2G-dKMufwI0y&h5FCoba5#>@kvIxR;}{%^<8VAqz==2sC*u^Hiqmj9&cK;C3uogT zoQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18z>T;GH{%xEira8I?!cY63%|qf zaX0S4AMi)~34g}DxDWT^0X&F@@Gu_1U+`BvipTIc{)WHfANVK!g@5A-Jc+09G@ik; zcn;6wKlm?Rz>9bZFXI)wivQs?ypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4 z!q@l)-{L!bk00!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn z`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF8mI^$KALG zf50E{C;S=r;y&Du2k;;s!ozq3f5BhzC?3P(_#6I?f8d|^7ygYW@FbqX(|88Y;yFBz z|KPuP0Wabuyo^`yD*lJp@H*bWn|KRv;~l(<_wYVGz=!wCNR8d164K&e08y$4fLm&IJ>X5Kn-*L5i9Tf$k zQ-c3(6aF{ol<~s|M-Ec8FhB>X@stP+9!n{P_RP z<+@^#YSpS$`u4x|e_LgtPpgS_QdIl@4<`zeAPb713Ywq`hF}VoU<;1m3ZCE#fe;Fj z5EJ4EafNt7d?A65P)H;s7Lo`_g=9i8H9{NCLyzsMaU{- z6S50Agq%VyA-9l6$SdR%@(TrofI)5ohC(BuvCu?lDl`+C3oV3}LMx%Q&_-x0 zv=iD39fXcTC!w>@Md&JY6S@mMgq}h#p|{XS=qvOS`U?Yufx;kRurNdzDhv~b3nPS) z!YE<1Fh&?Fj1$HS6NHJvBw?~JMVKl~6Q&C@gqgxDVYVDgMYt;bCtMS*3pa$D!Y$#pa7VZ++!O8#4}^!pBjK^|M0hGZ z6P^n%gqOlA;kEEacq_aU-U}aukHRP6v+zauDt!Cj=NCmulto2UMNQO2Lo`K8v_(gB zMNjm_Kn%r5jEQl?xMDmpzL-EvC?*mUi%G+&X~eW*Ix)SNLCh#- z5;Kcg#H?aAF}s*U%qiv)bBlSzykb5vzgR#lC>9b6i$%nuVllC}SVAl*mJ&;gWyG># zIkCK0L98fN5-W>U#HwO7vAS48tSQzKYm0Tnx?(-CzSux)C^ixsi%rC)Vl%P1*g|Y6 zwh~*5ZN#=>JF&gkLF_1Y5<81s#I9mDvAftq>?!sVdy9RA5;u!m#I52sal5!f+$ru7zZ1U~ zcZ++(AH*NUpTwWVz2ZJ`zj#1AC>|0Ii$}y?#9zgu;xX~K_?!5<_=otX_?P&%ctSiW zo)S-sXT-DOIq|&skNB^6LA)ql5-*Ea#H-?e;x+NQctgA?-V$$%cf`BmJ@LNyKzt}Z z5+93C#HZpj@wxayd?~&XUyEf*eSL!GAmj*}! zr9skQX^1pb8YT^wMo1&2QPOB>j5JmnCykdTNE4+=(qw6hG*y}=O_yd!Go@M5Y-x@( zSDGiymljA1rA5+WX^FH{S|%-*R!A$QRnlr{jkH!;C#{z@NE@Y1(q?Ikv{l+BZI^aP zJEdLHchdLLZfTG7gY={Hlk~H+SK24-mkvk=r9;wT>4@};^s97KIwl>Lev^Ke{*eBZ z{*wNdPDm%EQ_^YajC58yC!Lr6k^Yq~NEf9`(q-w2bXEFKx+YzhZb&z!TheXmj&xVL zC*7AGNDrk)(qrj~^i+B#J(pfcFQr$~Yw3;jR(dDBmp(`zrBBjl>5KGL`u4x8Uz8ij63OS{m zN=_}Ok<-fQ~an{r<_a9E$5N*%K7B{asj!ZTu3e~7mT(Uard&&|E!UCj%Jt;>as#=c z+(>RLH<6pl&E)2C3%RA-N^UK;k=x4cGBMD zraViYEzgnX%Jby;@&b9GyhvUwFOiqZ%jD(q3VEfxN?t9mk=M%W%KPN~@&Wmvd`LblACZ5Nf0d8Q$K>Pk zZ}RW*AM&5_U-IAb3HhXaNg zOTI1Nk?+d)Sf04h+ z-~JbVQIQl`Q503t6kRbCQ?V3VaTHha6kiFHP>Gb75=V)v#8cuc36z9NA|;;$N-ib0l1Is_amwN-d?fQb(z) z)Kls!4U~pTBc-v@L}{utQ<^I+l$J^>rM1#VX{)qT+AAHDj!Gw`v(iQBs&rGjD?OB+ zN-w3i(nsm5^i%pP1C)WvAZ4&JL>a0KQ-&)el#$9PWwbIz8LNy_#w!z)iOM8pvNA=P zs!UU+D>Iat$}DBJGDn%K%v0to3zUV*B4x3%L|LjVQ{IqD2b6=#A?2`gMEOPe zRXM60Q;sXYDZeX!D1Rz{DSs;`l#|LS<+O4}Ijfvg&MW^Y|0)-hi^?VCvT{Yas{E&1 zQ?4sFl$**e<+gH1xvSh$?kf+Jhsq=6vGPQDsytJkD=(Cn$}8oy@X)m1&!R|7RvBQ>VRQRAxd)c9%wHKCeF zO{^wSld8$oK@M+0^W64mGEmOUk%c;%W)Cq*_WXt(H;Cs^!%3Y6Z2TT1l;}R#B^})zs>04Yj6P zORcTeQR}Mp)cR@zwV~QbZLBs?o2t#!=4uPIrP@kut+r9ys_oSFY6rEW+DYxKc2T>k z-PG=C54ES-OYN=pQTwX>)c)!Kb)Y&(9jp#fhpNNW;pzx=q&i9+t&UO0s^ir0>I8M7 zI!T?ZPEn_-)70te40WbDOP#IGQRk}j)cNWHb)mXQU92uqm#WLuIQYAx=G!vZc(?Y+tlsq4t1xxOZ`s$Ufr$kQGZZ>RDV)`R`;s=)cxuK^`Lr4 zJ**y4e^Gx`kE+MiObng>IL*@{lrg}@gt=>`Zs`u3U>I3zm`bd4OK2e{l&(!DY3-zV?N`0-qQQxZX z)c5KK^`rVp{j7dbzpCF`N0zScl%s5R0WYfZGKS~IP=) z(mHEhw60n=t-IDk>#6n9dTV{OzFI%6zcxS{s14EvYeTf5+AwXnHbNVzjnYPIW3;i_ zIBmQ(L7S*e(k5$Dw5i%OZMrr?o2kvxW@~e_x!OE!zP3PHs4dbKYfH4H+A?jqwnAH} ztbw5{4UZM(KZ+o|o+zSF+fc58dIAG9B}pR}K~z1lu) zzji=7s2$P{Ye%$Sv|qKO+A;08_M7&*_J{VT_Lugzc0xO;ozhNgXSB20IqkgmkM^&2 zLA$73(k^RPw5!^G+BNOEc0;?V-O_GrceK0OJ?*~sKzpb?(jIG1w5QrL?YZ_sd#SzB zUTbf(x7s`Hz4k%-sD08tYhSdl+BZ$mMP1TmUC~ut({iS)#J5bdmXdLBKmo=?xO7tjmph4jLD5xuBhOfRmN&`av2^wN46y{ukNFRxe7E9#Z> z%6b*Os$Na6uGi3O>b3ORdL6y4UQe&DH_#jEjr7KP6TPY4OmD8Y&|B)Q^wxSCy{+C( zZ?AXIJL;YE&UzQUtKLoTuJ_P;>b>;ddLO;7-cRqZ56}ndgY?1r5PhgVOdqa~&`0W{ z^wIhleXKrCAFof)C+d^*$@&z1sya+CO`W$_(K2M*oFVGk2i}c0%5`C$@ zOkb|A&{yiK^ws(reXYJuU$1Y_H|m@8&H5I7tG-R&uJ6!y>bvys^zZfE`X2oU{YU*L z{bzlzzE9t;AJ7l#hxEhx5&akaSN*7dOh2ywrvI-0q5rA>rT?v;&`;{8^wato{j7dY zKd=9z|EpinFY1@{%lZ}ls{WsTO~0<+&~NIu^xOI!{jPpbzpp>gAL@_v$NCffss2oV zuD{S<>aXVyK$B1jhGvXTwjD$uaBe9XhNNOZAk{c}Mkk}Q(Z%R$bThgeJ&c}4FQd27$LMSHGx{3?jDf}=W3VyA7-|eN zh8rV{k;W)vv@ymQYm76-8xxF)#w261F~yi_Of#k%GmM$WEMvAY$Czu(Gv*r$jD^M` zW3jQsSZXXYmK!UKmBuP#wXw!nYpgTY8yk#`#wKI4vBlVGY%{hSJB*#iF5^4ndt5K`H9gZe12Z%uGiJsyCE(I z1~a3X$;@nKF|(T4%zeh<`ep;Oq1ni6Y&J2Qn$67S zW(%{W*~)BfwlUk9?acOO2eYHu$?R-)F}s@G%}~cj`&*4$26Lmi$=qyiF}IrA% z{LcK|+->eLe=vVEe=>hI_nQ06{pJDlpn1qVY#uRxF@H6Wn#auJ=5OZj<{##t=3nOD z<_YtpdCEL(o-xmw=gjlwKjy#Y1@oeL$-Hb{F|V5cnb*wg<_+_vdCR#&+@In3a!YBS#hkmRy-@dmB319C9)D*Nvxz+GAp^2!b)kSvQk@V zth81-E4`J$%4lV>GFw@!tX4KFyOqPrY2~tVTY0R!Rz54gRlq7}6|xFjMXaJ$F{`*$ z!YXN%vPxTJtg==)tGrdgs%TZRDqB^os#Z0tx>dufY1Oi7TXn3uRz0h})xc_KHL@C8 zO{}I?Gpo7P!fI)?vRYehthQD=tG(61>S%ScI$K?=u2wgzyVb+$Y4x&tTYap)RzIu1 zHNYBZ4YCGXL#(0JFl)Fq!WwCfvPN5Dtg+TOYrHkVnrKb3CR+GuUEHd|Y)t=2Yc zyS2mGY3;JUv%a@>TYIb@tRJnPte>sD);?>$b-+4k9kLEvN3370U#+9oG3&VXoAtZ( zhxMoRm-V-G!a8Z4vQArPth3fR>%8@k^{;iox@cXpE?ZZutJZ(kHS4-{!@6nRvTj>< zth?4d>%R5CdT2ee9$QbWr`9v;x%I+&X}z*uTW_ql);sIH^}+gReX>4VU#zdzH%qWZ zTe4+au~l2Mb=$B_+p=xjv0dAt)0$JZ)dPG+L`Rkb{0FUoz2c}=dg3yx$N9_9y_m{&(3cbunXFS?80^t zyQp2vE^e2wOWLLE(smiUtX`*TiUJc)^;1at=-OUZ+Eae+MVpqb{D&=-OcW9_pp1~z3kq0 zAG@#J&+cyzum{?M?7{XBd#F9k9&V4YN7|$8(e@a7tUb;iZ%?o%+LP?b_7r=nJ1`) zK5qYJ|8DXxU797!$9NAGE)zKW?F&xve9NTdm*YOLhcLJ1Lx$PAVt0lg3Hwq;t|c8JvtxCMUC##mVYqbFw=*oSaTBC%2Qw$?N2E@;e2b zf=(f)uv5e->J)Q|J0+ZwPAR9fQ^qOllyk~E6`YDrC8x4e#i{C4bE-QvoSIH8r?yka zsq55p>N^dbhE5}=vD3t9>NInjJ1v}+PAjLi)5dA*v~$`!9h{C%C#SR1#p&vFbGkb{ zoSsfEr?=C`>Fe}!`a1)hfzBXjurtIN>I`#+J0qNt&M0TJGsYR~jC0026P$_8BxkZS z#hL0%bEZ2poSDunXSOrPnd{7R<~s|Vh0Y>pv9rWk>MV1XJ1d-(&MIfMv&LELtaH{o z8=Q^KCTFv=#o6j?bGAD>oSn`t=R4Kt>9JHI)U6#ChsGbDlddoR`ik=e6_3dF#A$-a8+h zkIpCOv-8FI>U?tqS9B#;b`@83HCJ~H*K{q{b{*GsJ=b>wH*_O6=Eia3y7AojZUQ%< zo5)S~0P>r<=>o?dEaw zy7}DvZUMKTTgWZ!7IBNZ#oXd<3AdzM$}R1dam%{p-12S(x1w9gt?X8DtGd

6=3(=QdDJ{+9yd>zC(TplY4ePE);wpPH!qkM%}eHG^NM-Z zyk=fEZP~kNMY(U`4bdS&^+MR#YpR72S$q#k7K~U@OE5wPIPZ ztvFU(E1nhKN?;|l5?P6@Bvw)@nPpg}Wm&?qEonKHYk8Kkd`nwlR&pzamC{OOrMA*o zX{~ftdMksK(aL0Hwz61Rt!!3yD~FZS%4OxY@>qGTd{%y|fK|{cWEHlGSVgU3R&lF@ zRnjVDmA1-QWvy~nd8>j|(W+!swyIcFt!h?vtAR5HHdRBd_fz{AzWHq*$ zSWT^FR&%R`)zWHZwYJ(=ZLM}zd#i)h(duM%wz^ndt!`F#tB2Lo>Sgt|`dEFfepa~E z-x^>Iv<6v&ts&M>YnV0M8exsJMp>h+G1gdXoHgE>U`@0pS(B|P)>LbnHQkzF&9r7& zv#mMSTx*^+-&$ZTv=&*5ttHk{Ynip&T4AlUR#~g9HP%{doweTDU~RNES(~jb)>dnq zwcXlb?X-4TyRALeUTdGV-#TC&v<_K^ts~Y^>zH-iI$@o(PFbg|GuBz_oORy1U|qB> zS(mLV)>Z48b=|sQ-L!65x2-$YUF)88-+Ev@v>sWHttZw~>zVc3dSSh^URkfLH`ZJ0 zo%P=OV12YcS)Z*h)>rGB_1*em{j`2rzpX#kUn_!$C?bi-B8rGAqKW7thKMPGM6d`E zp(2)uE#io{BA$pZ5{QH%kw`3(h@>K!FoY>AA%rcYaD*#7p@c8A2ouRg3XxKz5~)QR zkyfM==|u*SQDhRCMHZ1&WE0s%4v|yj61hblkyqps`9%RyP!tk{MG;X{6cfco2~kp% z5~W2MQC5@_Yobz;5PAU29kVzbyHwu)_HyVxOiid|y2*dz9eePX{j zAP$N{;;=X(j*4UAxHutBic{jWI3v!AbK<mR&$Z{-^X&!pLVJ*gj$(wU61y?GyG%`;>j!K4YJ?&)Mhg3-(3(l6~2}Vqdkd+1KqG_D%biecQfc z-?i`A_w5JvL;I2a*nVO^wV&C~?HBe-`<4CLeq+D2-`Vf&5B5j2N=LfVlS=wh%P^T-rjRLRDw$fQk!fW*nOREk#%J~Szk7g4P_(QST>POWi#1awva7lE7@AMk!@u=*1b*F|?)2ZducIr5F zoqA4vr-9SZY2-9^nmA3JW=?abh11e$<+OI%IBlJFPJ5?=)6wbVbauKpU7c=Dcc+Kb z)9K~(cKSGdoqkTZ)885340HxLgPkGHP-mDk+!^7FbVfO&oiWZ>XPh(Mncz%xCOMOx zDb7@9nls&*;mmYqIkTNP&Rl1nGv8U@EOZt*i=8FTQfHa7+*#qQbXGa5oi)x{XPvX& z+2Cw+HaVM}EzVYFo3q{7;p}vFIlG-b&R%Dqv)?)39CQvjhn*wNQRkR*+&ST#bWS;^ zoiol^=bUrix!_!ME;*N-E6!EtnseQ`;oNj?Ik%lV&Ryr8bKiO3Jais8kDVvZQ|FoU z+$%GHUG0Xs$=wugN;j38+D+r8bRX7tGU(P8g5Ou zmRsAcub=$e^-41R?x0Bo1?c#QI zySd%n9&S&!m)qOz(vCkGMzOWA1VH zgnQCG<(_uWxM$sS?s@lud(pk*UUsjzSKVvwb@zsQ)4k>1cJH`%-FxnR_ksJ+edIoN zpSVxmXYO-$P4yDyihNe7u$>D#r5KO@x26ILNAe**h}Ii^^$ppXL^<= zJlm6=Cgy+U4LuZUOFE9MpVN_Zu`QeJ7Vj91nx=au&=con@$US+R}SJkWL zRrhLmHN9G1ZLf}3*Q@8%_ZoN&y+&SRuZh>xYvwigT6itJR$gnbjn~#|=e74bcpbe? zUT3e1*VXIhb@zIBJ-uFDZ?BKn*X!qnd;Pru-av1VH`p8E4fTe3!@UvSNNDYx7b_aE%lap%e@ue zN^h07+FRqT_11aoy$#+*Zci21P z9rccR$GsEYN$-?*+B@T&_0D&%GDkOYfEU+I!=@_1<~!y${|;@00i0`{I4|zIorhAKp*zm-pNI zT~#;L zUG-2sRWH?B^-+CQKNYU}s{v}D8l(oRA!?`^riQB#YNQ&aMyoMutQx1rs|jkNnxrPH zDQc>krlzYIYNnc{W~(`Bu9~Ojs|9MITBH`MC2FZ!rk1M}YNcAGR;x8?ty-tns|{+S z+N3tCEo!UUrnajcYNy(zcB?&VuiB^fs{`twI;0M(BkHI+rjDx<>ZCfQPOCHOtU9O8 zs|)I)x}+|vE9$Dcrmm|S>ZZD-ZmT=$uDYl0s|V_#dZZq!C+ewsrk<-8>ZN+6UaL3i zt$L^4s}Jg<`lLRqFY2rMroO8m>Zkgpeycy~uZrMD^dtF^{V0A^KbjxikKxDkgZyAW z#1Hjj`LX>teq2AEAKy>lC-f8fiTxygQa_n*_@;09!nb|tJHG3CzVdxv`(b`^KZT#t zPvxif)A(uqbbfk2gP+mQJU*Y@l9b^UsNeZPU< z&~M~7_M7-k{bqi1zlGn@Z{@f4+xTt$c7A)mgWu8btNk_pT7R9t-rwMF^f&pN z{Vo1hf1AJE-{J4{clo>hJ^o&QpTFNf;2-o4`G@@@{!#y!f80OepY%`pr~NbjS^u1W z-oM~q^e_3B{VV=e|C)c@zv18XZ~3?VJN{k&o`2te;6L;q`H%f4{!{;%|J;A!zw}@E zul+avTmPN^-v8i#^gsEZ{V)Dk|C|5a|Kb1ifBC=tKmK1of{v&o>Bu^Yj;f>S=sJdu zse^Q|4$+}HmX59C=(swbj;|BwggTKueA=-$#n{y zQm4|XbsC*kr_w(`Q|HpTbsn8p=hOLh0bNiR(uH*qT~rs- z#dQf?QkT-Dbs1e&m(%5S1zk~B(v@`;T~$}p)pZSBQ`ge9bsb$-*VFZN1Km(J(v5W! z-BdTz&2Op$29-@cpVS2b8p-1XbdbA#+$LeuPdRCo}#DfX?nVzp=at@dbXaU=jwTS zzFwdg>P337UZR)kWqP?@p;zivdbM7o*Xnh8z22ZV>P>pH-lDhaZF;-jp?B(Cdbi%A z_v(FmzdoQ3>O=alKBAB6WBRy0p-<{l`m{cy&+2pfyuP3>>Pz~vzM`+{Yx=srp>OJ2 z`nJBK@9KN{zJ8z|>PPyqexjf1XZpE*pQDN!{-VF?Z~D9b zp?~UM`nUe0|LO=~5yK*dMGlJ+7BwtdSoE+MVKKvk!h*v>!a~Dhg~bm0_h{t*mIeMF zi>Cjco%`Pv!-C=2|2sbz;r||0{^yF_zx{Ix@!$SAJo<0{+z$M=|KE;R5Wg3UAsC9W zFgC`)e@-y}TRe=92{0ih!o>LB4b7x{Cqn~Gv{0ap5*>8WLxnzS48#Anv{LY$5>sJn zOoM4L9j3<&m=QB!X3Tp5^R>vAx6Ki2@tb=v29@fVO*bp0GV{C#= zu^BeU7T6M7VQXxIZLuA;#}3#LJ7H(+f?cs2cE=vr6MJEA?1O!=ABJOp9DoCH5Dvy6 zI24EBa2$anaTJcmF*p{-;dq>Y6LAtw#wj=zr{Q#*firOy&c-=77w6%8T!0I45iZ6h zxD=P+a$JEcaTTt{HMkbn;d@fE(tH~1Fc;d}gmAMq1@#xM94zu|ZMfj{vV{>DG}7yoJeU{rVSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4<5?014SQV>bb*zCk zu@=_GI#?I$VSQ|X4Y3h6#wOSln_+Wofi1BWw#GKt7TaNa?0_Ay6L!Wf7}zfg)D631 z4-D+Z2kM2rF)&jUs4w=zaO{r*a3BuC!8inm;xHVJBXA^+!qGSe$Kp5~j}verPQuAJ z1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G z1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00F!wSOQC8 zU{N7ZX)J?fu^g7izyM;Pz%E3fN>~{KyN!XWVl}LeHLxbu!rE8|>ta2uj}5RPHp0f( z1e;t7=sX#5U6}HAU*cRJid+dN6u@iR2F4z?V`xt?`V_*p*P)`gjj|J+DeXuX~ z!*J}618^V?!ofHMhvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1 z;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$ z;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}> z;yt{N5AY#A!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)Jl-}ndr z;y;5j{}vGgqw0YIlj?z@U{s8Tfi3nxF)$_uVK9bZC8WLxnzS48!D@0#jltOpR$UEvCctm;p0lCJan{2Fik2 zF&hR}%md}XoR|x9V;;yqV zBX+{h*af>{H|&mq{l7pxu^0BnKG+xgVL0~30XPr`;b0tsLva`m#}POZN8xB3gJW?V zj>ic&5hvkfoPtwv8cxRcnl zHU@^<1I5L77#|a0LQI5-F$pHcWN4s?77DacqJu7asL)4^VVE3KU`kAdsWAp5^R>vAx6Ki2@tb=v29@fVO*bp0GV{C#=u^BeU7T6M7VQXxI zZLuA;#}3#LJ7H(+f?cs2cE=vr6MJEA?1O!=ABJOp9DoCH5Dvy6I24EBa2$anaTJcm zF*p{-;dq>Y6LAtw#wj=zr{Q#*firOy&c-=77w6%8T!0I45iZ6hxD=P+a$JEcaTTt{ zHMkbn;d@fE(t zH~1Fc;d}gmAMq1@#xM94zu|ZMfj{vV{>DG}7ylWP{kMo1ST7G08KYoSjE2!MFlrts zCI(?JhF~ZLw$1~^#yA)k<6(RZOrQr!h>0*UCc&hb3=K5VLV-3)bkIc)75b<#43lFD zOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4 zEQzJCG?u}#SPla#?tv;`MXZE@x%WU-uqsx=>R1D7VlAwVb+9hh!}{0&8)74Dj7_j9 zHpAxF0$XA$Y>k0g@<45|9k#~~*bzHnXY7Jqu^V>B9@rCmVQ&l!rw8hb{V*K+;{Y6p zgK#ho!J#+|hvNtwiKB2dj=`}w4#(pJoQRWfGETv%I1Q)c44jFxa5m1txi}B!;{sfW zi*PY6!KJtim*WatiK}omuEDjq4%g!b+=!cSGj74HxDB`C4%~^ma5wJ3y|@qe;{iN~ zhwv~S!J~K#kK+kEiKp;1p24$t4$tESyoi_ZGG4)}cnz=P4ZMlB@HXDTyLb=p;{$w% zkMJ=*!Ke5PpW_RBiLdZAzQMQn4&UPk{D`0MGk(FZ_zl0~5B!P0@HhU!zxdCT{J%xS zNEjKTU{s8T(J=ct8Ud)I2u>cmtLRc7! zU{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDbtcKOG2G+z{SR3nLU95-ou>m&3M%Wme zU{h>{&9Mcx#8%iE+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cO}p7vo`kOn?b75hlhYm=u$tfhJlg&_;<4y6B-oA2o(y za!i3KF%_o9G?*6CVS3Df88H)P#w?f>vtf43fjKc3=Egjj7xQ6$EPw^E5EjNFSQLw4 zaV&u)u@siZGFTSNVR@{86|oXl#wu79t6_Dlfi{5Fg=Ve1cE$ z89v7s_!3{?YkY%m@g2U$5BL#3;b;7UU-27$#~=6;f8lTZgMTqX4Bmf?gpn}{M#X3t z9b;fj48mXx!BC8au`v$D#dsJW6JSD2go!Z;CdFiEpota=v{9miE_$fYM~z{a98+LQ zOogd24W`9(m>x4=M$CknF$-qJY?vK$U{1`1xiJss#eA3_3t&MkgoUvP7R6#%97|wH zEQO`943@=mSRN~2MXZFCu?kkjYFHg>U`?!rwXqJ?#d=sD8(>3hgpIKYHpOPx99v*Z zY=y0{4YtL0*d9AzN9=^1u?u#^ZrB}rU{CCYy|EAW#eNu${c!*e#6dV1hu}~ghQo0L zj>J(o8pq&R9Eam^0#3w9I2otlRGfy>aR$!BSvVW#;9Q)C^Kk(##6`Fmm*7%dhRbmU zuEbTi8rR@jT!-s%18&4kxEZ(LR@{c$aR=_iUAPb0(F&@Up1eg#LVPZ^zNii83XrhGzZItMsiykWUQDYb;#}t?nQ(|SQBeuZLEWJu^!gP2G|fAVPkB9O|cm^#}?QUTVZQ# zgKe=Lw#N?G5j$aL?1Ejf8+OMY*b{qUZ|sA8u^)zGe;j}VaS#s1AvhF=;cy&*BXJat z#xXb+$KiOKfD>^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB z#x=MW*Wr5HfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X z#xr;p&*6EzfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X}FYy(= z#y9vD-{E`wfFJP_e#S5O6~Ezk{DD957yiaS_!lDt@&02ZjEqq*Dn`TT7z1Ns5C&ri zhGHy?jd3t8#>4oS025*&OpHk|DJDY$O|(#;jS?Mn(L;qkY7E2Vm;zH`Dol-OFfFFT z^q2uNVkXRtSuiVR!|a#?b7C&cjd?IH=EMA001ILvER034C>F!wSOQC8DJ+d;uq>9t z@>l^YVkNAMRj?{n!|GTAYho>|jdidt*2DVP02^W>Y>Z8?DK^9A*aBN(D{PHzur0R3 z_SgYCVkhj3U9c;5!|vDvdtxu_jeW2$_QP=Oj{|TZ4#L4W1c%}<9F8M!B#y$-I0nb! zI2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYna zI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hn zIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tN zJA98H@FRZ0&-ewu;y3(`Kkz61!r%A@|6+t--hYgQkueHJ#b_8EV_-}S!e9)+P>hAK zF%HJXco-iOU_wlUi7^Q##bjupi53d9QKEw`dZ^GxjbWG^Q(#I=g{d(Orp0ua9y4G@ z%!HXS3ueV^m>qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSXg{83!mc?>d9xGr) ztb~=Z3RcBxSRHF%O{|5ru@2V7dRQMDU_)$#jj;(f#b($XTVP9Ug{`p-w#9bX9y?%1 z?1Y`M3wFhB*d2RdPwa)gu@Cmeei)AZaR3g)K{yzP;7}Zf!*K+T#8EgJ$KY5ThvRVq zPQ*z#8K>Y>oQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_z zZp2Nv8Mok8+=kn62kyjOxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$N ze#B4s8Nc9H{D$B02mZug_#6M=UyKmK`;U<@GDg9u7!9Li42+3E7>pqpim@;@#=*E4 z594D3Oo)jvF($#Jm<$ax(L#YXN_5ag4;A{TF$|Mq3QUQqFg2#Zw3rUlV+PEKnJ_bE z!K|1Kvttg-iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9}2`q`Fur!vzvRDqwV+E{;m9R2a z!Kzpdt78qUiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4ur;>9w%88aV+ZVrov<@@ z!LHa1yJHXRiM_Bl_QAf`55uuP4#0sp2nXX39E!tmIF7)PI0{GO7#xe^a6C@Hi8u)- z;}o2V({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn z;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky z;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c z;}`sj-|##Bz@PXFf8!tgixEP3|1lCq#wZvSqhWN6fiW=%gE0g{F&4(gI2ae>VSG%0 z2{92S#w3^&lc9koS}4#)i4MBxp+X-uhGB9{fhjQ+rp7dw7SmyR%zzm&6K2LNm=&{O zcFch}F&E~>JeU{rVSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4<5?014SQV>b zb*zCku@=_GI#?I$VSQ|X4Y3h6#wOSln_+Wofi1BWw#GKt7TaNa?0_Ay6L!Wf*cH2B zckF>Zu^0BnKG+xgVL0~30XPr`;b0tsLva`m#}POZN8xB3gJW?Vj>ic&5hvkfoPtwv z8cxR&yZK`exYu?QB$VptqYU`Z^6rLhc_#d264D_}*egq5)hR>f*q9cy4s ztcA6)4%WqbSRWf;Lu`bNu?aTCX4o8CU`uR;t+5TZ#dg>pJ77obgq^VqcExVk9eZF; z?1jCt5B9}=7>@mM01m`KI2ecEP#lKCaRiRUQ8*gM;8+}o<8cB`#7Q_Ar{GkahSPBd z&csv02a#7(#vx8PRXhTCxm z?!;ZV8~5N|+=u(|03O6cco>i1Q9Opn@dTd4Q+OKB;8{F}=kWqw#7lS?ui#a@hS%{1 z-o#sY8}Hy_!ytyQ+$Tc@ddubSNIy=;9Go$@9_hE#83Dczu;H=hTriA z{={GS8~@;6j1ZgmA0uI8jDk@y8b-$$7!!jq7(*}=V_|HJgK;q)#>WJh5EEfyOoB-< z85(G!g#vAq=%9-pD)dof7$(OQm=aTAYD|M^F&(DI444r!VP?#NSuq=C#~hdwb75}G zgLyF@=Enk95DQ^pEP_R`7#7D8SQ1NNX)J?fu^g7i3Rn>Rk0dY#~N4@Yhi7y zgLSbU*2f0e5F24*Y=TX(88*ij*b-Y|YixsUu^qO@4%iVpVQ1`uU9lT>#~#=ddtq88 z#yz+f_u+m#fCupq9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU z#yfZy@8NxXfDiEzKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO@e_W=FZdO|;dlIjKk*m- z#y|KMBgEnT$4D3%qhM5whS4zw#>5~D#t;m}SQs1QU|fuc@i74=#6*}FlVDOzh6b8w zp+FlYI_RQ@3VqZVhRHDnro>d38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<- z^|1jq#75W{n_yFHhRv}Bw!~K08rxu7Y=`Z!19rqt*crQESL}w}u?P0VUf3J^U|;Nq z;n*Js;6NONgK-EB#bG!cN8m^tg`;r{j>T~}9w*>LoP?8c3QomoI2~u;Oq_+YaSqPK zc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5uElk@9yj1d+=QEP3vR`2xE*)kPTYmNaS!gr zeYhVF;6Xfuhw%s=#bbCJPvA*Bg{Schp2c%`9xvcUyo8tW3SPx)cpY!xO}vG-@eba_ zdw3ro;6r?bkMRjU#b@{&U*Jo8g|G1qzQuR=9zWnm{DhzJ3x36K_#J=XPyB_y@elsR z2yuD;F%m|`C>Rx^VRVdvF);{(F$6;~7RJUn7#HJVd`y4|F%c%lB$yPFp@Ak^D9}cU z4!Y=}LLW7TVRB4?DKQnM#x$4~(_wndfEh6pX2vX-6|-S>%z-&E7v{!1m>2V5ek_0m zu@Dxmq=6{}%&tbsML7S_f(SQqPIeQbaY zu@N@LCfF34VRLMOEwL50#x~d%+hKd`fE}?DcE&E)6}w?~?14S87xut%&aTpHA5jYY@;bUuCPRAKI6KCOUoP%?59?r)F zxDXfNVqAhtaTzYh6}S>t;c8riYjGW}#|^j-exUdJ1F6K~;dyn}b~9^S_X z_z)lAV|;>7@fkkH7x)ri;cI+@Z}AIFT9}{3gOoWLs2`0s4XrPG}3baw8gD!fg z&_|77m>g4JN=${RF%720beJA9U`EV@nK27y#cY@zb6`%)g}E^g=EZ!N9}8eXEQE!z z2o}X+SR6}WNi2n>u?&{Qa#$WKU`4Eim9Yv|#cEg`YhX>Rg|)E`*2Q{Q9~)ppY=n)m z2{y%M*c@A6OKgR$u?@DxcGw;}U`OnPov{mc#ctRgdtguOg}t#4_Qie}j{R`}4#Yt? z7>D3c9EQVj1dhZ}I2y;`SR9AraRN@nNjMp&;8dK3({TpQ#925S=ipqNhx2g(F2qH+ z7?_uyXKhx_pW9>ha< z7?0plJch^d1fIlGcpA^(Sv-g5@d94NOL!Tt;8nba*YO74#9Me9@8Dg$hxhRTKEy}( z7@y!%e1^~Q1-`^r_!{5fTYQJ_@dJLuPxu+X;8*;H-|+|j#9#Ou|KMMY5TExSBVlBW zf>ALVM#mT!6N4}qLogI$VQh?paWNjo#{`%V6JcUZf=MwM8fc=00&SG&po<sJnOoM4L9j3<&m=QB!X3T_y z7RM4;5=&ueEQ4jS9G1rlSP?5>Wvqf#u^Lv#8dwu+VQs8~b+I1S#|GFC8)0K?f=#g* zHpdp&5?f(wY=dpF9k#~~*bzHnXY7Jqu^V>B9@rCmVQ=h%eX$>gV}Bfg191=z#vwQq zhv9G>fg^Dgj>a)K7RTXuoPZN?5>Cb`I2EVibew@RaTdct8Ud)I2u>cmtLRc7!U{NfF#j(Wy z>DI2pwJ86E(NEARND6{rAtk7!N-9W~peUV-MR!YgDy_7LfJF!fCDNfF3P>sfidcw+ z7@W26v){dcd+)Q~^Iq5a<6P%{u4g_oiXLAncavq=O z3!KjdT*yUS%q3jP7rBftaXDY+3a;cTzQR}e8eiudT+KCH%XNH{>$!n%@om1tce#<9 zxS3nHmG5yI-{*Gj;7;!1Ztme;?&E$Q;0OGW2YHAe@i0H;Cp^NVJjPFXoF{mar+AuY z_!&RvS$@HDJkJaKl3(#6zvd-=!*BT=zvmCU%pZA$Kk;Y&!e99tukv?Z;~)H!fAMc# zXOP|a&xkQoFeOtlHPbLH(=k0WFe5WDGjC!RX64Py#_Y_&oXo}CyoGs~m$xz>^YbOY%P6&j(nFrCEk$S&rrTARpqxtiXz_#LBF~ zs(gfxvKp)NG1g#B)?#heVO`c^eKuf2Hsa%K%qDz-PqHbW;?r!#=4`>1Y{k}W!?tY4 z_UyopOkgJ_vNOA|E4#5fd$1>au{Zm$FZ;1S2XG(?yQ zj^_kUqim91~X_=1cnSmLZ ziJ5s5voI@fW;SML4(4Po=H@NT!@Rte`Iw)#u>cG5cHY4|c^B_yA>PBnEW)BJ#^St} zC0LU8@qRwQQY_6fEX#5%&j7@E4hlV@KwIX*ZBrla}C#W9pB`7Zs1#d zoA2;lZsaCz<`!<{d)&tNxt%+>le@T^d$^bTxSt330YBtH9^ywl%#ZmAkMJmu@lzh> z37+IBp5_^T#?N_{U+^5y^8&x*SG>rtd5PcfTYksy`2#QWM_%Di{F%S-SN_JU{GHeM z2mj<>{F~PqV$2jw$y7|uG)&8MOwSC=$V|-4o0x@Jc{8&yJ9986b1^q>VIJn? zt<1;#yp07|khk*=-pRXoHw*C|7G@C^Wib}#y)40!ypQ+u0hVHEmSI_zV|hNvhxjln zup%q5GOMsEAK{~{#_D{GHCU6iSetcNm-Sem4cL&4_&6K037_DTY|5wjG@G$GTd*Zt zu{GPUE!(j@JFp`Y*olek%r5N8ZtTt;?8#p2%|7hQe(cWy9LPZ&%pn}gXE=<*If5fO zilaG(V>yoFIe`;7iIX{nQ#p;(`7EE~49?^%&gLA>4~#Kl~~rF@ag_!5`%Wv<{#uHq|vm9O!2zQNU8!?j$; zH@Ti0_!i&hJA9WLxrv*(g$=ML`VF7D_z@5DV}8OT zJj!GIl*f63CwYped4`|ybDrfFJje6Az%Tg~FY;?%;y3)3-|>6?z{~uRSNIcu<}dt} zzws)6=QaMpKlvB`=5+?SjsJ`oGX+yJ6;m?}(=r{?GXpa+6EpKBW?@#|%xuig9L&jF z%*|Vvhk1D`^D#efV*wWA?Yx6`@-E)ZLcE8CS%gJdjKz5`ORyyG=^K9^p|QLEg?gcqi}T z-7Lg=SeQjvl*L$__p$^_@;=_r2Uv=wS%zg2KI%B;ewe1wm(8msd$ z)?iK6Vr|x8UDjiLHef?G;^S=0CVYZVvMHb9(`?4(Y{8an#nx=Ywrt1t?7)spU?(QB zGrO=WyRkcauqS)5H~X+J`>{UKiOcyiS8yd)@fE(x*Z4Z$ z;A*bnTCU@pT+a=Bi*NHCzRQi=#Le8ot$dH$_&&FD2X}H8cXJQ-av%5e06*Y|Jjg@* zh==(xKj9G`JnRyelFe`6n zHfCoI=43AB<}J*_yu6k9n4h$3qHvJoF=V>aOv ze3DK16rW}@HfIaAWGl928@6RTwr2-+WCA-ek)7FvUD=J@*@HdVi@n*0ec6xwIe-H> zh=VzVL-`DcaX3eCBu8;H$8apiaXcq*A}4V&r*JB#aXO#nbDY7MoWKjc9k;zvBpkNF9Y@F<{5s* z&v}+#@Ep(c0>9)}yvVP4iQn*Be#h_m126MOUg1ytnZNK?{>H2Po!9sW|Kwl%o7WlS zHU2YV%oI$?R7}k@Ov`jk&kW4SOw7!in1xw+GqW)}b1)}!F*k2v9_Hn(%*Xt^jRjbc zxAP9($-8(r3-KNnW)T);F&5{&EWwhzkN5KdmSSm^VOf@Ac|OR8_%JK5A}g^ntFS5` z;iIg^>U@kfSd+C_n{`;1^;n+`*pQ9*I2*GGpWu^h%BT1=o3S}tuq9iuHQTT)+p#@6 zup<-LiHYpYF6_!~?9LwS$zJTuKJ3eW?9Txl$Uz*;AsotQIE=$Nf+IPKqdA6SIgaBw zffG52lR1S`IgQi#ET7{H&g3l4<{ZxDJU-7CIG+o+kc+sOOSqITav5LZa=y$JT**~@ zg|G58zRowenrpb0>-Z+ua|7Sv+kA)baw9i!Gq-Rn-{UsE&+Xj7o!rIU+{3-x$NfCO z5BMPu@(@4bVSda{c!WoJjGyv2Pw*s9@ifoyGk(sq{DS9ro)`Efzv4xH%}e}--|{5BgRa@luX6cOvAKH$Mnp=jLgK$ zyop(ul{YgRvoi;CG8c367Up4I-pYK;&)Zml1$jH~;GMjSce4=hVPO_wQ5Iuy-pdj! z$@_ReA7Ck#W*L@cIhN;xe25RT0xPl-E3*o#@)17DYOK!3Sc5fLi?vyYby<(~*?x=|oA3!f$)WjExyfn z_%1hc6E|}UxAHx11N?v=@*ofKBOd0*{Den%l*jlfkMjgi z@)S?=3_s)NJj*Y5j^}xSU-BznF@F)JvU-&D3<5m97Yy5+M z@-P0)>kRT4{~0l63Z`T#re+$ZWjdy324-X?X68-I!mPZR*_fRP&v6E4au#QE4(DZ8{CnmBpyRa*}u{(RP zCws9s`>-$ju|EfJAO~?Uhj1vL;V=&82#(|^j^-GSyhxsu-;SnC? zF@DP9Ji(JZ#nU{)&-gjd@(Z5hd0ya`{E8R(H81fSe#`IpJ%8Y3{>Ur*i9hof{>tBY zmA~^E|KOkei+}SvgWHV%j2JToQ!*7(GY!)+9n&)dGcpr1^Co6tR^H5P%+4Il$z06M zTbPG=c`NfVKW}3J7Ub=`gLm>S-pxY1hlN>$MOlo+c`r+_B=6(>e1N4`nq^p)S;k(?(P29{a+{*X3jqh_i zcW@_naX0sHFZXdj5AXwi$b&q@k9e3L^AjH7Q6A%`JkAq5$x}SdGyII7^DMvMIiBYQ ze#x(Rkzex?zu~w1j^FbKUgnRy!k_pvf8nqEjaT_QukjE5$-nqFuQMnR+(;oK#!SJK zOvThp!?aAt^vuAF%*4#RiCLJHH!~ZvGY4}r7jyF#=3!pm%6!bv+gN}Fc{}gmoxF>8 zvk>oLVHROg7GrVV%MvWf`*=SeU@4Yn8J1-^mgj?fh!3*@E3y(RvkI&75kAUltj@<+ zgEd);wONOCS&#MEfDPG*kFznG@CiQ2rhJM|vl*MS1zWNeTeA(@vK`yA13NN-otVhZ z?82_>#_sIFp6tcm?8Cn7$Nn6^fgHra9KxY|hQm0VBRG|E;n)$H**WO@;z?j``peQ+{sb?!e%{6cEXdn=2k+!vyqkr14-2yhi?SGt^In!3o*YaRz5{7H4w~=W-sO=L?+A z1zgBQT+Ah0$``qeFL60v<_fOlD!#&3`5Is68(hsbT+4NQlk2&GZ}Dxu!*{uno4A=< zxRvj58{g-4?%+=D;%@HYUhd<59^eQ3kOz5)AMr3h<|jPDqddk>d7LMBlBal@XZRUE z=UINib3D%r{E}bsBERM(e#3A19lz%fyv!eYg+K9U{=#4R8?W+rUgID9lYjAVUT1K- z@t+Z6reI2@Vrr&gTBc)qW?)8UVrJgNEX>NAnT^?*gE^Utxp@ooFfVUqKIZ3bEWm=i zop%qg78X`Iey`5b3(CTDRr=Ws6P@p-<$`CPz-T*Sp(!litX%lHzP^JT8!O0ME7 ze3h^9b-uyXT*I|o$2Ym28~7IA<~w|s8@Y*_xrJN#9=Gv*Zs!i}!9`5Bn?&kr1 zzz=zlhxic>^J9L(BRtAu{FKLef+u;3r+J2-@pGQ#7d*%FyudH{6)*B@Ug9_Wmf!Jv z{=m!pkyrQ=f95azmA~;Sf9Ey+!9V#I|K@cDcNqT}F=h&;WGbd+8m47Bre_9bWF}_j zP0YfqyqVdUojI73xtN=`Fc0(cR_0@V-o^qf$lG}b@8n&)n}v7}3$qA|vKWi=UY1}< z-pBj-086nn%djlVu{XLAnc zavq=O3!KjdT*yUS%q3jP7rBftaXDY+3a;cTzQR}e8eiudT+KCH%XNH{>$!n%@om1t zce#<9xS3nHmG5yI-{*Gj;7;!1Ztme;?&E$Q;0OGW2YHAe@i0H;Cp^NVJjPFXoF{ma zr+AuY_!&RvS$@HDJkJaKl3(#6zvd-=!*BT=zvmCU%pZA$Kk;Y&!e99tukv?Z;~)H! zfAMc#XK<(SpAloGU`nQ9YNlaYrek_$U`A$QX5Pds%*vaYjoF!lIhl*Oc?4~#Kl~~rF@ag_!5`% zWv<{#uHq|vm9O!2zQNU8!?j$;H@Ti0_!i&hJA9WLxrv*(g$=ML`VF7D_z@5DV}8OTJj!GIl*f63CwYped4`|ybDrfFJje6Az%Tg~FY;?% z;y3)3-|>6?z{~uRSNIcu<}dt}zws)6=QaMpKlvB`=5+>l8UGnEW(uZcDyC){re!*& zX9i|uCT8YM%)+d^nc0|~Ihd2Vn47mS5A*U?=3{=|#sVzJ+j$4?n~>mS9QV$NTvJOR+S|uq?~5JRjske3%tjk(F4PRalje@KIJ{bw0)#tjSue%{r{h zdaTa|Y{*7@oQ>IpPw+`LiCwP*lc$#PU89(P)e!+7*&kOvLU-2Tp<|Tf^Z}}a+=MTKhA9;m8@n`s}Tn5wCRFp5*8OqG1z#M_g1&YF-d>A-{>|Lsu;cTZ?Ov(V(Q^~}j(o1|~s zq;I>VZ~LTghoo=EneCHK^k4FoGAUmrOaJdPR_9`zl(TColX6z&Q0nADhCvj>L5d({ zkSa(WqzTdn>4Nk@h9F~*Daah$6l4jq1~&)Ug6u(#AZL&($Q|4gN2<8V1f`!4NU~#Y{SQ@+-EDK%=mIp5fD}t55 zs^FF2)!?<@_27+Qb+9H_8>|c74Auu5g13UVgLi^=gN?zaU~{k~*c!YSYzy8Gwg)?c zox!ePcd#ee8|(}A2M2-=f)9g(!J*)z;BfG9@JVnaI2s%aJ`IisCxVl~so->QCipD) zJUAPC5u6Lo2N!}bgRg>%!Pmj1;G5vv;Je`a;D_LH@MCZ#_$l}~_$Bx?_${~^{2p8j z{s{gI{tEsMt|v8r7)D_nrU+ApslwD@nlNpcE=(V02s4J6!pz}KVU{p!cypL7%pT?l zbB4LX+~F-@o-l8CYnU(0AKn%g2n&X{hj)Z`hIfT`hlRp>!op#ZuxMB;EFRt)mIzCR z_l5U|4}_({(qWmfY*;QVA3hj96h0hQ2rGt_!pdQluxj{7_-I%ytR6lV)(C5cwZhtA zov?0LFRULn2pfit!pFnLVUzHQ@X4@g_*D3G*eq-wwg_8>t-{t}o3L%zE^Hrm2s?%e zVW%)L>>PFpyN2Du?qQFxXV@$39rg+PhW*0+;ec>pI4B$(4he^b&xFIm;o*pIWH>4u z9gYddhU3EV;e>EvI4PVQP6?-m)57WDv*B~$jBsW+E1Vt93Fn6M!so*m!ujEXaACM8 zTpTV5mxeEf%fgq!<>AZWig0DPDtsk;HGD06J$xfv9j*!2hU>yN!}Z~Y@U8Ie@SX78 zaAUYB+#GHRw}$V9+rsz5?ct7aXSgfe9qtMDhWo<(;eqgj@Wb$6cqsfRJRE);ei9xD zkA}y>Ps8KkiST52Dm)#Y2|o)z56^~Qgy+KZ;f3(a@T>4*_;q+G{3iT1{4V@H{2{y? z{uo{fe+qvNe+hpLe+#dMzlYbtKf*u5zrw%6>q!@X7)4PWrHE2SsiM?Tnka3QE=nI| zh%!c*qRi1vQI;rcbaRv~${yv2az?qL+|eyjo+xj0Ym_g_AKexehzdrxM|VVbMt4Pb zM}?w$qQX&;sAyCyDjwY%m5544_eJ+d4@9M+(ovbHY*a2PA3Yd76g?bOh$=>vqRLT~ zsA}{`^k`HqsvbQS)re|FwW8Wlov3b9FRC9kh#E$XqQ|4gQIqJ2=*g&Q^i=e8)GTTq zwTN0qt)kXZo2YHnE@~fjh&o0IQKu*|>Kt{6x<=ii?op4ZXVfd|9rcO&M*X7x(ST@R zG$q>bSU~LIvjl*eG(msjz-6#Pov|}iRff>DmopVi9U-ykIqJ4MCYRO(S_*C z=&R^r^mTM8`X>4|`Y!rD`XRa;{TN+|eu{pMeu;jKev7U~zem@iKcYXQzoNgR>q)o% zFplClP7$Y!Q^l#{G;!KEU7SA75NC`t#hK%q;w*92_~tlUoITDF=Ztg3x#L^nJaOLm z);M3BKfWz45EqPZkMD@@jPHu?jtj;2#D(J`anZO~Ts*!vE)kcE?~Ct`ABaoErQK7KHMD1JDu5Lb*V#g*eKan<;d_|dppTs?j)t`XOaYsIzWI&s~&UR*zJ5I2k) z#gE60<0kPF@sn}W_^J5mxLMphZV|VPTg9#8HgVgyUEDtI5O<6d;!bg5+&S(Nca6Kn z-Qyl{&$w6IJMI(rjr+y@;{ox&cu+hz9ug0YpNWUX!{ZV0$aqvdIvx{`jmO2~;|cM^ zcv3t$o)S-ur^VCbXXEGM8S%_`Ry;eN6VHw3#m~nt#Pj0?@xpjfyf|JGFO6S}m&Gr| z%j1{h74gb=Rs2f)YW!OKdi+MbI$jg6jn~C*#_Qt^@mulR@jLOm@y2*lygA+yZ;juJ zx5e+r+v6Sa&Ujb6JKhuTjrYa-;{)*r@rUuj_)z>&d^r9%{v6o4wn30*7nKv;Dv+`zUV|M0XPUd26-oiZ0 z%UhX`|5o(dY!_fb{-=P&v6E4au#QE4(D6o4wn30*7nKv;Dv+`zUV|M0XPUd26-oiZ0%UhX``FR@) zupn>e9lVow@opC4JuJ*3EXram&U;ycC3zq3=L0Oo(k#QWEXVSEkPq=;R$xU|Vr5og zRX)N;S&h~C7;CU5Yq2)#urBMdJ{zzh8}V^AW)nWaC)t!w@o6?=bGBehwqk3xVOzFi zdv;()Ca@C|*_mC~mEG8#J=l}I*qeRWm;KnE12~X_IG95?l+SP&hjRo+aui2%499XD z$8!QFauO$V3a4@!r}J4p#~GZ-S)9!|oXdH9o-c4d7jPjLaWR*0DPQC=zQpBxnJc)G ztN03EDZ`3ryLZ@kLid5wSYPyWTfd7VkG0R9>O88K!GrerFnW*VktI;Lj^ zW@IL2=1t7Pth|}on4LM8lew7uS|&N&!aPiVN0XdxWj^NTZ7jfoOnzOHobKSAyo-0U z5bt4O7GY5qV{zWgSej*6mgQKU5Aq>C%nGc?O03K(tjb6DD66qL zA7c&HWG&Wa9oA(%)@K7YWFtP##%#hT_#~V1DL&0+Y|a*J$yRL5Hf+mwY|jqt$OLv` zB0IASyRsX*vj=;!7kjf0`?4SVa{vc&5C?Mzhw>Q?<8Y4PNRHxYj^S92<9JTsL{8#l zPT^Ee<8(gD=Qx8iIg7J7hjTfP&+`S&=K?O|A};01%An|c#&W862IZM{EpxA2VUln zyuzROGk@W){Eb)nJFoE%{>i`iH?K1(0N_Rn88K!GrerFnW*VktI;Lj^W@IL2=1t7P zth|}on4LM8lew6iw=fU$@>b?!e%{6cEXdn=2k+!vyqkr14-2yhi?SGt^In!3o*YaRz5{7H4w~=W-sO z=L?+A1zgBQT+Ah0$``qeFL60v<_fOlD!#&3`5Is68(hsbT+4NQlk2&GZ}Dxu!*{un zo4A=d7LMBlBal@ zXZRUE=UINib3D%r{E}ZW`K|kZ4$12~IVmQuUHbvaL4uQ0v}@g`{mg_?31yOB$d^m3 zo;2|O6F-o+-* z>Z$vGShzY@!{ih6P3)I+@X(o)Q}k-z z@xGZ!O*{0)cA>;!Nj6W4X?-}Ul@co?E!rn0KLSqr)Bg~(nj$Ig^+wQYV&SCRCKgFv z6ir%8j(heUAX@dt8E-_={tqXL{?~~{B>iY&@_ld3E!t}!XF=|%2H(Mb?DTkOUr((yYy{8v-&VE=@QZLf?sj97K5ddq^8c{RLuV#jOsbfKOU-Xo{&xwNlWvNcYu)I=2|pz+O1g9OZJqE} z;@sqw-pSF`aZ;rvmQGqEl$e~ljmAm1J~`t*&Gm0h*|$yN_~cGHBk90@S}JjN((1n# z{$EMOf$NOpGlDCN~8FYKS)DA@;^7g?HYfh=E7`8~-V zONX9HcI*9Fi(E@G)6V4slWpGQ!9CZq+-O|k@BjbmfB&zvXyT!yF7yAD=K0@TDyNgL lkmO5cd-BSExm41&>C^k)FP5w>mOh=jw*U7lrf=Wk{}(x7+h70y literal 0 HcmV?d00001 diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/io/tests/generate_legacy_storage_files.py index 25fd86d899c08..d0365cb2c30b3 100644 --- a/pandas/io/tests/generate_legacy_storage_files.py +++ b/pandas/io/tests/generate_legacy_storage_files.py @@ -5,7 +5,7 @@ SparseSeries, SparseDataFrame, Index, MultiIndex, bdate_range, to_msgpack, date_range, period_range, - Timestamp, Categorical, Period) + Timestamp, NaT, Categorical, Period) from pandas.compat import u import os import sys @@ -140,6 +140,13 @@ def create_data(): int16=Categorical(np.arange(1000)), int32=Categorical(np.arange(10000))) + timestamp = dict(normal=Timestamp('2011-01-01'), + nat=NaT, + tz=Timestamp('2011-01-01', tz='US/Eastern'), + freq=Timestamp('2011-01-01', freq='D'), + both=Timestamp('2011-01-01', tz='Asia/Tokyo', + freq='M')) + return dict(series=series, frame=frame, panel=panel, @@ -149,7 +156,8 @@ def create_data(): sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), - cat=cat) + cat=cat, + timestamp=timestamp) def create_pickle_data(): diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index ad7d6c3c9f94f..0a491a69af8e2 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -8,7 +8,7 @@ from distutils.version import LooseVersion from pandas import compat -from pandas.compat import u +from pandas.compat import u, PY3 from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, date_range, period_range, Index, Categorical) from pandas.core.common import PerformanceWarning @@ -58,6 +58,19 @@ def check_arbitrary(a, b): assert_series_equal(a, b) elif isinstance(a, Index): assert_index_equal(a, b) + elif isinstance(a, Categorical): + # Temp, + # Categorical.categories is changed from str to bytes in PY3 + # maybe the same as GH 13591 + if PY3 and b.categories.inferred_type == 'string': + pass + else: + tm.assert_categorical_equal(a, b) + elif a is NaT: + assert b is NaT + elif isinstance(a, Timestamp): + assert a == b + assert a.freq == b.freq else: assert(a == b) @@ -815,8 +828,8 @@ def check_min_structure(self, data): for typ, v in self.minimum_structure.items(): assert typ in data, '"{0}" not found in unpacked data'.format(typ) for kind in v: - assert kind in data[ - typ], '"{0}" not found in data["{1}"]'.format(kind, typ) + msg = '"{0}" not found in data["{1}"]'.format(kind, typ) + assert kind in data[typ], msg def compare(self, vf, version): # GH12277 encoding default used to be latin-1, now utf-8 @@ -839,8 +852,8 @@ def compare(self, vf, version): # use a specific comparator # if available - comparator = getattr( - self, "compare_{typ}_{dt}".format(typ=typ, dt=dt), None) + comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt) + comparator = getattr(self, comp_method, None) if comparator is not None: comparator(result, expected, typ, version) else: @@ -872,9 +885,8 @@ def read_msgpacks(self, version): n = 0 for f in os.listdir(pth): # GH12142 0.17 files packed in P2 can't be read in P3 - if (compat.PY3 and - version.startswith('0.17.') and - f.split('.')[-4][-1] == '2'): + if (compat.PY3 and version.startswith('0.17.') and + f.split('.')[-4][-1] == '2'): continue vf = os.path.join(pth, f) try: diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index e337ad4dcfed2..55c14fee9e3ed 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -46,6 +46,12 @@ def compare_element(self, result, expected, typ, version=None): if typ.startswith('sp_'): comparator = getattr(tm, "assert_%s_equal" % typ) comparator(result, expected, exact_indices=False) + elif typ == 'timestamp': + if expected is pd.NaT: + assert result is pd.NaT + else: + tm.assert_equal(result, expected) + tm.assert_equal(result.freq, expected.freq) else: comparator = getattr(tm, "assert_%s_equal" % typ, tm.assert_almost_equal) diff --git a/pandas/lib.pxd b/pandas/lib.pxd index 36c91faa00036..554b0248e97ea 100644 --- a/pandas/lib.pxd +++ b/pandas/lib.pxd @@ -1,3 +1,4 @@ # prototypes for sharing cdef bint is_null_datetimelike(v) +cpdef bint is_period(val) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 262e036ff44f1..234ac7ea2c60c 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -33,7 +33,7 @@ def is_bool(object obj): def is_complex(object obj): return util.is_complex_object(obj) -def is_period(object val): +cpdef bint is_period(object val): """ Return a boolean if this is a Period object """ return util.is_period_object(val) @@ -538,9 +538,6 @@ def is_time_array(ndarray[object] values): return False return True -def is_period(object o): - from pandas import Period - return isinstance(o,Period) def is_period_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index aca0d0dbc107b..af2e295ae0cfc 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -24,7 +24,7 @@ cimport cython from datetime cimport * cimport util cimport lib -from lib cimport is_null_datetimelike +from lib cimport is_null_datetimelike, is_period import lib from pandas import tslib from tslib import Timedelta, Timestamp, iNaT, NaT @@ -484,8 +484,11 @@ def extract_freq(ndarray[object] values): for i in range(n): p = values[i] + try: - return p.freq + # now Timestamp / NaT has freq attr + if is_period(p): + return p.freq except AttributeError: pass diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index b86b248ead290..a6246790f83cb 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -965,7 +965,7 @@ def test_indexing_with_datetime_tz(self): # indexing - fast_xs df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) result = df.iloc[5] - expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', offset='D') + expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', freq='D') self.assertEqual(result, expected) result = df.loc[5] diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a80a3af56b18f..c632704b7c5eb 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -426,10 +426,10 @@ def test_constructor_with_datetime_tz(self): # indexing result = s.iloc[0] self.assertEqual(result, Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern', offset='D')) + tz='US/Eastern', freq='D')) result = s[0] self.assertEqual(result, Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern', offset='D')) + tz='US/Eastern', freq='D')) result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index c4ccef13f2844..1b1db90ea713d 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2365,7 +2365,7 @@ def test_reset_index_datetime(self): 'a': np.arange(6, dtype='int64')}, columns=['level_0', 'level_1', 'a']) expected['level_1'] = expected['level_1'].apply( - lambda d: pd.Timestamp(d, offset='D', tz=tz)) + lambda d: pd.Timestamp(d, freq='D', tz=tz)) assert_frame_equal(df.reset_index(), expected) def test_reset_index_period(self): diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 83cb768b37aaa..9b36bc5907066 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -558,7 +558,7 @@ def _generate(cls, start, end, periods, name, offset, @property def _box_func(self): - return lambda x: Timestamp(x, offset=self.offset, tz=self.tz) + return lambda x: Timestamp(x, freq=self.offset, tz=self.tz) def _convert_for_op(self, value): """ Convert value to be insertable to ndarray """ @@ -1199,8 +1199,9 @@ def __iter__(self): for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, l) - converted = tslib.ints_to_pydatetime( - data[start_i:end_i], tz=self.tz, offset=self.offset, box=True) + converted = tslib.ints_to_pydatetime(data[start_i:end_i], + tz=self.tz, freq=self.freq, + box=True) for v in converted: yield v diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 360944e355b4d..17b6dd12a5c02 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -124,10 +124,11 @@ def test_minmax(self): def test_numpy_minmax(self): dr = pd.date_range(start='2016-01-15', end='2016-01-20') - self.assertEqual(np.min(dr), Timestamp( - '2016-01-15 00:00:00', offset='D')) - self.assertEqual(np.max(dr), Timestamp( - '2016-01-20 00:00:00', offset='D')) + + self.assertEqual(np.min(dr), + Timestamp('2016-01-15 00:00:00', freq='D')) + self.assertEqual(np.max(dr), + Timestamp('2016-01-20 00:00:00', freq='D')) errmsg = "the 'out' parameter is not supported" tm.assertRaisesRegexp(ValueError, errmsg, np.min, dr, out=0) @@ -148,11 +149,11 @@ def test_round(self): elt = rng[1] expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 01:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, offset='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 01:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), ]) expected_elt = expected_rng[1] @@ -175,10 +176,10 @@ def test_repeat(self): freq='30Min', tz=tz) expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, offset='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), ]) tm.assert_index_equal(rng.repeat(reps), expected_rng) @@ -192,10 +193,10 @@ def test_numpy_repeat(self): freq='30Min', tz=tz) expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, offset='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), ]) tm.assert_index_equal(np.repeat(rng, reps), expected_rng) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index b0caa1f6a77cb..e594d31e57296 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3884,36 +3884,36 @@ def test_datetimeindex_accessors(self): self.assertEqual(dti.is_month_start[0], 1) tests = [ - (Timestamp('2013-06-01', offset='M').is_month_start, 1), - (Timestamp('2013-06-01', offset='BM').is_month_start, 0), - (Timestamp('2013-06-03', offset='M').is_month_start, 0), - (Timestamp('2013-06-03', offset='BM').is_month_start, 1), - (Timestamp('2013-02-28', offset='Q-FEB').is_month_end, 1), - (Timestamp('2013-02-28', offset='Q-FEB').is_quarter_end, 1), - (Timestamp('2013-02-28', offset='Q-FEB').is_year_end, 1), - (Timestamp('2013-03-01', offset='Q-FEB').is_month_start, 1), - (Timestamp('2013-03-01', offset='Q-FEB').is_quarter_start, 1), - (Timestamp('2013-03-01', offset='Q-FEB').is_year_start, 1), - (Timestamp('2013-03-31', offset='QS-FEB').is_month_end, 1), - (Timestamp('2013-03-31', offset='QS-FEB').is_quarter_end, 0), - (Timestamp('2013-03-31', offset='QS-FEB').is_year_end, 0), - (Timestamp('2013-02-01', offset='QS-FEB').is_month_start, 1), - (Timestamp('2013-02-01', offset='QS-FEB').is_quarter_start, 1), - (Timestamp('2013-02-01', offset='QS-FEB').is_year_start, 1), - (Timestamp('2013-06-30', offset='BQ').is_month_end, 0), - (Timestamp('2013-06-30', offset='BQ').is_quarter_end, 0), - (Timestamp('2013-06-30', offset='BQ').is_year_end, 0), - (Timestamp('2013-06-28', offset='BQ').is_month_end, 1), - (Timestamp('2013-06-28', offset='BQ').is_quarter_end, 1), - (Timestamp('2013-06-28', offset='BQ').is_year_end, 0), - (Timestamp('2013-06-30', offset='BQS-APR').is_month_end, 0), - (Timestamp('2013-06-30', offset='BQS-APR').is_quarter_end, 0), - (Timestamp('2013-06-30', offset='BQS-APR').is_year_end, 0), - (Timestamp('2013-06-28', offset='BQS-APR').is_month_end, 1), - (Timestamp('2013-06-28', offset='BQS-APR').is_quarter_end, 1), - (Timestamp('2013-03-29', offset='BQS-APR').is_year_end, 1), - (Timestamp('2013-11-01', offset='AS-NOV').is_year_start, 1), - (Timestamp('2013-10-31', offset='AS-NOV').is_year_end, 1), + (Timestamp('2013-06-01', freq='M').is_month_start, 1), + (Timestamp('2013-06-01', freq='BM').is_month_start, 0), + (Timestamp('2013-06-03', freq='M').is_month_start, 0), + (Timestamp('2013-06-03', freq='BM').is_month_start, 1), + (Timestamp('2013-02-28', freq='Q-FEB').is_month_end, 1), + (Timestamp('2013-02-28', freq='Q-FEB').is_quarter_end, 1), + (Timestamp('2013-02-28', freq='Q-FEB').is_year_end, 1), + (Timestamp('2013-03-01', freq='Q-FEB').is_month_start, 1), + (Timestamp('2013-03-01', freq='Q-FEB').is_quarter_start, 1), + (Timestamp('2013-03-01', freq='Q-FEB').is_year_start, 1), + (Timestamp('2013-03-31', freq='QS-FEB').is_month_end, 1), + (Timestamp('2013-03-31', freq='QS-FEB').is_quarter_end, 0), + (Timestamp('2013-03-31', freq='QS-FEB').is_year_end, 0), + (Timestamp('2013-02-01', freq='QS-FEB').is_month_start, 1), + (Timestamp('2013-02-01', freq='QS-FEB').is_quarter_start, 1), + (Timestamp('2013-02-01', freq='QS-FEB').is_year_start, 1), + (Timestamp('2013-06-30', freq='BQ').is_month_end, 0), + (Timestamp('2013-06-30', freq='BQ').is_quarter_end, 0), + (Timestamp('2013-06-30', freq='BQ').is_year_end, 0), + (Timestamp('2013-06-28', freq='BQ').is_month_end, 1), + (Timestamp('2013-06-28', freq='BQ').is_quarter_end, 1), + (Timestamp('2013-06-28', freq='BQ').is_year_end, 0), + (Timestamp('2013-06-30', freq='BQS-APR').is_month_end, 0), + (Timestamp('2013-06-30', freq='BQS-APR').is_quarter_end, 0), + (Timestamp('2013-06-30', freq='BQS-APR').is_year_end, 0), + (Timestamp('2013-06-28', freq='BQS-APR').is_month_end, 1), + (Timestamp('2013-06-28', freq='BQS-APR').is_quarter_end, 1), + (Timestamp('2013-03-29', freq='BQS-APR').is_year_end, 1), + (Timestamp('2013-11-01', freq='AS-NOV').is_year_start, 1), + (Timestamp('2013-10-31', freq='AS-NOV').is_year_end, 1), (Timestamp('2012-02-01').days_in_month, 29), (Timestamp('2013-02-01').days_in_month, 28)] diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index c6436163b9edb..ce88edcf4249b 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -255,6 +255,21 @@ def test_constructor_keyword(self): hour=1, minute=2, second=3, microsecond=999999)), repr(Timestamp('2015-11-12 01:02:03.999999'))) + def test_constructor_offset_depr(self): + # GH 12160 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + ts = Timestamp('2011-01-01', offset='D') + self.assertEqual(ts.freq, 'D') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + self.assertEqual(ts.offset, 'D') + + msg = "Can only specify freq or offset, not both" + with tm.assertRaisesRegexp(TypeError, msg): + Timestamp('2011-01-01', offset='D', freq='D') + def test_conversion(self): # GH 9255 ts = Timestamp('2000-01-01') @@ -312,13 +327,13 @@ def test_repr(self): self.assertNotIn(freq_repr, repr(date_tz)) self.assertEqual(date_tz, eval(repr(date_tz))) - date_freq = Timestamp(date, offset=freq) + date_freq = Timestamp(date, freq=freq) self.assertIn(date, repr(date_freq)) self.assertNotIn(tz_repr, repr(date_freq)) self.assertIn(freq_repr, repr(date_freq)) self.assertEqual(date_freq, eval(repr(date_freq))) - date_tz_freq = Timestamp(date, tz=tz, offset=freq) + date_tz_freq = Timestamp(date, tz=tz, freq=freq) self.assertIn(date, repr(date_tz_freq)) self.assertIn(tz_repr, repr(date_tz_freq)) self.assertIn(freq_repr, repr(date_tz_freq)) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 0db4282808a26..e45523be738df 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -63,6 +63,7 @@ from pandas.compat import parse_date, string_types, iteritems, StringIO, callabl import operator import collections +import warnings # initialize numpy import_array() @@ -86,23 +87,24 @@ try: except NameError: # py3 basestring = str -cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset): +cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct dts, + object tz, object freq): cdef _Timestamp ts_base ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - ts_base.value = value - ts_base.offset = offset + ts_base.freq = freq ts_base.nanosecond = dts.ps / 1000 return ts_base -cdef inline object create_datetime_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset): +cdef inline object create_datetime_from_ts(int64_t value, pandas_datetimestruct dts, + object tz, object freq): return datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) -def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): +def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): # convert an i8 repr to an ndarray of datetimes or Timestamp (if box == True) cdef: @@ -113,9 +115,9 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, pandas_datetimestruct, object, object) - if box and util.is_string_object(offset): + if box and util.is_string_object(freq): from pandas.tseries.frequencies import to_offset - offset = to_offset(offset) + freq = to_offset(freq) if box: func_create = create_timestamp_from_ts @@ -130,7 +132,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): result[i] = NaT else: pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) - result[i] = func_create(value, dts, tz, offset) + result[i] = func_create(value, dts, tz, freq) elif _is_tzlocal(tz) or _is_fixed_offset(tz): for i in range(n): value = arr[i] @@ -138,7 +140,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): result[i] = NaT else: pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) - dt = create_datetime_from_ts(value, dts, tz, offset) + dt = create_datetime_from_ts(value, dts, tz, freq) dt = dt + tz.utcoffset(dt) if box: dt = Timestamp(dt) @@ -163,7 +165,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): new_tz = tz pandas_datetime_to_datetimestruct(value + deltas[pos], PANDAS_FR_ns, &dts) - result[i] = func_create(value, dts, new_tz, offset) + result[i] = func_create(value, dts, new_tz, freq) else: for i in range(n): @@ -172,7 +174,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): result[i] = NaT else: pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) - result[i] = func_create(value, dts, None, offset) + result[i] = func_create(value, dts, None, freq) return result @@ -259,10 +261,10 @@ class Timestamp(_Timestamp): """ @classmethod - def fromordinal(cls, ordinal, offset=None, tz=None): + def fromordinal(cls, ordinal, freq=None, tz=None, offset=None): """ passed an ordinal, translate and convert to a ts note: by definition there cannot be any tz info on the ordinal itself """ - return cls(datetime.fromordinal(ordinal),offset=offset,tz=tz) + return cls(datetime.fromordinal(ordinal), freq=freq, tz=tz, offset=offset) @classmethod def now(cls, tz=None): @@ -309,11 +311,12 @@ class Timestamp(_Timestamp): def combine(cls, date, time): return cls(datetime.combine(date, time)) - def __new__(cls, - object ts_input=_no_input, object offset=None, tz=None, unit=None, - year=None, month=None, day=None, - hour=None, minute=None, second=None, microsecond=None, - tzinfo=None): + def __new__(cls, object ts_input=_no_input, + object freq=None, tz=None, unit=None, + year=None, month=None, day=None, + hour=None, minute=None, second=None, microsecond=None, + tzinfo=None, + object offset=None): # The parameter list folds together legacy parameter names (the first # four) and positional and keyword parameter names from pydatetime. # @@ -338,15 +341,24 @@ class Timestamp(_Timestamp): cdef _TSObject ts cdef _Timestamp ts_base + if offset is not None: + # deprecate offset kwd in 0.19.0, GH13593 + if freq is not None: + msg = "Can only specify freq or offset, not both" + raise TypeError(msg) + warnings.warn("offset is deprecated. Use freq instead", + FutureWarning) + freq = offset + if ts_input is _no_input: # User passed keyword arguments. return Timestamp(datetime(year, month, day, hour or 0, minute or 0, second or 0, microsecond or 0, tzinfo), tz=tzinfo) - elif is_integer_object(offset): + elif is_integer_object(freq): # User passed positional arguments: # Timestamp(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]]) - return Timestamp(datetime(ts_input, offset, tz, unit or 0, + return Timestamp(datetime(ts_input, freq, tz, unit or 0, year or 0, month or 0, day or 0, hour), tz=hour) ts = convert_to_tsobject(ts_input, tz, unit, 0, 0) @@ -354,9 +366,9 @@ class Timestamp(_Timestamp): if ts.value == NPY_NAT: return NaT - if util.is_string_object(offset): + if util.is_string_object(freq): from pandas.tseries.frequencies import to_offset - offset = to_offset(offset) + freq = to_offset(freq) # make datetime happy ts_base = _Timestamp.__new__(cls, ts.dts.year, ts.dts.month, @@ -365,7 +377,7 @@ class Timestamp(_Timestamp): # fill out rest of data ts_base.value = ts.value - ts_base.offset = offset + ts_base.freq = freq ts_base.nanosecond = ts.dts.ps / 1000 return ts_base @@ -433,16 +445,18 @@ class Timestamp(_Timestamp): return self.tzinfo @property - def freq(self): - return self.offset + def offset(self): + warnings.warn(".offset is deprecated. Use .freq instead", + FutureWarning) + return self.freq def __setstate__(self, state): self.value = state[0] - self.offset = state[1] + self.freq = state[1] self.tzinfo = state[2] def __reduce__(self): - object_state = self.value, self.offset, self.tzinfo + object_state = self.value, self.freq, self.tzinfo return (Timestamp, object_state) def to_period(self, freq=None): @@ -491,7 +505,7 @@ class Timestamp(_Timestamp): @property def freqstr(self): - return getattr(self.offset, 'freqstr', self.offset) + return getattr(self.freq, 'freqstr', self.freq) @property def is_month_start(self): @@ -602,7 +616,7 @@ class Timestamp(_Timestamp): def replace(self, **kwds): return Timestamp(datetime.replace(self, **kwds), - offset=self.offset) + freq=self.freq) def to_pydatetime(self, warn=True): """ @@ -911,16 +925,6 @@ cdef inline bint _is_multiple(int64_t us, int64_t mult): return us % mult == 0 -def apply_offset(ndarray[object] values, object offset): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] new_values - object boxed - - result = np.empty(n, dtype='M8[ns]') - new_values = result.view('i8') - - cdef inline bint _cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1: if op == Py_EQ: return lhs == rhs @@ -955,7 +959,7 @@ cdef str _NDIM_STRING = "ndim" cdef class _Timestamp(datetime): cdef readonly: int64_t value, nanosecond - object offset # frequency reference + object freq # frequency reference def __hash__(_Timestamp self): if self.nanosecond: @@ -1029,9 +1033,9 @@ cdef class _Timestamp(datetime): pass tz = ", tz='{0}'".format(zone) if zone is not None else "" - offset = ", offset='{0}'".format(self.offset.freqstr) if self.offset is not None else "" + freq = ", freq='{0}'".format(self.freq.freqstr) if self.freq is not None else "" - return "Timestamp('{stamp}'{tz}{offset})".format(stamp=stamp, tz=tz, offset=offset) + return "Timestamp('{stamp}'{tz}{freq})".format(stamp=stamp, tz=tz, freq=freq) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: @@ -1083,17 +1087,17 @@ cdef class _Timestamp(datetime): if is_timedelta64_object(other): other_int = other.astype('timedelta64[ns]').view('i8') - return Timestamp(self.value + other_int, tz=self.tzinfo, offset=self.offset) + return Timestamp(self.value + other_int, tz=self.tzinfo, freq=self.freq) elif is_integer_object(other): - if self.offset is None: + if self.freq is None: raise ValueError("Cannot add integral value to Timestamp " - "without offset.") - return Timestamp((self.offset * other).apply(self), offset=self.offset) + "without freq.") + return Timestamp((self.freq * other).apply(self), freq=self.freq) elif isinstance(other, timedelta) or hasattr(other, 'delta'): nanos = _delta_to_nanoseconds(other) - result = Timestamp(self.value + nanos, tz=self.tzinfo, offset=self.offset) + result = Timestamp(self.value + nanos, tz=self.tzinfo, freq=self.freq) if getattr(other, 'normalize', False): result = Timestamp(normalize_date(result)) return result From c989570319464fe3b7227e69db9f27601ab7a66d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 10 Jun 2016 03:29:17 +0100 Subject: [PATCH 083/359] CLN: Remove the engine parameter in CSVFormatter and to_csv closes #13419 xref #11274 --- doc/source/whatsnew/v0.19.0.txt | 9 ++ pandas/core/frame.py | 1 - pandas/formats/format.py | 133 ++-------------------------- pandas/tests/formats/test_format.py | 6 -- pandas/tests/frame/test_to_csv.py | 109 +++++++++-------------- 5 files changed, 56 insertions(+), 202 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a6c3c0c5d7f79..3e05003389b54 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -436,6 +436,15 @@ Deprecations - top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) - ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) + +.. _whatsnew_0190.prior_deprecations: + +Removal of prior version deprecations/changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) + + .. _whatsnew_0190.performance: Performance Improvements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e804271d8afa9..356abc67b168a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1342,7 +1342,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=columns, header=header, index=index, index_label=index_label, mode=mode, chunksize=chunksize, quotechar=quotechar, - engine=kwds.get("engine"), tupleize_cols=tupleize_cols, date_format=date_format, doublequote=doublequote, diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 0c6a15db4ccfe..cc46ed57aeff0 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -30,7 +30,6 @@ import itertools import csv -import warnings common_docstring = """ Parameters @@ -1326,15 +1325,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, compression=None, quoting=None, line_terminator='\n', - chunksize=None, engine=None, tupleize_cols=False, - quotechar='"', date_format=None, doublequote=True, - escapechar=None, decimal='.'): - - if engine is not None: - warnings.warn("'engine' keyword is deprecated and will be " - "removed in a future version", FutureWarning, - stacklevel=3) - self.engine = engine # remove for 0.18 + chunksize=None, tupleize_cols=False, quotechar='"', + date_format=None, doublequote=True, escapechar=None, + decimal='.'): + self.obj = obj if path_or_buf is None: @@ -1369,11 +1363,6 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.date_format = date_format - # GH3457 - if not self.obj.columns.is_unique and engine == 'python': - raise NotImplementedError("columns.is_unique == False not " - "supported with engine='python'") - self.tupleize_cols = tupleize_cols self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and not self.tupleize_cols) @@ -1430,108 +1419,6 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', if not index: self.nlevels = 0 - # original python implem. of df.to_csv - # invoked by df.to_csv(engine=python) - def _helper_csv(self, writer, na_rep=None, cols=None, header=True, - index=True, index_label=None, float_format=None, - date_format=None): - if cols is None: - cols = self.columns - - has_aliases = isinstance(header, (tuple, list, np.ndarray, Index)) - if has_aliases or header: - if index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(self.obj.index, MultiIndex): - index_label = [] - for i, name in enumerate(self.obj.index.names): - if name is None: - name = '' - index_label.append(name) - else: - index_label = self.obj.index.name - if index_label is None: - index_label = [''] - else: - index_label = [index_label] - elif not isinstance(index_label, - (list, tuple, np.ndarray, Index)): - # given a string for a DF with Index - index_label = [index_label] - - encoded_labels = list(index_label) - else: - encoded_labels = [] - - if has_aliases: - if len(header) != len(cols): - raise ValueError(('Writing %d cols but got %d aliases' - % (len(cols), len(header)))) - else: - write_cols = header - else: - write_cols = cols - encoded_cols = list(write_cols) - - writer.writerow(encoded_labels + encoded_cols) - else: - encoded_cols = list(cols) - writer.writerow(encoded_cols) - - if date_format is None: - date_formatter = lambda x: Timestamp(x)._repr_base - else: - - def strftime_with_nulls(x): - x = Timestamp(x) - if notnull(x): - return x.strftime(date_format) - - date_formatter = lambda x: strftime_with_nulls(x) - - data_index = self.obj.index - - if isinstance(self.obj.index, PeriodIndex): - data_index = self.obj.index.to_timestamp() - - if isinstance(data_index, DatetimeIndex) and date_format is not None: - data_index = Index([date_formatter(x) for x in data_index]) - - values = self.obj.copy() - values.index = data_index - values.columns = values.columns.to_native_types( - na_rep=na_rep, float_format=float_format, date_format=date_format, - quoting=self.quoting) - values = values[cols] - - series = {} - for k, v in compat.iteritems(values._series): - series[k] = v._values - - nlevels = getattr(data_index, 'nlevels', 1) - for j, idx in enumerate(data_index): - row_fields = [] - if index: - if nlevels == 1: - row_fields = [idx] - else: # handle MultiIndex - row_fields = list(idx) - for i, col in enumerate(cols): - val = series[col][j] - if lib.checknull(val): - val = na_rep - - if float_format is not None and com.is_float(val): - val = float_format % val - elif isinstance(val, (np.datetime64, Timestamp)): - val = date_formatter(val) - - row_fields.append(val) - - writer.writerow(row_fields) - def save(self): # create the writer & save if hasattr(self.path_or_buf, 'write'): @@ -1555,17 +1442,7 @@ def save(self): else: self.writer = csv.writer(f, **writer_kwargs) - if self.engine == 'python': - # to be removed in 0.13 - self._helper_csv(self.writer, na_rep=self.na_rep, - float_format=self.float_format, - cols=self.cols, header=self.header, - index=self.index, - index_label=self.index_label, - date_format=self.date_format) - - else: - self._save() + self._save() finally: if close: diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index c5e9c258b293a..7a282e7eb14ad 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3329,12 +3329,6 @@ def test_to_csv_date_format(self): self.assertEqual(df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d'), expected_ymd_sec) - # deprecation GH11274 - def test_to_csv_engine_kw_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) - df.to_csv(engine='python') - def test_period(self): # GH 12615 df = pd.DataFrame({'A': pd.period_range('2013-01', diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index c23702ef46ad2..55c7ebb183ce5 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -10,7 +10,7 @@ from pandas.compat import (lmap, range, lrange, StringIO, u) from pandas.parser import CParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, - date_range, read_csv, compat) + date_range, read_csv, compat, to_datetime) import pandas as pd from pandas.util.testing import (assert_almost_equal, @@ -139,7 +139,7 @@ def test_to_csv_from_csv5(self): self.tzframe.to_csv(path) result = pd.read_csv(path, index_col=0, parse_dates=['A']) - converter = lambda c: pd.to_datetime(result[c]).dt.tz_localize( + converter = lambda c: to_datetime(result[c]).dt.tz_localize( 'UTC').dt.tz_convert(self.tzframe[c].dt.tz) result['B'] = converter('B') result['C'] = converter('C') @@ -162,15 +162,6 @@ def test_to_csv_cols_reordering(self): assert_frame_equal(df[cols], rs_c, check_names=False) - def test_to_csv_legacy_raises_on_dupe_cols(self): - df = mkdf(10, 3) - df.columns = ['a', 'a', 'b'] - with ensure_clean() as path: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - self.assertRaises(NotImplementedError, - df.to_csv, path, engine='python') - def test_to_csv_new_dupe_cols(self): import pandas as pd @@ -712,7 +703,6 @@ def test_to_csv_dups_cols(self): cols.extend([0, 1, 2]) df.columns = cols - from pandas import to_datetime with ensure_clean() as filename: df.to_csv(filename) result = read_csv(filename, index_col=0) @@ -993,72 +983,57 @@ def test_to_csv_compression_value_error(self): filename, compression="zip") def test_to_csv_date_format(self): - from pandas import to_datetime with ensure_clean('__tmp_to_csv_date_format__') as path: - for engine in [None, 'python']: - w = FutureWarning if engine == 'python' else None - - dt_index = self.tsframe.index - datetime_frame = DataFrame( - {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) - - with tm.assert_produces_warning(w, check_stacklevel=False): - datetime_frame.to_csv( - path, date_format='%Y%m%d', engine=engine) - - # Check that the data was put in the specified format - test = read_csv(path, index_col=0) - - datetime_frame_int = datetime_frame.applymap( - lambda x: int(x.strftime('%Y%m%d'))) - datetime_frame_int.index = datetime_frame_int.index.map( - lambda x: int(x.strftime('%Y%m%d'))) + dt_index = self.tsframe.index + datetime_frame = DataFrame( + {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) + datetime_frame.to_csv(path, date_format='%Y%m%d') - assert_frame_equal(test, datetime_frame_int) + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) - with tm.assert_produces_warning(w, check_stacklevel=False): - datetime_frame.to_csv( - path, date_format='%Y-%m-%d', engine=engine) + datetime_frame_int = datetime_frame.applymap( + lambda x: int(x.strftime('%Y%m%d'))) + datetime_frame_int.index = datetime_frame_int.index.map( + lambda x: int(x.strftime('%Y%m%d'))) - # Check that the data was put in the specified format - test = read_csv(path, index_col=0) - datetime_frame_str = datetime_frame.applymap( - lambda x: x.strftime('%Y-%m-%d')) - datetime_frame_str.index = datetime_frame_str.index.map( - lambda x: x.strftime('%Y-%m-%d')) + assert_frame_equal(test, datetime_frame_int) - assert_frame_equal(test, datetime_frame_str) + datetime_frame.to_csv(path, date_format='%Y-%m-%d') - # Check that columns get converted - datetime_frame_columns = datetime_frame.T + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + datetime_frame_str = datetime_frame.applymap( + lambda x: x.strftime('%Y-%m-%d')) + datetime_frame_str.index = datetime_frame_str.index.map( + lambda x: x.strftime('%Y-%m-%d')) - with tm.assert_produces_warning(w, check_stacklevel=False): - datetime_frame_columns.to_csv( - path, date_format='%Y%m%d', engine=engine) + assert_frame_equal(test, datetime_frame_str) - test = read_csv(path, index_col=0) + # Check that columns get converted + datetime_frame_columns = datetime_frame.T + datetime_frame_columns.to_csv(path, date_format='%Y%m%d') - datetime_frame_columns = datetime_frame_columns.applymap( - lambda x: int(x.strftime('%Y%m%d'))) - # Columns don't get converted to ints by read_csv - datetime_frame_columns.columns = ( - datetime_frame_columns.columns - .map(lambda x: x.strftime('%Y%m%d'))) + test = read_csv(path, index_col=0) - assert_frame_equal(test, datetime_frame_columns) + datetime_frame_columns = datetime_frame_columns.applymap( + lambda x: int(x.strftime('%Y%m%d'))) + # Columns don't get converted to ints by read_csv + datetime_frame_columns.columns = ( + datetime_frame_columns.columns + .map(lambda x: x.strftime('%Y%m%d'))) - # test NaTs - nat_index = to_datetime( - ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) - nat_frame = DataFrame({'A': nat_index}, index=nat_index) + assert_frame_equal(test, datetime_frame_columns) - with tm.assert_produces_warning(w, check_stacklevel=False): - nat_frame.to_csv( - path, date_format='%Y-%m-%d', engine=engine) + # test NaTs + nat_index = to_datetime( + ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) + nat_frame = DataFrame({'A': nat_index}, index=nat_index) + nat_frame.to_csv(path, date_format='%Y-%m-%d') - test = read_csv(path, parse_dates=[0, 1], index_col=0) + test = read_csv(path, parse_dates=[0, 1], index_col=0) - assert_frame_equal(test, nat_frame) + assert_frame_equal(test, nat_frame) def test_to_csv_with_dst_transitions(self): @@ -1077,7 +1052,7 @@ def test_to_csv_with_dst_transitions(self): # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) - result.index = pd.to_datetime(result.index).tz_localize( + result.index = to_datetime(result.index).tz_localize( 'UTC').tz_convert('Europe/London') assert_frame_equal(result, df) @@ -1089,9 +1064,9 @@ def test_to_csv_with_dst_transitions(self): with ensure_clean('csv_date_format_with_dst') as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) - result.index = pd.to_datetime(result.index).tz_localize( + result.index = to_datetime(result.index).tz_localize( 'UTC').tz_convert('Europe/Paris') - result['idx'] = pd.to_datetime(result['idx']).astype( + result['idx'] = to_datetime(result['idx']).astype( 'datetime64[ns, Europe/Paris]') assert_frame_equal(result, df) From c2cc68d6eb233bc74c6bd032650704030c4b9a9d Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 10 Jul 2016 18:02:26 -0400 Subject: [PATCH 084/359] BUG: Block/DTI doesnt handle tzlocal properly Author: sinhrks Closes #13583 from sinhrks/tzlocal and squashes the following commits: 93f59a3 [sinhrks] BUG: DTI doesnt handle tzlocal properly --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/tools/tests/test_merge.py | 12 ++++++++ pandas/tseries/tests/test_timezones.py | 40 ++++++++++++++++++++++++++ pandas/tslib.pyx | 7 +++-- 4 files changed, 58 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 3e05003389b54..70c466ed51681 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -492,6 +492,8 @@ Bug Fixes - Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame`` appropriately when empty (:issue:`13212`) - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) - Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue:`13306`) +- Bug in ``.tz_localize`` with ``dateutil.tz.tzlocal`` may return incorrect result (:issue:`13583`) +- Bug in ``DatetimeTZDtype`` dtype with ``dateutil.tz.tzlocal`` cannot be regarded as valid dtype (:issue:`13583`) - Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) - Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index c8d1bae78dad3..6c448de741e0c 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1263,6 +1263,18 @@ def test_concat_tz_series_with_datetimelike(self): result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + def test_concat_tz_series_tzlocal(self): + # GH 13583 + tm._skip_if_no_dateutil() + import dateutil + x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()), + pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())] + y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()), + pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y)) + self.assertEqual(result.dtype, 'datetime64[ns, tzlocal()]') + def test_concat_period_series(self): x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index d68ff793c9b6a..71a041d5139a2 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -1061,6 +1061,46 @@ def test_tslib_tz_convert_dst(self): self.assert_numpy_array_equal(idx.hour, np.array([4, 4], dtype=np.int32)) + def test_tzlocal(self): + # GH 13583 + ts = Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()) + self.assertEqual(ts.tz, dateutil.tz.tzlocal()) + self.assertTrue("tz='tzlocal()')" in repr(ts)) + + tz = tslib.maybe_get_tz('tzlocal()') + self.assertEqual(tz, dateutil.tz.tzlocal()) + + # get offset using normal datetime for test + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = offset.total_seconds() * 1000000000 + self.assertEqual(ts.value + offset, Timestamp('2011-01-01').value) + + def test_tz_localize_tzlocal(self): + # GH 13583 + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = int(offset.total_seconds() * 1000000000) + + dti = date_range(start='2001-01-01', end='2001-03-01') + dti2 = dti.tz_localize(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) + + dti = date_range(start='2001-01-01', end='2001-03-01', + tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_localize(None) + tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) + + def test_tz_convert_tzlocal(self): + # GH 13583 + # tz_convert doesn't affect to internal + dti = date_range(start='2001-01-01', end='2001-03-01', tz='UTC') + dti2 = dti.tz_convert(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + dti = date_range(start='2001-01-01', end='2001-03-01', + tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_convert(None) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + class TestTimeZoneCacheKey(tm.TestCase): def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self): diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index e45523be738df..62f8b10e3eea2 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1595,7 +1595,9 @@ cpdef inline object maybe_get_tz(object tz): Otherwise, just return tz. """ if isinstance(tz, string_types): - if tz.startswith('dateutil/'): + if tz == 'tzlocal()': + tz = _dateutil_tzlocal() + elif tz.startswith('dateutil/'): zone = tz[9:] tz = _dateutil_gettz(zone) # On Python 3 on Windows, the filename is not always set correctly. @@ -3771,7 +3773,6 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): return np.array([], dtype=np.int64) # Convert to UTC - if _get_zone(tz1) != 'UTC': utc_dates = np.empty(n, dtype=np.int64) if _is_tzlocal(tz1): @@ -3825,7 +3826,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): dts.min, dts.sec, dts.us, tz2) delta = int(total_seconds(_get_utcoffset(tz2, dt))) * 1000000000 result[i] = v + delta - return result + return result # Convert UTC to other timezone trans, deltas, typ = _get_dst_info(tz2) From 2e8c993d68e6edb5afaa54b0742fac8f01a04abb Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Mon, 11 Jul 2016 10:37:08 +0900 Subject: [PATCH 085/359] BUG: Series contains NaT with object dtype comparison incorrect (#13592) closes #9005 --- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/core/ops.py | 35 +++++---- pandas/indexes/base.py | 20 +++-- pandas/lib.pyx | 12 +-- pandas/tests/series/test_operators.py | 101 ++++++++++++++++++++++---- pandas/tseries/base.py | 2 +- pandas/tseries/tdi.py | 2 +- pandas/tseries/tests/test_base.py | 78 ++++++++++++++++++++ 8 files changed, 208 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 70c466ed51681..706ec903daaa2 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -527,6 +527,8 @@ Bug Fixes - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) +- Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`) +- Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 34ab3ae6863b5..0af7b6d80ce0e 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -27,7 +27,8 @@ is_integer_dtype, is_categorical_dtype, is_object_dtype, is_timedelta64_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_bool_dtype, PerformanceWarning, ABCSeries) + is_bool_dtype, PerformanceWarning, + ABCSeries, ABCIndex) # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory @@ -664,6 +665,22 @@ def wrapper(left, right, name=name, na_op=na_op): return wrapper +def _comp_method_OBJECT_ARRAY(op, x, y): + if isinstance(y, list): + y = lib.list_to_object_array(y) + if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): + if not is_object_dtype(y.dtype): + y = y.astype(np.object_) + + if isinstance(y, (ABCSeries, ABCIndex)): + y = y.values + + result = lib.vec_compare(x, y, op) + else: + result = lib.scalar_compare(x, y, op) + return result + + def _comp_method_SERIES(op, name, str_rep, masker=False): """ Wrapper function for Series arithmetic operations, to avoid @@ -680,16 +697,7 @@ def na_op(x, y): return op(y, x) if is_object_dtype(x.dtype): - if isinstance(y, list): - y = lib.list_to_object_array(y) - - if isinstance(y, (np.ndarray, ABCSeries)): - if not is_object_dtype(y.dtype): - result = lib.vec_compare(x, y.astype(np.object_), op) - else: - result = lib.vec_compare(x, y, op) - else: - result = lib.scalar_compare(x, y, op) + result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types @@ -713,12 +721,11 @@ def na_op(x, y): (not isscalar(y) and needs_i8_conversion(y))): if isscalar(y): + mask = isnull(x) y = _index.convert_scalar(x, _values_from_object(y)) else: + mask = isnull(x) | isnull(y) y = y.view('i8') - - mask = isnull(x) - x = x.view('i8') try: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index ad27010714f63..e697dc63c2cdb 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -31,6 +31,7 @@ is_list_like, is_bool_dtype, is_integer_dtype, is_float_dtype, needs_i8_conversion) +from pandas.core.ops import _comp_method_OBJECT_ARRAY from pandas.core.strings import StringAccessorMixin from pandas.core.config import get_option @@ -3182,8 +3183,11 @@ def _evaluate_compare(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - func = getattr(self.values, op) - result = func(np.asarray(other)) + if is_object_dtype(self) and self.nlevels == 1: + # don't pass MultiIndex + result = _comp_method_OBJECT_ARRAY(op, self.values, other) + else: + result = op(self.values, np.asarray(other)) # technically we could support bool dtyped Index # for now just return the indexing array directly @@ -3196,12 +3200,12 @@ def _evaluate_compare(self, other): return _evaluate_compare - cls.__eq__ = _make_compare('__eq__') - cls.__ne__ = _make_compare('__ne__') - cls.__lt__ = _make_compare('__lt__') - cls.__gt__ = _make_compare('__gt__') - cls.__le__ = _make_compare('__le__') - cls.__ge__ = _make_compare('__ge__') + cls.__eq__ = _make_compare(operator.eq) + cls.__ne__ = _make_compare(operator.ne) + cls.__lt__ = _make_compare(operator.lt) + cls.__gt__ = _make_compare(operator.gt) + cls.__le__ = _make_compare(operator.le) + cls.__ge__ = _make_compare(operator.ge) @classmethod def _add_numericlike_set_methods_disabled(cls): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index a9c7f93097f1b..7cbb502315b64 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -768,12 +768,12 @@ def scalar_compare(ndarray[object] values, object val, object op): raise ValueError('Unrecognized operator') result = np.empty(n, dtype=bool).view(np.uint8) - isnull_val = _checknull(val) + isnull_val = checknull(val) if flag == cpython.Py_NE: for i in range(n): x = values[i] - if _checknull(x): + if checknull(x): result[i] = True elif isnull_val: result[i] = True @@ -785,7 +785,7 @@ def scalar_compare(ndarray[object] values, object val, object op): elif flag == cpython.Py_EQ: for i in range(n): x = values[i] - if _checknull(x): + if checknull(x): result[i] = False elif isnull_val: result[i] = False @@ -798,7 +798,7 @@ def scalar_compare(ndarray[object] values, object val, object op): else: for i in range(n): x = values[i] - if _checknull(x): + if checknull(x): result[i] = False elif isnull_val: result[i] = False @@ -864,7 +864,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): x = left[i] y = right[i] - if _checknull(x) or _checknull(y): + if checknull(x) or checknull(y): result[i] = True else: result[i] = cpython.PyObject_RichCompareBool(x, y, flag) @@ -873,7 +873,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): x = left[i] y = right[i] - if _checknull(x) or _checknull(y): + if checknull(x) or checknull(y): result[i] = False else: result[i] = cpython.PyObject_RichCompareBool(x, y, flag) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 6ab382beb7973..9c401e9ce6da8 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -980,24 +980,97 @@ def test_comparison_invalid(self): self.assertRaises(TypeError, lambda: x <= y) def test_more_na_comparisons(self): - left = Series(['a', np.nan, 'c']) - right = Series(['a', np.nan, 'd']) + for dtype in [None, object]: + left = Series(['a', np.nan, 'c'], dtype=dtype) + right = Series(['a', np.nan, 'd'], dtype=dtype) - result = left == right - expected = Series([True, False, False]) - assert_series_equal(result, expected) + result = left == right + expected = Series([True, False, False]) + assert_series_equal(result, expected) - result = left != right - expected = Series([False, True, True]) - assert_series_equal(result, expected) + result = left != right + expected = Series([False, True, True]) + assert_series_equal(result, expected) - result = left == np.nan - expected = Series([False, False, False]) - assert_series_equal(result, expected) + result = left == np.nan + expected = Series([False, False, False]) + assert_series_equal(result, expected) - result = left != np.nan - expected = Series([True, True, True]) - assert_series_equal(result, expected) + result = left != np.nan + expected = Series([True, True, True]) + assert_series_equal(result, expected) + + def test_nat_comparisons(self): + data = [([pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')], + [pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]), + + ([pd.Timedelta('1 days'), pd.NaT, + pd.Timedelta('3 days')], + [pd.NaT, pd.NaT, pd.Timedelta('3 days')]), + + ([pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')], + [pd.NaT, pd.NaT, pd.Period('2011-03', freq='M')])] + + # add lhs / rhs switched data + data = data + [(r, l) for l, r in data] + + for l, r in data: + for dtype in [None, object]: + left = Series(l, dtype=dtype) + + # Series, Index + for right in [Series(r, dtype=dtype), Index(r, dtype=dtype)]: + expected = Series([False, False, True]) + assert_series_equal(left == right, expected) + + expected = Series([True, True, False]) + assert_series_equal(left != right, expected) + + expected = Series([False, False, False]) + assert_series_equal(left < right, expected) + + expected = Series([False, False, False]) + assert_series_equal(left > right, expected) + + expected = Series([False, False, True]) + assert_series_equal(left >= right, expected) + + expected = Series([False, False, True]) + assert_series_equal(left <= right, expected) + + def test_nat_comparisons_scalar(self): + data = [[pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')], + + [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')], + + [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')]] + + for l in data: + for dtype in [None, object]: + left = Series(l, dtype=dtype) + + expected = Series([False, False, False]) + assert_series_equal(left == pd.NaT, expected) + assert_series_equal(pd.NaT == left, expected) + + expected = Series([True, True, True]) + assert_series_equal(left != pd.NaT, expected) + assert_series_equal(pd.NaT != left, expected) + + expected = Series([False, False, False]) + assert_series_equal(left < pd.NaT, expected) + assert_series_equal(pd.NaT > left, expected) + assert_series_equal(left <= pd.NaT, expected) + assert_series_equal(pd.NaT >= left, expected) + + assert_series_equal(left > pd.NaT, expected) + assert_series_equal(pd.NaT < left, expected) + assert_series_equal(left >= pd.NaT, expected) + assert_series_equal(pd.NaT <= left, expected) def test_comparison_different_length(self): a = Series(['a', 'b', 'c']) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 2e3d1ace9734c..4bafac873ea09 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -142,7 +142,7 @@ def _evaluate_compare(self, other, op): other = type(self)(other) # compare - result = getattr(self.asi8, op)(other.asi8) + result = op(self.asi8, other.asi8) # technically we could support bool dtyped Index # for now just return the indexing array directly diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 84f357481a28e..af4c46e2d16fa 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -36,7 +36,7 @@ def _td_index_cmp(opname, nat_result=False): def wrapper(self, other): func = getattr(super(TimedeltaIndex, self), opname) - if _is_convertible_to_td(other): + if _is_convertible_to_td(other) or other is tslib.NaT: other = _to_m8(other) result = func(other) if com.isnull(other): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 17b6dd12a5c02..68cea17ba3fc9 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -458,6 +458,32 @@ def test_sub_period(self): with tm.assertRaises(TypeError): p - idx + def test_comp_nat(self): + left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')]) + right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + def test_value_counts_unique(self): # GH 7735 for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: @@ -1238,6 +1264,32 @@ def test_addition_ops(self): expected = Timestamp('20130102') self.assertEqual(result, expected) + def test_comp_nat(self): + left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT, + pd.Timedelta('3 days')]) + right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + def test_value_counts_unique(self): # GH 7735 @@ -2039,6 +2091,32 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + def test_comp_nat(self): + left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, + pd.Period('2011-01-03')]) + right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + def test_value_counts_unique(self): # GH 7735 idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) From 5605f99105b6a97fb950fbebbf83d2cac0b23ef1 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Mon, 11 Jul 2016 16:31:10 +0900 Subject: [PATCH 086/359] CLN/TST: Add tests for nan/nat mixed input (#13477) closes #13467 --- doc/source/whatsnew/v0.19.0.txt | 3 +- pandas/indexes/base.py | 3 +- pandas/src/inference.pyx | 72 ++++++-- pandas/src/util.pxd | 2 +- pandas/tests/indexes/test_base.py | 43 +++++ pandas/tests/series/test_constructors.py | 21 ++- pandas/tests/test_infer_and_convert.py | 209 +++++++++++++++++++++++ pandas/tslib.pyx | 10 +- 8 files changed, 334 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 706ec903daaa2..046690e28dba5 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -484,8 +484,9 @@ Bug Fixes - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) - - Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`) +- Bug in ``Index`` and ``Series`` created with ``NaN`` and ``NaT`` mixed data may not have ``datetime64`` dtype (:issue:`13324`) +- Bug in ``Index`` and ``Series`` may ignore ``np.datetime64('nat')`` and ``np.timdelta64('nat')`` to infer dtype (:issue:`13324`) - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) - Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index e697dc63c2cdb..0bb80be013275 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -243,8 +243,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # don't support boolean explicity ATM pass elif inferred != 'string': - if (inferred.startswith('datetime') or - tslib.is_timestamp_array(subarr)): + if inferred.startswith('datetime'): if (lib.is_datetime_with_singletz_array(subarr) or 'tz' in kwargs): diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 234ac7ea2c60c..9f96037c97c62 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -103,6 +103,7 @@ def infer_dtype(object _values): Py_ssize_t i, n object val ndarray values + bint seen_pdnat = False, seen_val = False if isinstance(_values, np.ndarray): values = _values @@ -141,17 +142,34 @@ def infer_dtype(object _values): values = values.ravel() # try to use a valid value - for i in range(n): - val = util.get_value_1d(values, i) - if not is_null_datetimelike(val): - break + for i from 0 <= i < n: + val = util.get_value_1d(values, i) - if util.is_datetime64_object(val) or val is NaT: + # do not use is_nul_datetimelike to keep + # np.datetime64('nat') and np.timedelta64('nat') + if util._checknull(val): + pass + elif val is NaT: + seen_pdnat = True + else: + seen_val = True + break + + # if all values are nan/NaT + if seen_val is False and seen_pdnat is True: + return 'datetime' + # float/object nan is handled in latter logic + + if util.is_datetime64_object(val): if is_datetime64_array(values): return 'datetime64' elif is_timedelta_or_timedelta64_array(values): return 'timedelta' + elif is_timedelta(val): + if is_timedelta_or_timedelta64_array(values): + return 'timedelta' + elif util.is_integer_object(val): # a timedelta will show true here as well if is_timedelta(val): @@ -200,17 +218,15 @@ def infer_dtype(object _values): if is_bytes_array(values): return 'bytes' - elif is_timedelta(val): - if is_timedelta_or_timedelta64_array(values): - return 'timedelta' - elif is_period(val): if is_period_array(values): return 'period' for i in range(n): val = util.get_value_1d(values, i) - if util.is_integer_object(val): + if (util.is_integer_object(val) and + not util.is_timedelta64_object(val) and + not util.is_datetime64_object(val)): return 'mixed-integer' return 'mixed' @@ -237,20 +253,46 @@ def is_possible_datetimelike_array(object arr): return False return seen_datetime or seen_timedelta + cdef inline bint is_null_datetimelike(v): # determine if we have a null for a timedelta/datetime (or integer versions)x if util._checknull(v): return True + elif v is NaT: + return True elif util.is_timedelta64_object(v): return v.view('int64') == iNaT elif util.is_datetime64_object(v): return v.view('int64') == iNaT elif util.is_integer_object(v): return v == iNaT + return False + + +cdef inline bint is_null_datetime64(v): + # determine if we have a null for a datetime (or integer versions)x, + # excluding np.timedelta64('nat') + if util._checknull(v): + return True + elif v is NaT: + return True + elif util.is_datetime64_object(v): + return v.view('int64') == iNaT + return False + + +cdef inline bint is_null_timedelta64(v): + # determine if we have a null for a timedelta (or integer versions)x, + # excluding np.datetime64('nat') + if util._checknull(v): + return True elif v is NaT: return True + elif util.is_timedelta64_object(v): + return v.view('int64') == iNaT return False + cdef inline bint is_datetime(object o): return PyDateTime_Check(o) @@ -420,7 +462,7 @@ def is_datetime_array(ndarray[object] values): # return False for all nulls for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_datetime64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -437,7 +479,7 @@ def is_datetime64_array(ndarray values): # return False for all nulls for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_datetime64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -481,7 +523,7 @@ def is_timedelta_array(ndarray values): return False for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_timedelta64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -496,7 +538,7 @@ def is_timedelta64_array(ndarray values): return False for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_timedelta64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -512,7 +554,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return False for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_timedelta64(v): # we are a regular null if util._checknull(v): null_count += 1 diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd index 96a23a91cc7c2..fcb5583a0a6e7 100644 --- a/pandas/src/util.pxd +++ b/pandas/src/util.pxd @@ -98,4 +98,4 @@ cdef inline bint _checknan(object val): return not cnp.PyArray_Check(val) and val != val cdef inline bint is_period_object(object val): - return getattr(val,'_typ','_typ') == 'period' + return getattr(val, '_typ', '_typ') == 'period' diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index d535eaa238567..67869901b068e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -203,6 +203,49 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) self.assert_index_equal(result, expected) + def test_index_ctor_infer_nan_nat(self): + # GH 13467 + exp = pd.Float64Index([np.nan, np.nan]) + self.assertEqual(exp.dtype, np.float64) + tm.assert_index_equal(Index([np.nan, np.nan]), exp) + tm.assert_index_equal(Index(np.array([np.nan, np.nan])), exp) + + exp = pd.DatetimeIndex([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'datetime64[ns]') + tm.assert_index_equal(Index([pd.NaT, pd.NaT]), exp) + tm.assert_index_equal(Index(np.array([pd.NaT, pd.NaT])), exp) + + exp = pd.DatetimeIndex([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'datetime64[ns]') + + for data in [[pd.NaT, np.nan], [np.nan, pd.NaT], + [np.nan, np.datetime64('nat')], + [np.datetime64('nat'), np.nan]]: + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + + exp = pd.TimedeltaIndex([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'timedelta64[ns]') + + for data in [[np.nan, np.timedelta64('nat')], + [np.timedelta64('nat'), np.nan], + [pd.NaT, np.timedelta64('nat')], + [np.timedelta64('nat'), pd.NaT]]: + + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + + # mixed np.datetime64/timedelta64 nat results in object + data = [np.datetime64('nat'), np.timedelta64('nat')] + exp = pd.Index(data, dtype=object) + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + + data = [np.timedelta64('nat'), np.datetime64('nat')] + exp = pd.Index(data, dtype=object) + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + def test_index_ctor_infer_periodindex(self): xp = period_range('2012-1-1', freq='M', periods=3) rs = Index(xp) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c632704b7c5eb..2a7e8a957977f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -252,6 +252,24 @@ def test_constructor_pass_none(self): expected = Series(index=Index([None])) assert_series_equal(s, expected) + def test_constructor_pass_nan_nat(self): + # GH 13467 + exp = Series([np.nan, np.nan], dtype=np.float64) + self.assertEqual(exp.dtype, np.float64) + tm.assert_series_equal(Series([np.nan, np.nan]), exp) + tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) + + exp = Series([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'datetime64[ns]') + tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) + tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) + + tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) + tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) + + tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) + tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) + def test_constructor_cast(self): self.assertRaises(ValueError, Series, ['a', 'b', 'c'], dtype=float) @@ -688,8 +706,9 @@ def test_constructor_dtype_timedelta64(self): td = Series([np.timedelta64(300000000), pd.NaT]) self.assertEqual(td.dtype, 'timedelta64[ns]') + # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), tslib.iNaT]) - self.assertEqual(td.dtype, 'timedelta64[ns]') + self.assertEqual(td.dtype, 'object') td = Series([np.timedelta64(300000000), np.nan]) self.assertEqual(td.dtype, 'timedelta64[ns]') diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py index a6941369b35be..5f016322f101f 100644 --- a/pandas/tests/test_infer_and_convert.py +++ b/pandas/tests/test_infer_and_convert.py @@ -180,6 +180,207 @@ def test_datetime(self): index = Index(dates) self.assertEqual(index.inferred_type, 'datetime64') + def test_infer_dtype_datetime(self): + + arr = np.array([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.datetime64('2011-01-01'), + np.datetime64('2011-01-01')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1)]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, pd.Timestamp('2011-01-02'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + # different type of nat + arr = np.array([np.timedelta64('nat'), + np.datetime64('2011-01-02')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.datetime64('2011-01-02'), + np.timedelta64('nat')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + # mixed datetime + arr = np.array([datetime(2011, 1, 1), + pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + # should be datetime? + arr = np.array([np.datetime64('2011-01-01'), + pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([pd.Timestamp('2011-01-02'), + np.datetime64('2011-01-01')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed-integer') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_timedelta(self): + + arr = np.array([pd.Timedelta('1 days'), + pd.Timedelta('2 days')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([np.timedelta64(1, 'D'), + np.timedelta64(2, 'D')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([timedelta(1), timedelta(2)]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Timedelta('1 days')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1)]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, pd.Timedelta('1 days'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + # different type of nat + arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_all_nan_nat_like(self): + arr = np.array([np.nan, np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'floating') + + # nan and None mix are result in mixed + arr = np.array([np.nan, np.nan, None]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([None, np.nan, np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + # pd.NaT + arr = np.array([pd.NaT]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([pd.NaT, np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT, np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([None, pd.NaT, None]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + # np.datetime64(nat) + arr = np.array([np.datetime64('nat')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.datetime64('nat'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([np.timedelta64('nat')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.timedelta64('nat'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + # datetime / timedelta mixed + arr = np.array([pd.NaT, np.datetime64('nat'), + np.timedelta64('nat'), np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + def test_is_datetimelike_array_all_nan_nat_like(self): + arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) + self.assertTrue(pd.lib.is_datetime_array(arr)) + self.assertTrue(pd.lib.is_datetime64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_array(arr)) + self.assertFalse(pd.lib.is_timedelta64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) + self.assertFalse(pd.lib.is_datetime_array(arr)) + self.assertFalse(pd.lib.is_datetime64_array(arr)) + self.assertTrue(pd.lib.is_timedelta_array(arr)) + self.assertTrue(pd.lib.is_timedelta64_array(arr)) + self.assertTrue(pd.lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), + np.timedelta64('nat')]) + self.assertFalse(pd.lib.is_datetime_array(arr)) + self.assertFalse(pd.lib.is_datetime64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_array(arr)) + self.assertFalse(pd.lib.is_timedelta64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT]) + self.assertTrue(pd.lib.is_datetime_array(arr)) + self.assertTrue(pd.lib.is_datetime64_array(arr)) + self.assertTrue(pd.lib.is_timedelta_array(arr)) + self.assertTrue(pd.lib.is_timedelta64_array(arr)) + self.assertTrue(pd.lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, np.nan], dtype=object) + self.assertFalse(pd.lib.is_datetime_array(arr)) + self.assertFalse(pd.lib.is_datetime64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_array(arr)) + self.assertFalse(pd.lib.is_timedelta64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) + def test_date(self): dates = [date(2012, 1, x) for x in range(1, 20)] @@ -244,6 +445,13 @@ def test_categorical(self): result = lib.infer_dtype(Series(arr)) self.assertEqual(result, 'categorical') + def test_is_period(self): + self.assertTrue(lib.is_period(pd.Period('2011-01', freq='M'))) + self.assertFalse(lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))) + self.assertFalse(lib.is_period(pd.Timestamp('2011-01'))) + self.assertFalse(lib.is_period(1)) + self.assertFalse(lib.is_period(np.nan)) + class TestConvert(tm.TestCase): @@ -437,6 +645,7 @@ def test_convert_downcast_int64(self): result = lib.downcast_int64(arr, na_values) self.assert_numpy_array_equal(result, expected) + if __name__ == '__main__': import nose diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 62f8b10e3eea2..fe4de11864522 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -843,15 +843,6 @@ cdef _tz_format(object obj, object zone): except: return ', tz=%s' % zone -def is_timestamp_array(ndarray[object] values): - cdef int i, n = len(values) - if n == 0: - return False - for i in range(n): - if not is_timestamp(values[i]): - return False - return True - cpdef object get_value_box(ndarray arr, object loc): cdef: @@ -957,6 +948,7 @@ cdef str _NDIM_STRING = "ndim" # (see Timestamp class above). This will serve as a C extension type that # shadows the python class, where we do any heavy lifting. cdef class _Timestamp(datetime): + cdef readonly: int64_t value, nanosecond object freq # frequency reference From 2f7fdd07eb0925016a28cf9ff324e351eac0c4df Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Jul 2016 17:02:24 +0200 Subject: [PATCH 087/359] BUG: groupby apply on selected columns yielding scalar (GH13568) (#13585) closes #13568 --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/groupby.py | 5 ++++- pandas/tests/test_groupby.py | 10 ++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 046690e28dba5..4cc16aac15f8b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -491,6 +491,7 @@ Bug Fixes - Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame`` appropriately when empty (:issue:`13212`) +- Bug in ``groupby(..).apply(..)`` when the passed function returns scalar values per group (:issue:`13468`). - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) - Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue:`13306`) - Bug in ``.tz_localize`` with ``dateutil.tz.tzlocal`` may return incorrect result (:issue:`13583`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8d33c27481d93..077acc1e81444 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3403,11 +3403,14 @@ def first_non_None_value(values): return self._reindex_output(result) + # values are not series or array-like but scalars else: # only coerce dates if we find at least 1 datetime coerce = True if any([isinstance(x, Timestamp) for x in values]) else False - return (Series(values, index=key_index, name=self.name) + # self.name not passed through to Series as the result + # should not take the name of original selection of columns + return (Series(values, index=key_index) ._convert(datetime=True, coerce=coerce)) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index efcba758e3b38..a52f22fe2032a 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2584,6 +2584,16 @@ def test_apply_series_yield_constant(self): result = self.df.groupby(['A', 'B'])['C'].apply(len) self.assertEqual(result.index.names[:2], ('A', 'B')) + def test_apply_frame_yield_constant(self): + # GH13568 + result = self.df.groupby(['A', 'B']).apply(len) + self.assertTrue(isinstance(result, Series)) + self.assertIsNone(result.name) + + result = self.df.groupby(['A', 'B'])[['C', 'D']].apply(len) + self.assertTrue(isinstance(result, Series)) + self.assertIsNone(result.name) + def test_apply_frame_to_series(self): grouped = self.df.groupby(['A', 'B']) result = grouped.apply(len) From 65849d3c9feea395d6e6a124f7a3b11ecdb943cb Mon Sep 17 00:00:00 2001 From: Jeffrey Gerard Date: Mon, 11 Jul 2016 20:21:06 +0200 Subject: [PATCH 088/359] TST: Clean up tests of DataFrame.sort_{index,values} (#13496) * TST: Clean up tests of DataFrame.sort_{index,values} * Factor out Series sorting tests to own file. * Delegate deprecated sort() and order() to their own tests. Before this commit, the `Series.sort_values()` tests relied on deprecated `Series.sort()` and `Series.order()` as the source of truth. However they both merely called `Series.sort_values()` under the hood. This commit consolidates the core test logic against `.sort_values()` directly, while `.sort()` and `.order()` merely check for equivalence with `.sort_values()`. Also removes some no-op assertions that had rotted from the old days of `sort()`/`order()`. * Remove 'by' docstring from Series.sort_values * Document defaults for optional sorting args * Move more sort_values, sort_index tests to be together. * Add test for Series.sort_index(sort_remaining=True) * Improve `sort_values` tests when multiple `by`s Duplicates values in the test DataFrame are necessary to fully test this feature. * PEP8 cleanup * Annotate tests with GH issue * Fix indentation - docstring string replacement --- pandas/core/frame.py | 8 +- pandas/core/generic.py | 32 +++--- pandas/core/series.py | 3 +- pandas/tests/frame/test_sorting.py | 116 ++++++++++---------- pandas/tests/series/test_analytics.py | 136 ------------------------ pandas/tests/series/test_sorting.py | 146 ++++++++++++++++++++++++++ 6 files changed, 226 insertions(+), 215 deletions(-) create mode 100644 pandas/tests/series/test_sorting.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 356abc67b168a..b4509c999a5da 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -68,8 +68,12 @@ # --------------------------------------------------------------------- # Docstring templates -_shared_doc_kwargs = dict(axes='index, columns', klass='DataFrame', - axes_single_arg="{0, 1, 'index', 'columns'}") +_shared_doc_kwargs = dict( + axes='index, columns', klass='DataFrame', + axes_single_arg="{0, 1, 'index', 'columns'}", + optional_by=""" + by : str or list of str + Name or list of names which refer to the axis items.""") _numeric_only_doc = """numeric_only : boolean, default None Include only float, int, boolean data. If None, will attempt to use diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7b271df4085cc..1aadc50b76f95 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -37,10 +37,13 @@ # goal is to be able to define the docs close to function, while still being # able to share _shared_docs = dict() -_shared_doc_kwargs = dict(axes='keywords for axes', klass='NDFrame', - axes_single_arg='int or labels for object', - args_transpose='axes to permute (int or label for' - ' object)') +_shared_doc_kwargs = dict( + axes='keywords for axes', klass='NDFrame', + axes_single_arg='int or labels for object', + args_transpose='axes to permute (int or label for object)', + optional_by=""" + by : str or list of str + Name or list of names which refer to the axis items.""") def is_dictlike(x): @@ -1961,21 +1964,20 @@ def add_suffix(self, suffix): .. versionadded:: 0.17.0 Parameters - ---------- - by : string name or list of names which refer to the axis items - axis : %(axes)s to direct sorting - ascending : bool or list of bool + ----------%(optional_by)s + axis : %(axes)s to direct sorting, default 0 + ascending : bool or list of bool, default True Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. - inplace : bool + inplace : bool, default False if True, perform operation in-place - kind : {`quicksort`, `mergesort`, `heapsort`} + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See also ndarray.np.sort for more information. `mergesort` is the only stable algorithm. For DataFrames, this option is only applied when sorting on a single column or label. - na_position : {'first', 'last'} + na_position : {'first', 'last'}, default 'last' `first` puts NaNs at the beginning, `last` puts NaNs at the end Returns @@ -1997,16 +1999,16 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, if not None, sort on values in specified index level(s) ascending : boolean, default True Sort ascending vs. descending - inplace : bool + inplace : bool, default False if True, perform operation in-place - kind : {`quicksort`, `mergesort`, `heapsort`} + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See also ndarray.np.sort for more information. `mergesort` is the only stable algorithm. For DataFrames, this option is only applied when sorting on a single column or label. - na_position : {'first', 'last'} + na_position : {'first', 'last'}, default 'last' `first` puts NaNs at the beginning, `last` puts NaNs at the end - sort_remaining : bool + sort_remaining : bool, default True if true and sorting by level and index is multilevel, sort by other levels too (in order) after sorting by specified level diff --git a/pandas/core/series.py b/pandas/core/series.py index e2726bef0bd03..8015670212181 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -62,7 +62,8 @@ axes='index', klass='Series', axes_single_arg="{0, 'index'}", inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", - duplicated='Series') + duplicated='Series', + optional_by='') def _coerce_method(converter): diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index ff2159f8b6f40..4d57216c8f870 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -21,75 +21,68 @@ class TestDataFrameSorting(tm.TestCase, TestData): _multiprocess_can_split_ = True - def test_sort_values(self): - # API for 9816 + def test_sort_index(self): + # GH13496 - # sort_index frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - frame.sort(columns='A') - with tm.assert_produces_warning(FutureWarning): - frame.sort() - + # axis=0 : sort rows by index labels unordered = frame.ix[[3, 2, 4, 1]] - expected = unordered.sort_index() - result = unordered.sort_index(axis=0) + expected = frame assert_frame_equal(result, expected) - unordered = frame.ix[:, [2, 1, 3, 0]] - expected = unordered.sort_index(axis=1) + result = unordered.sort_index(ascending=False) + expected = frame[::-1] + assert_frame_equal(result, expected) + # axis=1 : sort columns by column names + unordered = frame.ix[:, [2, 1, 3, 0]] result = unordered.sort_index(axis=1) - assert_frame_equal(result, expected) + assert_frame_equal(result, frame) + + result = unordered.sort_index(axis=1, ascending=False) + expected = frame.ix[:, ::-1] assert_frame_equal(result, expected) - # sortlevel - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + def test_sort_index_multiindex(self): + # GH13496 + + # sort rows by specified level of multi-index + mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) result = df.sort_index(level='A', sort_remaining=False) expected = df.sortlevel('A', sort_remaining=False) assert_frame_equal(result, expected) + # sort columns by specified level of multi-index df = df.T result = df.sort_index(level='A', axis=1, sort_remaining=False) expected = df.sortlevel('A', axis=1, sort_remaining=False) assert_frame_equal(result, expected) - # MI sort, but no by + # MI sort, but no level: sort_level has no effect mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) result = df.sort_index(sort_remaining=False) expected = df.sort_index() assert_frame_equal(result, expected) - def test_sort_index(self): + def test_sort(self): frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) - # axis=0 - unordered = frame.ix[[3, 2, 4, 1]] - sorted_df = unordered.sort_index(axis=0) - expected = frame - assert_frame_equal(sorted_df, expected) - - sorted_df = unordered.sort_index(ascending=False) - expected = frame[::-1] - assert_frame_equal(sorted_df, expected) - - # axis=1 - unordered = frame.ix[:, ['D', 'B', 'C', 'A']] - sorted_df = unordered.sort_index(axis=1) - expected = frame - assert_frame_equal(sorted_df, expected) + # 9816 deprecated + with tm.assert_produces_warning(FutureWarning): + frame.sort(columns='A') + with tm.assert_produces_warning(FutureWarning): + frame.sort() - sorted_df = unordered.sort_index(axis=1, ascending=False) - expected = frame.ix[:, ::-1] - assert_frame_equal(sorted_df, expected) + def test_sort_values(self): + frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], + index=[1, 2, 3], columns=list('ABC')) # by column sorted_df = frame.sort_values(by='A') @@ -109,16 +102,17 @@ def test_sort_index(self): sorted_df = frame.sort_values(by=['A'], ascending=[False]) assert_frame_equal(sorted_df, expected) - # check for now - sorted_df = frame.sort_values(by='A') - assert_frame_equal(sorted_df, expected[::-1]) - expected = frame.sort_values(by='A') + # multiple bys + sorted_df = frame.sort_values(by=['B', 'C']) + expected = frame.loc[[2, 1, 3]] assert_frame_equal(sorted_df, expected) - expected = frame.sort_values(by=['A', 'B'], ascending=False) - sorted_df = frame.sort_values(by=['A', 'B']) + sorted_df = frame.sort_values(by=['B', 'C'], ascending=False) assert_frame_equal(sorted_df, expected[::-1]) + sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False]) + assert_frame_equal(sorted_df, expected) + self.assertRaises(ValueError, lambda: frame.sort_values( by=['A', 'B'], axis=2, inplace=True)) @@ -130,6 +124,25 @@ def test_sort_index(self): with assertRaisesRegexp(ValueError, msg): frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5) + def test_sort_values_inplace(self): + frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], + columns=['A', 'B', 'C', 'D']) + + sorted_df = frame.copy() + sorted_df.sort_values(by='A', inplace=True) + expected = frame.sort_values(by='A') + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by='A', ascending=False, inplace=True) + expected = frame.sort_values(by='A', ascending=False) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True) + expected = frame.sort_values(by=['A', 'B'], ascending=False) + assert_frame_equal(sorted_df, expected) + def test_sort_index_categorical_index(self): df = (DataFrame({'A': np.arange(6, dtype='int64'), @@ -361,25 +374,6 @@ def test_sort_index_different_sortorder(self): result = idf['C'].sort_index(ascending=[1, 0]) assert_series_equal(result, expected['C']) - def test_sort_inplace(self): - frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], - columns=['A', 'B', 'C', 'D']) - - sorted_df = frame.copy() - sorted_df.sort_values(by='A', inplace=True) - expected = frame.sort_values(by='A') - assert_frame_equal(sorted_df, expected) - - sorted_df = frame.copy() - sorted_df.sort_values(by='A', ascending=False, inplace=True) - expected = frame.sort_values(by='A', ascending=False) - assert_frame_equal(sorted_df, expected) - - sorted_df = frame.copy() - sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True) - expected = frame.sort_values(by=['A', 'B'], ascending=False) - assert_frame_equal(sorted_df, expected) - def test_sort_index_duplicates(self): # with 9816, these are all translated to .sort_values diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 0dbff0a028619..d9e2d8096c8d7 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -5,7 +5,6 @@ from distutils.version import LooseVersion import nose -import random from numpy import nan import numpy as np @@ -1418,141 +1417,6 @@ def test_is_monotonic(self): self.assertFalse(s.is_monotonic) self.assertTrue(s.is_monotonic_decreasing) - def test_sort_values(self): - - ts = self.ts.copy() - - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - ts.sort() - - self.assert_series_equal(ts, self.ts.sort_values()) - self.assert_index_equal(ts.index, self.ts.sort_values().index) - - ts.sort_values(ascending=False, inplace=True) - self.assert_series_equal(ts, self.ts.sort_values(ascending=False)) - self.assert_index_equal(ts.index, - self.ts.sort_values(ascending=False).index) - - # GH 5856/5853 - # Series.sort_values operating on a view - df = DataFrame(np.random.randn(10, 4)) - s = df.iloc[:, 0] - - def f(): - s.sort_values(inplace=True) - - self.assertRaises(ValueError, f) - - # test order/sort inplace - # GH6859 - ts1 = self.ts.copy() - ts1.sort_values(ascending=False, inplace=True) - ts2 = self.ts.copy() - ts2.sort_values(ascending=False, inplace=True) - assert_series_equal(ts1, ts2) - - ts1 = self.ts.copy() - ts1 = ts1.sort_values(ascending=False, inplace=False) - ts2 = self.ts.copy() - ts2 = ts.sort_values(ascending=False) - assert_series_equal(ts1, ts2) - - def test_sort_index(self): - rindex = list(self.ts.index) - random.shuffle(rindex) - - random_order = self.ts.reindex(rindex) - sorted_series = random_order.sort_index() - assert_series_equal(sorted_series, self.ts) - - # descending - sorted_series = random_order.sort_index(ascending=False) - assert_series_equal(sorted_series, - self.ts.reindex(self.ts.index[::-1])) - - def test_sort_index_inplace(self): - - # For #11402 - rindex = list(self.ts.index) - random.shuffle(rindex) - - # descending - random_order = self.ts.reindex(rindex) - result = random_order.sort_index(ascending=False, inplace=True) - self.assertIs(result, None, - msg='sort_index() inplace should return None') - assert_series_equal(random_order, self.ts.reindex(self.ts.index[::-1])) - - # ascending - random_order = self.ts.reindex(rindex) - result = random_order.sort_index(ascending=True, inplace=True) - self.assertIs(result, None, - msg='sort_index() inplace should return None') - assert_series_equal(random_order, self.ts) - - def test_sort_API(self): - - # API for 9816 - - # sortlevel - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) - s = Series([1, 2], mi) - backwards = s.iloc[[1, 0]] - - res = s.sort_index(level='A') - assert_series_equal(backwards, res) - - # sort_index - rindex = list(self.ts.index) - random.shuffle(rindex) - - random_order = self.ts.reindex(rindex) - sorted_series = random_order.sort_index(level=0) - assert_series_equal(sorted_series, self.ts) - - # compat on axis - sorted_series = random_order.sort_index(axis=0) - assert_series_equal(sorted_series, self.ts) - - self.assertRaises(ValueError, lambda: random_order.sort_values(axis=1)) - - sorted_series = random_order.sort_index(level=0, axis=0) - assert_series_equal(sorted_series, self.ts) - - self.assertRaises(ValueError, - lambda: random_order.sort_index(level=0, axis=1)) - - def test_order(self): - - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - self.ts.order() - - ts = self.ts.copy() - ts[:5] = np.NaN - vals = ts.values - - result = ts.sort_values() - self.assertTrue(np.isnan(result[-5:]).all()) - self.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) - - result = ts.sort_values(na_position='first') - self.assertTrue(np.isnan(result[:5]).all()) - self.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) - - # something object-type - ser = Series(['A', 'B'], [1, 2]) - # no failure - ser.sort_values() - - # ascending=False - ordered = ts.sort_values(ascending=False) - expected = np.sort(ts.valid().values)[::-1] - assert_almost_equal(expected, ordered.valid().values) - ordered = ts.sort_values(ascending=False, na_position='first') - assert_almost_equal(expected, ordered.valid().values) - def test_nsmallest_nlargest(self): # float, int, datetime64 (use i8), timedelts64 (same), # object that are numbers, object that are strings diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py new file mode 100644 index 0000000000000..826201adbdb50 --- /dev/null +++ b/pandas/tests/series/test_sorting.py @@ -0,0 +1,146 @@ +# coding=utf-8 + +import numpy as np +import random + +from pandas import (DataFrame, Series, MultiIndex) + +from pandas.util.testing import (assert_series_equal, assert_almost_equal) +import pandas.util.testing as tm + +from .common import TestData + + +class TestSeriesSorting(TestData, tm.TestCase): + + _multiprocess_can_split_ = True + + def test_sort(self): + + ts = self.ts.copy() + + # 9816 deprecated + with tm.assert_produces_warning(FutureWarning): + ts.sort() # sorts inplace + self.assert_series_equal(ts, self.ts.sort_values()) + + def test_order(self): + + # 9816 deprecated + with tm.assert_produces_warning(FutureWarning): + result = self.ts.order() + self.assert_series_equal(result, self.ts.sort_values()) + + def test_sort_values(self): + + # check indexes are reordered corresponding with the values + ser = Series([3, 2, 4, 1], ['A', 'B', 'C', 'D']) + expected = Series([1, 2, 3, 4], ['D', 'B', 'A', 'C']) + result = ser.sort_values() + self.assert_series_equal(expected, result) + + ts = self.ts.copy() + ts[:5] = np.NaN + vals = ts.values + + result = ts.sort_values() + self.assertTrue(np.isnan(result[-5:]).all()) + self.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) + + # na_position + result = ts.sort_values(na_position='first') + self.assertTrue(np.isnan(result[:5]).all()) + self.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) + + # something object-type + ser = Series(['A', 'B'], [1, 2]) + # no failure + ser.sort_values() + + # ascending=False + ordered = ts.sort_values(ascending=False) + expected = np.sort(ts.valid().values)[::-1] + assert_almost_equal(expected, ordered.valid().values) + ordered = ts.sort_values(ascending=False, na_position='first') + assert_almost_equal(expected, ordered.valid().values) + + # inplace=True + ts = self.ts.copy() + ts.sort_values(ascending=False, inplace=True) + self.assert_series_equal(ts, self.ts.sort_values(ascending=False)) + self.assert_index_equal(ts.index, + self.ts.sort_values(ascending=False).index) + + # GH 5856/5853 + # Series.sort_values operating on a view + df = DataFrame(np.random.randn(10, 4)) + s = df.iloc[:, 0] + + def f(): + s.sort_values(inplace=True) + + self.assertRaises(ValueError, f) + + def test_sort_index(self): + rindex = list(self.ts.index) + random.shuffle(rindex) + + random_order = self.ts.reindex(rindex) + sorted_series = random_order.sort_index() + assert_series_equal(sorted_series, self.ts) + + # descending + sorted_series = random_order.sort_index(ascending=False) + assert_series_equal(sorted_series, + self.ts.reindex(self.ts.index[::-1])) + + # compat on level + sorted_series = random_order.sort_index(level=0) + assert_series_equal(sorted_series, self.ts) + + # compat on axis + sorted_series = random_order.sort_index(axis=0) + assert_series_equal(sorted_series, self.ts) + + self.assertRaises(ValueError, lambda: random_order.sort_values(axis=1)) + + sorted_series = random_order.sort_index(level=0, axis=0) + assert_series_equal(sorted_series, self.ts) + + self.assertRaises(ValueError, + lambda: random_order.sort_index(level=0, axis=1)) + + def test_sort_index_inplace(self): + + # For #11402 + rindex = list(self.ts.index) + random.shuffle(rindex) + + # descending + random_order = self.ts.reindex(rindex) + result = random_order.sort_index(ascending=False, inplace=True) + self.assertIs(result, None, + msg='sort_index() inplace should return None') + assert_series_equal(random_order, self.ts.reindex(self.ts.index[::-1])) + + # ascending + random_order = self.ts.reindex(rindex) + result = random_order.sort_index(ascending=True, inplace=True) + self.assertIs(result, None, + msg='sort_index() inplace should return None') + assert_series_equal(random_order, self.ts) + + def test_sort_index_multiindex(self): + + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + # implicit sort_remaining=True + res = s.sort_index(level='A') + assert_series_equal(backwards, res) + + # GH13496 + # rows share same level='A': sort has no effect without remaining lvls + res = s.sort_index(level='A', sort_remaining=False) + assert_series_equal(s, res) From 8dbc0f49ccd8617c1ac5c2daf38b55db4335efa1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Jul 2016 09:07:41 +0200 Subject: [PATCH 089/359] DOC: asfreq clarify original NaNs are not filled (GH9963) (#13617) --- pandas/core/generic.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1aadc50b76f95..b4bcae47cbbdf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3916,16 +3916,20 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, def asfreq(self, freq, method=None, how=None, normalize=False): """ - Convert all TimeSeries inside to specified frequency using DateOffset - objects. Optionally provide fill method to pad/backfill missing values. + Convert TimeSeries to specified frequency. + + Optionally provide filling method to pad/backfill missing values. Parameters ---------- freq : DateOffset object, or string - method : {'backfill', 'bfill', 'pad', 'ffill', None} - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill method + method : {'backfill'/'bfill', 'pad'/'ffill'}, default None + Method to use for filling holes in reindexed Series (note this + does not fill NaNs that already were present): + + * 'pad' / 'ffill': propagate last valid observation forward to next + valid + * 'backfill' / 'bfill': use NEXT valid observation to fill how : {'start', 'end'}, default end For PeriodIndex only, see PeriodIndex.asfreq normalize : bool, default False From 93b7d1319731304f388717f2651f3a278749c517 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 12 Jul 2016 06:51:03 -0400 Subject: [PATCH 090/359] BUG: Invalid Timedelta op may raise ValueError Author: sinhrks Closes #13624 from sinhrks/timedelta_comp and squashes the following commits: 856df95 [sinhrks] BUG: Invalid Timedelta op may raise ValueError --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tseries/tdi.py | 10 +++++--- pandas/tseries/tests/test_timedeltas.py | 32 +++++++++++++++++++++++++ pandas/tseries/timedeltas.py | 4 ++-- pandas/tslib.pyx | 9 ++++++- 5 files changed, 50 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 4cc16aac15f8b..8661d87a617ba 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -543,7 +543,7 @@ Bug Fixes - Bug in ``.to_html``, ``.to_latex`` and ``.to_string`` silently ignore custom datetime formatter passed through the ``formatters`` key word (:issue:`10690`) - Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) - +- Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index af4c46e2d16fa..dbc0078b67ae7 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -35,16 +35,20 @@ def _td_index_cmp(opname, nat_result=False): """ def wrapper(self, other): + msg = "cannot compare a TimedeltaIndex with type {0}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is tslib.NaT: - other = _to_m8(other) + try: + other = _to_m8(other) + except ValueError: + # failed to parse as timedelta + raise TypeError(msg.format(type(other))) result = func(other) if com.isnull(other): result.fill(nat_result) else: if not com.is_list_like(other): - raise TypeError("cannot compare a TimedeltaIndex with type " - "{0}".format(type(other))) + raise TypeError(msg.format(type(other))) other = TimedeltaIndex(other).values result = func(other) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index c3bd62849bf82..4f985998d5e20 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -472,6 +472,21 @@ class Other: self.assertTrue(td.__mul__(other) is NotImplemented) self.assertTrue(td.__floordiv__(td) is NotImplemented) + def test_ops_error_str(self): + # GH 13624 + td = Timedelta('1 day') + + for l, r in [(td, 'a'), ('a', td)]: + + with tm.assertRaises(TypeError): + l + r + + with tm.assertRaises(TypeError): + l > r + + self.assertFalse(l == r) + self.assertTrue(l != r) + def test_fields(self): def check(value): # that we are int/long like @@ -1432,6 +1447,23 @@ def test_comparisons_nat(self): expected = np.array([True, True, True, True, True, False]) self.assert_numpy_array_equal(result, expected) + def test_ops_error_str(self): + # GH 13624 + tdi = TimedeltaIndex(['1 day', '2 days']) + + for l, r in [(tdi, 'a'), ('a', tdi)]: + with tm.assertRaises(TypeError): + l + r + + with tm.assertRaises(TypeError): + l > r + + with tm.assertRaises(TypeError): + l == r + + with tm.assertRaises(TypeError): + l != r + def test_map(self): rng = timedelta_range('1 day', periods=10) diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 7ff5d7adcaa35..5a28218500858 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -74,8 +74,8 @@ def _convert_listlike(arg, box, unit, name=None): value = arg.astype('timedelta64[{0}]'.format( unit)).astype('timedelta64[ns]', copy=False) else: - value = tslib.array_to_timedelta64( - _ensure_object(arg), unit=unit, errors=errors) + value = tslib.array_to_timedelta64(_ensure_object(arg), + unit=unit, errors=errors) value = value.astype('timedelta64[ns]', copy=False) if box: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index fe4de11864522..650b4c7979d8d 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2912,10 +2912,17 @@ class Timedelta(_Timedelta): if not self._validate_ops_compat(other): return NotImplemented - other = Timedelta(other) if other is NaT: return NaT + + try: + other = Timedelta(other) + except ValueError: + # failed to parse as timedelta + return NotImplemented + return Timedelta(op(self.value, other.value), unit='ns') + f.__name__ = name return f From dbd53306e4a1c091cf41426d1297648b042c771c Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 12 Jul 2016 06:52:54 -0400 Subject: [PATCH 091/359] CLN: Cleanup ops.py Author: sinhrks Closes #13605 from sinhrks/ops_cln2 and squashes the following commits: 729997b [sinhrks] CLN: Cleanup ops.py --- pandas/core/ops.py | 365 +++++++++++++------------- pandas/tests/frame/test_operators.py | 66 ++++- pandas/tests/series/test_operators.py | 74 +++--- 3 files changed, 284 insertions(+), 221 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 0af7b6d80ce0e..3aaca1eea486e 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -20,7 +20,6 @@ from pandas.compat import bind_method import pandas.core.missing as missing import pandas.algos as _algos -import pandas.core.algorithms as algos from pandas.core.common import (is_list_like, notnull, isnull, _values_from_object, _maybe_match_name, needs_i8_conversion, is_datetimelike_v_numeric, @@ -258,30 +257,87 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, exclude=exclude) -class _TimeOp(object): +class _Op(object): + """ - Wrapper around Series datetime/time/timedelta arithmetic operations. - Generally, you should use classmethod ``maybe_convert_for_time_op`` as an - entry point. + Wrapper around Series arithmetic operations. + Generally, you should use classmethod ``_Op.get_op`` as an entry point. + + This validates and coerces lhs and rhs depending on its dtype and + based on op. See _TimeOp also. + + Parameters + ---------- + left : Series + lhs of op + right : object + rhs of op + name : str + name of op + na_op : callable + a function which wraps op """ - fill_value = iNaT + + fill_value = np.nan wrap_results = staticmethod(lambda x: x) dtype = None def __init__(self, left, right, name, na_op): + self.left = left + self.right = right + + self.name = name + self.na_op = na_op + + self.lvalues = left + self.rvalues = right + + @classmethod + def get_op(cls, left, right, name, na_op): + """ + Get op dispatcher, returns _Op or _TimeOp. + + If ``left`` and ``right`` are appropriate for datetime arithmetic with + operation ``name``, processes them and returns a ``_TimeOp`` object + that stores all the required values. Otherwise, it will generate + either a ``_Op``, indicating that the operation is performed via + normal numpy path. + """ + is_timedelta_lhs = is_timedelta64_dtype(left) + is_datetime_lhs = (is_datetime64_dtype(left) or + is_datetime64tz_dtype(left)) - # need to make sure that we are aligning the data if isinstance(left, ABCSeries) and isinstance(right, ABCSeries): - left, right = left.align(right, copy=False) + # avoid repated alignment + if not left.index.equals(right.index): + left, right = left.align(right, copy=False) + + index, lidx, ridx = left.index.join(right.index, how='outer', + return_indexers=True) + # if DatetimeIndex have different tz, convert to UTC + left.index = index + right.index = index + + if not (is_datetime_lhs or is_timedelta_lhs): + return _Op(left, right, name, na_op) + else: + return _TimeOp(left, right, name, na_op) + + +class _TimeOp(_Op): + """ + Wrapper around Series datetime/time/timedelta arithmetic operations. + Generally, you should use classmethod ``_Op.get_op`` as an entry point. + """ + fill_value = iNaT + + def __init__(self, left, right, name, na_op): + super(_TimeOp, self).__init__(left, right, name, na_op) lvalues = self._convert_to_array(left, name=name) rvalues = self._convert_to_array(right, name=name, other=lvalues) - self.name = name - self.na_op = na_op - # left - self.left = left self.is_offset_lhs = self._is_offset(left) self.is_timedelta_lhs = is_timedelta64_dtype(lvalues) self.is_datetime64_lhs = is_datetime64_dtype(lvalues) @@ -292,7 +348,6 @@ def __init__(self, left, right, name, na_op): self.is_floating_lhs = left.dtype.kind == 'f' # right - self.right = right self.is_offset_rhs = self._is_offset(right) self.is_datetime64_rhs = is_datetime64_dtype(rvalues) self.is_datetime64tz_rhs = is_datetime64tz_dtype(rvalues) @@ -543,26 +598,6 @@ def _is_offset(self, arr_or_obj): else: return False - @classmethod - def maybe_convert_for_time_op(cls, left, right, name, na_op): - """ - if ``left`` and ``right`` are appropriate for datetime arithmetic with - operation ``name``, processes them and returns a ``_TimeOp`` object - that stores all the required values. Otherwise, it will generate - either a ``NotImplementedError`` or ``None``, indicating that the - operation is unsupported for datetimes (e.g., an unsupported r_op) or - that the data is not the right type for time ops. - """ - # decide if we can do it - is_timedelta_lhs = is_timedelta64_dtype(left) - is_datetime_lhs = (is_datetime64_dtype(left) or - is_datetime64tz_dtype(left)) - - if not (is_datetime_lhs or is_timedelta_lhs): - return None - - return cls(left, right, name, na_op) - def _arith_method_SERIES(op, name, str_rep, fill_zeros=None, default_axis=None, **eval_kwargs): @@ -615,53 +650,28 @@ def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, pd.DataFrame): return NotImplemented - time_converted = _TimeOp.maybe_convert_for_time_op(left, right, name, - na_op) + converted = _Op.get_op(left, right, name, na_op) - if time_converted is None: - lvalues, rvalues = left, right - dtype = None - wrap_results = lambda x: x - elif time_converted is NotImplemented: - return NotImplemented - else: - left, right = time_converted.left, time_converted.right - lvalues, rvalues = time_converted.lvalues, time_converted.rvalues - dtype = time_converted.dtype - wrap_results = time_converted.wrap_results - na_op = time_converted.na_op + left, right = converted.left, converted.right + lvalues, rvalues = converted.lvalues, converted.rvalues + dtype = converted.dtype + wrap_results = converted.wrap_results + na_op = converted.na_op if isinstance(rvalues, ABCSeries): - rindex = getattr(rvalues, 'index', rvalues) name = _maybe_match_name(left, rvalues) lvalues = getattr(lvalues, 'values', lvalues) rvalues = getattr(rvalues, 'values', rvalues) - if left.index.equals(rindex): - index = left.index - else: - index, lidx, ridx = left.index.join(rindex, how='outer', - return_indexers=True) - - if lidx is not None: - lvalues = algos.take_1d(lvalues, lidx) - - if ridx is not None: - rvalues = algos.take_1d(rvalues, ridx) - - result = wrap_results(safe_na_op(lvalues, rvalues)) - return left._constructor(result, index=index, - name=name, dtype=dtype) + # _Op aligns left and right else: - # scalars + name = left.name if (hasattr(lvalues, 'values') and not isinstance(lvalues, pd.DatetimeIndex)): lvalues = lvalues.values - result = wrap_results(safe_na_op(lvalues, rvalues)) - return left._constructor(result, - index=left.index, name=left.name, - dtype=dtype) - + result = wrap_results(safe_na_op(lvalues, rvalues)) + return left._constructor(result, index=left.index, + name=name, dtype=dtype) return wrapper @@ -895,6 +905,32 @@ def wrapper(self, other): _op_descriptions[reverse_op]['reverse'] = k +_flex_doc_SERIES = """ +%s of series and other, element-wise (binary operator `%s`). + +Equivalent to ``%s``, but with support to substitute a fill_value for +missing data in one of the inputs. + +Parameters +---------- +other: Series or scalar value +fill_value : None or float value, default None (NaN) + Fill missing (NaN) values with this value. If both Series are + missing, the result will be missing +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + +Returns +------- +result : Series + +See also +-------- +Series.%s +""" + + def _flex_method_SERIES(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs): op_name = name.replace('__', '') @@ -904,30 +940,8 @@ def _flex_method_SERIES(op, name, str_rep, default_axis=None, fill_zeros=None, else: equiv = 'series ' + op_desc['op'] + ' other' - doc = """ - %s of series and other, element-wise (binary operator `%s`). - - Equivalent to ``%s``, but with support to substitute a fill_value for - missing data in one of the inputs. - - Parameters - ---------- - other: Series or scalar value - fill_value : None or float value, default None (NaN) - Fill missing (NaN) values with this value. If both Series are - missing, the result will be missing - level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level - - Returns - ------- - result : Series - - See also - -------- - Series.%s - """ % (op_desc['desc'], op_name, equiv, op_desc['reverse']) + doc = _flex_doc_SERIES % (op_desc['desc'], op_name, equiv, + op_desc['reverse']) @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): @@ -983,6 +997,75 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): result : DataFrame """ +_flex_doc_FRAME = """ +%s of dataframe and other, element-wise (binary operator `%s`). + +Equivalent to ``%s``, but with support to substitute a fill_value for +missing data in one of the inputs. + +Parameters +---------- +other : Series, DataFrame, or constant +axis : {0, 1, 'index', 'columns'} + For Series input, axis to match Series index on +fill_value : None or float value, default None + Fill missing (NaN) values with this value. If both DataFrame + locations are missing, the result will be missing +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + +Notes +----- +Mismatched indices will be unioned together + +Returns +------- +result : DataFrame + +See also +-------- +DataFrame.%s +""" + + +def _align_method_FRAME(left, right, axis): + """ convert rhs to meet lhs dims if input is list, tuple or np.ndarray """ + + def to_series(right): + msg = 'Unable to coerce to Series, length must be {0}: given {1}' + if axis is not None and left._get_axis_name(axis) == 'index': + if len(left.index) != len(right): + raise ValueError(msg.format(len(left.index), len(right))) + right = left._constructor_sliced(right, index=left.index) + else: + if len(left.columns) != len(right): + raise ValueError(msg.format(len(left.columns), len(right))) + right = left._constructor_sliced(right, index=left.columns) + return right + + if isinstance(right, (list, tuple)): + right = to_series(right) + + elif isinstance(right, np.ndarray) and right.ndim: # skips np scalar + + if right.ndim == 1: + right = to_series(right) + + elif right.ndim == 2: + if left.shape != right.shape: + msg = ("Unable to coerce to DataFrame, " + "shape must be {0}: given {1}") + raise ValueError(msg.format(left.shape, right.shape)) + + right = left._constructor(right, index=left.index, + columns=left.columns) + else: + msg = 'Unable to coerce to Series/DataFrame, dim must be <= 2: {0}' + raise ValueError(msg.format(right.shape, )) + + return right + def _arith_method_FRAME(op, name, str_rep=None, default_axis='columns', fill_zeros=None, **eval_kwargs): @@ -1027,75 +1110,20 @@ def na_op(x, y): else: equiv = 'dataframe ' + op_desc['op'] + ' other' - doc = """ - %s of dataframe and other, element-wise (binary operator `%s`). - - Equivalent to ``%s``, but with support to substitute a fill_value for - missing data in one of the inputs. - - Parameters - ---------- - other : Series, DataFrame, or constant - axis : {0, 1, 'index', 'columns'} - For Series input, axis to match Series index on - fill_value : None or float value, default None - Fill missing (NaN) values with this value. If both DataFrame - locations are missing, the result will be missing - level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level - - Notes - ----- - Mismatched indices will be unioned together - - Returns - ------- - result : DataFrame - - See also - -------- - DataFrame.%s - """ % (op_desc['desc'], op_name, equiv, op_desc['reverse']) + doc = _flex_doc_FRAME % (op_desc['desc'], op_name, equiv, + op_desc['reverse']) else: doc = _arith_doc_FRAME % name @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): + + other = _align_method_FRAME(self, other, axis) + if isinstance(other, pd.DataFrame): # Another DataFrame return self._combine_frame(other, na_op, fill_value, level) elif isinstance(other, ABCSeries): return self._combine_series(other, na_op, fill_value, axis, level) - elif isinstance(other, (list, tuple)): - if axis is not None and self._get_axis_name(axis) == 'index': - # TODO: Get all of these to use _constructor_sliced - # casted = self._constructor_sliced(other, index=self.index) - casted = pd.Series(other, index=self.index) - else: - # casted = self._constructor_sliced(other, index=self.columns) - casted = pd.Series(other, index=self.columns) - return self._combine_series(casted, na_op, fill_value, axis, level) - elif isinstance(other, np.ndarray) and other.ndim: # skips np scalar - if other.ndim == 1: - if axis is not None and self._get_axis_name(axis) == 'index': - # casted = self._constructor_sliced(other, - # index=self.index) - casted = pd.Series(other, index=self.index) - else: - # casted = self._constructor_sliced(other, - # index=self.columns) - casted = pd.Series(other, index=self.columns) - return self._combine_series(casted, na_op, fill_value, axis, - level) - elif other.ndim == 2: - # casted = self._constructor(other, index=self.index, - # columns=self.columns) - casted = pd.DataFrame(other, index=self.index, - columns=self.columns) - return self._combine_frame(casted, na_op, fill_value, level) - else: - raise ValueError("Incompatible argument shape: %s" % - (other.shape, )) else: if fill_value is not None: self = self.fillna(fill_value) @@ -1135,39 +1163,14 @@ def na_op(x, y): @Appender('Wrapper for flexible comparison methods %s' % name) def f(self, other, axis=default_axis, level=None): + + other = _align_method_FRAME(self, other, axis) + if isinstance(other, pd.DataFrame): # Another DataFrame return self._flex_compare_frame(other, na_op, str_rep, level) elif isinstance(other, ABCSeries): return self._combine_series(other, na_op, None, axis, level) - - elif isinstance(other, (list, tuple)): - if axis is not None and self._get_axis_name(axis) == 'index': - casted = pd.Series(other, index=self.index) - else: - casted = pd.Series(other, index=self.columns) - - return self._combine_series(casted, na_op, None, axis, level) - - elif isinstance(other, np.ndarray): - if other.ndim == 1: - if axis is not None and self._get_axis_name(axis) == 'index': - casted = pd.Series(other, index=self.index) - else: - casted = pd.Series(other, index=self.columns) - - return self._combine_series(casted, na_op, None, axis, level) - - elif other.ndim == 2: - casted = pd.DataFrame(other, index=self.index, - columns=self.columns) - - return self._flex_compare_frame(casted, na_op, str_rep, level) - - else: - raise ValueError("Incompatible argument shape: %s" % - (other.shape, )) - else: return self._combine_const(other, na_op) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index ee7c296f563f0..e2e0f568e4098 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -417,10 +417,11 @@ def test_arith_flex_frame(self): # ndim >= 3 ndim_5 = np.ones(self.frame.shape + (3, 4, 5)) - with assertRaisesRegexp(ValueError, 'shape'): + msg = "Unable to coerce to Series/DataFrame" + with assertRaisesRegexp(ValueError, msg): f(self.frame, ndim_5) - with assertRaisesRegexp(ValueError, 'shape'): + with assertRaisesRegexp(ValueError, msg): getattr(self.frame, op)(ndim_5) # res_add = self.frame.add(self.frame) @@ -581,8 +582,9 @@ def _check_unaligned_frame(meth, op, df, other): # scalar assert_frame_equal(f(0), o(df, 0)) # NAs + msg = "Unable to coerce to Series/DataFrame" assert_frame_equal(f(np.nan), o(df, np.nan)) - with assertRaisesRegexp(ValueError, 'shape'): + with assertRaisesRegexp(ValueError, msg): f(ndim_5) # Series @@ -662,6 +664,17 @@ def _test_seq(df, idx_ser, col_ser): exp = DataFrame({'col': [False, True, False]}) assert_frame_equal(result, exp) + def test_dti_tz_convert_to_utc(self): + base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], tz='UTC') + idx1 = base.tz_convert('Asia/Tokyo')[:2] + idx2 = base.tz_convert('US/Eastern')[1:] + + df1 = DataFrame({'A': [1, 2]}, index=idx1) + df2 = DataFrame({'A': [1, 1]}, index=idx2) + exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base) + assert_frame_equal(df1 + df2, exp) + def test_arith_flex_series(self): df = self.simple @@ -1176,6 +1189,53 @@ def test_inplace_ops_identity(self): assert_frame_equal(df2, expected) self.assertIs(df._data, df2._data) + def test_alignment_non_pandas(self): + index = ['A', 'B', 'C'] + columns = ['X', 'Y', 'Z'] + df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns) + + align = pd.core.ops._align_method_FRAME + + for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3])]: + + tm.assert_series_equal(align(df, val, 'index'), + Series([1, 2, 3], index=df.index)) + tm.assert_series_equal(align(df, val, 'columns'), + Series([1, 2, 3], index=df.columns)) + + # length mismatch + msg = 'Unable to coerce to Series, length must be 3: given 2' + for val in [[1, 2], (1, 2), np.array([1, 2])]: + with tm.assertRaisesRegexp(ValueError, msg): + align(df, val, 'index') + + with tm.assertRaisesRegexp(ValueError, msg): + align(df, val, 'columns') + + val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + tm.assert_frame_equal(align(df, val, 'index'), + DataFrame(val, index=df.index, + columns=df.columns)) + tm.assert_frame_equal(align(df, val, 'columns'), + DataFrame(val, index=df.index, + columns=df.columns)) + + # shape mismatch + msg = 'Unable to coerce to DataFrame, shape must be' + val = np.array([[1, 2, 3], [4, 5, 6]]) + with tm.assertRaisesRegexp(ValueError, msg): + align(df, val, 'index') + + with tm.assertRaisesRegexp(ValueError, msg): + align(df, val, 'columns') + + val = np.zeros((3, 3, 3)) + with tm.assertRaises(ValueError): + align(df, val, 'index') + with tm.assertRaises(ValueError): + align(df, val, 'columns') + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 9c401e9ce6da8..5ebe528ff8cab 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -571,11 +571,11 @@ def run_ops(ops, get_ser, test_ser): td2 / td1 # ## datetime64 ### - dt1 = Series([Timestamp('20111230'), Timestamp('20120101'), Timestamp( - '20120103')]) + dt1 = Series([Timestamp('20111230'), Timestamp('20120101'), + Timestamp('20120103')]) dt1.iloc[2] = np.nan - dt2 = Series([Timestamp('20111231'), Timestamp('20120102'), Timestamp( - '20120104')]) + dt2 = Series([Timestamp('20111231'), Timestamp('20120102'), + Timestamp('20120104')]) ops = ['__add__', '__mul__', '__floordiv__', '__truediv__', '__div__', '__pow__', '__radd__', '__rmul__', '__rfloordiv__', '__rtruediv__', '__rdiv__', '__rpow__'] @@ -607,9 +607,10 @@ def run_ops(ops, get_ser, test_ser): ops = ['__mul__', '__floordiv__', '__truediv__', '__div__', '__pow__', '__rmul__', '__rfloordiv__', '__rtruediv__', '__rdiv__', '__rpow__'] - dt1 = Series( - date_range('2000-01-01 09:00:00', periods=5, - tz='US/Eastern'), name='foo') + + tz = 'US/Eastern' + dt1 = Series(date_range('2000-01-01 09:00:00', periods=5, + tz=tz), name='foo') dt2 = dt1.copy() dt2.iloc[2] = np.nan td1 = Series(timedelta_range('1 days 1 min', periods=5, freq='H')) @@ -618,58 +619,48 @@ def run_ops(ops, get_ser, test_ser): run_ops(ops, dt1, td1) result = dt1 + td1[0] - expected = ( - dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt2 + td2[0] - expected = ( - dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) # odd numpy behavior with scalar timedeltas if not _np_version_under1p8: result = td1[0] + dt1 - expected = ( - dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) result = td2[0] + dt2 - expected = ( - dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt1 - td1[0] - expected = ( - dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) self.assertRaises(TypeError, lambda: td1[0] - dt1) result = dt2 - td2[0] - expected = ( - dt2.dt.tz_localize(None) - td2[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) - td2[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) self.assertRaises(TypeError, lambda: td2[0] - dt2) result = dt1 + td1 - expected = ( - dt1.dt.tz_localize(None) + td1).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) + td1).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt2 + td2 - expected = ( - dt2.dt.tz_localize(None) + td2).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) + td2).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt1 - td1 - expected = ( - dt1.dt.tz_localize(None) - td1).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) - td1).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt2 - td2 - expected = ( - dt2.dt.tz_localize(None) - td2).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) - td2).dt.tz_localize(tz) + assert_series_equal(result, exp) self.assertRaises(TypeError, lambda: td1 - dt1) self.assertRaises(TypeError, lambda: td2 - dt2) @@ -1555,3 +1546,12 @@ def test_datetime64_with_index(self): df['expected'] = df['date'] - df.index.to_series() df['result'] = df['date'] - df.index assert_series_equal(df['result'], df['expected'], check_names=False) + + def test_dti_tz_convert_to_utc(self): + base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], + tz='UTC') + idx1 = base.tz_convert('Asia/Tokyo')[:2] + idx2 = base.tz_convert('US/Eastern')[1:] + + res = Series([1, 2], index=idx1) + Series([1, 1], index=idx2) + assert_series_equal(res, Series([np.nan, 3, np.nan], index=base)) From 7c357d20f6cd0f379790c200e91075a179ebab75 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 12 Jul 2016 13:19:49 -0400 Subject: [PATCH 092/359] CLN: Removed outtype in DataFrame.to_dict (#13627) Follows up from #8486 in 0.15.0 by removing outtype in DataFrame.to_dict() --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/frame.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8661d87a617ba..f457b8d4bd1f6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -443,6 +443,7 @@ Removal of prior version deprecations/changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) +- ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) .. _whatsnew_0190.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b4509c999a5da..e01fc6dca6be3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -818,7 +818,6 @@ def from_dict(cls, data, orient='columns', dtype=None): return cls(data, index=index, columns=columns, dtype=dtype) - @deprecate_kwarg(old_arg_name='outtype', new_arg_name='orient') def to_dict(self, orient='dict'): """Convert DataFrame to dictionary. From 27d29158780bc7127bd944fc41eed3b74f38870b Mon Sep 17 00:00:00 2001 From: yui-knk Date: Tue, 12 Jul 2016 22:14:09 -0400 Subject: [PATCH 093/359] CLN: Fix compile time warnings This commit suppresses these warnings warning: comparison of constant -1 with expression\ of type 'PANDAS_DATETIMEUNIT' is always true\ [-Wtautological-constant-out-of-range-compare] Author: yui-knk Closes #13607 from yui-knk/fix_c_warning and squashes the following commits: e9eee1d [yui-knk] CLN: Fix compile time warnings --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/src/datetime/np_datetime_strings.c | 28 ++++------------------- pandas/src/ujson/python/objToJSON.c | 2 +- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f457b8d4bd1f6..fb09f99f2a7fe 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -536,6 +536,7 @@ Bug Fixes - Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) +- Clean some compile time warnings in datetime parsing (:issue:`13607`) - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/src/datetime/np_datetime_strings.c index 3a1d37f86cc28..b633d6cde0820 100644 --- a/pandas/src/datetime/np_datetime_strings.c +++ b/pandas/src/datetime/np_datetime_strings.c @@ -460,7 +460,7 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + if (!can_cast_datetime64_units(bestunit, unit, casting)) { PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", @@ -503,7 +503,7 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + if (!can_cast_datetime64_units(bestunit, unit, casting)) { PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", @@ -975,7 +975,7 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + if (!can_cast_datetime64_units(bestunit, unit, casting)) { PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", @@ -1005,11 +1005,6 @@ get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) { int len = 0; - /* If no unit is provided, return the maximum length */ - if (base == -1) { - return PANDAS_DATETIME_MAX_ISO8601_STRLEN; - } - switch (base) { /* Generic units can only be used to represent NaT */ /*case PANDAS_FR_GENERIC:*/ @@ -1146,28 +1141,13 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, local = 0; } - /* Automatically detect a good unit */ - if (base == -1) { - base = lossless_unit_from_datetimestruct(dts); - /* - * If there's a timezone, use at least minutes precision, - * and never split up hours and minutes by default - */ - if ((base < PANDAS_FR_m && local) || base == PANDAS_FR_h) { - base = PANDAS_FR_m; - } - /* Don't split up dates by default */ - else if (base < PANDAS_FR_D) { - base = PANDAS_FR_D; - } - } /* * Print weeks with the same precision as days. * * TODO: Could print weeks with YYYY-Www format if the week * epoch is a Monday. */ - else if (base == PANDAS_FR_W) { + if (base == PANDAS_FR_W) { base = PANDAS_FR_D; } diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 925c18cd23d8f..1080e9548ba56 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -450,7 +450,7 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, si static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, JSONTypeContext *tc, void *outValue, size_t *_outLen) { - int base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; + PANDAS_DATETIMEUNIT base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; if (((PyObjectEncoder*) tc->encoder)->datetimeIso) { From 06103dd7735335e51fcd77a36b2e8a714286a059 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Jul 2016 12:31:44 +0200 Subject: [PATCH 094/359] Pin IPython for doc build to 4.x (see #13639) --- ci/requirements-2.7_DOC_BUILD.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7_DOC_BUILD.run b/ci/requirements-2.7_DOC_BUILD.run index b87a41df4191d..a07721c75cf34 100644 --- a/ci/requirements-2.7_DOC_BUILD.run +++ b/ci/requirements-2.7_DOC_BUILD.run @@ -1,4 +1,4 @@ -ipython +ipython=4 ipykernel sphinx nbconvert From 7dd4091458d9117e57d2ad9ce3126855bd00108c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 13 Jul 2016 07:51:59 -0400 Subject: [PATCH 095/359] CLN: reorg type inference & introspection closes #12503 Author: Jeff Reback Closes #13147 from jreback/types and squashes the following commits: 244649a [Jeff Reback] CLN: reorg type inference & introspection --- ci/lint.sh | 2 +- doc/source/whatsnew/v0.19.0.txt | 22 +- pandas/__init__.py | 2 +- pandas/api/__init__.py | 1 + pandas/api/tests/__init__.py | 0 pandas/api/tests/test_api.py | 213 +++ pandas/api/types/__init__.py | 4 + pandas/compat/numpy/function.py | 3 +- pandas/computation/ops.py | 8 +- pandas/computation/pytables.py | 4 +- pandas/computation/tests/test_eval.py | 19 +- pandas/core/algorithms.py | 145 +- pandas/core/api.py | 2 +- pandas/core/base.py | 31 +- pandas/core/categorical.py | 33 +- pandas/core/common.py | 1656 +------------------ pandas/core/config_init.py | 2 +- pandas/core/convert.py | 127 -- pandas/core/frame.py | 113 +- pandas/core/generic.py | 106 +- pandas/core/groupby.py | 94 +- pandas/core/indexing.py | 49 +- pandas/core/internals.py | 129 +- pandas/core/missing.py | 73 +- pandas/core/nanops.py | 27 +- pandas/core/ops.py | 36 +- pandas/core/panel.py | 26 +- pandas/core/reshape.py | 10 +- pandas/core/series.py | 69 +- pandas/core/strings.py | 25 +- pandas/core/window.py | 41 +- pandas/formats/format.py | 43 +- pandas/formats/printing.py | 4 +- pandas/formats/style.py | 7 +- pandas/indexes/base.py | 79 +- pandas/indexes/category.py | 38 +- pandas/indexes/multi.py | 34 +- pandas/indexes/numeric.py | 22 +- pandas/indexes/range.py | 18 +- pandas/io/common.py | 4 +- pandas/io/data.py | 4 +- pandas/io/excel.py | 26 +- pandas/io/html.py | 4 +- pandas/io/packers.py | 8 +- pandas/io/parsers.py | 44 +- pandas/io/pickle.py | 6 +- pandas/io/pytables.py | 37 +- pandas/io/sql.py | 16 +- pandas/io/stata.py | 16 +- pandas/io/tests/test_sql.py | 15 +- pandas/io/tests/test_stata.py | 2 +- pandas/sparse/array.py | 46 +- pandas/sparse/frame.py | 10 +- pandas/sparse/list.py | 4 +- pandas/sparse/panel.py | 6 +- pandas/sparse/series.py | 8 +- pandas/src/testing.pyx | 12 +- pandas/stats/moments.py | 4 +- pandas/stats/ols.py | 2 +- pandas/tests/frame/test_apply.py | 6 +- pandas/tests/frame/test_constructors.py | 3 +- pandas/tests/frame/test_dtypes.py | 8 +- pandas/tests/frame/test_indexing.py | 16 +- pandas/tests/indexing/test_indexing.py | 17 +- pandas/tests/series/test_constructors.py | 13 +- pandas/tests/series/test_datetime_values.py | 7 +- pandas/tests/series/test_indexing.py | 24 +- pandas/tests/series/test_quantile.py | 6 +- pandas/tests/test_base.py | 8 +- pandas/tests/test_categorical.py | 61 +- pandas/tests/test_common.py | 658 +------- pandas/tests/test_generic.py | 4 +- pandas/tests/test_graphics.py | 6 +- pandas/tests/test_groupby.py | 12 +- pandas/tests/test_infer_and_convert.py | 653 -------- pandas/tests/test_lib.py | 1 + pandas/tests/test_multilevel.py | 5 +- pandas/tests/test_nanops.py | 4 +- pandas/tests/test_panel.py | 5 +- pandas/tests/test_panel4d.py | 4 +- pandas/tests/test_strings.py | 7 +- pandas/tests/types/test_cast.py | 193 +++ pandas/tests/types/test_common.py | 22 + pandas/tests/types/test_dtypes.py | 19 +- pandas/tests/types/test_generic.py | 36 +- pandas/tests/types/test_inference.py | 820 +++++++++ pandas/tests/types/test_io.py | 116 ++ pandas/tests/types/test_missing.py | 243 +++ pandas/tests/types/test_types.py | 40 - pandas/tools/merge.py | 46 +- pandas/tools/pivot.py | 6 +- pandas/tools/plotting.py | 67 +- pandas/tools/tile.py | 14 +- pandas/tools/util.py | 19 +- pandas/tseries/base.py | 44 +- pandas/tseries/common.py | 16 +- pandas/tseries/converter.py | 28 +- pandas/tseries/frequencies.py | 23 +- pandas/tseries/index.py | 43 +- pandas/tseries/offsets.py | 4 +- pandas/tseries/period.py | 62 +- pandas/tseries/tdi.py | 33 +- pandas/tseries/tests/test_bin_groupby.py | 6 +- pandas/tseries/tests/test_period.py | 4 +- pandas/tseries/tests/test_resample.py | 5 +- pandas/tseries/tests/test_timeseries.py | 3 +- pandas/tseries/tests/test_timezones.py | 2 +- pandas/tseries/timedeltas.py | 8 +- pandas/tseries/tools.py | 35 +- pandas/tseries/util.py | 4 +- pandas/types/api.py | 121 +- pandas/types/cast.py | 860 ++++++++++ pandas/types/common.py | 448 +++++ pandas/types/concat.py | 47 +- pandas/types/inference.py | 104 ++ pandas/types/missing.py | 394 +++++ pandas/util/testing.py | 20 +- pandas/util/validators.py | 4 +- 118 files changed, 4944 insertions(+), 4134 deletions(-) create mode 100644 pandas/api/__init__.py create mode 100644 pandas/api/tests/__init__.py create mode 100644 pandas/api/tests/test_api.py create mode 100644 pandas/api/types/__init__.py delete mode 100644 pandas/core/convert.py delete mode 100644 pandas/tests/test_infer_and_convert.py create mode 100644 pandas/tests/types/test_cast.py create mode 100644 pandas/tests/types/test_common.py create mode 100644 pandas/tests/types/test_inference.py create mode 100644 pandas/tests/types/test_io.py create mode 100644 pandas/tests/types/test_missing.py delete mode 100644 pandas/tests/types/test_types.py create mode 100644 pandas/types/cast.py create mode 100644 pandas/types/common.py create mode 100644 pandas/types/inference.py create mode 100644 pandas/types/missing.py diff --git a/ci/lint.sh b/ci/lint.sh index a4c960084040f..9f582f72fcdd7 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -8,7 +8,7 @@ RET=0 if [ "$LINT" ]; then echo "Linting" - for path in 'core' 'indexes' 'types' 'formats' 'io' 'stats' 'compat' 'sparse' 'tools' 'tseries' 'tests' 'computation' 'util' + for path in 'api' 'core' 'indexes' 'types' 'formats' 'io' 'stats' 'compat' 'sparse' 'tools' 'tseries' 'tests' 'computation' 'util' do echo "linting -> pandas/$path" flake8 pandas/$path --filename '*.py' diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index fb09f99f2a7fe..bef02a06135de 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -10,6 +10,7 @@ users upgrade to this version. Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` +- pandas development api, see :ref:`here ` .. contents:: What's new in v0.18.2 :local: @@ -20,6 +21,25 @@ Highlights include: New features ~~~~~~~~~~~~ +.. _whatsnew_0190.dev_api: + +pandas development API +^^^^^^^^^^^^^^^^^^^^^^ + +As part of making pandas APi more uniform and accessible in the future, we have created a standard +sub-package of pandas, ``pandas.api`` to hold public API's. We are starting by exposing type +introspection functions in ``pandas.api.types``. More sub-packages and officially sanctioned API's +will be published in future versions of pandas. + +The following are now part of this API: + +.. ipython:: python + + import pprint + from pandas.api import types + funcs = [ f for f in dir(types) if not f.startswith('_') ] + pprint.pprint(funcs) + .. _whatsnew_0190.enhancements.asof_merge: :func:`merge_asof` for asof-style time-series joining @@ -227,7 +247,7 @@ Other enhancements - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A top-level function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) .. _whatsnew_0190.api: diff --git a/pandas/__init__.py b/pandas/__init__.py index 350898c9925e7..2d91c97144e3c 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -16,7 +16,7 @@ if missing_dependencies: raise ImportError("Missing required dependencies {0}".format(missing_dependencies)) - +del hard_dependencies, dependency, missing_dependencies # numpy compat from pandas.compat.numpy import * diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py new file mode 100644 index 0000000000000..fcbf42f6dabc4 --- /dev/null +++ b/pandas/api/__init__.py @@ -0,0 +1 @@ +""" public toolkit API """ diff --git a/pandas/api/tests/__init__.py b/pandas/api/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py new file mode 100644 index 0000000000000..3f6c97441d659 --- /dev/null +++ b/pandas/api/tests/test_api.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- + +import pandas as pd +from pandas.core import common as com +from pandas import api +from pandas.api import types +from pandas.util import testing as tm + +_multiprocess_can_split_ = True + + +class Base(object): + + def check(self, namespace, expected, ignored=None): + # see which names are in the namespace, minus optional + # ignored ones + # compare vs the expected + + result = sorted([f for f in dir(namespace) if not f.startswith('_')]) + if ignored is not None: + result = sorted(list(set(result) - set(ignored))) + + expected = sorted(expected) + tm.assert_almost_equal(result, expected) + + +class TestPDApi(Base, tm.TestCase): + + # these are optionally imported based on testing + # & need to be ignored + ignored = ['tests', 'rpy', 'sandbox', 'locale'] + + # top-level sub-packages + lib = ['api', 'compat', 'computation', 'core', + 'indexes', 'formats', 'pandas', + 'test', 'tools', 'tseries', + 'types', 'util', 'options', 'io'] + + # top-level packages that are c-imports, should rename to _* + # to avoid naming conflicts + lib_to_rename = ['algos', 'hashtable', 'tslib', 'msgpack', 'sparse', + 'json', 'lib', 'index', 'parser'] + + # these are already deprecated; awaiting removal + deprecated_modules = ['ols', 'stats'] + + # misc + misc = ['IndexSlice', 'NaT'] + + # top-level classes + classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset', + 'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index', + 'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex', + 'Period', 'PeriodIndex', 'RangeIndex', + 'Series', 'SparseArray', 'SparseDataFrame', + 'SparseSeries', 'TimeGrouper', 'Timedelta', + 'TimedeltaIndex', 'Timestamp'] + + # these are already deprecated; awaiting removal + deprecated_classes = ['SparsePanel', 'TimeSeries', 'WidePanel', + 'SparseTimeSeries'] + + # these should be deperecated in the future + deprecated_classes_in_future = ['Panel', 'Panel4D', + 'SparseList', 'Term'] + + # these should be removed from top-level namespace + remove_classes_from_top_level_namespace = ['Expr'] + + # external modules exposed in pandas namespace + modules = ['np', 'datetime', 'datetools'] + + # top-level functions + funcs = ['bdate_range', 'concat', 'crosstab', 'cut', + 'date_range', 'eval', + 'factorize', 'get_dummies', 'get_store', + 'infer_freq', 'isnull', 'lreshape', + 'match', 'melt', 'notnull', 'offsets', + 'merge', 'merge_ordered', 'merge_asof', + 'period_range', + 'pivot', 'pivot_table', 'plot_params', 'qcut', + 'scatter_matrix', + 'show_versions', 'timedelta_range', 'unique', + 'value_counts', 'wide_to_long'] + + # top-level option funcs + funcs_option = ['reset_option', 'describe_option', 'get_option', + 'option_context', 'set_option', + 'set_eng_float_format'] + + # top-level read_* funcs + funcs_read = ['read_clipboard', 'read_csv', 'read_excel', 'read_fwf', + 'read_gbq', 'read_hdf', 'read_html', 'read_json', + 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', + 'read_sql_query', 'read_sql_table', 'read_stata', + 'read_table'] + + # top-level to_* funcs + funcs_to = ['to_datetime', 'to_msgpack', + 'to_numeric', 'to_pickle', 'to_timedelta'] + + # these should be deperecated in the future + deprecated_funcs_in_future = ['pnow', 'groupby', 'info'] + + # these are already deprecated; awaiting removal + deprecated_funcs = ['ewma', 'ewmcorr', 'ewmcov', 'ewmstd', 'ewmvar', + 'ewmvol', 'expanding_apply', 'expanding_corr', + 'expanding_count', 'expanding_cov', 'expanding_kurt', + 'expanding_max', 'expanding_mean', 'expanding_median', + 'expanding_min', 'expanding_quantile', + 'expanding_skew', 'expanding_std', 'expanding_sum', + 'expanding_var', 'fama_macbeth', 'rolling_apply', + 'rolling_corr', 'rolling_count', 'rolling_cov', + 'rolling_kurt', 'rolling_max', 'rolling_mean', + 'rolling_median', 'rolling_min', 'rolling_quantile', + 'rolling_skew', 'rolling_std', 'rolling_sum', + 'rolling_var', 'rolling_window', 'ordered_merge'] + + def test_api(self): + + self.check(pd, + self.lib + self.lib_to_rename + self.misc + + self.modules + self.deprecated_modules + + self.classes + self.deprecated_classes + + self.deprecated_classes_in_future + + self.remove_classes_from_top_level_namespace + + self.funcs + self.funcs_option + + self.funcs_read + self.funcs_to + + self.deprecated_funcs + + self.deprecated_funcs_in_future, + self.ignored) + + +class TestApi(Base, tm.TestCase): + + allowed = ['tests', 'types'] + + def test_api(self): + + self.check(api, self.allowed) + + +class TestTypes(Base, tm.TestCase): + + allowed = ['is_any_int_dtype', 'is_bool', 'is_bool_dtype', + 'is_categorical', 'is_categorical_dtype', 'is_complex', + 'is_complex_dtype', 'is_datetime64_any_dtype', + 'is_datetime64_dtype', 'is_datetime64_ns_dtype', + 'is_datetime64tz_dtype', 'is_datetimetz', 'is_dtype_equal', + 'is_extension_type', 'is_float', 'is_float_dtype', + 'is_floating_dtype', 'is_int64_dtype', 'is_integer', + 'is_integer_dtype', 'is_number', 'is_numeric_dtype', + 'is_object_dtype', 'is_scalar', 'is_sparse', + 'is_string_dtype', 'is_timedelta64_dtype', + 'is_timedelta64_ns_dtype', + 'is_re', 'is_re_compilable', + 'is_dict_like', 'is_iterator', + 'is_list_like', 'is_hashable', + 'is_named_tuple', 'is_sequence', + 'pandas_dtype'] + + def test_types(self): + + self.check(types, self.allowed) + + def check_deprecation(self, fold, fnew): + with tm.assert_produces_warning(FutureWarning): + try: + result = fold('foo') + expected = fnew('foo') + self.assertEqual(result, expected) + except TypeError: + self.assertRaises(TypeError, + lambda: fnew('foo')) + except AttributeError: + self.assertRaises(AttributeError, + lambda: fnew('foo')) + + def test_deprecation_core_common(self): + + # test that we are in fact deprecating + # the pandas.core.common introspectors + for t in self.allowed: + self.check_deprecation(getattr(com, t), getattr(types, t)) + + def test_deprecation_core_common_moved(self): + + # these are in pandas.types.common + l = ['is_datetime_arraylike', + 'is_datetime_or_timedelta_dtype', + 'is_datetimelike', + 'is_datetimelike_v_numeric', + 'is_datetimelike_v_object', + 'is_datetimetz', + 'is_int_or_datetime_dtype', + 'is_period_arraylike', + 'is_string_like', + 'is_string_like_dtype'] + + from pandas.types import common as c + for t in l: + self.check_deprecation(getattr(com, t), getattr(c, t)) + + def test_removed_from_core_common(self): + + for t in ['is_null_datelike_scalar', + 'ensure_float']: + self.assertRaises(AttributeError, lambda: getattr(com, t)) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py new file mode 100644 index 0000000000000..ee217543f0420 --- /dev/null +++ b/pandas/api/types/__init__.py @@ -0,0 +1,4 @@ +""" public toolkit API """ + +from pandas.types.api import * # noqa +del np # noqa diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 15bf6d31b7109..adc17c7514832 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -21,7 +21,8 @@ from numpy import ndarray from pandas.util.validators import (validate_args, validate_kwargs, validate_args_and_kwargs) -from pandas.core.common import is_bool, is_integer, UnsupportedFunctionCall +from pandas.core.common import UnsupportedFunctionCall +from pandas.types.common import is_integer, is_bool from pandas.compat import OrderedDict diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 7a0743f6b2778..96a04cff9372e 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -7,11 +7,11 @@ import numpy as np +from pandas.types.common import is_list_like, is_scalar import pandas as pd from pandas.compat import PY3, string_types, text_type import pandas.core.common as com from pandas.formats.printing import pprint_thing, pprint_thing_encoded -import pandas.lib as lib from pandas.core.base import StringMixin from pandas.computation.common import _ensure_decoded, _result_type_many from pandas.computation.scope import _DEFAULT_GLOBALS @@ -100,7 +100,7 @@ def update(self, value): @property def isscalar(self): - return lib.isscalar(self._value) + return is_scalar(self._value) @property def type(self): @@ -229,7 +229,7 @@ def _in(x, y): try: return x.isin(y) except AttributeError: - if com.is_list_like(x): + if is_list_like(x): try: return y.isin(x) except AttributeError: @@ -244,7 +244,7 @@ def _not_in(x, y): try: return ~x.isin(y) except AttributeError: - if com.is_list_like(x): + if is_list_like(x): try: return ~y.isin(x) except AttributeError: diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index d6d55d15fec30..e375716b0d606 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -7,6 +7,8 @@ from datetime import datetime, timedelta import numpy as np import pandas as pd + +from pandas.types.common import is_list_like import pandas.core.common as com from pandas.compat import u, string_types, DeepChainMap from pandas.core.base import StringMixin @@ -127,7 +129,7 @@ def pr(left, right): def conform(self, rhs): """ inplace conform rhs """ - if not com.is_list_like(rhs): + if not is_list_like(rhs): rhs = [rhs] if isinstance(rhs, np.ndarray): rhs = rhs.ravel() diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 5019dd392a567..066df0521fef6 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -13,6 +13,7 @@ from numpy.random import randn, rand, randint import numpy as np +from pandas.types.common import is_list_like, is_scalar import pandas as pd from pandas.core import common as com from pandas import DataFrame, Series, Panel, date_range @@ -200,7 +201,7 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, binop=binop, cmp2=cmp2) - scalar_with_in_notin = (lib.isscalar(rhs) and (cmp1 in skip_these or + scalar_with_in_notin = (is_scalar(rhs) and (cmp1 in skip_these or cmp2 in skip_these)) if scalar_with_in_notin: with tm.assertRaises(TypeError): @@ -253,7 +254,7 @@ def check_operands(left, right, cmp_op): def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) - if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): + if cmp1 in ('in', 'not in') and not is_list_like(rhs): self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) @@ -331,7 +332,7 @@ def check_pow(self, lhs, arith1, rhs): expected = self.get_expected_pow_result(lhs, rhs) result = pd.eval(ex, engine=self.engine, parser=self.parser) - if (lib.isscalar(lhs) and lib.isscalar(rhs) and + if (is_scalar(lhs) and is_scalar(rhs) and _is_py3_complex_incompat(result, expected)): self.assertRaises(AssertionError, tm.assert_numpy_array_equal, result, expected) @@ -364,16 +365,16 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = 'in', 'not in' ex = '~(lhs {0} rhs)'.format(cmp1) - if lib.isscalar(rhs) and cmp1 in skip_these: + if is_scalar(rhs) and cmp1 in skip_these: self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) else: # compound - if lib.isscalar(lhs) and lib.isscalar(rhs): + if is_scalar(lhs) and is_scalar(rhs): lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) - if lib.isscalar(expected): + if is_scalar(expected): expected = not expected else: expected = ~expected @@ -643,17 +644,17 @@ def test_identical(self): x = 1 result = pd.eval('x', engine=self.engine, parser=self.parser) self.assertEqual(result, 1) - self.assertTrue(lib.isscalar(result)) + self.assertTrue(is_scalar(result)) x = 1.5 result = pd.eval('x', engine=self.engine, parser=self.parser) self.assertEqual(result, 1.5) - self.assertTrue(lib.isscalar(result)) + self.assertTrue(is_scalar(result)) x = False result = pd.eval('x', engine=self.engine, parser=self.parser) self.assertEqual(result, False) - self.assertTrue(lib.isscalar(result)) + self.assertTrue(is_scalar(result)) x = np.array([1]) result = pd.eval('x', engine=self.engine, parser=self.parser) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4b40bce79cbb5..c3ba734353a8d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -7,10 +7,31 @@ import numpy as np from pandas import compat, lib, tslib, _np_version_under1p8 +from pandas.types.cast import _maybe_promote +from pandas.types.generic import ABCPeriodIndex, ABCDatetimeIndex +from pandas.types.common import (is_integer_dtype, + is_int64_dtype, + is_categorical_dtype, + is_extension_type, + is_datetimetz, + is_period_arraylike, + is_datetime_or_timedelta_dtype, + is_float_dtype, + needs_i8_conversion, + is_categorical, + is_datetime64_dtype, + is_timedelta64_dtype, + is_scalar, + _ensure_platform_int, + _ensure_object, + _ensure_float64, + _ensure_int64, + is_list_like) +from pandas.types.missing import isnull + import pandas.core.common as com import pandas.algos as algos import pandas.hashtable as htable -from pandas.types import api as gt from pandas.compat import string_types from pandas.tslib import iNaT @@ -105,12 +126,12 @@ def isin(comps, values): boolean array same length as comps """ - if not com.is_list_like(comps): + if not is_list_like(comps): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(comps).__name__)) comps = np.asarray(comps) - if not com.is_list_like(values): + if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(values).__name__)) @@ -126,15 +147,15 @@ def isin(comps, values): f = lambda x, y: lib.ismember_int64(x, set(y)) # may need i8 conversion for proper membership testing - if com.is_datetime64_dtype(comps): + if is_datetime64_dtype(comps): from pandas.tseries.tools import to_datetime values = to_datetime(values)._values.view('i8') comps = comps.view('i8') - elif com.is_timedelta64_dtype(comps): + elif is_timedelta64_dtype(comps): from pandas.tseries.timedeltas import to_timedelta values = to_timedelta(values)._values.view('i8') comps = comps.view('i8') - elif com.is_int64_dtype(comps): + elif is_int64_dtype(comps): pass else: f = lambda x, y: lib.ismember(x, set(values)) @@ -171,20 +192,20 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): vals = np.asarray(values) # localize to UTC - is_datetimetz = com.is_datetimetz(values) - if is_datetimetz: + is_datetimetz_type = is_datetimetz(values) + if is_datetimetz_type: values = DatetimeIndex(values) vals = values.tz_localize(None) - is_datetime = com.is_datetime64_dtype(vals) - is_timedelta = com.is_timedelta64_dtype(vals) + is_datetime = is_datetime64_dtype(vals) + is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) - labels = com._ensure_platform_int(labels) + labels = _ensure_platform_int(labels) uniques = uniques.to_array() @@ -194,7 +215,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) - t.map_locations(com._ensure_object(uniques)) + t.map_locations(_ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ @@ -202,8 +223,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): dtype=object)) for f in [lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types)]]) - sorter = com._ensure_platform_int(t.lookup( - com._ensure_object(ordered))) + sorter = _ensure_platform_int(t.lookup( + _ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) @@ -214,7 +235,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = uniques.take(sorter) - if is_datetimetz: + if is_datetimetz_type: # reset tz uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( @@ -267,7 +288,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, raise TypeError("bins argument only works with numeric data.") values = cat.codes - if com.is_extension_type(values) and not com.is_datetimetz(values): + if is_extension_type(values) and not is_datetimetz(values): # handle Categorical and sparse, # datetime tz can be handeled in ndarray path result = Series(values).values.value_counts(dropna=dropna) @@ -298,9 +319,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False, def _value_counts_arraylike(values, dropna=True): - is_datetimetz = com.is_datetimetz(values) - is_period = (isinstance(values, gt.ABCPeriodIndex) or - com.is_period_arraylike(values)) + is_datetimetz_type = is_datetimetz(values) + is_period = (isinstance(values, ABCPeriodIndex) or + is_period_arraylike(values)) orig = values @@ -308,7 +329,7 @@ def _value_counts_arraylike(values, dropna=True): values = Series(values).values dtype = values.dtype - if com.is_datetime_or_timedelta_dtype(dtype) or is_period: + if is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex @@ -327,8 +348,8 @@ def _value_counts_arraylike(values, dropna=True): keys = keys.astype(dtype) # dtype handling - if is_datetimetz: - if isinstance(orig, gt.ABCDatetimeIndex): + if is_datetimetz_type: + if isinstance(orig, ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz @@ -336,15 +357,15 @@ def _value_counts_arraylike(values, dropna=True): if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) - elif com.is_integer_dtype(dtype): - values = com._ensure_int64(values) + elif is_integer_dtype(dtype): + values = _ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) - elif com.is_float_dtype(dtype): - values = com._ensure_float64(values) + elif is_float_dtype(dtype): + values = _ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: - values = com._ensure_object(values) - mask = com.isnull(values) + values = _ensure_object(values) + mask = isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) @@ -366,8 +387,8 @@ def mode(values): constructor = Series dtype = values.dtype - if com.is_integer_dtype(values): - values = com._ensure_int64(values) + if is_integer_dtype(values): + values = _ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): @@ -375,11 +396,11 @@ def mode(values): values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) - elif com.is_categorical_dtype(values): + elif is_categorical_dtype(values): result = constructor(values.mode()) else: - mask = com.isnull(values) - values = com._ensure_object(values) + mask = isnull(values) + values = _ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) @@ -459,7 +480,7 @@ def quantile(x, q, interpolation_method='fraction'): """ x = np.asarray(x) - mask = com.isnull(x) + mask = isnull(x) x = x[~mask] @@ -486,7 +507,7 @@ def _get_score(at): return score - if lib.isscalar(q): + if is_scalar(q): return _get_score(q) else: q = np.asarray(q, np.float64) @@ -593,18 +614,18 @@ def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ - if com.is_float_dtype(dtype): - return f(htable.Float64HashTable, com._ensure_float64) - elif com.is_integer_dtype(dtype): - return f(htable.Int64HashTable, com._ensure_int64) - elif com.is_datetime64_dtype(dtype): + if is_float_dtype(dtype): + return f(htable.Float64HashTable, _ensure_float64) + elif is_integer_dtype(dtype): + return f(htable.Int64HashTable, _ensure_int64) + elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' - return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) - elif com.is_timedelta64_dtype(dtype): + return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) + elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' - return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) + return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) else: - return f(htable.PyObjectHashTable, com._ensure_object) + return f(htable.PyObjectHashTable, _ensure_object) _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), @@ -614,20 +635,20 @@ def _hashtable_algo(f, dtype, return_dtype=None): def _get_data_algo(values, func_map): - if com.is_float_dtype(values): + if is_float_dtype(values): f = func_map['float64'] - values = com._ensure_float64(values) + values = _ensure_float64(values) - elif com.needs_i8_conversion(values): + elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') - elif com.is_integer_dtype(values): + elif is_integer_dtype(values): f = func_map['int64'] - values = com._ensure_int64(values) + values = _ensure_int64(values) else: f = func_map['generic'] - values = com._ensure_object(values) + values = _ensure_object(values) return f, values @@ -689,7 +710,7 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): if arr.dtype != out.dtype: arr = arr.astype(out.dtype) if arr.shape[axis] > 0: - arr.take(com._ensure_platform_int(indexer), axis=axis, out=out) + arr.take(_ensure_platform_int(indexer), axis=axis, out=out) if needs_masking: outindexer = [slice(None)] * arr.ndim outindexer[axis] = mask @@ -830,7 +851,7 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): return func def func(arr, indexer, out, fill_value=np.nan): - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) _take_nd_generic(arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info) @@ -854,7 +875,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, out : ndarray or None, default None Optional output array, must be appropriate type to hold input and fill_value together, if indexer has any -1 value entries; call - common._maybe_promote to determine this type for any fill_value + _maybe_promote to determine this type for any fill_value fill_value : any, default np.nan Fill value to replace -1 values with mask_info : tuple of (ndarray, boolean) @@ -868,24 +889,24 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, """ # dispatch to internal type takes - if com.is_categorical(arr): + if is_categorical(arr): return arr.take_nd(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif com.is_datetimetz(arr): + elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) - dtype, fill_value = com._maybe_promote(arr.dtype, fill_value) + dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: @@ -931,7 +952,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: @@ -957,11 +978,11 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, if row_idx is None: row_idx = np.arange(arr.shape[0], dtype=np.int64) else: - row_idx = com._ensure_int64(row_idx) + row_idx = _ensure_int64(row_idx) if col_idx is None: col_idx = np.arange(arr.shape[1], dtype=np.int64) else: - col_idx = com._ensure_int64(col_idx) + col_idx = _ensure_int64(col_idx) indexer = row_idx, col_idx if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() @@ -969,7 +990,7 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, else: # check for promotion based on types only (do this first because # it's faster than computing a mask) - dtype, fill_value = com._maybe_promote(arr.dtype, fill_value) + dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: @@ -1032,7 +1053,7 @@ def diff(arr, n, axis=0): na = np.nan dtype = arr.dtype is_timedelta = False - if com.needs_i8_conversion(arr): + if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view('i8') na = tslib.iNaT diff --git a/pandas/core/api.py b/pandas/core/api.py index 0a6992bfebd70..579f21eb4ada8 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -5,7 +5,7 @@ import numpy as np from pandas.core.algorithms import factorize, match, unique, value_counts -from pandas.core.common import isnull, notnull +from pandas.types.missing import isnull, notnull from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.formats.format import set_eng_float_format diff --git a/pandas/core/base.py b/pandas/core/base.py index 13a6b4b7b4ce0..a0dfebdfde356 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,6 +4,12 @@ from pandas import compat from pandas.compat import builtins import numpy as np + +from pandas.types.missing import isnull +from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndex +from pandas.types.common import (_ensure_object, is_object_dtype, + is_list_like, is_scalar) + from pandas.core import common as com import pandas.core.nanops as nanops import pandas.lib as lib @@ -11,7 +17,6 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) from pandas.core.common import AbstractMethodError -from pandas.types import api as gt from pandas.formats.printing import pprint_thing _shared_docs = dict() @@ -121,7 +126,7 @@ def __sizeof__(self): """ if hasattr(self, 'memory_usage'): mem = self.memory_usage(deep=True) - if not lib.isscalar(mem): + if not is_scalar(mem): mem = mem.sum() return int(mem) @@ -293,15 +298,15 @@ def name(self): @property def _selection_list(self): - if not isinstance(self._selection, (list, tuple, gt.ABCSeries, - gt.ABCIndex, np.ndarray)): + if not isinstance(self._selection, (list, tuple, ABCSeries, + ABCIndex, np.ndarray)): return [self._selection] return self._selection @cache_readonly def _selected_obj(self): - if self._selection is None or isinstance(self.obj, gt.ABCSeries): + if self._selection is None or isinstance(self.obj, ABCSeries): return self.obj else: return self.obj[self._selection] @@ -313,7 +318,7 @@ def ndim(self): @cache_readonly def _obj_with_exclusions(self): if self._selection is not None and isinstance(self.obj, - gt.ABCDataFrame): + ABCDataFrame): return self.obj.reindex(columns=self._selection_list) if len(self.exclusions) > 0: @@ -325,7 +330,7 @@ def __getitem__(self, key): if self._selection is not None: raise Exception('Column(s) %s already selected' % self._selection) - if isinstance(key, (list, tuple, gt.ABCSeries, gt.ABCIndex, + if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) @@ -553,7 +558,7 @@ def _agg(arg, func): if isinstance(result, list): result = concat(result, keys=keys, axis=1) elif isinstance(list(compat.itervalues(result))[0], - gt.ABCDataFrame): + ABCDataFrame): result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame @@ -682,7 +687,7 @@ def _gotitem(self, key, ndim, subset=None): **kwargs) self._reset_cache() if subset.ndim == 2: - if lib.isscalar(key) and key in subset or com.is_list_like(key): + if is_scalar(key) and key in subset or is_list_like(key): self._selection = key return self @@ -903,7 +908,7 @@ def argmin(self, axis=None): @cache_readonly def hasnans(self): """ return if I have any nans; enables various perf speedups """ - return com.isnull(self).any() + return isnull(self).any() def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): @@ -980,7 +985,7 @@ def nunique(self, dropna=True): """ uniqs = self.unique() n = len(uniqs) - if dropna and com.isnull(uniqs).any(): + if dropna and isnull(uniqs).any(): n -= 1 return n @@ -1053,7 +1058,7 @@ def memory_usage(self, deep=False): return self.values.memory_usage(deep=deep) v = self.values.nbytes - if deep and com.is_object_dtype(self): + if deep and is_object_dtype(self): v += lib.memory_usage_of_objects(self.values) return v @@ -1195,7 +1200,7 @@ def drop_duplicates(self, keep='first', inplace=False): False: 'first'}) @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) def duplicated(self, keep='first'): - keys = com._values_from_object(com._ensure_object(self.values)) + keys = com._values_from_object(_ensure_object(self.values)) duplicated = lib.duplicated(keys, keep=keep) try: return self._constructor(duplicated, diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index f4aeaf9184d09..79d8bfbf57f12 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -7,6 +7,22 @@ from pandas import compat, lib from pandas.compat import u +from pandas.types.generic import ABCSeries, ABCIndexClass, ABCCategoricalIndex +from pandas.types.missing import isnull, notnull +from pandas.types.cast import (_possibly_infer_to_datetimelike, + _coerce_indexer_dtype) +from pandas.types.dtypes import CategoricalDtype +from pandas.types.common import (_ensure_int64, + _ensure_object, + _ensure_platform_int, + is_dtype_equal, + is_datetimelike, + is_categorical_dtype, + is_integer_dtype, is_bool, + is_list_like, is_sequence, + is_scalar) +from pandas.core.common import is_null_slice + from pandas.core.algorithms import factorize, take_1d from pandas.core.base import (PandasObject, PandasDelegate, NoNewAttributesMixin, _shared_docs) @@ -16,13 +32,6 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) -from pandas.core.common import ( - ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull, - is_dtype_equal, is_categorical_dtype, is_integer_dtype, - _possibly_infer_to_datetimelike, is_list_like, - is_sequence, is_null_slice, is_bool, _ensure_object, _ensure_int64, - _coerce_indexer_dtype) -from pandas.types.api import CategoricalDtype from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option @@ -64,7 +73,7 @@ def f(self, other): # With cat[0], for example, being ``np.int64(1)`` by the time it gets # into this function would become ``np.array(1)``. other = lib.item_from_zerodim(other) - if lib.isscalar(other): + if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) return getattr(self._codes, op)(i) @@ -968,7 +977,7 @@ def shift(self, periods): if codes.ndim > 1: raise NotImplementedError("Categorical with ndim > 1.") if np.prod(codes.shape) and (periods != 0): - codes = np.roll(codes, com._ensure_platform_int(periods), axis=0) + codes = np.roll(codes, _ensure_platform_int(periods), axis=0) if periods > 0: codes[:periods] = -1 else: @@ -1148,7 +1157,7 @@ def value_counts(self, dropna=True): counts : Series """ from numpy import bincount - from pandas.core.common import isnull + from pandas.types.missing import isnull from pandas.core.series import Series from pandas.core.index import CategoricalIndex @@ -1182,7 +1191,7 @@ def get_values(self): Index if datetime / periods """ # if we are a datetime and period index, return Index to keep metadata - if com.is_datetimelike(self.categories): + if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) return np.array(self) @@ -1933,7 +1942,7 @@ def _convert_to_list_like(list_like): if (is_sequence(list_like) or isinstance(list_like, tuple) or isinstance(list_like, types.GeneratorType)): return list(list_like) - elif lib.isscalar(list_like): + elif is_scalar(list_like): return [list_like] else: # is this reached? diff --git a/pandas/core/common.py b/pandas/core/common.py index 28bae362a3411..99dd2e9f5b8a9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2,23 +2,66 @@ Misc tools for implementing data structures """ -import re -import collections -import numbers +import sys +import warnings from datetime import datetime, timedelta from functools import partial import numpy as np -import pandas as pd -import pandas.algos as algos import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import (long, zip, map, string_types, - iteritems) -from pandas.types import api as gt -from pandas.types.api import * # noqa +from pandas.compat import long, zip, iteritems from pandas.core.config import get_option +from pandas.types.generic import ABCSeries +from pandas.types.common import _NS_DTYPE, is_integer +from pandas.types.inference import _iterable_not_string +from pandas.types.missing import isnull +from pandas.api import types +from pandas.types import common + +# back-compat of public API +# deprecate these functions +m = sys.modules['pandas.core.common'] +for t in [t for t in dir(types) if not t.startswith('_')]: + + def outer(t=t): + + def wrapper(*args, **kwargs): + warnings.warn("pandas.core.common.{t} is deprecated. " + "import from the public API: " + "pandas.api.types.{t} instead".format(t=t), + FutureWarning, stacklevel=2) + return getattr(types, t)(*args, **kwargs) + return wrapper + + setattr(m, t, outer(t)) + +# back-compat for non-public functions +# deprecate these functions +for t in ['is_datetime_arraylike', + 'is_datetime_or_timedelta_dtype', + 'is_datetimelike', + 'is_datetimelike_v_numeric', + 'is_datetimelike_v_object', + 'is_datetimetz', + 'is_int_or_datetime_dtype', + 'is_period_arraylike', + 'is_string_like', + 'is_string_like_dtype']: + + def outer(t=t): + + def wrapper(*args, **kwargs): + warnings.warn("pandas.core.common.{t} is deprecated. " + "These are not longer public API functions, " + "but can be imported from " + "pandas.types.common.{t} instead".format(t=t), + FutureWarning, stacklevel=2) + return getattr(common, t)(*args, **kwargs) + return wrapper + + setattr(m, t, outer(t)) class PandasError(Exception): @@ -58,322 +101,6 @@ def __str__(self): self.class_instance.__class__.__name__) -_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name - for t in ['O', 'int8', 'uint8', 'int16', 'uint16', - 'int32', 'uint32', 'int64', 'uint64']]) - -_NS_DTYPE = np.dtype('M8[ns]') -_TD_DTYPE = np.dtype('m8[ns]') -_INT64_DTYPE = np.dtype(np.int64) -_DATELIKE_DTYPES = set([np.dtype(t) - for t in ['M8[ns]', 'M8[ns]', - 'm8[ns]', 'm8[ns]']]) -_int8_max = np.iinfo(np.int8).max -_int16_max = np.iinfo(np.int16).max -_int32_max = np.iinfo(np.int32).max -_int64_max = np.iinfo(np.int64).max - - -def isnull(obj): - """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) - - Parameters - ---------- - arr : ndarray or object value - Object to check for null-ness - - Returns - ------- - isnulled : array-like of bool or bool - Array or bool indicating whether an object is null or if an array is - given which of the element is null. - - See also - -------- - pandas.notnull: boolean inverse of pandas.isnull - """ - return _isnull(obj) - - -def _isnull_new(obj): - if lib.isscalar(obj): - return lib.checknull(obj) - # hack (for now) because MI registers as ndarray - elif isinstance(obj, pd.MultiIndex): - raise NotImplementedError("isnull is not defined for MultiIndex") - elif isinstance(obj, (gt.ABCSeries, np.ndarray, pd.Index)): - return _isnull_ndarraylike(obj) - elif isinstance(obj, gt.ABCGeneric): - return obj._constructor(obj._data.isnull(func=isnull)) - elif isinstance(obj, list) or hasattr(obj, '__array__'): - return _isnull_ndarraylike(np.asarray(obj)) - else: - return obj is None - - -def _isnull_old(obj): - """Detect missing values. Treat None, NaN, INF, -INF as null. - - Parameters - ---------- - arr: ndarray or object value - - Returns - ------- - boolean ndarray or boolean - """ - if lib.isscalar(obj): - return lib.checknull_old(obj) - # hack (for now) because MI registers as ndarray - elif isinstance(obj, pd.MultiIndex): - raise NotImplementedError("isnull is not defined for MultiIndex") - elif isinstance(obj, (gt.ABCSeries, np.ndarray, pd.Index)): - return _isnull_ndarraylike_old(obj) - elif isinstance(obj, gt.ABCGeneric): - return obj._constructor(obj._data.isnull(func=_isnull_old)) - elif isinstance(obj, list) or hasattr(obj, '__array__'): - return _isnull_ndarraylike_old(np.asarray(obj)) - else: - return obj is None - - -_isnull = _isnull_new - - -def _use_inf_as_null(key): - """Option change callback for null/inf behaviour - Choose which replacement for numpy.isnan / ~numpy.isfinite is used. - - Parameters - ---------- - flag: bool - True means treat None, NaN, INF, -INF as null (old way), - False means None and NaN are null, but INF, -INF are not null - (new way). - - Notes - ----- - This approach to setting global module values is discussed and - approved here: - - * http://stackoverflow.com/questions/4859217/ - programmatically-creating-variables-in-python/4859312#4859312 - """ - flag = get_option(key) - if flag: - globals()['_isnull'] = _isnull_old - else: - globals()['_isnull'] = _isnull_new - - -def _isnull_ndarraylike(obj): - - values = getattr(obj, 'values', obj) - dtype = values.dtype - - if is_string_dtype(dtype): - if is_categorical_dtype(values): - from pandas import Categorical - if not isinstance(values, Categorical): - values = values.values - result = values.isnull() - else: - - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = lib.isnullobj(values.ravel()) - result[...] = vec.reshape(shape) - - elif is_datetimelike(obj): - # this is the NaT pattern - result = values.view('i8') == tslib.iNaT - else: - result = np.isnan(values) - - # box - if isinstance(obj, gt.ABCSeries): - from pandas import Series - result = Series(result, index=obj.index, name=obj.name, copy=False) - - return result - - -def _isnull_ndarraylike_old(obj): - values = getattr(obj, 'values', obj) - dtype = values.dtype - - if is_string_dtype(dtype): - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = lib.isnullobj_old(values.ravel()) - result[:] = vec.reshape(shape) - - elif dtype in _DATELIKE_DTYPES: - # this is the NaT pattern - result = values.view('i8') == tslib.iNaT - else: - result = ~np.isfinite(values) - - # box - if isinstance(obj, gt.ABCSeries): - from pandas import Series - result = Series(result, index=obj.index, name=obj.name, copy=False) - - return result - - -def notnull(obj): - """Replacement for numpy.isfinite / ~numpy.isnan which is suitable for use - on object arrays. - - Parameters - ---------- - arr : ndarray or object value - Object to check for *not*-null-ness - - Returns - ------- - isnulled : array-like of bool or bool - Array or bool indicating whether an object is *not* null or if an array - is given which of the element is *not* null. - - See also - -------- - pandas.isnull : boolean inverse of pandas.notnull - """ - res = isnull(obj) - if lib.isscalar(res): - return not res - return ~res - - -def is_null_datelike_scalar(other): - """ test whether the object is a null datelike, e.g. Nat - but guard against passing a non-scalar """ - if other is pd.NaT or other is None: - return True - elif lib.isscalar(other): - - # a timedelta - if hasattr(other, 'dtype'): - return other.view('i8') == tslib.iNaT - elif is_integer(other) and other == tslib.iNaT: - return True - return isnull(other) - return False - - -def array_equivalent(left, right, strict_nan=False): - """ - True if two arrays, left and right, have equal non-NaN elements, and NaNs - in corresponding locations. False otherwise. It is assumed that left and - right are NumPy arrays of the same dtype. The behavior of this function - (particularly with respect to NaNs) is not defined if the dtypes are - different. - - Parameters - ---------- - left, right : ndarrays - strict_nan : bool, default False - If True, consider NaN and None to be different. - - Returns - ------- - b : bool - Returns True if the arrays are equivalent. - - Examples - -------- - >>> array_equivalent( - ... np.array([1, 2, np.nan]), - ... np.array([1, 2, np.nan])) - True - >>> array_equivalent( - ... np.array([1, np.nan, 2]), - ... np.array([1, 2, np.nan])) - False - """ - - left, right = np.asarray(left), np.asarray(right) - - # shape compat - if left.shape != right.shape: - return False - - # Object arrays can contain None, NaN and NaT. - # string dtypes must be come to this path for NumPy 1.7.1 compat - if is_string_dtype(left) or is_string_dtype(right): - - if not strict_nan: - # pd.isnull considers NaN and None to be equivalent. - return lib.array_equivalent_object(_ensure_object(left.ravel()), - _ensure_object(right.ravel())) - - for left_value, right_value in zip(left, right): - if left_value is tslib.NaT and right_value is not tslib.NaT: - return False - - elif isinstance(left_value, float) and np.isnan(left_value): - if (not isinstance(right_value, float) or - not np.isnan(right_value)): - return False - else: - if left_value != right_value: - return False - return True - - # NaNs can occur in float and complex arrays. - if is_float_dtype(left) or is_complex_dtype(left): - return ((left == right) | (np.isnan(left) & np.isnan(right))).all() - - # numpy will will not allow this type of datetimelike vs integer comparison - elif is_datetimelike_v_numeric(left, right): - return False - - # M8/m8 - elif needs_i8_conversion(left) and needs_i8_conversion(right): - if not is_dtype_equal(left.dtype, right.dtype): - return False - - left = left.view('i8') - right = right.view('i8') - - # NaNs cannot occur otherwise. - try: - return np.array_equal(left, right) - except AttributeError: - # see gh-13388 - # - # NumPy v1.7.1 has a bug in its array_equal - # function that prevents it from correctly - # comparing two arrays with complex dtypes. - # This bug is corrected in v1.8.0, so remove - # this try-except block as soon as we stop - # supporting NumPy versions < 1.8.0 - if not is_dtype_equal(left.dtype, right.dtype): - return False - - left = left.tolist() - right = right.tolist() - - return left == right - - -def _iterable_not_string(x): - return (isinstance(x, collections.Iterable) and - not isinstance(x, compat.string_types)) - - def flatten(l): """Flatten an arbitrarily nested sequence. @@ -398,510 +125,6 @@ def flatten(l): yield el -def _coerce_indexer_dtype(indexer, categories): - """ coerce the indexer input array to the smallest dtype possible """ - l = len(categories) - if l < _int8_max: - return _ensure_int8(indexer) - elif l < _int16_max: - return _ensure_int16(indexer) - elif l < _int32_max: - return _ensure_int32(indexer) - return _ensure_int64(indexer) - - -def _coerce_to_dtypes(result, dtypes): - """ given a dtypes and a result set, coerce the result elements to the - dtypes - """ - if len(result) != len(dtypes): - raise AssertionError("_coerce_to_dtypes requires equal len arrays") - - from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type - - def conv(r, dtype): - try: - if isnull(r): - pass - elif dtype == _NS_DTYPE: - r = lib.Timestamp(r) - elif dtype == _TD_DTYPE: - r = _coerce_scalar_to_timedelta_type(r) - elif dtype == np.bool_: - # messy. non 0/1 integers do not get converted. - if is_integer(r) and r not in [0, 1]: - return int(r) - r = bool(r) - elif dtype.kind == 'f': - r = float(r) - elif dtype.kind == 'i': - r = int(r) - except: - pass - - return r - - return [conv(r, dtype) for r, dtype in zip(result, dtypes)] - - -def _infer_fill_value(val): - """ - infer the fill value for the nan/NaT from the provided - scalar/ndarray/list-like if we are a NaT, return the correct dtyped - element to provide proper block construction - """ - - if not is_list_like(val): - val = [val] - val = np.array(val, copy=False) - if is_datetimelike(val): - return np.array('NaT', dtype=val.dtype) - elif is_object_dtype(val.dtype): - dtype = lib.infer_dtype(_ensure_object(val)) - if dtype in ['datetime', 'datetime64']: - return np.array('NaT', dtype=_NS_DTYPE) - elif dtype in ['timedelta', 'timedelta64']: - return np.array('NaT', dtype=_TD_DTYPE) - return np.nan - - -def _infer_dtype_from_scalar(val): - """ interpret the dtype from a scalar """ - - dtype = np.object_ - - # a 1-element ndarray - if isinstance(val, np.ndarray): - if val.ndim != 0: - raise ValueError( - "invalid ndarray passed to _infer_dtype_from_scalar") - - dtype = val.dtype - val = val.item() - - elif isinstance(val, compat.string_types): - - # If we create an empty array using a string to infer - # the dtype, NumPy will only allocate one character per entry - # so this is kind of bad. Alternately we could use np.repeat - # instead of np.empty (but then you still don't want things - # coming out as np.str_! - - dtype = np.object_ - - elif isinstance(val, (np.datetime64, - datetime)) and getattr(val, 'tzinfo', None) is None: - val = lib.Timestamp(val).value - dtype = np.dtype('M8[ns]') - - elif isinstance(val, (np.timedelta64, timedelta)): - val = lib.Timedelta(val).value - dtype = np.dtype('m8[ns]') - - elif is_bool(val): - dtype = np.bool_ - - elif is_integer(val): - if isinstance(val, np.integer): - dtype = type(val) - else: - dtype = np.int64 - - elif is_float(val): - if isinstance(val, np.floating): - dtype = type(val) - else: - dtype = np.float64 - - elif is_complex(val): - dtype = np.complex_ - - return dtype, val - - -def _is_na_compat(arr, fill_value=np.nan): - """ - Parameters - ---------- - arr: a numpy array - fill_value: fill value, default to np.nan - - Returns - ------- - True if we can fill using this fill_value - """ - dtype = arr.dtype - if isnull(fill_value): - return not (is_bool_dtype(dtype) or - is_integer_dtype(dtype)) - return True - - -def _maybe_fill(arr, fill_value=np.nan): - """ - if we have a compatiable fill_value and arr dtype, then fill - """ - if _is_na_compat(arr, fill_value): - arr.fill(fill_value) - return arr - - -def _maybe_promote(dtype, fill_value=np.nan): - - # if we passed an array here, determine the fill value by dtype - if isinstance(fill_value, np.ndarray): - if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): - fill_value = tslib.iNaT - else: - - # we need to change to object type as our - # fill_value is of object type - if fill_value.dtype == np.object_: - dtype = np.dtype(np.object_) - fill_value = np.nan - - # returns tuple of (dtype, fill_value) - if issubclass(dtype.type, (np.datetime64, np.timedelta64)): - # for now: refuse to upcast datetime64 - # (this is because datetime64 will not implicitly upconvert - # to object correctly as of numpy 1.6.1) - if isnull(fill_value): - fill_value = tslib.iNaT - else: - if issubclass(dtype.type, np.datetime64): - try: - fill_value = lib.Timestamp(fill_value).value - except: - # the proper thing to do here would probably be to upcast - # to object (but numpy 1.6.1 doesn't do this properly) - fill_value = tslib.iNaT - elif issubclass(dtype.type, np.timedelta64): - try: - fill_value = lib.Timedelta(fill_value).value - except: - # as for datetimes, cannot upcast to object - fill_value = tslib.iNaT - else: - fill_value = tslib.iNaT - elif is_datetimetz(dtype): - if isnull(fill_value): - fill_value = tslib.iNaT - elif is_float(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, np.integer): - dtype = np.float64 - elif is_bool(fill_value): - if not issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif is_integer(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, np.integer): - # upcast to prevent overflow - arr = np.asarray(fill_value) - if arr != arr.astype(dtype): - dtype = arr.dtype - elif is_complex(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, (np.integer, np.floating)): - dtype = np.complex128 - elif fill_value is None: - if is_float_dtype(dtype) or is_complex_dtype(dtype): - fill_value = np.nan - elif is_integer_dtype(dtype): - dtype = np.float64 - fill_value = np.nan - elif is_datetime_or_timedelta_dtype(dtype): - fill_value = tslib.iNaT - else: - dtype = np.object_ - else: - dtype = np.object_ - - # in case we have a string that looked like a number - if is_categorical_dtype(dtype): - pass - elif is_datetimetz(dtype): - pass - elif issubclass(np.dtype(dtype).type, compat.string_types): - dtype = np.object_ - - return dtype, fill_value - - -def _maybe_upcast_putmask(result, mask, other): - """ - A safe version of putmask that potentially upcasts the result - - Parameters - ---------- - result : ndarray - The destination array. This will be mutated in-place if no upcasting is - necessary. - mask : boolean ndarray - other : ndarray or scalar - The source array or value - - Returns - ------- - result : ndarray - changed : boolean - Set to true if the result array was upcasted - """ - - if mask.any(): - # Two conversions for date-like dtypes that can't be done automatically - # in np.place: - # NaN -> NaT - # integer or integer array -> date-like array - if result.dtype in _DATELIKE_DTYPES: - if lib.isscalar(other): - if isnull(other): - other = result.dtype.type('nat') - elif is_integer(other): - other = np.array(other, dtype=result.dtype) - elif is_integer_dtype(other): - other = np.array(other, dtype=result.dtype) - - def changeit(): - - # try to directly set by expanding our array to full - # length of the boolean - try: - om = other[mask] - om_at = om.astype(result.dtype) - if (om == om_at).all(): - new_result = result.values.copy() - new_result[mask] = om_at - result[:] = new_result - return result, False - except: - pass - - # we are forced to change the dtype of the result as the input - # isn't compatible - r, _ = _maybe_upcast(result, fill_value=other, copy=True) - np.place(r, mask, other) - - return r, True - - # we want to decide whether place will work - # if we have nans in the False portion of our mask then we need to - # upcast (possibly), otherwise we DON't want to upcast (e.g. if we - # have values, say integers, in the success portion then it's ok to not - # upcast) - new_dtype, _ = _maybe_promote(result.dtype, other) - if new_dtype != result.dtype: - - # we have a scalar or len 0 ndarray - # and its nan and we are changing some values - if (lib.isscalar(other) or - (isinstance(other, np.ndarray) and other.ndim < 1)): - if isnull(other): - return changeit() - - # we have an ndarray and the masking has nans in it - else: - - if isnull(other[mask]).any(): - return changeit() - - try: - np.place(result, mask, other) - except: - return changeit() - - return result, False - - -def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): - """ provide explict type promotion and coercion - - Parameters - ---------- - values : the ndarray that we want to maybe upcast - fill_value : what we want to fill with - dtype : if None, then use the dtype of the values, else coerce to this type - copy : if True always make a copy even if no upcast is required - """ - - if is_extension_type(values): - if copy: - values = values.copy() - else: - if dtype is None: - dtype = values.dtype - new_dtype, fill_value = _maybe_promote(dtype, fill_value) - if new_dtype != values.dtype: - values = values.astype(new_dtype) - elif copy: - values = values.copy() - - return values, fill_value - - -def _possibly_cast_item(obj, item, dtype): - chunk = obj[item] - - if chunk.values.dtype != dtype: - if dtype in (np.object_, np.bool_): - obj[item] = chunk.astype(np.object_) - elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover - raise ValueError("Unexpected dtype encountered: %s" % dtype) - - -def _possibly_downcast_to_dtype(result, dtype): - """ try to cast to the specified dtype (e.g. convert back to bool/int - or could be an astype of float64->float32 - """ - - if lib.isscalar(result): - return result - - def trans(x): - return x - - if isinstance(dtype, compat.string_types): - if dtype == 'infer': - inferred_type = lib.infer_dtype(_ensure_object(result.ravel())) - if inferred_type == 'boolean': - dtype = 'bool' - elif inferred_type == 'integer': - dtype = 'int64' - elif inferred_type == 'datetime64': - dtype = 'datetime64[ns]' - elif inferred_type == 'timedelta64': - dtype = 'timedelta64[ns]' - - # try to upcast here - elif inferred_type == 'floating': - dtype = 'int64' - if issubclass(result.dtype.type, np.number): - - def trans(x): # noqa - return x.round() - else: - dtype = 'object' - - if isinstance(dtype, compat.string_types): - dtype = np.dtype(dtype) - - try: - - # don't allow upcasts here (except if empty) - if dtype.kind == result.dtype.kind: - if (result.dtype.itemsize <= dtype.itemsize and - np.prod(result.shape)): - return result - - if issubclass(dtype.type, np.floating): - return result.astype(dtype) - elif dtype == np.bool_ or issubclass(dtype.type, np.integer): - - # if we don't have any elements, just astype it - if not np.prod(result.shape): - return trans(result).astype(dtype) - - # do a test on the first element, if it fails then we are done - r = result.ravel() - arr = np.array([r[0]]) - - # if we have any nulls, then we are done - if isnull(arr).any() or not np.allclose(arr, - trans(arr).astype(dtype)): - return result - - # a comparable, e.g. a Decimal may slip in here - elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, - float, bool)): - return result - - if (issubclass(result.dtype.type, (np.object_, np.number)) and - notnull(result).all()): - new_result = trans(result).astype(dtype) - try: - if np.allclose(new_result, result): - return new_result - except: - - # comparison of an object dtype with a number type could - # hit here - if (new_result == result).all(): - return new_result - - # a datetimelike - elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i']: - try: - result = result.astype(dtype) - except: - if dtype.tz: - # convert to datetime and change timezone - result = pd.to_datetime(result).tz_localize(dtype.tz) - - except: - pass - - return result - - -def _maybe_convert_string_to_object(values): - """ - - Convert string-like and string-like array to convert object dtype. - This is to avoid numpy to handle the array as str dtype. - """ - if isinstance(values, string_types): - values = np.array([values], dtype=object) - elif (isinstance(values, np.ndarray) and - issubclass(values.dtype.type, (np.string_, np.unicode_))): - values = values.astype(object) - return values - - -def _maybe_convert_scalar(values): - """ - Convert a python scalar to the appropriate numpy dtype if possible - This avoids numpy directly converting according to platform preferences - """ - if lib.isscalar(values): - dtype, values = _infer_dtype_from_scalar(values) - try: - values = dtype(values) - except TypeError: - pass - return values - - -def _lcd_dtypes(a_dtype, b_dtype): - """ return the lcd dtype to hold these types """ - - if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype): - return _NS_DTYPE - elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype): - return _TD_DTYPE - elif is_complex_dtype(a_dtype): - if is_complex_dtype(b_dtype): - return a_dtype - return np.float64 - elif is_integer_dtype(a_dtype): - if is_integer_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - return np.int64 - return np.float64 - elif is_float_dtype(a_dtype): - if is_float_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - else: - return np.float64 - elif is_integer(b_dtype): - return np.float64 - return np.object - - def _consensus_name_attr(objs): name = objs[0].name for obj in objs[1:]: @@ -909,66 +132,20 @@ def _consensus_name_attr(objs): return None return name -# ---------------------------------------------------------------------- -# Lots of little utilities - - -def _validate_date_like_dtype(dtype): - try: - typ = np.datetime_data(dtype)[0] - except ValueError as e: - raise TypeError('%s' % e) - if typ != 'generic' and typ != 'ns': - raise ValueError('%r is too specific of a frequency, try passing %r' % - (dtype.name, dtype.type.__name__)) - - -def _invalidate_string_dtypes(dtype_set): - """Change string like dtypes to object for - ``DataFrame.select_dtypes()``. - """ - non_string_dtypes = dtype_set - _string_dtypes - if non_string_dtypes != dtype_set: - raise TypeError("string dtypes are not allowed, use 'object' instead") - - -def _get_dtype_from_object(dtype): - """Get a numpy dtype.type-style object. This handles the datetime64[ns] - and datetime64[ns, TZ] compat - - Notes - ----- - If nothing can be found, returns ``object``. - """ - # type object from a dtype - if isinstance(dtype, type) and issubclass(dtype, np.generic): - return dtype - elif is_categorical(dtype): - return gt.CategoricalDtype().type - elif is_datetimetz(dtype): - return gt.DatetimeTZDtype(dtype).type - elif isinstance(dtype, np.dtype): # dtype object - try: - _validate_date_like_dtype(dtype) - except TypeError: - # should still pass if we don't have a datelike - pass - return dtype.type - elif isinstance(dtype, compat.string_types): - if dtype == 'datetime' or dtype == 'timedelta': - dtype += '64' - - try: - return _get_dtype_from_object(getattr(np, dtype)) - except (AttributeError, TypeError): - # handles cases like _get_dtype(int) - # i.e., python objects that are valid dtypes (unlike user-defined - # types, in general) - # TypeError handles the float16 typecode of 'e' - # further handle internal types - pass - return _get_dtype_from_object(np.dtype(dtype)) +def _maybe_match_name(a, b): + a_has = hasattr(a, 'name') + b_has = hasattr(b, 'name') + if a_has and b_has: + if a.name == b.name: + return a.name + else: + return None + elif a_has: + return a.name + elif b_has: + return b.name + return None def _get_info_slice(obj, indexer): @@ -1005,225 +182,8 @@ def _maybe_box_datetimelike(value): _values_from_object = lib.values_from_object -def _possibly_castable(arr): - # return False to force a non-fastpath - - # check datetime64[ns]/timedelta64[ns] are valid - # otherwise try to coerce - kind = arr.dtype.kind - if kind == 'M' or kind == 'm': - return arr.dtype in _DATELIKE_DTYPES - - return arr.dtype.name not in _POSSIBLY_CAST_DTYPES - - -def _possibly_convert_platform(values): - """ try to do platform conversion, allow ndarray or list here """ - - if isinstance(values, (list, tuple)): - values = lib.list_to_object_array(values) - if getattr(values, 'dtype', None) == np.object_: - if hasattr(values, '_values'): - values = values._values - values = lib.maybe_convert_objects(values) - - return values - - -def _possibly_cast_to_datetime(value, dtype, errors='raise'): - """ try to cast the array/value to a datetimelike dtype, converting float - nan to iNaT - """ - from pandas.tseries.timedeltas import to_timedelta - from pandas.tseries.tools import to_datetime - - if dtype is not None: - if isinstance(dtype, compat.string_types): - dtype = np.dtype(dtype) - - is_datetime64 = is_datetime64_dtype(dtype) - is_datetime64tz = is_datetime64tz_dtype(dtype) - is_timedelta64 = is_timedelta64_dtype(dtype) - - if is_datetime64 or is_datetime64tz or is_timedelta64: - - # force the dtype if needed - if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): - if dtype.name == 'datetime64[ns]': - dtype = _NS_DTYPE - else: - raise TypeError("cannot convert datetimelike to " - "dtype [%s]" % dtype) - elif is_datetime64tz: - - # our NaT doesn't support tz's - # this will coerce to DatetimeIndex with - # a matching dtype below - if lib.isscalar(value) and isnull(value): - value = [value] - - elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): - if dtype.name == 'timedelta64[ns]': - dtype = _TD_DTYPE - else: - raise TypeError("cannot convert timedeltalike to " - "dtype [%s]" % dtype) - - if lib.isscalar(value): - if value == tslib.iNaT or isnull(value): - value = tslib.iNaT - else: - value = np.array(value, copy=False) - - # have a scalar array-like (e.g. NaT) - if value.ndim == 0: - value = tslib.iNaT - - # we have an array of datetime or timedeltas & nulls - elif np.prod(value.shape) or not is_dtype_equal(value.dtype, - dtype): - try: - if is_datetime64: - value = to_datetime(value, errors=errors)._values - elif is_datetime64tz: - # input has to be UTC at this point, so just - # localize - value = to_datetime( - value, - errors=errors).tz_localize(dtype.tz) - elif is_timedelta64: - value = to_timedelta(value, errors=errors)._values - except (AttributeError, ValueError, TypeError): - pass - - # coerce datetimelike to object - elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): - if is_object_dtype(dtype): - ints = np.asarray(value).view('i8') - return tslib.ints_to_pydatetime(ints) - - # we have a non-castable dtype that was passed - raise TypeError('Cannot cast datetime64 to %s' % dtype) - - else: - - is_array = isinstance(value, np.ndarray) - - # catch a datetime/timedelta that is not of ns variety - # and no coercion specified - if is_array and value.dtype.kind in ['M', 'm']: - dtype = value.dtype - - if dtype.kind == 'M' and dtype != _NS_DTYPE: - value = value.astype(_NS_DTYPE) - - elif dtype.kind == 'm' and dtype != _TD_DTYPE: - value = to_timedelta(value) - - # only do this if we have an array and the dtype of the array is not - # setup already we are not an integer/object, so don't bother with this - # conversion - elif not (is_array and not (issubclass(value.dtype.type, np.integer) or - value.dtype == np.object_)): - value = _possibly_infer_to_datetimelike(value) - - return value - - -def _possibly_infer_to_datetimelike(value, convert_dates=False): - """ - we might have an array (or single object) that is datetime like, - and no dtype is passed don't change the value unless we find a - datetime/timedelta set - - this is pretty strict in that a datetime/timedelta is REQUIRED - in addition to possible nulls/string likes - - ONLY strings are NOT datetimelike - - Parameters - ---------- - value : np.array / Series / Index / list-like - convert_dates : boolean, default False - if True try really hard to convert dates (such as datetime.date), other - leave inferred dtype 'date' alone - - """ - - if isinstance(value, (gt.ABCDatetimeIndex, gt.ABCPeriodIndex)): - return value - elif isinstance(value, gt.ABCSeries): - if isinstance(value._values, gt.ABCDatetimeIndex): - return value._values - - v = value - if not is_list_like(v): - v = [v] - v = np.array(v, copy=False) - shape = v.shape - if not v.ndim == 1: - v = v.ravel() - - if len(v): - - def _try_datetime(v): - # safe coerce to datetime64 - try: - v = tslib.array_to_datetime(v, errors='raise') - except ValueError: - - # we might have a sequence of the same-datetimes with tz's - # if so coerce to a DatetimeIndex; if they are not the same, - # then these stay as object dtype - try: - from pandas import to_datetime - return to_datetime(v) - except: - pass - - except: - pass - - return v.reshape(shape) - - def _try_timedelta(v): - # safe coerce to timedelta64 - - # will try first with a string & object conversion - from pandas.tseries.timedeltas import to_timedelta - try: - return to_timedelta(v)._values.reshape(shape) - except: - return v - - # do a quick inference for perf - sample = v[:min(3, len(v))] - inferred_type = lib.infer_dtype(sample) - - if (inferred_type in ['datetime', 'datetime64'] or - (convert_dates and inferred_type in ['date'])): - value = _try_datetime(v) - elif inferred_type in ['timedelta', 'timedelta64']: - value = _try_timedelta(v) - - # It's possible to have nulls intermixed within the datetime or - # timedelta. These will in general have an inferred_type of 'mixed', - # so have to try both datetime and timedelta. - - # try timedelta first to avoid spurious datetime conversions - # e.g. '00:00:01' is a timedelta but technically is also a datetime - elif inferred_type in ['mixed']: - - if lib.is_possible_datetimelike_array(_ensure_object(v)): - value = _try_timedelta(v) - if lib.infer_dtype(value) in ['mixed']: - value = _try_datetime(v) - - return value - - def is_bool_indexer(key): - if isinstance(key, (gt.ABCSeries, np.ndarray)): + if isinstance(key, (ABCSeries, np.ndarray)): if key.dtype == np.object_: key = np.asarray(_values_from_object(key)) @@ -1250,12 +210,6 @@ def _default_index(n): return RangeIndex(0, n, name=None) -def ensure_float(arr): - if issubclass(arr.dtype.type, (np.integer, np.bool_)): - arr = arr.astype(float) - return arr - - def _mut_exclusive(**kwargs): item1, item2 = kwargs.items() label1, val1 = item1 @@ -1287,6 +241,10 @@ def _all_not_none(*args): return True +def _count_not_none(*args): + return sum(x is not None for x in args) + + def _try_sort(iterable): listed = list(iterable) try: @@ -1295,10 +253,6 @@ def _try_sort(iterable): return listed -def _count_not_none(*args): - return sum(x is not None for x in args) - - def iterpairs(seq): """ Parameters @@ -1451,349 +405,6 @@ def _maybe_make_list(obj): return [obj] return obj -# TYPE TESTING - -is_bool = lib.is_bool - -is_integer = lib.is_integer - -is_float = lib.is_float - -is_complex = lib.is_complex - - -def is_string_like(obj): - return isinstance(obj, (compat.text_type, compat.string_types)) - - -def is_iterator(obj): - # python 3 generators have __next__ instead of next - return hasattr(obj, 'next') or hasattr(obj, '__next__') - - -def is_number(obj): - return isinstance(obj, (numbers.Number, np.number)) - - -def is_period_arraylike(arr): - """ return if we are period arraylike / PeriodIndex """ - if isinstance(arr, pd.PeriodIndex): - return True - elif isinstance(arr, (np.ndarray, gt.ABCSeries)): - return arr.dtype == object and lib.infer_dtype(arr) == 'period' - return getattr(arr, 'inferred_type', None) == 'period' - - -def is_datetime_arraylike(arr): - """ return if we are datetime arraylike / DatetimeIndex """ - if isinstance(arr, gt.ABCDatetimeIndex): - return True - elif isinstance(arr, (np.ndarray, gt.ABCSeries)): - return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' - return getattr(arr, 'inferred_type', None) == 'datetime' - - -def is_datetimelike(arr): - return (arr.dtype in _DATELIKE_DTYPES or - isinstance(arr, gt.ABCPeriodIndex) or - is_datetimetz(arr)) - - -def _coerce_to_dtype(dtype): - """ coerce a string / np.dtype to a dtype """ - if is_categorical_dtype(dtype): - dtype = gt.CategoricalDtype() - elif is_datetime64tz_dtype(dtype): - dtype = gt.DatetimeTZDtype(dtype) - else: - dtype = np.dtype(dtype) - return dtype - - -def _get_dtype(arr_or_dtype): - if isinstance(arr_or_dtype, np.dtype): - return arr_or_dtype - elif isinstance(arr_or_dtype, type): - return np.dtype(arr_or_dtype) - elif isinstance(arr_or_dtype, gt.CategoricalDtype): - return arr_or_dtype - elif isinstance(arr_or_dtype, gt.DatetimeTZDtype): - return arr_or_dtype - elif isinstance(arr_or_dtype, compat.string_types): - if is_categorical_dtype(arr_or_dtype): - return gt.CategoricalDtype.construct_from_string(arr_or_dtype) - elif is_datetime64tz_dtype(arr_or_dtype): - return gt.DatetimeTZDtype.construct_from_string(arr_or_dtype) - - if hasattr(arr_or_dtype, 'dtype'): - arr_or_dtype = arr_or_dtype.dtype - return np.dtype(arr_or_dtype) - - -def _get_dtype_type(arr_or_dtype): - if isinstance(arr_or_dtype, np.dtype): - return arr_or_dtype.type - elif isinstance(arr_or_dtype, type): - return np.dtype(arr_or_dtype).type - elif isinstance(arr_or_dtype, gt.CategoricalDtype): - return gt.CategoricalDtypeType - elif isinstance(arr_or_dtype, gt.DatetimeTZDtype): - return gt.DatetimeTZDtypeType - elif isinstance(arr_or_dtype, compat.string_types): - if is_categorical_dtype(arr_or_dtype): - return gt.CategoricalDtypeType - elif is_datetime64tz_dtype(arr_or_dtype): - return gt.DatetimeTZDtypeType - return _get_dtype_type(np.dtype(arr_or_dtype)) - try: - return arr_or_dtype.dtype.type - except AttributeError: - return type(None) - - -def is_dtype_equal(source, target): - """ return a boolean if the dtypes are equal """ - try: - source = _get_dtype(source) - target = _get_dtype(target) - return source == target - except (TypeError, AttributeError): - - # invalid comparison - # object == category will hit this - return False - - -def is_any_int_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.integer) - - -def is_integer_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, np.integer) and - not issubclass(tipo, (np.datetime64, np.timedelta64))) - - -def is_int64_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.int64) - - -def is_int_or_datetime_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, np.integer) or - issubclass(tipo, (np.datetime64, np.timedelta64))) - - -def is_datetime64_dtype(arr_or_dtype): - try: - tipo = _get_dtype_type(arr_or_dtype) - except TypeError: - return False - return issubclass(tipo, np.datetime64) - - -def is_datetime64tz_dtype(arr_or_dtype): - return gt.DatetimeTZDtype.is_dtype(arr_or_dtype) - - -def is_datetime64_any_dtype(arr_or_dtype): - return (is_datetime64_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)) - - -def is_datetime64_ns_dtype(arr_or_dtype): - try: - tipo = _get_dtype(arr_or_dtype) - except TypeError: - return False - return tipo == _NS_DTYPE - - -def is_timedelta64_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.timedelta64) - - -def is_timedelta64_ns_dtype(arr_or_dtype): - tipo = _get_dtype(arr_or_dtype) - return tipo == _TD_DTYPE - - -def is_datetime_or_timedelta_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, (np.datetime64, np.timedelta64)) - - -def is_numeric_v_string_like(a, b): - """ - numpy doesn't like to compare numeric arrays vs scalar string-likes - - return a boolean result if this is the case for a,b or b,a - - """ - is_a_array = isinstance(a, np.ndarray) - is_b_array = isinstance(b, np.ndarray) - - is_a_numeric_array = is_a_array and is_numeric_dtype(a) - is_b_numeric_array = is_b_array and is_numeric_dtype(b) - is_a_string_array = is_a_array and is_string_like_dtype(a) - is_b_string_array = is_b_array and is_string_like_dtype(b) - - is_a_scalar_string_like = not is_a_array and is_string_like(a) - is_b_scalar_string_like = not is_b_array and is_string_like(b) - - return ((is_a_numeric_array and is_b_scalar_string_like) or - (is_b_numeric_array and is_a_scalar_string_like) or - (is_a_numeric_array and is_b_string_array) or - (is_b_numeric_array and is_a_string_array)) - - -def is_datetimelike_v_numeric(a, b): - # return if we have an i8 convertible and numeric comparison - if not hasattr(a, 'dtype'): - a = np.asarray(a) - if not hasattr(b, 'dtype'): - b = np.asarray(b) - - def is_numeric(x): - return is_integer_dtype(x) or is_float_dtype(x) - - is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_numeric(b)) or - (is_datetimelike(b) and is_numeric(a))) - - -def is_datetimelike_v_object(a, b): - # return if we have an i8 convertible and object comparsion - if not hasattr(a, 'dtype'): - a = np.asarray(a) - if not hasattr(b, 'dtype'): - b = np.asarray(b) - - def f(x): - return is_object_dtype(x) - - def is_object(x): - return is_integer_dtype(x) or is_float_dtype(x) - - is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_object(b)) or - (is_datetimelike(b) and is_object(a))) - - -def needs_i8_conversion(arr_or_dtype): - return (is_datetime_or_timedelta_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)) - - -def is_numeric_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, (np.number, np.bool_)) and - not issubclass(tipo, (np.datetime64, np.timedelta64))) - - -def is_string_dtype(arr_or_dtype): - dtype = _get_dtype(arr_or_dtype) - return dtype.kind in ('O', 'S', 'U') - - -def is_string_like_dtype(arr_or_dtype): - # exclude object as its a mixed dtype - dtype = _get_dtype(arr_or_dtype) - return dtype.kind in ('S', 'U') - - -def is_float_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.floating) - - -def is_floating_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return isinstance(tipo, np.floating) - - -def is_bool_dtype(arr_or_dtype): - try: - tipo = _get_dtype_type(arr_or_dtype) - except ValueError: - # this isn't even a dtype - return False - return issubclass(tipo, np.bool_) - - -def is_sparse(array): - """ return if we are a sparse array """ - return isinstance(array, (gt.ABCSparseArray, gt.ABCSparseSeries)) - - -def is_datetimetz(array): - """ return if we are a datetime with tz array """ - return ((isinstance(array, gt.ABCDatetimeIndex) and - getattr(array, 'tz', None) is not None) or - is_datetime64tz_dtype(array)) - - -def is_extension_type(value): - """ - if we are a klass that is preserved by the internals - these are internal klasses that we represent (and don't use a np.array) - """ - if is_categorical(value): - return True - elif is_sparse(value): - return True - elif is_datetimetz(value): - return True - return False - - -def is_categorical(array): - """ return if we are a categorical possibility """ - return isinstance(array, gt.ABCCategorical) or is_categorical_dtype(array) - - -def is_categorical_dtype(arr_or_dtype): - return gt.CategoricalDtype.is_dtype(arr_or_dtype) - - -def is_complex_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.complexfloating) - - -def is_object_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.object_) - - -def is_re(obj): - return isinstance(obj, re._pattern_type) - - -def is_re_compilable(obj): - try: - re.compile(obj) - except TypeError: - return False - else: - return True - - -def is_list_like(arg): - return (hasattr(arg, '__iter__') and - not isinstance(arg, compat.string_and_binary_types)) - - -def is_dict_like(arg): - return hasattr(arg, '__getitem__') and hasattr(arg, 'keys') - - -def is_named_tuple(arg): - return isinstance(arg, tuple) and hasattr(arg, '_fields') - def is_null_slice(obj): """ we have a null slice """ @@ -1807,47 +418,6 @@ def is_full_slice(obj, l): obj.step is None) -def is_hashable(arg): - """Return True if hash(arg) will succeed, False otherwise. - - Some types will pass a test against collections.Hashable but fail when they - are actually hashed with hash(). - - Distinguish between these and other types by trying the call to hash() and - seeing if they raise TypeError. - - Examples - -------- - >>> a = ([],) - >>> isinstance(a, collections.Hashable) - True - >>> is_hashable(a) - False - """ - # unfortunately, we can't use isinstance(arg, collections.Hashable), which - # can be faster than calling hash, because numpy scalars on Python 3 fail - # this test - - # reconsider this decision once this numpy bug is fixed: - # https://github.com/numpy/numpy/issues/5562 - - try: - hash(arg) - except TypeError: - return False - else: - return True - - -def is_sequence(x): - try: - iter(x) - len(x) # it has a length - return not isinstance(x, compat.string_and_binary_types) - except (TypeError, AttributeError): - return False - - def _get_callable_name(obj): # typical case has name if hasattr(obj, '__name__'): @@ -1875,74 +445,6 @@ def _apply_if_callable(maybe_callable, obj, **kwargs): return maybe_callable -_string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, - compat.text_type))) - -_ensure_float64 = algos.ensure_float64 -_ensure_float32 = algos.ensure_float32 -_ensure_int64 = algos.ensure_int64 -_ensure_int32 = algos.ensure_int32 -_ensure_int16 = algos.ensure_int16 -_ensure_int8 = algos.ensure_int8 -_ensure_platform_int = algos.ensure_platform_int -_ensure_object = algos.ensure_object - - -def _astype_nansafe(arr, dtype, copy=True): - """ return a view if copy is False, but - need to be very careful as the result shape could change! """ - if not isinstance(dtype, np.dtype): - dtype = _coerce_to_dtype(dtype) - - if issubclass(dtype.type, compat.text_type): - # in Py3 that's str, in Py2 that's unicode - return lib.astype_unicode(arr.ravel()).reshape(arr.shape) - elif issubclass(dtype.type, compat.string_types): - return lib.astype_str(arr.ravel()).reshape(arr.shape) - elif is_datetime64_dtype(arr): - if dtype == object: - return tslib.ints_to_pydatetime(arr.view(np.int64)) - elif dtype == np.int64: - return arr.view(dtype) - elif dtype != _NS_DTYPE: - raise TypeError("cannot astype a datetimelike from [%s] to [%s]" % - (arr.dtype, dtype)) - return arr.astype(_NS_DTYPE) - elif is_timedelta64_dtype(arr): - if dtype == np.int64: - return arr.view(dtype) - elif dtype == object: - return tslib.ints_to_pytimedelta(arr.view(np.int64)) - - # in py3, timedelta64[ns] are int64 - elif ((compat.PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or - (not compat.PY3 and dtype != _TD_DTYPE)): - - # allow frequency conversions - if dtype.kind == 'm': - mask = isnull(arr) - result = arr.astype(dtype).astype(np.float64) - result[mask] = np.nan - return result - - raise TypeError("cannot astype a timedelta from [%s] to [%s]" % - (arr.dtype, dtype)) - - return arr.astype(_TD_DTYPE) - elif (np.issubdtype(arr.dtype, np.floating) and - np.issubdtype(dtype, np.integer)): - - if np.isnan(arr).any(): - raise ValueError('Cannot convert NA to integer') - elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): - # work around NumPy brokenness, #1987 - return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) - - if copy: - return arr.astype(dtype) - return arr.view(dtype) - - def _all_none(*args): for arg in args: if arg is not None: @@ -1988,6 +490,9 @@ class Sentinel(object): return Sentinel() +# ---------------------------------------------------------------------- +# Detect our environment + def in_interactive_session(): """ check if we're running in an interactive shell @@ -2055,21 +560,6 @@ def in_ipython_frontend(): return False -def _maybe_match_name(a, b): - a_has = hasattr(a, 'name') - b_has = hasattr(b, 'name') - if a_has and b_has: - if a.name == b.name: - return a.name - else: - return None - elif a_has: - return a.name - elif b_has: - return b.name - return None - - def _random_state(state=None): """ Helper function for processing random_state arguments. diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3ca2c6cd014bc..5cbc968f06fa7 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -366,7 +366,7 @@ def mpl_style_cb(key): def use_inf_as_null_cb(key): - from pandas.core.common import _use_inf_as_null + from pandas.types.missing import _use_inf_as_null _use_inf_as_null(key) with cf.config_prefix('mode'): diff --git a/pandas/core/convert.py b/pandas/core/convert.py deleted file mode 100644 index 7f4fe73c688f8..0000000000000 --- a/pandas/core/convert.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Functions for converting object to other types -""" - -import numpy as np - -import pandas as pd -from pandas.core.common import (_possibly_cast_to_datetime, is_object_dtype, - isnull) -import pandas.lib as lib - - -# TODO: Remove in 0.18 or 2017, which ever is sooner -def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, - convert_timedeltas=True, copy=True): - """ if we have an object dtype, try to coerce dates and/or numbers """ - - # if we have passed in a list or scalar - if isinstance(values, (list, tuple)): - values = np.array(values, dtype=np.object_) - if not hasattr(values, 'dtype'): - values = np.array([values], dtype=np.object_) - - # convert dates - if convert_dates and values.dtype == np.object_: - - # we take an aggressive stance and convert to datetime64[ns] - if convert_dates == 'coerce': - new_values = _possibly_cast_to_datetime(values, 'M8[ns]', - errors='coerce') - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects(values, - convert_datetime=convert_dates) - - # convert timedeltas - if convert_timedeltas and values.dtype == np.object_: - - if convert_timedeltas == 'coerce': - from pandas.tseries.timedeltas import to_timedelta - new_values = to_timedelta(values, coerce=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects( - values, convert_timedelta=convert_timedeltas) - - # convert to numeric - if values.dtype == np.object_: - if convert_numeric: - try: - new_values = lib.maybe_convert_numeric(values, set(), - coerce_numeric=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - except: - pass - else: - # soft-conversion - values = lib.maybe_convert_objects(values) - - values = values.copy() if copy else values - - return values - - -def _soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, - coerce=False, copy=True): - """ if we have an object dtype, try to coerce dates and/or numbers """ - - conversion_count = sum((datetime, numeric, timedelta)) - if conversion_count == 0: - raise ValueError('At least one of datetime, numeric or timedelta must ' - 'be True.') - elif conversion_count > 1 and coerce: - raise ValueError("Only one of 'datetime', 'numeric' or " - "'timedelta' can be True when when coerce=True.") - - if isinstance(values, (list, tuple)): - # List or scalar - values = np.array(values, dtype=np.object_) - elif not hasattr(values, 'dtype'): - values = np.array([values], dtype=np.object_) - elif not is_object_dtype(values.dtype): - # If not object, do not attempt conversion - values = values.copy() if copy else values - return values - - # If 1 flag is coerce, ensure 2 others are False - if coerce: - # Immediate return if coerce - if datetime: - return pd.to_datetime(values, errors='coerce', box=False) - elif timedelta: - return pd.to_timedelta(values, errors='coerce', box=False) - elif numeric: - return pd.to_numeric(values, errors='coerce') - - # Soft conversions - if datetime: - values = lib.maybe_convert_objects(values, convert_datetime=datetime) - - if timedelta and is_object_dtype(values.dtype): - # Object check to ensure only run if previous did not convert - values = lib.maybe_convert_objects(values, convert_timedelta=timedelta) - - if numeric and is_object_dtype(values.dtype): - try: - converted = lib.maybe_convert_numeric(values, set(), - coerce_numeric=True) - # If all NaNs, then do not-alter - values = converted if not isnull(converted).all() else values - values = values.copy() if copy else values - except: - pass - - return values diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e01fc6dca6be3..334526b424be5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,12 +23,43 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import ( - isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, - is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like, - _maybe_box_datetimelike, is_categorical_dtype, is_object_dtype, - is_extension_type, is_datetimetz, _possibly_infer_to_datetimelike, - _dict_compat) +from pandas.types.cast import (_maybe_upcast, + _infer_dtype_from_scalar, + _possibly_cast_to_datetime, + _possibly_infer_to_datetimelike, + _possibly_convert_platform, + _possibly_downcast_to_dtype, + _invalidate_string_dtypes, + _coerce_to_dtypes, + _maybe_upcast_putmask) +from pandas.types.common import (is_categorical_dtype, + is_object_dtype, + is_extension_type, + is_datetimetz, + is_datetime64_dtype, + is_bool_dtype, + is_integer_dtype, + is_float_dtype, + is_integer, + is_scalar, + needs_i8_conversion, + _get_dtype_from_object, + _lcd_dtypes, + _ensure_float, + _ensure_float64, + _ensure_int64, + _ensure_platform_int, + is_list_like, + is_iterator, + is_sequence, + is_named_tuple) +from pandas.types.missing import isnull, notnull + +from pandas.core.common import (PandasError, _try_sort, + _default_index, + _values_from_object, + _maybe_box_datetimelike, + _dict_compat) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, @@ -268,7 +299,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data = list(data) if len(data) > 0: if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: - if com.is_named_tuple(data[0]) and columns is None: + if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields arrays, columns = _to_arrays(data, columns, dtype=dtype) columns = _ensure_index(columns) @@ -940,7 +971,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, if columns is not None: columns = _ensure_index(columns) - if com.is_iterator(data): + if is_iterator(data): if nrows == 0: return cls() @@ -1051,7 +1082,7 @@ def to_records(self, index=True, convert_datetime64=True): y : recarray """ if index: - if com.is_datetime64_dtype(self.index) and convert_datetime64: + if is_datetime64_dtype(self.index) and convert_datetime64: ix_vals = [self.index.to_pydatetime()] else: if isinstance(self.index, MultiIndex): @@ -1920,7 +1951,7 @@ def _ixs(self, i, axis=0): copy = True else: new_values = self._data.fast_xs(i) - if lib.isscalar(new_values): + if is_scalar(new_values): return new_values # if we are a copy, mark as such @@ -2072,7 +2103,7 @@ def _getitem_multilevel(self, key): return self._get_item_cache(key) def _getitem_frame(self, key): - if key.values.size and not com.is_bool_dtype(key.values): + if key.values.size and not is_bool_dtype(key.values): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) @@ -2289,7 +2320,7 @@ def select_dtypes(self, include=None, exclude=None): 5 False """ include, exclude = include or (), exclude or () - if not (com.is_list_like(include) and com.is_list_like(exclude)): + if not (is_list_like(include) and is_list_like(exclude)): raise TypeError('include and exclude must both be non-string' ' sequences') selection = tuple(map(frozenset, (include, exclude))) @@ -2300,9 +2331,9 @@ def select_dtypes(self, include=None, exclude=None): # convert the myriad valid dtypes object to a single representation include, exclude = map( - lambda x: frozenset(map(com._get_dtype_from_object, x)), selection) + lambda x: frozenset(map(_get_dtype_from_object, x)), selection) for dtypes in (include, exclude): - com._invalidate_string_dtypes(dtypes) + _invalidate_string_dtypes(dtypes) # can't both include AND exclude! if not include.isdisjoint(exclude): @@ -2392,7 +2423,7 @@ def _setitem_array(self, key, value): def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 - if key.values.size and not com.is_bool_dtype(key.values): + if key.values.size and not is_bool_dtype(key.values): raise TypeError('Must pass DataFrame with boolean values only') self._check_inplace_setting(value) @@ -2586,7 +2617,7 @@ def reindexer(value): value = _sanitize_index(value, self.index, copy=False) if not isinstance(value, (np.ndarray, Index)): if isinstance(value, list) and len(value) > 0: - value = com._possibly_convert_platform(value) + value = _possibly_convert_platform(value) else: value = com._asarray_tuplesafe(value) elif value.ndim == 2: @@ -2602,7 +2633,7 @@ def reindexer(value): # upcast the scalar dtype, value = _infer_dtype_from_scalar(value) value = np.repeat(value, len(self.index)).astype(dtype) - value = com._possibly_cast_to_datetime(value, dtype) + value = _possibly_cast_to_datetime(value, dtype) # return internal types directly if is_extension_type(value): @@ -2916,8 +2947,8 @@ def _maybe_casted_values(index, labels=None): mask = labels == -1 values = values.take(labels) if mask.any(): - values, changed = com._maybe_upcast_putmask(values, mask, - np.nan) + values, changed = _maybe_upcast_putmask(values, mask, + np.nan) return values new_index = _default_index(len(new_obj)) @@ -3131,14 +3162,14 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, raise ValueError('When sorting by column, axis must be 0 (rows)') if not isinstance(by, list): by = [by] - if com.is_sequence(ascending) and len(by) != len(ascending): + if is_sequence(ascending) and len(by) != len(ascending): raise ValueError('Length of ascending (%d) != length of by (%d)' % (len(ascending), len(by))) if len(by) > 1: from pandas.core.groupby import _lexsort_indexer def trans(v): - if com.needs_i8_conversion(v): + if needs_i8_conversion(v): return v.view('i8') return v @@ -3151,7 +3182,7 @@ def trans(v): keys.append(trans(k)) indexer = _lexsort_indexer(keys, orders=ascending, na_position=na_position) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) else: from pandas.core.groupby import _nargsort @@ -3320,7 +3351,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, inplace=inplace, sort_remaining=sort_remaining) def _nsorted(self, columns, n, method, keep): - if not com.is_list_like(columns): + if not is_list_like(columns): columns = [columns] columns = list(columns) ser = getattr(self[columns[0]], method)(n, keep=keep) @@ -3658,28 +3689,28 @@ def combine(self, other, func, fill_value=None, overwrite=True): # if we have different dtypes, possibily promote new_dtype = this_dtype if this_dtype != other_dtype: - new_dtype = com._lcd_dtypes(this_dtype, other_dtype) + new_dtype = _lcd_dtypes(this_dtype, other_dtype) series = series.astype(new_dtype) otherSeries = otherSeries.astype(new_dtype) # see if we need to be represented as i8 (datetimelike) # try to keep us at this dtype - needs_i8_conversion = com.needs_i8_conversion(new_dtype) - if needs_i8_conversion: + needs_i8_conversion_i = needs_i8_conversion(new_dtype) + if needs_i8_conversion_i: this_dtype = new_dtype arr = func(series, otherSeries, True) else: arr = func(series, otherSeries) if do_fill: - arr = com.ensure_float(arr) + arr = _ensure_float(arr) arr[this_mask & other_mask] = NA # try to downcast back to the original dtype - if needs_i8_conversion: - arr = com._possibly_cast_to_datetime(arr, this_dtype) + if needs_i8_conversion_i: + arr = _possibly_cast_to_datetime(arr, this_dtype) else: - arr = com._possibly_downcast_to_dtype(arr, this_dtype) + arr = _possibly_downcast_to_dtype(arr, this_dtype) result[col] = arr @@ -4581,7 +4612,7 @@ def _dict_round(df, decimals): yield vals def _series_round(s, decimals): - if com.is_integer_dtype(s) or com.is_float_dtype(s): + if is_integer_dtype(s) or is_float_dtype(s): return s.round(decimals) return s @@ -4592,7 +4623,7 @@ def _series_round(s, decimals): if not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") new_cols = [col for col in _dict_round(self, decimals)] - elif com.is_integer(decimals): + elif is_integer(decimals): # Dispatch to Series.round new_cols = [_series_round(v, decimals) for _, v in self.iteritems()] @@ -4634,14 +4665,14 @@ def corr(self, method='pearson', min_periods=1): mat = numeric_df.values if method == 'pearson': - correl = _algos.nancorr(com._ensure_float64(mat), minp=min_periods) + correl = _algos.nancorr(_ensure_float64(mat), minp=min_periods) elif method == 'spearman': - correl = _algos.nancorr_spearman(com._ensure_float64(mat), + correl = _algos.nancorr_spearman(_ensure_float64(mat), minp=min_periods) else: if min_periods is None: min_periods = 1 - mat = com._ensure_float64(mat).T + mat = _ensure_float64(mat).T corrf = nanops.get_corr_func(method) K = len(cols) correl = np.empty((K, K), dtype=float) @@ -4696,7 +4727,7 @@ def cov(self, min_periods=None): baseCov = np.cov(mat.T) baseCov = baseCov.reshape((len(cols), len(cols))) else: - baseCov = _algos.nancorr(com._ensure_float64(mat), cov=True, + baseCov = _algos.nancorr(_ensure_float64(mat), cov=True, minp=min_periods) return self._constructor(baseCov, index=cols, columns=cols) @@ -4825,7 +4856,7 @@ def _count_level(self, level, axis=0, numeric_only=False): level = count_axis._get_level_number(level) level_index = count_axis.levels[level] - labels = com._ensure_int64(count_axis.labels[level]) + labels = _ensure_int64(count_axis.labels[level]) counts = lib.count_level_2d(mask, labels, len(level_index), axis=0) result = DataFrame(counts, index=level_index, columns=agg_axis) @@ -4906,7 +4937,7 @@ def f(x): # try to coerce to the original dtypes item by item if we can if axis == 0: - result = com._coerce_to_dtypes(result, self.dtypes) + result = _coerce_to_dtypes(result, self.dtypes) return Series(result, index=labels) @@ -5376,13 +5407,13 @@ def _prep_ndarray(values, copy=True): return np.empty((0, 0), dtype=object) def convert(v): - return com._possibly_convert_platform(v) + return _possibly_convert_platform(v) # we could have a 1-dim or 2-dim list here # this is equiv of np.asarray, but does object conversion # and platform dtype preservation try: - if com.is_list_like(values[0]) or hasattr(values[0], 'len'): + if is_list_like(values[0]) or hasattr(values[0], 'len'): values = np.array([convert(v) for v in values]) else: values = convert(values) @@ -5570,7 +5601,7 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): def convert(arr): if dtype != object and dtype != np.object: arr = lib.maybe_convert_objects(arr, try_float=coerce_float) - arr = com._possibly_cast_to_datetime(arr, dtype) + arr = _possibly_cast_to_datetime(arr, dtype) return arr arrays = [convert(arr) for arr in content] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b4bcae47cbbdf..d6e6f571be53a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8,6 +8,29 @@ import pandas.lib as lib import pandas as pd + + +from pandas.types.common import (_coerce_to_dtype, + _ensure_int64, + needs_i8_conversion, + is_scalar, + is_integer, is_bool, + is_bool_dtype, + is_numeric_dtype, + is_datetime64_dtype, + is_timedelta64_dtype, + is_list_like, + is_dict_like, + is_re_compilable) +from pandas.types.cast import _maybe_promote, _maybe_upcast_putmask +from pandas.types.missing import isnull, notnull +from pandas.types.generic import ABCSeries, ABCPanel + +from pandas.core.common import (_values_from_object, + _maybe_box_datetimelike, + SettingWithCopyError, SettingWithCopyWarning, + AbstractMethodError) + from pandas.core.base import PandasObject from pandas.core.index import (Index, MultiIndex, _ensure_index, InvalidIndexError) @@ -25,11 +48,6 @@ from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lrange, string_types, isidentifier, set_function_name) -from pandas.core.common import (isnull, notnull, is_list_like, - _values_from_object, _maybe_promote, - _maybe_box_datetimelike, ABCSeries, - SettingWithCopyError, SettingWithCopyWarning, - AbstractMethodError) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.core import config @@ -46,10 +64,6 @@ Name or list of names which refer to the axis items.""") -def is_dictlike(x): - return isinstance(x, (dict, com.ABCSeries)) - - def _single_replace(self, to_replace, method, inplace, limit): if self.ndim != 1: raise TypeError('cannot replace {0} with method {1} on a {2}' @@ -116,7 +130,7 @@ def _validate_dtype(self, dtype): """ validate the passed dtype """ if dtype is not None: - dtype = com._coerce_to_dtype(dtype) + dtype = _coerce_to_dtype(dtype) # a compound dtype if dtype.kind == 'V': @@ -310,7 +324,7 @@ def _from_axes(cls, data, axes, **kwargs): def _get_axis_number(self, axis): axis = self._AXIS_ALIASES.get(axis, axis) - if com.is_integer(axis): + if is_integer(axis): if axis in self._AXIS_NAMES: return axis else: @@ -717,8 +731,8 @@ def rename_axis(self, mapper, axis=0, copy=True, inplace=False): 1 2 5 2 3 6 """ - non_mapper = lib.isscalar(mapper) or (com.is_list_like(mapper) and not - com.is_dict_like(mapper)) + non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not + is_dict_like(mapper)) if non_mapper: return self._set_axis_name(mapper, axis=axis) else: @@ -912,7 +926,7 @@ def bool(self): v = self.squeeze() if isinstance(v, (bool, np.bool_)): return bool(v) - elif lib.isscalar(v): + elif is_scalar(v): raise ValueError("bool cannot act on a non-boolean single element " "{0}".format(self.__class__.__name__)) @@ -1764,10 +1778,10 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): else: return self.take(loc, axis=axis, convert=True) - if not lib.isscalar(loc): + if not is_scalar(loc): new_index = self.index[loc] - if lib.isscalar(loc): + if is_scalar(loc): new_values = self._data.fast_xs(loc) # may need to box a datelike-scalar @@ -2340,7 +2354,7 @@ def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False, index = _ensure_index(index) if indexer is not None: - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) # TODO: speed up on homogeneous DataFrame objects new_data = new_data.reindex_indexer(index, indexer, axis=baxis, @@ -3202,10 +3216,10 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, return self if self.ndim == 1: - if isinstance(value, (dict, com.ABCSeries)): + if isinstance(value, (dict, ABCSeries)): from pandas import Series value = Series(value) - elif not com.is_list_like(value): + elif not is_list_like(value): pass else: raise ValueError("invalid fill value with a %s" % @@ -3215,7 +3229,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, inplace=inplace, downcast=downcast) - elif isinstance(value, (dict, com.ABCSeries)): + elif isinstance(value, (dict, ABCSeries)): if axis == 1: raise NotImplementedError('Currently only can fill ' 'with dict/Series column ' @@ -3228,7 +3242,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, obj = result[k] obj.fillna(v, limit=limit, inplace=True) return result - elif not com.is_list_like(value): + elif not is_list_like(value): new_data = self._data.fillna(value=value, limit=limit, inplace=inplace, downcast=downcast) @@ -3354,7 +3368,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, and play with this method to gain intuition about how it works. """ - if not com.is_bool(regex) and to_replace is not None: + if not is_bool(regex) and to_replace is not None: raise AssertionError("'to_replace' must be 'None' if 'regex' is " "not a bool") if axis is not None: @@ -3367,15 +3381,15 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, if value is None: # passing a single value that is scalar like # when value is None (GH5319), for compat - if not is_dictlike(to_replace) and not is_dictlike(regex): + if not is_dict_like(to_replace) and not is_dict_like(regex): to_replace = [to_replace] if isinstance(to_replace, (tuple, list)): return _single_replace(self, to_replace, method, inplace, limit) - if not is_dictlike(to_replace): - if not is_dictlike(regex): + if not is_dict_like(to_replace): + if not is_dict_like(regex): raise TypeError('If "to_replace" and "value" are both None' ' and "to_replace" is not a list, then ' 'regex must be a mapping') @@ -3385,7 +3399,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, items = list(compat.iteritems(to_replace)) keys, values = zip(*items) - are_mappings = [is_dictlike(v) for v in values] + are_mappings = [is_dict_like(v) for v in values] if any(are_mappings): if not all(are_mappings): @@ -3418,8 +3432,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return self new_data = self._data - if is_dictlike(to_replace): - if is_dictlike(value): # {'A' : NA} -> {'A' : 0} + if is_dict_like(to_replace): + if is_dict_like(value): # {'A' : NA} -> {'A' : 0} res = self if inplace else self.copy() for c, src in compat.iteritems(to_replace): if c in value and c in self: @@ -3429,7 +3443,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return None if inplace else res # {'A': NA} -> 0 - elif not com.is_list_like(value): + elif not is_list_like(value): for k, src in compat.iteritems(to_replace): if k in self: new_data = new_data.replace(to_replace=src, @@ -3441,8 +3455,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, raise TypeError('value argument must be scalar, dict, or ' 'Series') - elif com.is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] - if com.is_list_like(value): + elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] + if is_list_like(value): if len(to_replace) != len(value): raise ValueError('Replacement lists must match ' 'in length. Expecting %d got %d ' % @@ -3458,8 +3472,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, value=value, inplace=inplace, regex=regex) elif to_replace is None: - if not (com.is_re_compilable(regex) or - com.is_list_like(regex) or is_dictlike(regex)): + if not (is_re_compilable(regex) or + is_list_like(regex) or is_dict_like(regex)): raise TypeError("'regex' must be a string or a compiled " "regular expression or a list or dict of " "strings or regular expressions, you " @@ -3470,7 +3484,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, else: # dest iterable dict-like - if is_dictlike(value): # NA -> {'A' : 0, 'B' : -1} + if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1} new_data = self._data for k, v in compat.iteritems(value): @@ -3480,7 +3494,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, inplace=inplace, regex=regex) - elif not com.is_list_like(value): # NA -> 0 + elif not is_list_like(value): # NA -> 0 new_data = self._data.replace(to_replace=to_replace, value=value, inplace=inplace, regex=regex) @@ -3792,14 +3806,14 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): 3 0.230930 0.000000 4 1.100000 0.570967 """ - if isinstance(self, com.ABCPanel): + if isinstance(self, ABCPanel): raise NotImplementedError("clip is not supported yet for panels") axis = nv.validate_clip_with_axis(axis, args, kwargs) # GH 2747 (arguments were reversed) if lower is not None and upper is not None: - if lib.isscalar(lower) and lib.isscalar(upper): + if is_scalar(lower) and is_scalar(upper): lower, upper = min(lower, upper), max(lower, upper) result = self @@ -4485,10 +4499,12 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, new_other = np.array(other, dtype=self.dtype) except ValueError: new_other = np.array(other) + except TypeError: + new_other = other # we can end up comparing integers and m8[ns] # which is a numpy no no - is_i8 = com.needs_i8_conversion(self.dtype) + is_i8 = needs_i8_conversion(self.dtype) if is_i8: matches = False else: @@ -4497,7 +4513,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if matches is False or not matches.all(): # coerce other to a common dtype if we can - if com.needs_i8_conversion(self.dtype): + if needs_i8_conversion(self.dtype): try: other = np.array(other, dtype=self.dtype) except: @@ -4550,7 +4566,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, dtype, fill_value = _maybe_promote(other.dtype) new_other = np.empty(len(icond), dtype=dtype) new_other.fill(fill_value) - com._maybe_upcast_putmask(new_other, icond, other) + _maybe_upcast_putmask(new_other, icond, other) other = new_other else: @@ -5058,7 +5074,7 @@ def describe_categorical_1d(data): if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - if com.is_datetime64_dtype(data): + if is_datetime64_dtype(data): asint = data.dropna().values.view('i8') names += ['top', 'freq', 'first', 'last'] result += [lib.Timestamp(top), freq, @@ -5071,11 +5087,11 @@ def describe_categorical_1d(data): return pd.Series(result, index=names, name=data.name) def describe_1d(data): - if com.is_bool_dtype(data): + if is_bool_dtype(data): return describe_categorical_1d(data) - elif com.is_numeric_dtype(data): + elif is_numeric_dtype(data): return describe_numeric_1d(data) - elif com.is_timedelta64_dtype(data): + elif is_timedelta64_dtype(data): return describe_numeric_1d(data) else: return describe_categorical_1d(data) @@ -5162,7 +5178,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, rs = (data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1) if freq is None: - mask = com.isnull(_values_from_object(self)) + mask = isnull(_values_from_object(self)) np.putmask(rs.values, mask, np.nan) return rs diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 077acc1e81444..6179857978b7b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -13,6 +13,25 @@ from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 + +from pandas.types.common import (_DATELIKE_DTYPES, + is_numeric_dtype, + is_timedelta64_dtype, is_datetime64_dtype, + is_categorical_dtype, + is_datetime_or_timedelta_dtype, + is_bool, is_integer_dtype, + is_complex_dtype, + is_bool_dtype, + is_scalar, + _ensure_float64, + _ensure_platform_int, + _ensure_int64, + _ensure_object, + _ensure_float) +from pandas.types.cast import _possibly_downcast_to_dtype +from pandas.types.missing import isnull, notnull, _maybe_fill + +from pandas.core.common import _values_from_object, AbstractMethodError from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) from pandas.core.categorical import Categorical @@ -30,14 +49,7 @@ import pandas.core.algorithms as algos import pandas.core.common as com -from pandas.core.common import(_possibly_downcast_to_dtype, isnull, - notnull, _DATELIKE_DTYPES, is_numeric_dtype, - is_timedelta64_dtype, is_datetime64_dtype, - is_categorical_dtype, _values_from_object, - is_datetime_or_timedelta_dtype, is_bool, - is_bool_dtype, AbstractMethodError, - _maybe_fill) -from pandas.core.config import option_context, is_callable +from pandas.core.config import option_context import pandas.lib as lib from pandas.lib import Timestamp import pandas.tslib as tslib @@ -662,7 +674,7 @@ def apply(self, func, *args, **kwargs): # resolve functions to their callable functions prior, this # wouldn't be needed if args or kwargs: - if is_callable(func): + if callable(func): @wraps(func) def f(g): @@ -752,7 +764,7 @@ def _try_cast(self, result, obj): else: dtype = obj.dtype - if not lib.isscalar(result): + if not is_scalar(result): result = _possibly_downcast_to_dtype(result, dtype) return result @@ -817,7 +829,7 @@ def _python_agg_general(self, func, *args, **kwargs): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): - values = com.ensure_float(values) + values = _ensure_float(values) output[name] = self._try_cast(values[mask], result) @@ -1595,7 +1607,7 @@ def size(self): """ ids, _, ngroup = self.group_info - ids = com._ensure_platform_int(ids) + ids = _ensure_platform_int(ids) out = np.bincount(ids[ids != -1], minlength=ngroup or None) return Series(out, index=self.result_index, dtype='int64') @@ -1631,7 +1643,7 @@ def group_info(self): comp_ids, obs_group_ids = self._get_compressed_labels() ngroups = len(obs_group_ids) - comp_ids = com._ensure_int64(comp_ids) + comp_ids = _ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups def _get_compressed_labels(self): @@ -1671,7 +1683,7 @@ def get_group_levels(self): name_list = [] for ping, labels in zip(self.groupings, self.recons_labels): - labels = com._ensure_platform_int(labels) + labels = _ensure_platform_int(labels) levels = ping.group_index.take(labels) name_list.append(levels) @@ -1780,11 +1792,11 @@ def _cython_operation(self, kind, values, how, axis): values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): - values = _algos.ensure_float64(values) - elif com.is_integer_dtype(values): + values = _ensure_float64(values) + elif is_integer_dtype(values): values = values.astype('int64', copy=False) - elif is_numeric and not com.is_complex_dtype(values): - values = _algos.ensure_float64(values) + elif is_numeric and not is_complex_dtype(values): + values = _ensure_float64(values) else: values = values.astype(object) @@ -1793,7 +1805,7 @@ def _cython_operation(self, kind, values, how, axis): kind, how, values, is_numeric) except NotImplementedError: if is_numeric: - values = _algos.ensure_float64(values) + values = _ensure_float64(values) func, dtype_str = self._get_cython_function( kind, how, values, is_numeric) else: @@ -1821,7 +1833,7 @@ def _cython_operation(self, kind, values, how, axis): result = self._transform( result, accum, values, labels, func, is_numeric) - if com.is_integer_dtype(result): + if is_integer_dtype(result): if len(result[result == tslib.iNaT]) > 0: result = result.astype('float64') result[result == tslib.iNaT] = np.nan @@ -1834,7 +1846,7 @@ def _cython_operation(self, kind, values, how, axis): result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( - com._ensure_object(result), + _ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] @@ -1996,7 +2008,7 @@ def generate_bins_generic(values, binner, closed): class BinGrouper(BaseGrouper): def __init__(self, bins, binlabels, filter_empty=False, mutated=False): - self.bins = com._ensure_int64(bins) + self.bins = _ensure_int64(bins) self.binlabels = _ensure_index(binlabels) self._filter_empty_groups = filter_empty self.mutated = mutated @@ -2061,7 +2073,7 @@ def group_info(self): obs_group_ids = np.arange(ngroups) rep = np.diff(np.r_[0, self.bins]) - rep = com._ensure_platform_int(rep) + rep = _ensure_platform_int(rep) if ngroups == len(self.bins): comp_ids = np.repeat(np.arange(ngroups), rep) else: @@ -2449,7 +2461,7 @@ def is_in_obj(gpr): def _is_label_like(val): return (isinstance(val, compat.string_types) or - (val is not None and lib.isscalar(val))) + (val is not None and is_scalar(val))) def _convert_grouper(axis, grouper): @@ -2671,7 +2683,7 @@ def _aggregate_multiple_funcs(self, arg, _level): results[name] = obj.aggregate(func) if isinstance(list(compat.itervalues(results))[0], - com.ABCDataFrame): + DataFrame): # let higher level handle if _level: @@ -2870,9 +2882,9 @@ def nunique(self, dropna=True): 'val.dtype must be object, got %s' % val.dtype val, _ = algos.factorize(val, sort=False) sorter = np.lexsort((val, ids)) - isnull = lambda a: a == -1 + _isnull = lambda a: a == -1 else: - isnull = com.isnull + _isnull = isnull ids, val = ids[sorter], val[sorter] @@ -2882,7 +2894,7 @@ def nunique(self, dropna=True): inc = np.r_[1, val[1:] != val[:-1]] # 1st item of each group is a new unique observation - mask = isnull(val) + mask = _isnull(val) if dropna: inc[idx] = 1 inc[mask] = 0 @@ -2998,8 +3010,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) - if com.is_integer_dtype(out): - out = com._ensure_int64(out) + if is_integer_dtype(out): + out = _ensure_int64(out) return Series(out, index=mi, name=self.name) # for compat. with algos.value_counts need to ensure every @@ -3029,8 +3041,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) - if com.is_integer_dtype(out): - out = com._ensure_int64(out) + if is_integer_dtype(out): + out = _ensure_int64(out) return Series(out, index=mi, name=self.name) def count(self): @@ -3039,7 +3051,7 @@ def count(self): val = self.obj.get_values() mask = (ids != -1) & ~isnull(val) - ids = com._ensure_platform_int(ids) + ids = _ensure_platform_int(ids) out = np.bincount(ids[mask], minlength=ngroups or None) return Series(out, @@ -3616,7 +3628,7 @@ def filter(self, func, dropna=True, *args, **kwargs): # noqa pass # interpret the result of the filter - if is_bool(res) or (lib.isscalar(res) and isnull(res)): + if is_bool(res) or (is_scalar(res) and isnull(res)): if res and notnull(res): indices.append(self._get_index(name)) else: @@ -3813,7 +3825,7 @@ def count(self): """ Compute count of group, excluding missing values """ from functools import partial from pandas.lib import count_level_2d - from pandas.core.common import _isnull_ndarraylike as isnull + from pandas.types.missing import _isnull_ndarraylike as isnull data, _ = self._get_data_to_aggregate() ids, _, ngroups = self.grouper.group_info @@ -3934,7 +3946,7 @@ class DataSplitter(object): def __init__(self, data, labels, ngroups, axis=0): self.data = data - self.labels = com._ensure_int64(labels) + self.labels = _ensure_int64(labels) self.ngroups = ngroups self.axis = axis @@ -4115,7 +4127,7 @@ def loop(labels, shape): def maybe_lift(lab, size): # pormote nan values return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - labels = map(com._ensure_int64, labels) + labels = map(_ensure_int64, labels) if not xnull: labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) @@ -4331,9 +4343,9 @@ def _get_group_index_sorter(group_index, ngroups): alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters if alpha + beta * ngroups < count * np.log(count): - sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index), + sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), ngroups) - return com._ensure_platform_int(sorter) + return _ensure_platform_int(sorter) else: return group_index.argsort(kind='mergesort') @@ -4348,7 +4360,7 @@ def _compress_group_index(group_index, sort=True): size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT) table = _hash.Int64HashTable(size_hint) - group_index = com._ensure_int64(group_index) + group_index = _ensure_int64(group_index) # note, group labels come out ascending (ie, 1,2,3 etc) comp_ids, obs_group_ids = table.get_labels_groupby(group_index) @@ -4390,7 +4402,7 @@ def _groupby_indices(values): _, counts = _hash.value_count_scalar64(codes, False) else: reverse, codes, counts = _algos.group_labels( - _values_from_object(com._ensure_object(values))) + _values_from_object(_ensure_object(values))) return _algos.groupby_indices(reverse, codes, counts) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9485f50ed07f1..0cba8308c1c53 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,17 +1,24 @@ # pylint: disable=W0223 -from pandas.core.index import Index, MultiIndex +import numpy as np from pandas.compat import range, zip import pandas.compat as compat +from pandas.types.generic import ABCDataFrame, ABCPanel, ABCSeries +from pandas.types.common import (is_integer_dtype, + is_integer, is_float, + is_categorical_dtype, + is_list_like, + is_sequence, + is_scalar, + _ensure_platform_int) +from pandas.types.missing import isnull, _infer_fill_value + +from pandas.core.index import Index, MultiIndex + import pandas.core.common as com -import pandas.lib as lib -from pandas.core.common import (is_bool_indexer, is_integer_dtype, - _asarray_tuplesafe, is_list_like, isnull, - is_null_slice, is_full_slice, ABCSeries, - ABCDataFrame, ABCPanel, is_float, - _values_from_object, _infer_fill_value, - is_integer) -import numpy as np +from pandas.core.common import (is_bool_indexer, _asarray_tuplesafe, + is_null_slice, is_full_slice, + _values_from_object) # the supported indexers @@ -67,7 +74,7 @@ def __getitem__(self, key): key = tuple(com._apply_if_callable(x, self.obj) for x in key) try: values = self.obj.get_value(*key) - if lib.isscalar(values): + if is_scalar(values): return values except Exception: pass @@ -625,7 +632,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) if (sum_aligners == self.ndim and - all([com.is_sequence(_) for _ in indexer])): + all([is_sequence(_) for _ in indexer])): ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values # single indexer @@ -639,7 +646,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): ax = obj.axes[i] # multiple aligners (or null slices) - if com.is_sequence(idx) or isinstance(idx, slice): + if is_sequence(idx) or isinstance(idx, slice): if single_aligner and is_null_slice(idx): continue new_ix = ax[idx] @@ -685,7 +692,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): return ser - elif lib.isscalar(indexer): + elif is_scalar(indexer): ax = self.obj._get_axis(1) if ser.index.equals(ax): @@ -710,7 +717,7 @@ def _align_frame(self, indexer, df): sindexers = [] for i, ix in enumerate(indexer): ax = self.obj.axes[i] - if com.is_sequence(ix) or isinstance(ix, slice): + if is_sequence(ix) or isinstance(ix, slice): if idx is None: idx = ax[ix].ravel() elif cols is None: @@ -761,7 +768,7 @@ def _align_frame(self, indexer, df): val = df.reindex(index=ax)._values return val - elif lib.isscalar(indexer) and is_panel: + elif is_scalar(indexer) and is_panel: idx = self.obj.axes[1] cols = self.obj.axes[2] @@ -857,7 +864,7 @@ def _convert_for_reindex(self, key, axis=0): keyarr = _asarray_tuplesafe(key) if is_integer_dtype(keyarr) and not labels.is_integer(): - keyarr = com._ensure_platform_int(keyarr) + keyarr = _ensure_platform_int(keyarr) return labels.take(keyarr) return keyarr @@ -968,7 +975,7 @@ def _getitem_nested_tuple(self, tup): axis += 1 # if we have a scalar, we are done - if lib.isscalar(obj) or not hasattr(obj, 'ndim'): + if is_scalar(obj) or not hasattr(obj, 'ndim'): break # has the dim of the obj changed? @@ -1038,7 +1045,7 @@ def _getitem_iterable(self, key, axis=0): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - if com.is_categorical_dtype(labels): + if is_categorical_dtype(labels): keyarr = labels._shallow_copy(keyarr) # have the index handle the indexer and possibly return @@ -1799,7 +1806,7 @@ def check_bool_indexer(ax, key): result = key if isinstance(key, ABCSeries) and not key.index.equals(ax): result = result.reindex(ax) - mask = com.isnull(result._values) + mask = isnull(result._values) if mask.any(): raise IndexingError('Unalignable boolean Series key provided') @@ -1941,9 +1948,9 @@ def _non_reducing_slice(slice_): def pred(part): # true when slice does *not* reduce - return isinstance(part, slice) or com.is_list_like(part) + return isinstance(part, slice) or is_list_like(part) - if not com.is_list_like(slice_): + if not is_list_like(slice_): if not isinstance(slice_, slice): # a 1-d slice, like df.loc[1] slice_ = [[slice_]] diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1ea567f15cb7f..363ac8249eb06 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -10,29 +10,48 @@ from pandas.core.base import PandasObject -from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE, - _TD_DTYPE, ABCSeries, is_list_like, - _infer_dtype_from_scalar, is_null_slice, - is_dtype_equal, is_null_datelike_scalar, - _maybe_promote, is_timedelta64_dtype, - is_datetime64_dtype, is_datetimetz, is_sparse, - array_equivalent, _is_na_compat, - _maybe_convert_string_to_object, - _maybe_convert_scalar, - is_categorical, is_datetimelike_v_numeric, - is_numeric_v_string_like, is_extension_type) +from pandas.types.dtypes import DatetimeTZDtype, CategoricalDtype +from pandas.types.common import (_TD_DTYPE, _NS_DTYPE, + _ensure_int64, _ensure_platform_int, + is_integer, + is_dtype_equal, + is_timedelta64_dtype, + is_datetime64_dtype, is_datetimetz, is_sparse, + is_categorical, is_categorical_dtype, + is_integer_dtype, + is_datetime64tz_dtype, + is_object_dtype, + is_datetimelike_v_numeric, + is_numeric_v_string_like, is_extension_type, + is_list_like, + is_re, + is_re_compilable, + is_scalar, + _get_dtype) +from pandas.types.cast import (_possibly_downcast_to_dtype, + _maybe_convert_string_to_object, + _maybe_upcast, + _maybe_convert_scalar, _maybe_promote, + _infer_dtype_from_scalar, + _soft_convert_objects, + _possibly_convert_objects, + _astype_nansafe) +from pandas.types.missing import (isnull, array_equivalent, + _is_na_compat, + is_null_datelike_scalar) +import pandas.types.concat as _concat + +from pandas.types.generic import ABCSeries +from pandas.core.common import is_null_slice import pandas.core.algorithms as algos -from pandas.types.api import DatetimeTZDtype from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer from pandas.core.categorical import Categorical, maybe_to_categorical from pandas.tseries.index import DatetimeIndex from pandas.formats.printing import pprint_thing -import pandas.core.common as com -import pandas.types.concat as _concat + import pandas.core.missing as missing -import pandas.core.convert as convert from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib import pandas.tslib as tslib @@ -112,8 +131,8 @@ def is_categorical_astype(self, dtype): validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ - if com.is_categorical_dtype(dtype): - if dtype == com.CategoricalDtype(): + if is_categorical_dtype(dtype): + if dtype == CategoricalDtype(): return True # this is a pd.Categorical, but is not @@ -137,7 +156,7 @@ def get_values(self, dtype=None): return an internal format, currently just the ndarray this is often overriden to handle to_dense like operations """ - if com.is_object_dtype(dtype): + if is_object_dtype(dtype): return self.values.astype(object) return self.values @@ -481,7 +500,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, values = self.get_values(dtype=dtype) # _astype_nansafe works fine with 1-d only - values = com._astype_nansafe(values.ravel(), dtype, copy=True) + values = _astype_nansafe(values.ravel(), dtype, copy=True) values = values.reshape(self.shape) newb = make_block(values, placement=self.mgr_locs, dtype=dtype, @@ -651,7 +670,7 @@ def setitem(self, indexer, value, mgr=None): # cast the values to a type that can hold nan (if necessary) if not self._can_hold_element(value): - dtype, _ = com._maybe_promote(arr_value.dtype) + dtype, _ = _maybe_promote(arr_value.dtype) values = values.astype(dtype) transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x) @@ -684,7 +703,7 @@ def _is_scalar_indexer(indexer): if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) - return all([lib.isscalar(idx) for idx in indexer]) + return all([is_scalar(idx) for idx in indexer]) return False def _is_empty_indexer(indexer): @@ -724,7 +743,7 @@ def _is_empty_indexer(indexer): if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, value.dtype): dtype = value.dtype - elif lib.isscalar(value): + elif is_scalar(value): dtype, _ = _infer_dtype_from_scalar(value) else: dtype = 'infer' @@ -838,7 +857,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, n = np.array(new) # type of the new block - dtype, _ = com._maybe_promote(n.dtype) + dtype, _ = _maybe_promote(n.dtype) # we need to explicitly astype here to make a copy n = n.astype(dtype) @@ -1027,7 +1046,7 @@ def shift(self, periods, axis=0, mgr=None): # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = com._maybe_upcast(self.values) + new_values, fill_value = _maybe_upcast(self.values) # make sure array sent to np.roll is c_contiguous f_ordered = new_values.flags.f_contiguous @@ -1036,7 +1055,7 @@ def shift(self, periods, axis=0, mgr=None): axis = new_values.ndim - axis - 1 if np.prod(new_values.shape): - new_values = np.roll(new_values, com._ensure_platform_int(periods), + new_values = np.roll(new_values, _ensure_platform_int(periods), axis=axis) axis_indexer = [slice(None)] * self.ndim @@ -1306,7 +1325,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): from pandas import Float64Index is_empty = values.shape[axis] == 0 - if com.is_list_like(qs): + if is_list_like(qs): ax = Float64Index(qs) if is_empty: @@ -1350,7 +1369,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): ndim = getattr(result, 'ndim', None) or 0 result = self._try_coerce_result(result) - if lib.isscalar(result): + if is_scalar(result): return ax, self.make_block_scalar(result) return ax, make_block(result, placement=np.arange(len(result)), @@ -1591,7 +1610,7 @@ def _can_hold_element(self, element): tipo = element.dtype.type return (issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))) - return com.is_integer(element) + return is_integer(element) def _try_cast(self, element): try: @@ -1600,7 +1619,7 @@ def _try_cast(self, element): return element def should_store(self, value): - return com.is_integer_dtype(value) and value.dtype == self.dtype + return is_integer_dtype(value) and value.dtype == self.dtype class DatetimeLikeBlockMixin(object): @@ -1621,7 +1640,7 @@ def get_values(self, dtype=None): """ return object dtype as boxed values, such as Timestamps/Timedelta """ - if com.is_object_dtype(dtype): + if is_object_dtype(dtype): return lib.map_infer(self.values.ravel(), self._box_func).reshape(self.values.shape) return self.values @@ -1641,7 +1660,7 @@ def fillna(self, value, **kwargs): # allow filling with integers to be # interpreted as seconds - if not isinstance(value, np.timedelta64) and com.is_integer(value): + if not isinstance(value, np.timedelta64) and is_integer(value): value = Timedelta(value, unit='s') return super(TimeDeltaBlock, self).fillna(value, **kwargs) @@ -1795,10 +1814,10 @@ def convert(self, *args, **kwargs): new_style |= kw in kwargs if new_style: - fn = convert._soft_convert_objects + fn = _soft_convert_objects fn_inputs = new_inputs else: - fn = convert._possibly_convert_objects + fn = _possibly_convert_objects fn_inputs = ['convert_dates', 'convert_numeric', 'convert_timedeltas'] fn_inputs += ['copy'] @@ -1884,15 +1903,15 @@ def should_store(self, value): def replace(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True, mgr=None): - to_rep_is_list = com.is_list_like(to_replace) - value_is_list = com.is_list_like(value) + to_rep_is_list = is_list_like(to_replace) + value_is_list = is_list_like(value) both_lists = to_rep_is_list and value_is_list either_list = to_rep_is_list or value_is_list result_blocks = [] blocks = [self] - if not either_list and com.is_re(to_replace): + if not either_list and is_re(to_replace): return self._replace_single(to_replace, value, inplace=inplace, filter=filter, regex=True, convert=convert, mgr=mgr) @@ -1930,10 +1949,10 @@ def replace(self, to_replace, value, inplace=False, filter=None, def _replace_single(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True, mgr=None): # to_replace is regex compilable - to_rep_re = regex and com.is_re_compilable(to_replace) + to_rep_re = regex and is_re_compilable(to_replace) # regex is regex compilable - regex_re = com.is_re_compilable(regex) + regex_re = is_re_compilable(regex) # only one will survive if to_rep_re and regex_re: @@ -2046,7 +2065,7 @@ def _try_coerce_result(self, result): # GH12564: CategoricalBlock is 1-dim only # while returned results could be any dim - if ((not com.is_categorical_dtype(result)) and + if ((not is_categorical_dtype(result)) and isinstance(result, np.ndarray)): result = _block_shape(result, ndim=self.ndim) @@ -2151,7 +2170,7 @@ def _astype(self, dtype, mgr=None, **kwargs): """ # if we are passed a datetime64[ns, tz] - if com.is_datetime64tz_dtype(dtype): + if is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) values = self.values @@ -2167,7 +2186,7 @@ def _can_hold_element(self, element): if is_list_like(element): element = np.array(element) return element.dtype == _NS_DTYPE or element.dtype == np.int64 - return (com.is_integer(element) or isinstance(element, datetime) or + return (is_integer(element) or isinstance(element, datetime) or isnull(element)) def _try_cast(self, element): @@ -2209,7 +2228,7 @@ def _try_coerce_args(self, values, other): "naive Block") other_mask = isnull(other) other = other.asm8.view('i8') - elif hasattr(other, 'dtype') and com.is_integer_dtype(other): + elif hasattr(other, 'dtype') and is_integer_dtype(other): other = other.view('i8') else: try: @@ -2315,7 +2334,7 @@ def external_values(self): def get_values(self, dtype=None): # return object dtype as Timestamps with the zones - if com.is_object_dtype(dtype): + if is_object_dtype(dtype): f = lambda x: lib.Timestamp(x, tz=self.values.tz) return lib.map_infer( self.values.ravel(), f).reshape(self.values.shape) @@ -2561,7 +2580,7 @@ def shift(self, periods, axis=0, mgr=None): new_values = self.values.to_dense().take(indexer) # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = com._maybe_upcast(new_values) + new_values, fill_value = _maybe_upcast(new_values) if periods > 0: new_values[:periods] = fill_value else: @@ -3491,7 +3510,7 @@ def get(self, item, fastpath=True): indexer = np.arange(len(self.items))[isnull(self.items)] # allow a single nan location indexer - if not lib.isscalar(indexer): + if not is_scalar(indexer): if len(indexer) == 1: loc = indexer.item() else: @@ -3823,7 +3842,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_tuple[0] is None: - _, fill_value = com._maybe_promote(blk.dtype) + _, fill_value = _maybe_promote(blk.dtype) fill_tuple = (fill_value, ) return [blk.take_nd(slobj, axis=0, @@ -3881,7 +3900,7 @@ def _make_na_block(self, placement, fill_value=None): block_shape = list(self.shape) block_shape[0] = len(placement) - dtype, fill_value = com._infer_dtype_from_scalar(fill_value) + dtype, fill_value = _infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) return make_block(block_values, placement=placement) @@ -4560,7 +4579,7 @@ def _possibly_compare(a, b, op): else: result = op(a, b) - if lib.isscalar(result) and (is_a_array or is_b_array): + if is_scalar(result) and (is_a_array or is_b_array): type_names = [type(a).__name__, type(b).__name__] if is_a_array: @@ -4611,7 +4630,7 @@ def _factor_indexer(shape, labels): expanded label indexer """ mult = np.array(shape)[::-1].cumprod()[::-1] - return com._ensure_platform_int( + return _ensure_platform_int( np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) @@ -4631,7 +4650,7 @@ def _get_blkno_placements(blknos, blk_count, group=True): """ - blknos = com._ensure_int64(blknos) + blknos = _ensure_int64(blknos) # FIXME: blk_count is unused, but it may avoid the use of dicts in cython for blkno, indexer in lib.get_blkno_indexers(blknos, group): @@ -4721,7 +4740,7 @@ def _putmask_smart(v, m, n): pass # change the dtype - dtype, _ = com._maybe_promote(n.dtype) + dtype, _ = _maybe_promote(n.dtype) nv = v.astype(dtype) try: nv[m] = n[m] @@ -4787,9 +4806,9 @@ def get_empty_dtype_and_na(join_units): if dtype is None: continue - if com.is_categorical_dtype(dtype): + if is_categorical_dtype(dtype): upcast_cls = 'category' - elif com.is_datetimetz(dtype): + elif is_datetimetz(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' @@ -5062,8 +5081,8 @@ def dtype(self): if not self.needs_filling: return self.block.dtype else: - return com._get_dtype(com._maybe_promote(self.block.dtype, - self.block.fill_value)[0]) + return _get_dtype(_maybe_promote(self.block.dtype, + self.block.fill_value)[0]) return self._dtype diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 911fcaf529f98..b847415f274db 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -5,10 +5,15 @@ import numpy as np from distutils.version import LooseVersion -import pandas.core.common as com import pandas.algos as algos import pandas.lib as lib from pandas.compat import range, string_types +from pandas.types.common import (is_numeric_v_string_like, + is_float_dtype, is_datetime64_dtype, + is_integer_dtype, _ensure_float64, + is_scalar, + _DATELIKE_DTYPES) +from pandas.types.missing import isnull def mask_missing(arr, values_to_mask): @@ -24,7 +29,7 @@ def mask_missing(arr, values_to_mask): except Exception: values_to_mask = np.array(values_to_mask, dtype=object) - na_mask = com.isnull(values_to_mask) + na_mask = isnull(values_to_mask) nonna = values_to_mask[~na_mask] mask = None @@ -32,28 +37,28 @@ def mask_missing(arr, values_to_mask): if mask is None: # numpy elementwise comparison warning - if com.is_numeric_v_string_like(arr, x): + if is_numeric_v_string_like(arr, x): mask = False else: mask = arr == x # if x is a string and arr is not, then we get False and we must # expand the mask to size arr.shape - if lib.isscalar(mask): + if is_scalar(mask): mask = np.zeros(arr.shape, dtype=bool) else: # numpy elementwise comparison warning - if com.is_numeric_v_string_like(arr, x): + if is_numeric_v_string_like(arr, x): mask |= False else: mask |= arr == x if na_mask.any(): if mask is None: - mask = com.isnull(arr) + mask = isnull(arr) else: - mask |= com.isnull(arr) + mask |= isnull(arr) return mask @@ -110,7 +115,7 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, """ # Treat the original, non-scipy methods first. - invalid = com.isnull(yvalues) + invalid = isnull(yvalues) valid = ~invalid if not valid.any(): @@ -442,12 +447,12 @@ def pad_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_1d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object @@ -456,7 +461,7 @@ def pad_1d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values @@ -467,12 +472,12 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'backfill_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_1d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.backfill_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_inplace_object @@ -481,7 +486,7 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) @@ -493,12 +498,12 @@ def pad_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'pad_2d_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_2d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.pad_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_2d_inplace_object @@ -507,7 +512,7 @@ def pad_2d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for pad_2d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): @@ -523,12 +528,12 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_2d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object @@ -537,7 +542,7 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): @@ -570,22 +575,22 @@ def fill_zeros(result, x, y, name, fill): mask the nan's from x """ - if fill is None or com.is_float_dtype(result): + if fill is None or is_float_dtype(result): return result if name.startswith(('r', '__r')): x, y = y, x - is_typed_variable = (hasattr(y, 'dtype') or hasattr(y, 'type')) - is_scalar = lib.isscalar(y) + is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type')) + is_scalar_type = is_scalar(y) - if not is_typed_variable and not is_scalar: + if not is_variable_type and not is_scalar_type: return result - if is_scalar: + if is_scalar_type: y = np.array(y) - if com.is_integer_dtype(y): + if is_integer_dtype(y): if (y == 0).any(): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f390e3f04a6c3..7b89373dda7ba 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -11,16 +11,19 @@ import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib -from pandas.core.common import (isnull, notnull, _values_from_object, - _maybe_upcast_putmask, _ensure_float64, - _ensure_int64, _ensure_object, is_float, - is_integer, is_complex, is_float_dtype, - is_complex_dtype, is_integer_dtype, - is_bool_dtype, is_object_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_datetime_or_timedelta_dtype, _get_dtype, - is_int_or_datetime_dtype, is_any_int_dtype, - _int64_max) +from pandas.types.common import (_ensure_int64, _ensure_object, + _ensure_float64, _get_dtype, + is_float, is_scalar, + is_integer, is_complex, is_float_dtype, + is_complex_dtype, is_integer_dtype, + is_bool_dtype, is_object_dtype, + is_datetime64_dtype, is_timedelta64_dtype, + is_datetime_or_timedelta_dtype, + is_int_or_datetime_dtype, is_any_int_dtype) +from pandas.types.cast import _int64_max, _maybe_upcast_putmask +from pandas.types.missing import isnull, notnull + +from pandas.core.common import _values_from_object class disallow(object): @@ -351,7 +354,7 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): d = count - dtype.type(ddof) # always return NaN, never inf - if lib.isscalar(count): + if is_scalar(count): if count <= ddof: count = np.nan d = np.nan @@ -623,7 +626,7 @@ def _get_counts(mask, axis, dtype=float): return dtype.type(mask.size - mask.sum()) count = mask.shape[axis] - mask.sum(axis) - if lib.isscalar(count): + if is_scalar(count): return dtype.type(count) try: return count.astype(dtype) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 3aaca1eea486e..d76f011df3dd8 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -13,21 +13,25 @@ from pandas import compat, lib, tslib import pandas.index as _index from pandas.util.decorators import Appender -import pandas.core.common as com import pandas.computation.expressions as expressions from pandas.lib import isscalar from pandas.tslib import iNaT from pandas.compat import bind_method import pandas.core.missing as missing import pandas.algos as _algos -from pandas.core.common import (is_list_like, notnull, isnull, - _values_from_object, _maybe_match_name, - needs_i8_conversion, is_datetimelike_v_numeric, - is_integer_dtype, is_categorical_dtype, - is_object_dtype, is_timedelta64_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, - is_bool_dtype, PerformanceWarning, - ABCSeries, ABCIndex) +from pandas.core.common import (_values_from_object, _maybe_match_name, + PerformanceWarning) +from pandas.types.missing import notnull, isnull +from pandas.types.common import (needs_i8_conversion, + is_datetimelike_v_numeric, + is_integer_dtype, is_categorical_dtype, + is_object_dtype, is_timedelta64_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_bool_dtype, is_datetimetz, + is_list_like, + _ensure_object) +from pandas.types.cast import _maybe_upcast_putmask +from pandas.types.generic import ABCSeries, ABCIndex # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory @@ -446,7 +450,7 @@ def _convert_to_array(self, values, name=None, other=None): supplied_dtype = values.dtype inferred_type = supplied_dtype or lib.infer_dtype(values) if (inferred_type in ('datetime64', 'datetime', 'date', 'time') or - com.is_datetimetz(inferred_type)): + is_datetimetz(inferred_type)): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (supplied_dtype is None and other is not None and @@ -463,7 +467,7 @@ def _convert_to_array(self, values, name=None, other=None): hasattr(ovalues, 'tz')): values = pd.DatetimeIndex(values) # datetime array with tz - elif com.is_datetimetz(values): + elif is_datetimetz(values): if isinstance(values, ABCSeries): values = values._values elif not (isinstance(values, (np.ndarray, ABCSeries)) and @@ -625,7 +629,7 @@ def na_op(x, y): "{op}".format(typ=type(x).__name__, op=str_rep)) - result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result @@ -820,8 +824,8 @@ def na_op(x, y): if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)): result = op(x, y) # when would this be hit? else: - x = com._ensure_object(x) - y = com._ensure_object(y) + x = _ensure_object(x) + y = _ensure_object(y) result = lib.vec_binop(x, y, op) else: try: @@ -1095,7 +1099,7 @@ def na_op(x, y): "objects of type {x} and {y}".format( op=name, x=type(x), y=type(y))) - result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) result = missing.fill_zeros(result, x, y, name, fill_zeros) @@ -1220,7 +1224,7 @@ def na_op(x, y): result = np.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) - result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 7d0bedcc2b381..4d61563cccce5 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -8,17 +8,21 @@ import numpy as np +from pandas.types.cast import (_infer_dtype_from_scalar, + _possibly_cast_item) +from pandas.types.common import (is_integer, is_list_like, + is_string_like, is_scalar) +from pandas.types.missing import notnull + import pandas.computation.expressions as expressions import pandas.core.common as com import pandas.core.ops as ops import pandas.core.missing as missing from pandas import compat -from pandas import lib from pandas.compat import (map, zip, range, u, OrderedDict, OrderedDefaultdict) from pandas.compat.numpy import function as nv from pandas.core.categorical import Categorical -from pandas.core.common import (PandasError, _try_sort, _default_index, - _infer_dtype_from_scalar, is_list_like) +from pandas.core.common import PandasError, _try_sort, _default_index from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import (Index, MultiIndex, _ensure_index, @@ -168,7 +172,7 @@ def _init_data(self, data, copy, dtype, **kwargs): mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) copy = False dtype = None - elif lib.isscalar(data) and all(x is not None for x in passed_axes): + elif is_scalar(data) and all(x is not None for x in passed_axes): if dtype is None: dtype, data = _infer_dtype_from_scalar(data) values = np.empty([len(x) for x in passed_axes], dtype=dtype) @@ -552,7 +556,7 @@ def set_value(self, *args, **kwargs): made_bigger = not np.array_equal(axes[0], self._info_axis) # how to make this logic simpler? if made_bigger: - com._possibly_cast_item(result, args[0], likely_dtype) + _possibly_cast_item(result, args[0], likely_dtype) return result.set_value(*args) @@ -582,7 +586,7 @@ def __setitem__(self, key, value): 'object was {1}'.format( shape[1:], tuple(map(int, value.shape)))) mat = np.asarray(value) - elif lib.isscalar(value): + elif is_scalar(value): dtype, value = _infer_dtype_from_scalar(value) mat = np.empty(shape[1:], dtype=dtype) mat.fill(value) @@ -653,7 +657,7 @@ def round(self, decimals=0, *args, **kwargs): """ nv.validate_round(args, kwargs) - if com.is_integer(decimals): + if is_integer(decimals): result = np.apply_along_axis(np.round, 0, self.values) return self._wrap_result(result, axis=0) raise TypeError("decimals must be an integer") @@ -687,7 +691,7 @@ def dropna(self, axis=0, how='any', inplace=False): axis = self._get_axis_number(axis) values = self.values - mask = com.notnull(values) + mask = notnull(values) for ax in reversed(sorted(set(range(self._AXIS_LEN)) - set([axis]))): mask = mask.sum(ax) @@ -711,7 +715,7 @@ def _combine(self, other, func, axis=0): return self._combine_panel(other, func) elif isinstance(other, DataFrame): return self._combine_frame(other, func, axis=axis) - elif lib.isscalar(other): + elif is_scalar(other): return self._combine_const(other, func) else: raise NotImplementedError("%s is not supported in combine " @@ -924,7 +928,7 @@ def to_frame(self, filter_observations=True): if filter_observations: # shaped like the return DataFrame - mask = com.notnull(self.values).all(axis=0) + mask = notnull(self.values).all(axis=0) # size = mask.sum() selector = mask.ravel() else: @@ -1218,7 +1222,7 @@ def transpose(self, *args, **kwargs): # check if a list of axes was passed in instead as a # single *args element if (len(args) == 1 and hasattr(args[0], '__iter__') and - not com.is_string_like(args[0])): + not is_string_like(args[0])): axes = args[0] else: axes = args diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 8d237016d1b33..4f601a2d377a6 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -6,6 +6,11 @@ import numpy as np +from pandas.types.common import _ensure_platform_int, is_list_like +from pandas.types.cast import _maybe_promote +from pandas.types.missing import notnull +import pandas.types.concat as _concat + from pandas.core.series import Series from pandas.core.frame import DataFrame @@ -14,11 +19,8 @@ from pandas._sparse import IntIndex from pandas.core.categorical import Categorical -from pandas.core.common import notnull, _ensure_platform_int, _maybe_promote from pandas.core.groupby import get_group_index, _compress_group_index -import pandas.core.common as com -import pandas.types.concat as _concat import pandas.core.algorithms as algos import pandas.algos as _algos @@ -1063,7 +1065,7 @@ def check_len(item, name): length_msg = ("Length of '{0}' ({1}) did not match the length of " "the columns being encoded ({2}).") - if com.is_list_like(item): + if is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError(length_msg.format(name, len(item), len(columns_to_encode))) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8015670212181..2c7f298dde2ec 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -13,18 +13,33 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import (isnull, notnull, is_bool_indexer, - _default_index, _maybe_upcast, - _asarray_tuplesafe, _infer_dtype_from_scalar, - is_list_like, _values_from_object, - is_categorical_dtype, - _possibly_cast_to_datetime, - _possibly_castable, _possibly_convert_platform, - _try_sort, is_extension_type, is_datetimetz, - _maybe_match_name, ABCSparseArray, - _coerce_to_dtype, SettingWithCopyError, - _maybe_box_datetimelike, ABCDataFrame, - _dict_compat, is_integer) +from pandas.types.common import (_coerce_to_dtype, is_categorical_dtype, + is_integer, is_integer_dtype, + is_float_dtype, + is_extension_type, is_datetimetz, + is_datetimelike, + is_timedelta64_dtype, + is_list_like, + is_hashable, + is_iterator, + is_dict_like, + is_scalar, + _ensure_platform_int) +from pandas.types.generic import ABCSparseArray, ABCDataFrame +from pandas.types.cast import (_maybe_upcast, _infer_dtype_from_scalar, + _possibly_convert_platform, + _possibly_cast_to_datetime, _possibly_castable) +from pandas.types.missing import isnull, notnull + +from pandas.core.common import (is_bool_indexer, + _default_index, + _asarray_tuplesafe, + _values_from_object, + _try_sort, + _maybe_match_name, + SettingWithCopyError, + _maybe_box_datetimelike, + _dict_compat) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, Float64Index, _ensure_index) from pandas.core.indexing import check_bool_indexer, maybe_convert_indices @@ -303,7 +318,7 @@ def name(self): @name.setter def name(self, value): - if value is not None and not com.is_hashable(value): + if value is not None and not is_hashable(value): raise TypeError('Series.name must be a hashable type') object.__setattr__(self, '_name', value) @@ -580,7 +595,7 @@ def __getitem__(self, key): try: result = self.index.get_value(self, key) - if not lib.isscalar(result): + if not is_scalar(result): if is_list_like(result) and not isinstance(result, Series): # we need to box if we have a non-unique index here @@ -613,10 +628,10 @@ def __getitem__(self, key): except Exception: raise - if com.is_iterator(key): + if is_iterator(key): key = list(key) - if is_bool_indexer(key): + if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) return self._get_with(key) @@ -710,9 +725,9 @@ def setitem(key, value): elif key is Ellipsis: self[:] = value return - elif is_bool_indexer(key): + elif com.is_bool_indexer(key): pass - elif com.is_timedelta64_dtype(self.dtype): + elif is_timedelta64_dtype(self.dtype): # reassign a null value to iNaT if isnull(value): value = tslib.iNaT @@ -736,7 +751,7 @@ def setitem(key, value): if 'unorderable' in str(e): # pragma: no cover raise IndexError(key) - if is_bool_indexer(key): + if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) try: self._where(~key, value, inplace=True) @@ -1060,7 +1075,7 @@ def _get_repr(self, name=False, header=True, index=True, length=True, def __iter__(self): """ provide iteration over the values of the Series box values if necessary """ - if com.is_datetimelike(self): + if is_datetimelike(self): return (_maybe_box_datetimelike(x) for x in self._values) else: return iter(self._values) @@ -1349,7 +1364,7 @@ def quantile(self, q=0.5, interpolation='linear'): result = self._data.quantile(qs=q, interpolation=interpolation) - if com.is_list_like(q): + if is_list_like(q): return self._constructor(result, index=Float64Index(q), name=self.name) @@ -1481,7 +1496,7 @@ def dot(self, other): @Appender(base._shared_docs['searchsorted']) def searchsorted(self, v, side='left', sorter=None): if sorter is not None: - sorter = com._ensure_platform_int(sorter) + sorter = _ensure_platform_int(sorter) return self._values.searchsorted(Series(v)._values, side=side, sorter=sorter) @@ -1727,7 +1742,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(index, MultiIndex): from pandas.core.groupby import _lexsort_indexer indexer = _lexsort_indexer(index.labels, orders=ascending) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) new_index = index.take(indexer) else: new_index, indexer = index.sort_values(return_indexer=True, @@ -2265,8 +2280,8 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, @Appender(generic._shared_docs['rename'] % _shared_doc_kwargs) def rename(self, index=None, **kwargs): - non_mapping = lib.isscalar(index) or (com.is_list_like(index) and - not com.is_dict_like(index)) + non_mapping = is_scalar(index) or (is_list_like(index) and + not is_dict_like(index)) if non_mapping: return self._set_name(index, inplace=kwargs.get('inplace')) return super(Series, self).rename(index=index, **kwargs) @@ -2345,7 +2360,7 @@ def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs): if convert: indices = maybe_convert_indices(indices, len(self._get_axis(axis))) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) new_index = self.index.take(indices) new_values = self._values.take(indices) return self._constructor(new_values, @@ -2771,7 +2786,7 @@ def _try_cast(arr, take_fast_path): subarr = np.array(data, copy=False) # possibility of nan -> garbage - if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype): + if is_float_dtype(data.dtype) and is_integer_dtype(dtype): if not isnull(data).any(): subarr = _try_cast(data, True) elif copy: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a3f687b7fd73c..6ec28f9735850 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,14 +1,19 @@ import numpy as np from pandas.compat import zip -from pandas.core.common import (isnull, notnull, _values_from_object, - is_bool_dtype, - is_list_like, is_categorical_dtype, - is_object_dtype, is_string_like) +from pandas.types.generic import ABCSeries, ABCIndex +from pandas.types.missing import isnull, notnull +from pandas.types.common import (is_bool_dtype, + is_categorical_dtype, + is_object_dtype, + is_string_like, + is_list_like, + is_scalar) +from pandas.core.common import _values_from_object + from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin -from pandas.types import api as gt from pandas.util.decorators import Appender, deprecate_kwarg import re import pandas.lib as lib @@ -152,7 +157,7 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): if not len(arr): return np.ndarray(0, dtype=dtype) - if isinstance(arr, gt.ABCSeries): + if isinstance(arr, ABCSeries): arr = arr.values if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) @@ -343,7 +348,7 @@ def str_repeat(arr, repeats): ------- repeated : Series/Index of objects """ - if lib.isscalar(repeats): + if is_scalar(repeats): def rep(x): try: @@ -696,7 +701,7 @@ def str_extractall(arr, pat, flags=0): if regex.groups == 0: raise ValueError("pattern contains no capture groups") - if isinstance(arr, gt.ABCIndex): + if isinstance(arr, ABCIndex): arr = arr.to_series().reset_index(drop=True) names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) @@ -1538,7 +1543,7 @@ def rjust(self, width, fillchar=' '): return self.pad(width, side='left', fillchar=fillchar) def zfill(self, width): - """" + """ Filling left side of strings in the Series/Index with 0. Equivalent to :meth:`str.zfill`. @@ -1820,7 +1825,7 @@ class StringAccessorMixin(object): def _make_str_accessor(self): from pandas.core.index import Index - if (isinstance(self, gt.ABCSeries) and + if (isinstance(self, ABCSeries) and not ((is_categorical_dtype(self.dtype) and is_object_dtype(self.values.categories)) or (is_object_dtype(self.dtype)))): diff --git a/pandas/core/window.py b/pandas/core/window.py index 1e34d18fe3e54..bc4d34529287b 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -11,6 +11,15 @@ import numpy as np from collections import defaultdict +from pandas.types.generic import ABCSeries, ABCDataFrame +from pandas.types.common import (is_integer, + is_bool, + is_float_dtype, + is_integer_dtype, + needs_i8_conversion, + is_timedelta64_dtype, + is_list_like, + _ensure_float64) import pandas as pd from pandas.lib import isscalar from pandas.core.base import (PandasObject, SelectionMixin, @@ -64,10 +73,10 @@ def _constructor(self): return Window def validate(self): - if self.center is not None and not com.is_bool(self.center): + if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") if self.min_periods is not None and not \ - com.is_integer(self.min_periods): + is_integer(self.min_periods): raise ValueError("min_periods must be an integer") def _convert_freq(self, how=None): @@ -75,7 +84,7 @@ def _convert_freq(self, how=None): obj = self._selected_obj if (self.freq is not None and - isinstance(obj, (com.ABCSeries, com.ABCDataFrame))): + isinstance(obj, (ABCSeries, ABCDataFrame))): if how is not None: warnings.warn("The how kw argument is deprecated and removed " "in a future version. You can resample prior " @@ -111,7 +120,7 @@ def _gotitem(self, key, ndim, subset=None): self = self._shallow_copy(subset) self._reset_cache() if subset.ndim == 2: - if isscalar(key) and key in subset or com.is_list_like(key): + if isscalar(key) and key in subset or is_list_like(key): self._selection = key return self @@ -150,11 +159,11 @@ def _prep_values(self, values=None, kill_inf=True, how=None): # GH #12373 : rolling functions error on float32 data # make sure the data is coerced to float64 - if com.is_float_dtype(values.dtype): - values = com._ensure_float64(values) - elif com.is_integer_dtype(values.dtype): - values = com._ensure_float64(values) - elif com.needs_i8_conversion(values.dtype): + if is_float_dtype(values.dtype): + values = _ensure_float64(values) + elif is_integer_dtype(values.dtype): + values = _ensure_float64(values) + elif needs_i8_conversion(values.dtype): raise NotImplementedError("ops for {action} for this " "dtype {dtype} are not " "implemented".format( @@ -162,7 +171,7 @@ def _prep_values(self, values=None, kill_inf=True, how=None): dtype=values.dtype)) else: try: - values = com._ensure_float64(values) + values = _ensure_float64(values) except (ValueError, TypeError): raise TypeError("cannot handle this type -> {0}" "".format(values.dtype)) @@ -184,7 +193,7 @@ def _wrap_result(self, result, block=None, obj=None): # coerce if necessary if block is not None: - if com.is_timedelta64_dtype(block.values.dtype): + if is_timedelta64_dtype(block.values.dtype): result = pd.to_timedelta( result.ravel(), unit='ns').values.reshape(result.shape) @@ -345,7 +354,7 @@ def _prep_window(self, **kwargs): window = self._get_window() if isinstance(window, (list, tuple, np.ndarray)): return com._asarray_tuplesafe(window).astype(float) - elif com.is_integer(window): + elif is_integer(window): import scipy.signal as sig # the below may pop from kwargs @@ -543,7 +552,7 @@ def _apply(self, func, name=None, window=None, center=None, def func(arg, window, min_periods=None): minp = check_minp(min_periods, window) # GH #12373: rolling functions error on float32 data - return cfunc(com._ensure_float64(arg), + return cfunc(_ensure_float64(arg), window, minp, **kwargs) # calculation function @@ -586,7 +595,7 @@ def count(self): results = [] for b in blocks: - if com.needs_i8_conversion(b.values): + if needs_i8_conversion(b.values): result = b.notnull().astype(int) else: try: @@ -850,7 +859,7 @@ class Rolling(_Rolling_and_Expanding): def validate(self): super(Rolling, self).validate() - if not com.is_integer(self.window): + if not is_integer(self.window): raise ValueError("window must be an integer") elif self.window < 0: raise ValueError("window must be non-negative") @@ -1484,7 +1493,7 @@ def _get_center_of_mass(com, span, halflife, alpha): def _offset(window, center): - if not com.is_integer(window): + if not is_integer(window): window = len(window) offset = (window - 1) / 2. if center else 0 try: diff --git a/pandas/formats/format.py b/pandas/formats/format.py index cc46ed57aeff0..436a9d5d5d4c8 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -10,8 +10,19 @@ import sys +from pandas.types.missing import isnull, notnull +from pandas.types.common import (is_categorical_dtype, + is_float_dtype, + is_period_arraylike, + is_integer_dtype, + is_datetimetz, + is_integer, + is_float, + is_numeric_dtype, + is_datetime64_dtype, + is_timedelta64_dtype) + from pandas.core.base import PandasObject -from pandas.core.common import isnull, notnull, is_numeric_dtype from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u, @@ -194,7 +205,7 @@ def _get_footer(self): # level infos are added to the end and in a new line, like it is done # for Categoricals - if com.is_categorical_dtype(self.tr_series.dtype): + if is_categorical_dtype(self.tr_series.dtype): level_info = self.tr_series._values._repr_categories_info() if footer: footer += "\n" @@ -316,12 +327,12 @@ def should_show_dimensions(self): def _get_formatter(self, i): if isinstance(self.formatters, (list, tuple)): - if com.is_integer(i): + if is_integer(i): return self.formatters[i] else: return None else: - if com.is_integer(i) and i not in self.columns: + if is_integer(i) and i not in self.columns: i = self.columns[i] return self.formatters.get(i, None) @@ -1646,7 +1657,7 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, def _format_value(self, val): if lib.checknull(val): val = self.na_rep - elif com.is_float(val): + elif is_float(val): if lib.isposinf_scalar(val): val = self.inf_rep elif lib.isneginf_scalar(val): @@ -1867,19 +1878,19 @@ def get_formatted_cells(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.'): - if com.is_categorical_dtype(values): + if is_categorical_dtype(values): fmt_klass = CategoricalArrayFormatter - elif com.is_float_dtype(values.dtype): + elif is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter - elif com.is_period_arraylike(values): + elif is_period_arraylike(values): fmt_klass = PeriodArrayFormatter - elif com.is_integer_dtype(values.dtype): + elif is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter - elif com.is_datetimetz(values): + elif is_datetimetz(values): fmt_klass = Datetime64TZFormatter - elif com.is_datetime64_dtype(values.dtype): + elif is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter - elif com.is_timedelta64_dtype(values.dtype): + elif is_timedelta64_dtype(values.dtype): fmt_klass = Timedelta64Formatter else: fmt_klass = GenericArrayFormatter @@ -1949,14 +1960,14 @@ def _format(x): if isinstance(vals, Index): vals = vals._values - is_float = lib.map_infer(vals, com.is_float) & notnull(vals) - leading_space = is_float.any() + is_float_type = lib.map_infer(vals, is_float) & notnull(vals) + leading_space = is_float_type.any() fmt_values = [] for i, v in enumerate(vals): - if not is_float[i] and leading_space: + if not is_float_type[i] and leading_space: fmt_values.append(' %s' % _format(v)) - elif is_float[i]: + elif is_float_type[i]: fmt_values.append(float_format(v)) else: fmt_values.append(' %s' % _format(v)) diff --git a/pandas/formats/printing.py b/pandas/formats/printing.py index a4eaec8d5334b..37bd4b63d6f7a 100644 --- a/pandas/formats/printing.py +++ b/pandas/formats/printing.py @@ -2,9 +2,9 @@ printing tools """ +from pandas.types.inference import is_sequence from pandas import compat from pandas.compat import u -import pandas.core.common as com from pandas.core.config import get_option @@ -213,7 +213,7 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): _nest_lvl < get_option("display.pprint_nest_depth")): result = _pprint_dict(thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items) - elif (com.is_sequence(thing) and + elif (is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth")): result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars, quote_strings=quote_strings, diff --git a/pandas/formats/style.py b/pandas/formats/style.py index 477ecccc03f4f..472fd958d35eb 100644 --- a/pandas/formats/style.py +++ b/pandas/formats/style.py @@ -17,10 +17,11 @@ "or `pip install Jinja2`" raise ImportError(msg) +from pandas.types.common import is_float, is_string_like + import numpy as np import pandas as pd from pandas.compat import lzip, range -import pandas.core.common as com from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice try: import matplotlib.pyplot as plt @@ -153,7 +154,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, # display_funcs maps (row, col) -> formatting function def default_display_func(x): - if com.is_float(x): + if is_float(x): return '{:>.{precision}g}'.format(x, precision=self.precision) else: return x @@ -893,7 +894,7 @@ def _highlight_extrema(data, color='yellow', max_=True): def _maybe_wrap_formatter(formatter): - if com.is_string_like(formatter): + if is_string_like(formatter): return lambda x: formatter.format(x) elif callable(formatter): return formatter diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 0bb80be013275..5c9938c932da2 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -12,6 +12,28 @@ from pandas.compat import range, u from pandas.compat.numpy import function as nv from pandas import compat + + +from pandas.types.generic import ABCSeries, ABCMultiIndex, ABCPeriodIndex +from pandas.types.missing import isnull, array_equivalent +from pandas.types.common import (_ensure_int64, _ensure_object, + _ensure_platform_int, + is_datetimetz, + is_integer, + is_float, + is_dtype_equal, + is_object_dtype, + is_categorical_dtype, + is_bool_dtype, + is_integer_dtype, is_float_dtype, + needs_i8_conversion, + is_iterator, is_list_like, + is_scalar) +from pandas.types.cast import _coerce_indexer_dtype +from pandas.core.common import (is_bool_indexer, + _values_from_object, + _asarray_tuplesafe) + from pandas.core.base import (PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin) import pandas.core.base as base @@ -22,15 +44,6 @@ import pandas.core.missing as missing import pandas.core.algorithms as algos from pandas.formats.printing import pprint_thing -from pandas.core.common import (isnull, array_equivalent, - is_object_dtype, is_datetimetz, ABCSeries, - ABCPeriodIndex, ABCMultiIndex, - _values_from_object, is_float, is_integer, - is_iterator, is_categorical_dtype, - _ensure_object, _ensure_int64, is_bool_indexer, - is_list_like, is_bool_dtype, - is_integer_dtype, is_float_dtype, - needs_i8_conversion) from pandas.core.ops import _comp_method_OBJECT_ARRAY from pandas.core.strings import StringAccessorMixin @@ -223,7 +236,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype('object') else: - subarr = com._asarray_tuplesafe(data, dtype=object) + subarr = _asarray_tuplesafe(data, dtype=object) # _asarray_tuplesafe does not always copy underlying data, # so need to make sure that this happens @@ -264,7 +277,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif hasattr(data, '__array__'): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) - elif data is None or lib.isscalar(data): + elif data is None or is_scalar(data): cls._scalar_data_error(data) else: if (tupleize_cols and isinstance(data, list) and data and @@ -284,7 +297,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # python2 - MultiIndex fails on mixed types pass # other iterable of some kind - subarr = com._asarray_tuplesafe(data, dtype=object) + subarr = _asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) """ @@ -539,7 +552,7 @@ def _coerce_to_ndarray(cls, data): """ if not isinstance(data, (np.ndarray, Index)): - if data is None or lib.isscalar(data): + if data is None or is_scalar(data): cls._scalar_data_error(data) # other iterable of some kind @@ -841,7 +854,7 @@ def to_datetime(self, dayfirst=False): return DatetimeIndex(self.values) def _assert_can_do_setop(self, other): - if not com.is_list_like(other): + if not is_list_like(other): raise TypeError('Input must be Index or array-like') return True @@ -1325,7 +1338,7 @@ def __getitem__(self, key): getitem = self._data.__getitem__ promote = self._shallow_copy - if lib.isscalar(key): + if is_scalar(key): return getitem(key) if isinstance(key, slice): @@ -1338,7 +1351,7 @@ def __getitem__(self, key): key = _values_from_object(key) result = getitem(key) - if not lib.isscalar(result): + if not is_scalar(result): return promote(result) else: return result @@ -1426,7 +1439,7 @@ def _ensure_compat_concat(indexes): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) if self._can_hold_na: taken = self._assert_take_fillable(self.values, indices, allow_fill=allow_fill, @@ -1442,7 +1455,7 @@ def take(self, indices, axis=0, allow_fill=True, def _assert_take_fillable(self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan): """ Internal method to handle NA filling of take """ - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: @@ -1491,7 +1504,7 @@ def _convert_for_op(self, value): def _assert_can_do_op(self, value): """ Check value is valid for scalar op """ - if not lib.isscalar(value): + if not is_scalar(value): msg = "'value' must be a scalar, passed: {0}" raise TypeError(msg.format(type(value).__name__)) @@ -1706,7 +1719,7 @@ def argsort(self, *args, **kwargs): return result.argsort(*args, **kwargs) def __add__(self, other): - if com.is_list_like(other): + if is_list_like(other): warnings.warn("using '+' to provide set union with Indexes is " "deprecated, use '|' or .union()", FutureWarning, stacklevel=2) @@ -1783,7 +1796,7 @@ def union(self, other): if len(self) == 0: return other._get_consensus_name(self) - if not com.is_dtype_equal(self.dtype, other.dtype): + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) @@ -1866,7 +1879,7 @@ def intersection(self, other): if self.equals(other): return self._get_consensus_name(other) - if not com.is_dtype_equal(self.dtype, other.dtype): + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.intersection(other) @@ -2028,7 +2041,7 @@ def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex s = getattr(series, '_values', None) - if isinstance(s, Index) and lib.isscalar(key): + if isinstance(s, Index) and is_scalar(key): try: return s[key] except (IndexError, ValueError): @@ -2061,7 +2074,7 @@ def get_value(self, series, key): raise e1 except TypeError: # python 3 - if lib.isscalar(key): # pragma: no cover + if is_scalar(key): # pragma: no cover raise IndexError(key) raise InvalidIndexError(key) @@ -2137,7 +2150,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return pself.get_indexer(ptarget, method=method, limit=limit, tolerance=tolerance) - if not com.is_dtype_equal(self.dtype, target.dtype): + if not is_dtype_equal(self.dtype, target.dtype): this = self.astype(object) target = target.astype(object) return this.get_indexer(target, method=method, limit=limit, @@ -2161,7 +2174,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): indexer = self._engine.get_indexer(target._values) - return com._ensure_platform_int(indexer) + return _ensure_platform_int(indexer) def _convert_tolerance(self, tolerance): # override this method on subclasses @@ -2443,7 +2456,7 @@ def _reindex_non_unique(self, target): if len(missing): l = np.arange(len(indexer)) - missing = com._ensure_platform_int(missing) + missing = _ensure_platform_int(missing) missing_labels = target.take(missing) missing_indexer = _ensure_int64(l[~check]) cur_labels = self.take(indexer[check])._values @@ -2541,7 +2554,7 @@ def join(self, other, how='left', level=None, return_indexers=False): result = x, z, y return result - if not com.is_dtype_equal(self.dtype, other.dtype): + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.join(other, how=how, return_indexers=return_indexers) @@ -2637,8 +2650,8 @@ def _join_non_unique(self, other, how='left', return_indexers=False): [other._values], how=how, sort=True) - left_idx = com._ensure_platform_int(left_idx) - right_idx = com._ensure_platform_int(right_idx) + left_idx = _ensure_platform_int(left_idx) + right_idx = _ensure_platform_int(right_idx) join_index = self.values.take(left_idx) mask = left_idx == -1 @@ -2850,9 +2863,9 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): kind=kind) # return a slice - if not lib.isscalar(start_slice): + if not is_scalar(start_slice): raise AssertionError("Start slice bound is non-scalar") - if not lib.isscalar(end_slice): + if not is_scalar(end_slice): raise AssertionError("End slice bound is non-scalar") return slice(start_slice, end_slice, step) @@ -3483,7 +3496,7 @@ def _get_na_value(dtype): def _ensure_frozen(array_like, categories, copy=False): - array_like = com._coerce_indexer_dtype(array_like, categories) + array_like = _coerce_indexer_dtype(array_like, categories) array_like = array_like.view(FrozenNDArray) if copy: array_like = array_like.copy() diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 84b8926f4177f..f1d4fe2f26bdd 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -1,15 +1,21 @@ import numpy as np -import pandas.lib as lib import pandas.index as _index from pandas import compat from pandas.compat.numpy import function as nv +from pandas.types.generic import ABCCategorical, ABCSeries +from pandas.types.common import (is_categorical_dtype, + _ensure_platform_int, + is_list_like, + is_scalar) +from pandas.types.missing import array_equivalent + + from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg) from pandas.core.config import get_option from pandas.indexes.base import Index, _index_shared_docs import pandas.core.base as base -import pandas.core.common as com import pandas.core.missing as missing import pandas.indexes.base as ibase @@ -49,7 +55,7 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if name is None and hasattr(data, 'name'): name = data.name - if isinstance(data, com.ABCCategorical): + if isinstance(data, ABCCategorical): data = cls._create_categorical(cls, data, categories, ordered) elif isinstance(data, CategoricalIndex): data = data._data @@ -58,7 +64,7 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, # don't allow scalars # if data is None, then categories must be provided - if lib.isscalar(data): + if is_scalar(data): if data is not None or categories is None: cls._scalar_data_error(data) data = [] @@ -116,7 +122,7 @@ def _create_categorical(self, data, categories=None, ordered=None): ------- Categorical """ - if not isinstance(data, com.ABCCategorical): + if not isinstance(data, ABCCategorical): from pandas.core.categorical import Categorical data = Categorical(data, categories=categories, ordered=ordered) else: @@ -164,7 +170,7 @@ def _is_dtype_compat(self, other): ------ TypeError if the dtypes are not compatible """ - if com.is_categorical_dtype(other): + if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): @@ -172,7 +178,7 @@ def _is_dtype_compat(self, other): "when appending") else: values = other - if not com.is_list_like(values): + if not is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( self, other, categories=self.categories, ordered=self.ordered)) @@ -191,7 +197,7 @@ def equals(self, other): try: other = self._is_dtype_compat(other) - return com.array_equivalent(self._data, other) + return array_equivalent(self._data, other) except (TypeError, ValueError): pass @@ -360,7 +366,7 @@ def reindex(self, target, method=None, level=None, limit=None, target = ibase._ensure_index(target) - if not com.is_categorical_dtype(target) and not target.is_unique: + if not is_categorical_dtype(target) and not target.is_unique: raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) @@ -388,7 +394,7 @@ def reindex(self, target, method=None, level=None, limit=None, # unless we had an inital Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) - if com.is_categorical_dtype(target): + if is_categorical_dtype(target): new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) @@ -460,7 +466,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) - return com._ensure_platform_int(indexer) + return _ensure_platform_int(indexer) def get_indexer_non_unique(self, target): """ this is the same for a CategoricalIndex for get_indexer; the API @@ -491,7 +497,7 @@ def _convert_list_indexer(self, keyarr, kind=None): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.codes, indices, allow_fill=allow_fill, fill_value=fill_value, @@ -591,12 +597,12 @@ def _evaluate_compare(self, other): self, other._values, categories=self.categories, ordered=self.ordered) - if isinstance(other, (com.ABCCategorical, np.ndarray, - com.ABCSeries)): + if isinstance(other, (ABCCategorical, np.ndarray, + ABCSeries)): if len(self.values) != len(other): raise ValueError("Lengths must match to compare") - if isinstance(other, com.ABCCategorical): + if isinstance(other, ABCCategorical): if not self.values.is_dtype_equal(other): raise TypeError("categorical index comparisions must " "have the same categories and ordered " @@ -619,7 +625,7 @@ def _delegate_method(self, name, *args, **kwargs): if 'inplace' in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) - if lib.isscalar(res): + if is_scalar(res): return res return CategoricalIndex(res, name=self.name) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 05b2045a4850f..365a971f82a3b 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -13,6 +13,21 @@ from pandas.compat import range, zip, lrange, lzip, map from pandas.compat.numpy import function as nv from pandas import compat + + +from pandas.types.common import (_ensure_int64, + _ensure_platform_int, + is_object_dtype, + is_iterator, + is_list_like, + is_scalar) +from pandas.types.missing import isnull, array_equivalent +from pandas.core.common import (_values_from_object, + is_bool_indexer, + is_null_slice, + PerformanceWarning) + + from pandas.core.base import FrozenList import pandas.core.base as base from pandas.util.decorators import (Appender, cache_readonly, @@ -21,13 +36,6 @@ import pandas.core.missing as missing import pandas.core.algorithms as algos from pandas.formats.printing import pprint_thing -from pandas.core.common import (isnull, array_equivalent, - is_object_dtype, - _values_from_object, - is_iterator, - _ensure_int64, is_bool_indexer, - is_list_like, is_null_slice, - PerformanceWarning) from pandas.core.config import get_option @@ -798,7 +806,7 @@ def lexsort_depth(self): else: return 0 - int64_labels = [com._ensure_int64(lab) for lab in self.labels] + int64_labels = [_ensure_int64(lab) for lab in self.labels] for k in range(self.nlevels, 0, -1): if lib.is_lexsorted(int64_labels[:k]): return k @@ -984,7 +992,7 @@ def __setstate__(self, state): self._reset_identity() def __getitem__(self, key): - if lib.isscalar(key): + if is_scalar(key): retval = [] for lev, lab in zip(self.levels, self.labels): if lab[key] == -1: @@ -1011,7 +1019,7 @@ def __getitem__(self, key): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.labels, indices, allow_fill=allow_fill, fill_value=fill_value, @@ -1313,7 +1321,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): if not ascending: indexer = indexer[::-1] - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) new_labels = [lab.take(indexer) for lab in self.labels] new_index = MultiIndex(labels=new_labels, levels=self.levels, @@ -1377,7 +1385,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): else: indexer = self_index._engine.get_indexer(target._values) - return com._ensure_platform_int(indexer) + return _ensure_platform_int(indexer) def reindex(self, target, method=None, level=None, limit=None, tolerance=None): @@ -1759,7 +1767,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): # selected from pandas import Series mapper = Series(indexer) - indexer = labels.take(com._ensure_platform_int(indexer)) + indexer = labels.take(_ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) m = result.map(mapper)._values diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 89fc05fdcc5f5..86d22e141f781 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -3,13 +3,15 @@ import pandas.algos as _algos import pandas.index as _index +from pandas.types.common import (is_dtype_equal, pandas_dtype, + is_float_dtype, is_object_dtype, + is_integer_dtype, is_scalar) +from pandas.types.missing import array_equivalent, isnull +from pandas.core.common import _values_from_object + from pandas import compat from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly -import pandas.core.common as com -from pandas.core.common import (is_dtype_equal, isnull, pandas_dtype, - is_float_dtype, is_object_dtype, - is_integer_dtype) import pandas.indexes.base as ibase @@ -164,8 +166,8 @@ def equals(self, other): if self.is_(other): return True - return com.array_equivalent(com._values_from_object(self), - com._values_from_object(other)) + return array_equivalent(_values_from_object(self), + _values_from_object(other)) def _wrap_joined_index(self, joined, other): name = self.name if self.name == other.name else None @@ -287,17 +289,17 @@ def _format_native_types(self, na_rep='', float_format=None, decimal='.', def get_value(self, series, key): """ we always want to get an index value, never a value """ - if not lib.isscalar(key): + if not is_scalar(key): raise InvalidIndexError from pandas.core.indexing import maybe_droplevels from pandas.core.series import Series - k = com._values_from_object(key) + k = _values_from_object(key) loc = self.get_loc(k) - new_values = com._values_from_object(series)[loc] + new_values = _values_from_object(series)[loc] - if lib.isscalar(new_values) or new_values is None: + if is_scalar(new_values) or new_values is None: return new_values new_index = self[loc] diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 168143fdea047..f680d2da0161e 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -4,14 +4,16 @@ import numpy as np import pandas.index as _index +from pandas.types.common import (is_integer, + is_scalar, + is_int64_dtype) + from pandas import compat from pandas.compat import lrange, range from pandas.compat.numpy import function as nv from pandas.indexes.base import Index, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly -import pandas.core.common as com import pandas.indexes.base as ibase -import pandas.lib as lib from pandas.indexes.numeric import Int64Index @@ -120,7 +122,7 @@ def _simple_new(cls, start, stop=None, step=None, name=None, result = object.__new__(cls) # handle passed None, non-integers - if start is None or not com.is_integer(start): + if start is None or not is_integer(start): try: return RangeIndex(start, stop, step, name=name, **kwargs) except TypeError: @@ -139,7 +141,7 @@ def _simple_new(cls, start, stop=None, step=None, name=None, @staticmethod def _validate_dtype(dtype): """ require dtype to be None or int64 """ - if not (dtype is None or com.is_int64_dtype(dtype)): + if not (dtype is None or is_int64_dtype(dtype)): raise TypeError('Invalid to pass a non-int64 dtype to RangeIndex') @cache_readonly @@ -448,7 +450,7 @@ def __getitem__(self, key): """ super_getitem = super(RangeIndex, self).__getitem__ - if lib.isscalar(key): + if is_scalar(key): n = int(key) if n != key: return super_getitem(key) @@ -510,7 +512,7 @@ def __getitem__(self, key): return super_getitem(key) def __floordiv__(self, other): - if com.is_integer(other): + if is_integer(other): if (len(self) == 0 or self._start % other == 0 and self._step % other == 0): @@ -560,7 +562,7 @@ def _evaluate_numeric_binop(self, other): # we don't have a representable op # so return a base index - if not com.is_integer(rstep) or not rstep: + if not is_integer(rstep) or not rstep: raise ValueError else: @@ -577,7 +579,7 @@ def _evaluate_numeric_binop(self, other): # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return # as a Float64Index if we have float-like descriptors - if not all([com.is_integer(x) for x in + if not all([is_integer(x) for x in [rstart, rstop, rstep]]): result = result.astype('float64') diff --git a/pandas/io/common.py b/pandas/io/common.py index 76395928eb011..6f9bddd0fdf9b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -11,8 +11,8 @@ from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat from pandas.formats.printing import pprint_thing -from pandas.core.common import is_number, AbstractMethodError - +from pandas.core.common import AbstractMethodError +from pandas.types.common import is_number try: import pathlib diff --git a/pandas/io/data.py b/pandas/io/data.py index 5fa440e7bb1ff..68151fbb091fa 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -19,7 +19,9 @@ ) import pandas.compat as compat from pandas import Panel, DataFrame, Series, read_csv, concat, to_datetime, DatetimeIndex, DateOffset -from pandas.core.common import is_list_like, PandasError + +from pandas.types.common import is_list_like +from pandas.core.common import PandasError from pandas.io.common import urlopen, ZipFile, urlencode from pandas.tseries.offsets import MonthEnd from pandas.util.testing import _network_error_classes diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 775465ea9372d..703cdbeaa7a8f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -10,6 +10,9 @@ import abc import numpy as np +from pandas.types.common import (is_integer, is_float, + is_bool, is_list_like) + from pandas.core.frame import DataFrame from pandas.io.parsers import TextParser from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, @@ -22,7 +25,6 @@ from pandas.formats.printing import pprint_thing import pandas.compat as compat import pandas.compat.openpyxl_compat as openpyxl_compat -import pandas.core.common as com from warnings import warn from distutils.version import LooseVersion @@ -423,17 +425,17 @@ def _parse_cell(cell_contents, cell_typ): output[asheetname] = DataFrame() continue - if com.is_list_like(header) and len(header) == 1: + if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None: - if com.is_list_like(header): + if is_list_like(header): header_names = [] control_row = [True for x in data[0]] for row in header: - if com.is_integer(skiprows): + if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header( @@ -444,9 +446,9 @@ def _parse_cell(cell_contents, cell_typ): else: data[header] = _trim_excel_header(data[header]) - if com.is_list_like(index_col): + if is_list_like(index_col): # forward fill values for MultiIndex index - if not com.is_list_like(header): + if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) @@ -459,7 +461,7 @@ def _parse_cell(cell_contents, cell_typ): else: last = data[row][col] - if com.is_list_like(header) and len(header) > 1: + if is_list_like(header) and len(header) > 1: has_index_names = True # GH 12292 : error when read one empty column from excel file @@ -556,21 +558,21 @@ def _pop_header_name(row, index_col): return none_fill(row[0]), row[1:] else: # pop out header name and fill w/ blank - i = index_col if not com.is_list_like(index_col) else max(index_col) + i = index_col if not is_list_like(index_col) else max(index_col) return none_fill(row[i]), row[:i] + [''] + row[i + 1:] def _conv_value(val): # Convert numpy types to Python types for the Excel writers. - if com.is_integer(val): + if is_integer(val): val = int(val) - elif com.is_float(val): + elif is_float(val): val = float(val) - elif com.is_bool(val): + elif is_bool(val): val = bool(val) elif isinstance(val, Period): val = "%s" % val - elif com.is_list_like(val): + elif is_list_like(val): val = str(val) return val diff --git a/pandas/io/html.py b/pandas/io/html.py index 609642e248eda..e0d84a9617ae4 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -12,12 +12,12 @@ import numpy as np +from pandas.types.common import is_list_like from pandas.io.common import (EmptyDataError, _is_url, urlopen, parse_url, _validate_header_arg) from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) -from pandas.core import common as com from pandas import Series from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing @@ -107,7 +107,7 @@ def _get_skiprows(skiprows): """ if isinstance(skiprows, slice): return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) - elif isinstance(skiprows, numbers.Integral) or com.is_list_like(skiprows): + elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): return skiprows elif skiprows is None: return 0 diff --git a/pandas/io/packers.py b/pandas/io/packers.py index ff06a5f212f8b..14e2c9b371296 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -47,6 +47,10 @@ import numpy as np from pandas import compat from pandas.compat import u, u_safe + +from pandas.types.common import (is_categorical_dtype, is_object_dtype, + needs_i8_conversion, pandas_dtype) + from pandas import (Timestamp, Period, Series, DataFrame, # noqa Index, MultiIndex, Float64Index, Int64Index, Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, @@ -55,9 +59,7 @@ from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame -from pandas.core.common import (PerformanceWarning, - is_categorical_dtype, is_object_dtype, - needs_i8_conversion, pandas_dtype) +from pandas.core.common import PerformanceWarning from pandas.io.common import get_filepath_or_buffer from pandas.core.internals import BlockManager, make_block import pandas.core.internals as internals diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dc9455289b757..84ea2a92b8026 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,20 +2,22 @@ Module contains tools for processing files into DataFrames or other objects """ from __future__ import print_function -from pandas.compat import (range, lrange, StringIO, lzip, zip, - string_types, map, OrderedDict) -from pandas import compat from collections import defaultdict import re import csv import warnings +import datetime import numpy as np +from pandas import compat +from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map +from pandas.types.common import (is_integer, _ensure_object, + is_list_like, is_integer_dtype, + is_float, + is_scalar) from pandas.core.index import Index, MultiIndex from pandas.core.frame import DataFrame -import datetime -import pandas.core.common as com from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser @@ -326,11 +328,11 @@ def _validate_nrows(nrows): msg = "'nrows' must be an integer" if nrows is not None: - if com.is_float(nrows): + if is_float(nrows): if int(nrows) != nrows: raise ValueError(msg) nrows = int(nrows) - elif not com.is_integer(nrows): + elif not is_integer(nrows): raise ValueError(msg) return nrows @@ -869,7 +871,7 @@ def _clean_options(self, options, engine): # handle skiprows; this is internally handled by the # c-engine, so only need for python parsers if engine != 'c': - if com.is_integer(skiprows): + if is_integer(skiprows): skiprows = lrange(skiprows) skiprows = set() if skiprows is None else set(skiprows) @@ -961,7 +963,7 @@ def _validate_parse_dates_arg(parse_dates): "for the 'parse_dates' parameter") if parse_dates is not None: - if lib.isscalar(parse_dates): + if is_scalar(parse_dates): if not lib.is_bool(parse_dates): raise TypeError(msg) @@ -1021,8 +1023,8 @@ def __init__(self, kwds): is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) if not (is_sequence and - all(map(com.is_integer, self.index_col)) or - com.is_integer(self.index_col)): + all(map(is_integer, self.index_col)) or + is_integer(self.index_col)): raise ValueError("index_col must only contain row numbers " "when specifying a multi-index header") @@ -1047,7 +1049,7 @@ def _should_parse_dates(self, i): name = self.index_names[i] j = self.index_col[i] - if lib.isscalar(self.parse_dates): + if is_scalar(self.parse_dates): return (j == self.parse_dates) or (name == self.parse_dates) else: return (j in self.parse_dates) or (name in self.parse_dates) @@ -1281,7 +1283,7 @@ def _convert_types(self, values, na_values, try_num_bool=True): mask = lib.ismember(values, na_values) na_count = mask.sum() if na_count > 0: - if com.is_integer_dtype(values): + if is_integer_dtype(values): values = values.astype(np.float64) np.putmask(values, mask, np.nan) return values, na_count @@ -1407,10 +1409,10 @@ def _set_noconvert_columns(self): usecols = self.usecols def _set(x): - if usecols and com.is_integer(x): + if usecols and is_integer(x): x = list(usecols)[x] - if not com.is_integer(x): + if not is_integer(x): x = names.index(x) self._reader.set_noconvert(x) @@ -1790,7 +1792,7 @@ def _set_no_thousands_columns(self): noconvert_columns = set() def _set(x): - if com.is_integer(x): + if is_integer(x): noconvert_columns.add(x) else: noconvert_columns.add(self.columns.index(x)) @@ -1954,7 +1956,7 @@ def _convert_data(self, data): def _to_recarray(self, data, columns): dtypes = [] - o = OrderedDict() + o = compat.OrderedDict() # use the columns to "order" the keys # in the unordered 'data' dictionary @@ -2439,7 +2441,7 @@ def converter(*date_cols): try: return tools._to_datetime( - com._ensure_object(strs), + _ensure_object(strs), utc=None, box=False, dayfirst=dayfirst, @@ -2492,7 +2494,7 @@ def _isindex(colspec): if isinstance(parse_spec, list): # list of column lists for colspec in parse_spec: - if lib.isscalar(colspec): + if is_scalar(colspec): if isinstance(colspec, int) and colspec not in data_dict: colspec = orig_names[colspec] if _isindex(colspec): @@ -2569,7 +2571,7 @@ def _clean_na_values(na_values, keep_default_na=True): (k, _floatify_na_values(v)) for k, v in na_values.items() # noqa ]) else: - if not com.is_list_like(na_values): + if not is_list_like(na_values): na_values = [na_values] na_values = _stringify_na_values(na_values) if keep_default_na: @@ -2622,7 +2624,7 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): if not isinstance(dtype, dict): dtype = defaultdict(lambda: dtype) # Convert column indexes to column names. - dtype = dict((columns[k] if com.is_integer(k) else k, v) + dtype = dict((columns[k] if is_integer(k) else k, v) for k, v in compat.iteritems(dtype)) if index_col is None or index_col is False: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index c19dae7f3545e..2358c296f782e 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -3,7 +3,7 @@ import numpy as np from numpy.lib.format import read_array, write_array from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 -import pandas.core.common as com +from pandas.types.common import is_datetime64_dtype, _NS_DTYPE def to_pickle(obj, path): @@ -86,7 +86,7 @@ def _unpickle_array(bytes): # All datetimes should be stored as M8[ns]. When unpickling with # numpy1.6, it will read these as M8[us]. So this ensures all # datetime64 types are read as MS[ns] - if com.is_datetime64_dtype(arr): - arr = arr.view(com._NS_DTYPE) + if is_datetime64_dtype(arr): + arr = arr.view(_NS_DTYPE) return arr diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d4ca717ddbc4e..038ca7ac7775b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -12,11 +12,21 @@ import warnings import os +from pandas.types.common import (is_list_like, + is_categorical_dtype, + is_timedelta64_dtype, + is_datetime64tz_dtype, + is_datetime64_dtype, + _ensure_object, + _ensure_int64, + _ensure_platform_int) +from pandas.types.missing import array_equivalent + import numpy as np import pandas as pd from pandas import (Series, DataFrame, Panel, Panel4D, Index, - MultiIndex, Int64Index) + MultiIndex, Int64Index, isnull) from pandas.core import config from pandas.io.common import _stringify_path from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel @@ -32,7 +42,6 @@ _block2d_to_blocknd, _factor_indexer, _block_shape) from pandas.core.index import _ensure_index -import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter @@ -1677,7 +1686,7 @@ def validate_metadata(self, handler): new_metadata = self.metadata cur_metadata = handler.read_metadata(self.cname) if new_metadata is not None and cur_metadata is not None \ - and not com.array_equivalent(new_metadata, cur_metadata): + and not array_equivalent(new_metadata, cur_metadata): raise ValueError("cannot append a categorical with " "different categories to the existing") @@ -2566,7 +2575,7 @@ def write_array(self, key, value, items=None): empty_array = self._is_empty_array(value.shape) transposed = False - if com.is_categorical_dtype(value): + if is_categorical_dtype(value): raise NotImplementedError('Cannot store a category dtype in ' 'a HDF5 dataset that uses format=' '"fixed". Use format="table".') @@ -2621,12 +2630,12 @@ def write_array(self, key, value, items=None): if empty_array: self.write_array_empty(key, value) else: - if com.is_datetime64_dtype(value.dtype): + if is_datetime64_dtype(value.dtype): self._handle.create_array( self.group, key, value.view('i8')) getattr( self.group, key)._v_attrs.value_type = 'datetime64' - elif com.is_datetime64tz_dtype(value.dtype): + elif is_datetime64tz_dtype(value.dtype): # store as UTC # with a zone self._handle.create_array(self.group, key, @@ -2635,7 +2644,7 @@ def write_array(self, key, value, items=None): node = getattr(self.group, key) node._v_attrs.tz = _get_tz(value.tz) node._v_attrs.value_type = 'datetime64' - elif com.is_timedelta64_dtype(value.dtype): + elif is_timedelta64_dtype(value.dtype): self._handle.create_array( self.group, key, value.view('i8')) getattr( @@ -3756,8 +3765,8 @@ def read(self, where=None, columns=None, **kwargs): if len(unique(key)) == len(key): sorter, _ = algos.groupsort_indexer( - com._ensure_int64(key), np.prod(N)) - sorter = com._ensure_platform_int(sorter) + _ensure_int64(key), np.prod(N)) + sorter = _ensure_platform_int(sorter) # create the objs for c in self.values_axes: @@ -3802,7 +3811,7 @@ def read(self, where=None, columns=None, **kwargs): unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) @@ -3903,7 +3912,7 @@ def write_data(self, chunksize, dropna=False): # figure the mask: only do if we can successfully process this # column, otherwise ignore the mask - mask = com.isnull(a.data).all(axis=0) + mask = isnull(a.data).all(axis=0) if isinstance(mask, np.ndarray): masks.append(mask.astype('u1', copy=False)) @@ -4522,7 +4531,7 @@ def _convert_string_array(data, encoding, itemsize=None): # create the sized dtype if itemsize is None: - itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) + itemsize = lib.max_len_string_array(_ensure_object(data.ravel())) data = np.asarray(data, dtype="S%d" % itemsize) return data @@ -4551,7 +4560,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - itemsize = lib.max_len_string_array(com._ensure_object(data)) + itemsize = lib.max_len_string_array(_ensure_object(data)) if compat.PY3: dtype = "U{0}".format(itemsize) else: @@ -4619,7 +4628,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.terms = None self.coordinates = None - if com.is_list_like(where): + if is_list_like(where): # see if we have a passed coordinate like try: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 324988360c9fe..8485a3f13f047 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -13,13 +13,15 @@ import numpy as np import pandas.lib as lib -import pandas.core.common as com +from pandas.types.missing import isnull +from pandas.types.dtypes import DatetimeTZDtype +from pandas.types.common import (is_list_like, + is_datetime64tz_dtype) + from pandas.compat import (lzip, map, zip, raise_with_traceback, string_types, text_type) from pandas.core.api import DataFrame, Series -from pandas.core.common import isnull from pandas.core.base import PandasObject -from pandas.types.api import DatetimeTZDtype from pandas.tseries.tools import to_datetime from contextlib import contextmanager @@ -90,7 +92,7 @@ def _handle_date_column(col, format=None): # parse dates as timestamp format = 's' if format is None else format return to_datetime(col, errors='coerce', unit=format, utc=True) - elif com.is_datetime64tz_dtype(col): + elif is_datetime64tz_dtype(col): # coerce to UTC timezone # GH11216 return (to_datetime(col, errors='coerce') @@ -123,7 +125,7 @@ def _parse_date_columns(data_frame, parse_dates): # we could in theory do a 'nice' conversion from a FixedOffset tz # GH11216 for col_name, df_col in data_frame.iteritems(): - if com.is_datetime64tz_dtype(df_col): + if is_datetime64tz_dtype(df_col): data_frame[col_name] = _handle_date_column(df_col) return data_frame @@ -876,7 +878,7 @@ def _create_table_setup(self): for name, typ, is_index in column_names_and_types] if self.keys is not None: - if not com.is_list_like(self.keys): + if not is_list_like(self.keys): keys = [self.keys] else: keys = self.keys @@ -1465,7 +1467,7 @@ def _create_table_setup(self): for cname, ctype, _ in column_names_and_types] if self.keys is not None and len(self.keys): - if not com.is_list_like(self.keys): + if not is_list_like(self.keys): keys = [self.keys] else: keys = self.keys diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c7390cf240f8a..bd19102c7f18c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -14,6 +14,10 @@ import sys import struct from dateutil.relativedelta import relativedelta + +from pandas.types.common import (is_categorical_dtype, is_datetime64_dtype, + _ensure_object) + from pandas.core.base import StringMixin from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame @@ -24,7 +28,7 @@ zip, BytesIO from pandas.util.decorators import Appender import pandas as pd -import pandas.core.common as com + from pandas.io.common import get_filepath_or_buffer, BaseIterator from pandas.lib import max_len_string_array, infer_dtype from pandas.tslib import NaT, Timestamp @@ -358,7 +362,7 @@ def _datetime_to_stata_elapsed_vec(dates, fmt): def parse_dates_safe(dates, delta=False, year=False, days=False): d = {} - if com.is_datetime64_dtype(dates.values): + if is_datetime64_dtype(dates.values): if delta: delta = dates - stata_epoch d['delta'] = delta.values.astype( @@ -396,7 +400,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): index = dates.index if bad_loc.any(): dates = Series(dates) - if com.is_datetime64_dtype(dates): + if is_datetime64_dtype(dates): dates[bad_loc] = to_datetime(stata_epoch) else: dates[bad_loc] = stata_epoch @@ -1746,7 +1750,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype.type == np.object_: # try to coerce it to the biggest string # not memory efficient, what else could we # do? - itemsize = max_len_string_array(com._ensure_object(column.values)) + itemsize = max_len_string_array(_ensure_object(column.values)) return chr(max(itemsize, 1)) elif dtype == np.float64: return chr(255) @@ -1784,7 +1788,7 @@ def _dtype_to_default_stata_fmt(dtype, column): if not (inferred_dtype in ('string', 'unicode') or len(column) == 0): raise ValueError('Writing general object arrays is not supported') - itemsize = max_len_string_array(com._ensure_object(column.values)) + itemsize = max_len_string_array(_ensure_object(column.values)) if itemsize > 244: raise ValueError(excessive_string_length_error % column.name) return "%" + str(max(itemsize, 1)) + "s" @@ -1880,7 +1884,7 @@ def _prepare_categoricals(self, data): """Check for categorical columns, retain categorical information for Stata file and convert categorical data to int""" - is_cat = [com.is_categorical_dtype(data[col]) for col in data] + is_cat = [is_categorical_dtype(data[col]) for col in data] self._is_col_cat = is_cat self._value_labels = [] if not any(is_cat): diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 9a995c17f0445..e5a49c5213a48 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -31,11 +31,12 @@ from datetime import datetime, date, time +from pandas.types.common import (is_object_dtype, is_datetime64_dtype, + is_datetime64tz_dtype) from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat from pandas import date_range, to_datetime, to_timedelta, Timestamp import pandas.compat as compat from pandas.compat import StringIO, range, lrange, string_types -from pandas.core import common as com from pandas.core.datetools import format as date_format import pandas.io.sql as sql @@ -1275,7 +1276,7 @@ def test_datetime_with_timezone(self): def check(col): # check that a column is either datetime64[ns] # or datetime64[ns, UTC] - if com.is_datetime64_dtype(col.dtype): + if is_datetime64_dtype(col.dtype): # "2000-01-01 00:00:00-08:00" should convert to # "2000-01-01 08:00:00" @@ -1285,7 +1286,7 @@ def check(col): # "2000-06-01 07:00:00" self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00')) - elif com.is_datetime64tz_dtype(col.dtype): + elif is_datetime64tz_dtype(col.dtype): self.assertTrue(str(col.dt.tz) == 'UTC') # "2000-01-01 00:00:00-08:00" should convert to @@ -1311,9 +1312,9 @@ def check(col): # even with the same versions of psycopg2 & sqlalchemy, possibly a # Postgrsql server version difference col = df.DateColWithTz - self.assertTrue(com.is_object_dtype(col.dtype) or - com.is_datetime64_dtype(col.dtype) or - com.is_datetime64tz_dtype(col.dtype), + self.assertTrue(is_object_dtype(col.dtype) or + is_datetime64_dtype(col.dtype) or + is_datetime64tz_dtype(col.dtype), "DateCol loaded with incorrect type -> {0}" .format(col.dtype)) @@ -1327,7 +1328,7 @@ def check(col): self.conn, chunksize=1)), ignore_index=True) col = df.DateColWithTz - self.assertTrue(com.is_datetime64tz_dtype(col.dtype), + self.assertTrue(is_datetime64tz_dtype(col.dtype), "DateCol loaded with incorrect type -> {0}" .format(col.dtype)) self.assertTrue(str(col.dt.tz) == 'UTC') diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 830c68d62efad..5f45d1b547e62 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -15,7 +15,7 @@ import pandas as pd from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series -from pandas.core.common import is_categorical_dtype +from pandas.types.common import is_categorical_dtype from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 0312fb023f7fd..35233d1b6ba94 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -15,6 +15,14 @@ from pandas.compat import range from pandas.compat.numpy import function as nv +from pandas.types.generic import ABCSparseArray, ABCSparseSeries +from pandas.types.common import (is_float, is_integer, + is_integer_dtype, _ensure_platform_int, + is_list_like, + is_scalar) +from pandas.types.cast import _possibly_convert_platform +from pandas.types.missing import isnull, notnull + from pandas._sparse import SparseIndex, BlockIndex, IntIndex import pandas._sparse as splib import pandas.index as _index @@ -40,13 +48,13 @@ def wrapper(self, other): if len(self) != len(other): raise AssertionError("length mismatch: %d vs. %d" % (len(self), len(other))) - if not isinstance(other, com.ABCSparseArray): + if not isinstance(other, ABCSparseArray): other = SparseArray(other, fill_value=self.fill_value) if name[0] == 'r': return _sparse_array_op(other, self, op, name[1:]) else: return _sparse_array_op(self, other, op, name) - elif lib.isscalar(other): + elif is_scalar(other): new_fill_value = op(np.float64(self.fill_value), np.float64(other)) return _wrap_result(name, op(self.sp_values, other), @@ -120,7 +128,7 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', if index is not None: if data is None: data = np.nan - if not lib.isscalar(data): + if not is_scalar(data): raise Exception("must only pass scalars with an index ") values = np.empty(len(index), dtype='float64') values.fill(data) @@ -177,7 +185,7 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', @classmethod def _simple_new(cls, data, sp_index, fill_value): - if (com.is_integer_dtype(data) and com.is_float(fill_value) and + if (is_integer_dtype(data) and is_float(fill_value) and sp_index.ngaps > 0): # if float fill_value is being included in dense repr, # convert values to float @@ -288,7 +296,7 @@ def __getitem__(self, key): """ """ - if com.is_integer(key): + if is_integer(key): return self._get_val_at(key) elif isinstance(key, tuple): data_slice = self.values[key] @@ -340,11 +348,11 @@ def take(self, indices, axis=0, allow_fill=True, if axis: raise ValueError("axis must be 0, input was {0}".format(axis)) - if com.is_integer(indices): + if is_integer(indices): # return scalar return self[indices] - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) n = len(self) if allow_fill and fill_value is not None: # allow -1 to indicate self.fill_value, @@ -380,7 +388,7 @@ def take(self, indices, axis=0, allow_fill=True, return self._simple_new(new_values, sp_index, self.fill_value) def __setitem__(self, key, value): - # if com.is_integer(key): + # if is_integer(key): # self.values[key] = value # else: # raise Exception("SparseArray does not support seting non-scalars @@ -395,7 +403,7 @@ def __setslice__(self, i, j, value): j = 0 slobj = slice(i, j) # noqa - # if not lib.isscalar(value): + # if not is_scalar(value): # raise Exception("SparseArray does not support seting non-scalars # via slices") @@ -445,12 +453,12 @@ def count(self): @property def _null_fill_value(self): - return com.isnull(self.fill_value) + return isnull(self.fill_value) @property def _valid_sp_values(self): sp_vals = self.sp_values - mask = com.notnull(sp_vals) + mask = notnull(sp_vals) return sp_vals[mask] @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs) @@ -466,7 +474,7 @@ def fillna(self, value, downcast=None): fill_value=value) else: new_values = self.sp_values.copy() - new_values[com.isnull(new_values)] = value + new_values[isnull(new_values)] = value return self._simple_new(new_values, self.sp_index, fill_value=self.fill_value) @@ -498,7 +506,7 @@ def cumsum(self, axis=0, *args, **kwargs): nv.validate_cumsum(args, kwargs) # TODO: gh-12855 - return a SparseArray here - if com.notnull(self.fill_value): + if notnull(self.fill_value): return self.to_dense().cumsum() # TODO: what if sp_values contains NaN?? @@ -569,7 +577,7 @@ def _maybe_to_dense(obj): def _maybe_to_sparse(array): - if isinstance(array, com.ABCSparseSeries): + if isinstance(array, ABCSparseSeries): array = SparseArray(array.values, sparse_index=array.sp_index, fill_value=array.fill_value, copy=True) if not isinstance(array, SparseArray): @@ -588,15 +596,15 @@ def _sanitize_values(arr): else: # scalar - if lib.isscalar(arr): + if is_scalar(arr): arr = [arr] # ndarray if isinstance(arr, np.ndarray): pass - elif com.is_list_like(arr) and len(arr) > 0: - arr = com._possibly_convert_platform(arr) + elif is_list_like(arr) and len(arr) > 0: + arr = _possibly_convert_platform(arr) else: arr = np.asarray(arr) @@ -624,8 +632,8 @@ def make_sparse(arr, kind='block', fill_value=nan): if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") - if com.isnull(fill_value): - mask = com.notnull(arr) + if isnull(fill_value): + mask = notnull(arr) else: mask = arr != fill_value diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 52a6e6edf0896..811d8019c7fee 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -10,13 +10,15 @@ from pandas import compat import numpy as np +from pandas.types.missing import isnull, notnull +from pandas.types.common import _ensure_platform_int + +from pandas.core.common import _try_sort from pandas.compat.numpy import function as nv -from pandas.core.common import isnull, _try_sort from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.series import Series from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) -import pandas.core.common as com import pandas.core.algorithms as algos from pandas.core.internals import (BlockManager, create_block_manager_from_arrays) @@ -520,7 +522,7 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, return SparseDataFrame(index=index, columns=self.columns) indexer = self.index.get_indexer(index, method, limit=limit) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() @@ -546,7 +548,7 @@ def _reindex_columns(self, columns, copy, level, fill_value, limit=None, if level is not None: raise TypeError('Reindex by level not supported for sparse') - if com.notnull(fill_value): + if notnull(fill_value): raise NotImplementedError("'fill_value' argument is not supported") if limit: diff --git a/pandas/sparse/list.py b/pandas/sparse/list.py index bc10b73a47723..666dae8071053 100644 --- a/pandas/sparse/list.py +++ b/pandas/sparse/list.py @@ -2,9 +2,9 @@ from pandas.core.base import PandasObject from pandas.formats.printing import pprint_thing +from pandas.types.common import is_scalar from pandas.sparse.array import SparseArray import pandas._sparse as splib -import pandas.lib as lib class SparseList(PandasObject): @@ -121,7 +121,7 @@ def append(self, value): ---------- value: scalar or array-like """ - if lib.isscalar(value): + if is_scalar(value): value = [value] sparr = SparseArray(value, fill_value=self.fill_value) diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index 88f396d20a91e..0996cd3bd826a 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -10,6 +10,7 @@ from pandas import compat import numpy as np +from pandas.types.common import is_list_like, is_scalar from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.frame import DataFrame from pandas.core.panel import Panel @@ -18,7 +19,6 @@ import pandas.core.common as com import pandas.core.ops as ops -import pandas.lib as lib class SparsePanelAxis(object): @@ -186,7 +186,7 @@ def _ixs(self, i, axis=0): key = self._get_axis(axis)[i] # xs cannot handle a non-scalar key, so just reindex here - if com.is_list_like(key): + if is_list_like(key): return self.reindex(**{self._get_axis_name(axis): key}) return self.xs(key, axis=axis) @@ -393,7 +393,7 @@ def _combine(self, other, func, axis=0): return self._combineFrame(other, func, axis=axis) elif isinstance(other, Panel): return self._combinePanel(other, func) - elif lib.isscalar(other): + elif is_scalar(other): new_frames = dict((k, func(v, other)) for k, v in self.iteritems()) return self._new_like(new_frames) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 5c7762c56ec6d..951c2ae0c0d5a 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -8,8 +8,11 @@ import numpy as np import warnings +from pandas.types.missing import isnull +from pandas.types.common import is_scalar +from pandas.core.common import _values_from_object, _maybe_match_name + from pandas.compat.numpy import function as nv -from pandas.core.common import isnull, _values_from_object, _maybe_match_name from pandas.core.index import Index, _ensure_index, InvalidIndexError from pandas.core.series import Series from pandas.core.frame import DataFrame @@ -18,7 +21,6 @@ import pandas.core.common as com import pandas.core.ops as ops import pandas.index as _index -import pandas.lib as lib from pandas.util.decorators import Appender from pandas.sparse.array import (make_sparse, _sparse_array_op, SparseArray, @@ -54,7 +56,7 @@ def wrapper(self, other): return _sparse_series_op(self, other, op, name) elif isinstance(other, DataFrame): return NotImplemented - elif lib.isscalar(other): + elif is_scalar(other): if isnull(other) or isnull(self.fill_value): new_fill_value = np.nan else: diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index 6780cf311c244..e9563d9168206 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -1,7 +1,8 @@ import numpy as np from pandas import compat -from pandas.core.common import isnull, array_equivalent, is_dtype_equal +from pandas.types.missing import isnull, array_equivalent +from pandas.types.common import is_dtype_equal cdef NUMERIC_TYPES = ( bool, @@ -145,8 +146,15 @@ cpdef assert_almost_equal(a, b, if na != nb: from pandas.util.testing import raise_assert_detail + + # if we have a small diff set, print it + if abs(na-nb) < 10: + r = list(set(a) ^ set(b)) + else: + r = None + raise_assert_detail(obj, '{0} length are different'.format(obj), - na, nb) + na, nb, r) for i in xrange(len(a)): try: diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 46d30ab7fe313..bb475e47206c2 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -6,7 +6,7 @@ import warnings import numpy as np -from pandas import lib +from pandas.types.common import is_scalar from pandas.core.api import DataFrame, Series from pandas.util.decorators import Substitution, Appender @@ -226,7 +226,7 @@ def ensure_compat(dispatch, name, arg, func_kw=None, *args, **kwargs): aargs += ',' def f(a, b): - if lib.isscalar(b): + if is_scalar(b): return "{a}={b}".format(a=a, b=b) return "{a}=<{b}>".format(a=a, b=type(b).__name__) aargs = ','.join([f(a, b) for a, b in kwds.items() if b is not None]) diff --git a/pandas/stats/ols.py b/pandas/stats/ols.py index 678689f2d2b30..b533d255bd196 100644 --- a/pandas/stats/ols.py +++ b/pandas/stats/ols.py @@ -13,7 +13,7 @@ from pandas.core.api import DataFrame, Series, isnull from pandas.core.base import StringMixin -from pandas.core.common import _ensure_float64 +from pandas.types.common import _ensure_float64 from pandas.core.index import MultiIndex from pandas.core.panel import Panel from pandas.util.decorators import cache_readonly diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 2b619b84a5994..020b7f1f1ab9d 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -10,7 +10,7 @@ from pandas import (notnull, DataFrame, Series, MultiIndex, date_range, Timestamp, compat) import pandas as pd -import pandas.core.common as com +from pandas.types.dtypes import CategoricalDtype from pandas.util.testing import (assert_series_equal, assert_frame_equal) import pandas.util.testing as tm @@ -45,8 +45,8 @@ def test_apply(self): 'c1': ['C', 'C', 'D', 'D']}) df = df.apply(lambda ts: ts.astype('category')) self.assertEqual(df.shape, (4, 2)) - self.assertTrue(isinstance(df['c0'].dtype, com.CategoricalDtype)) - self.assertTrue(isinstance(df['c1'].dtype, com.CategoricalDtype)) + self.assertTrue(isinstance(df['c0'].dtype, CategoricalDtype)) + self.assertTrue(isinstance(df['c1'].dtype, CategoricalDtype)) def test_apply_mixed_datetimelike(self): # mixed datetimelike diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b42aef9447373..d21db5ba52a45 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -14,6 +14,7 @@ import numpy.ma as ma import numpy.ma.mrecords as mrecords +from pandas.types.common import is_integer_dtype from pandas.compat import (lmap, long, zip, range, lrange, lzip, OrderedDict, is_platform_little_endian) from pandas import compat @@ -809,7 +810,7 @@ def test_constructor_list_of_lists(self): # GH #484 l = [[1, 'a'], [2, 'b']] df = DataFrame(data=l, columns=["num", "str"]) - self.assertTrue(com.is_integer_dtype(df['num'])) + self.assertTrue(is_integer_dtype(df['num'])) self.assertEqual(df['str'].dtype, np.object_) # GH 4851 diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 5f95ff6b6b601..c650436eefaf3 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1,15 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import print_function - from datetime import timedelta import numpy as np - from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, compat, option_context) from pandas.compat import u -from pandas.core import common as com +from pandas.types.dtypes import DatetimeTZDtype from pandas.tests.frame.common import TestData from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -84,8 +82,8 @@ def test_datetime_with_tz_dtypes(self): tzframe.iloc[1, 2] = pd.NaT result = tzframe.dtypes.sort_index() expected = Series([np.dtype('datetime64[ns]'), - com.DatetimeTZDtype('datetime64[ns, US/Eastern]'), - com.DatetimeTZDtype('datetime64[ns, CET]')], + DatetimeTZDtype('datetime64[ns, US/Eastern]'), + DatetimeTZDtype('datetime64[ns, CET]')], ['A', 'B', 'C']) assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index d7fed8131a4f4..578df5ba9101e 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -17,6 +17,9 @@ date_range) import pandas as pd +from pandas.types.common import (is_float_dtype, + is_integer, + is_scalar) from pandas.util.testing import (assert_almost_equal, assert_numpy_array_equal, assert_series_equal, @@ -26,7 +29,6 @@ from pandas.core.indexing import IndexingError import pandas.util.testing as tm -import pandas.lib as lib from pandas.tests.frame.common import TestData @@ -1419,15 +1421,15 @@ def test_setitem_single_column_mixed_datetime(self): # set an allowable datetime64 type from pandas import tslib df.ix['b', 'timestamp'] = tslib.iNaT - self.assertTrue(com.isnull(df.ix['b', 'timestamp'])) + self.assertTrue(isnull(df.ix['b', 'timestamp'])) # allow this syntax df.ix['c', 'timestamp'] = nan - self.assertTrue(com.isnull(df.ix['c', 'timestamp'])) + self.assertTrue(isnull(df.ix['c', 'timestamp'])) # allow this syntax df.ix['d', :] = nan - self.assertTrue(com.isnull(df.ix['c', :]).all() == False) # noqa + self.assertTrue(isnull(df.ix['c', :]).all() == False) # noqa # as of GH 3216 this will now work! # try to set with a list like item @@ -1619,7 +1621,7 @@ def test_set_value_resize(self): res = self.frame.copy() res3 = res.set_value('foobar', 'baz', 5) - self.assertTrue(com.is_float_dtype(res3['baz'])) + self.assertTrue(is_float_dtype(res3['baz'])) self.assertTrue(isnull(res3['baz'].drop(['foobar'])).all()) self.assertRaises(ValueError, res3.set_value, 'foobar', 'baz', 'sam') @@ -1662,7 +1664,7 @@ def test_single_element_ix_dont_upcast(self): (int, np.integer))) result = self.frame.ix[self.frame.index[5], 'E'] - self.assertTrue(com.is_integer(result)) + self.assertTrue(is_integer(result)) def test_irow(self): df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2)) @@ -2268,7 +2270,7 @@ def _check_align(df, cond, other, check_dtypes=True): d = df[k].values c = cond[k].reindex(df[k].index).fillna(False).values - if lib.isscalar(other): + if is_scalar(other): o = other else: if isinstance(other, np.ndarray): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a6246790f83cb..44c7f2277293d 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -6,6 +6,9 @@ import warnings from datetime import datetime +from pandas.types.common import (is_integer_dtype, + is_float_dtype, + is_scalar) from pandas.compat import range, lrange, lzip, StringIO, lmap, map from pandas.tslib import NaT from numpy import nan @@ -22,7 +25,7 @@ assert_frame_equal, assert_panel_equal, assert_attr_equal, slow) from pandas.formats.printing import pprint_thing -from pandas import concat, lib +from pandas import concat from pandas.core.common import PerformanceWarning import pandas.util.testing as tm @@ -200,7 +203,7 @@ def _print(result, error=None): return try: - if lib.isscalar(rs) and lib.isscalar(xp): + if is_scalar(rs) and is_scalar(xp): self.assertEqual(rs, xp) elif xp.ndim == 1: assert_series_equal(rs, xp) @@ -775,7 +778,7 @@ def test_ix_loc_consistency(self): # this is not an exhaustive case def compare(result, expected): - if lib.isscalar(expected): + if is_scalar(expected): self.assertEqual(result, expected) else: self.assertTrue(expected.equals(result)) @@ -2888,8 +2891,8 @@ def test_setitem_dtype_upcast(self): columns=['foo', 'bar', 'baz']) assert_frame_equal(left, right) - self.assertTrue(com.is_integer_dtype(left['foo'])) - self.assertTrue(com.is_integer_dtype(left['baz'])) + self.assertTrue(is_integer_dtype(left['foo'])) + self.assertTrue(is_integer_dtype(left['baz'])) left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, index=list('ab'), @@ -2900,8 +2903,8 @@ def test_setitem_dtype_upcast(self): columns=['foo', 'bar', 'baz']) assert_frame_equal(left, right) - self.assertTrue(com.is_float_dtype(left['foo'])) - self.assertTrue(com.is_float_dtype(left['baz'])) + self.assertTrue(is_float_dtype(left['foo'])) + self.assertTrue(is_float_dtype(left['baz'])) def test_setitem_iloc(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 2a7e8a957977f..b7ec4d570f18b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -8,10 +8,11 @@ import numpy.ma as ma import pandas as pd +from pandas.types.common import is_categorical_dtype, is_datetime64tz_dtype from pandas import Index, Series, isnull, date_range, period_range from pandas.core.index import MultiIndex from pandas.tseries.index import Timestamp, DatetimeIndex -import pandas.core.common as com + import pandas.lib as lib from pandas.compat import lrange, range, zip, OrderedDict, long @@ -144,11 +145,11 @@ def test_constructor_categorical(self): ValueError, lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64')) cat = Series(pd.Categorical([1, 2, 3]), dtype='category') - self.assertTrue(com.is_categorical_dtype(cat)) - self.assertTrue(com.is_categorical_dtype(cat.dtype)) + self.assertTrue(is_categorical_dtype(cat)) + self.assertTrue(is_categorical_dtype(cat.dtype)) s = Series([1, 2, 3], dtype='category') - self.assertTrue(com.is_categorical_dtype(s)) - self.assertTrue(com.is_categorical_dtype(s.dtype)) + self.assertTrue(is_categorical_dtype(s)) + self.assertTrue(is_categorical_dtype(s.dtype)) def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) @@ -429,7 +430,7 @@ def test_constructor_with_datetime_tz(self): s = Series(dr) self.assertTrue(s.dtype.name == 'datetime64[ns, US/Eastern]') self.assertTrue(s.dtype == 'datetime64[ns, US/Eastern]') - self.assertTrue(com.is_datetime64tz_dtype(s.dtype)) + self.assertTrue(is_datetime64tz_dtype(s.dtype)) self.assertTrue('datetime64[ns, US/Eastern]' in str(s)) # export diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 6e82f81f901a9..c25895548dcb9 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from pandas.types.common import is_integer_dtype, is_list_like from pandas import (Index, Series, DataFrame, bdate_range, date_range, period_range, timedelta_range) from pandas.tseries.period import PeriodIndex @@ -49,16 +50,16 @@ def test_dt_namespace_accessor(self): def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): - if com.is_integer_dtype(result): + if is_integer_dtype(result): result = result.astype('int64') - elif not com.is_list_like(result): + elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name) def compare(s, name): a = getattr(s.dt, prop) b = get_expected(s, prop) - if not (com.is_list_like(a) and com.is_list_like(b)): + if not (is_list_like(a) and is_list_like(b)): self.assertEqual(a, b) else: tm.assert_series_equal(a, b) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 15ca238ee32a0..64ebaa63cc10f 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -7,16 +7,14 @@ import numpy as np import pandas as pd +from pandas.types.common import is_integer, is_scalar from pandas import Index, Series, DataFrame, isnull, date_range from pandas.core.index import MultiIndex from pandas.core.indexing import IndexingError from pandas.tseries.index import Timestamp from pandas.tseries.tdi import Timedelta -import pandas.core.common as com import pandas.core.datetools as datetools -import pandas.lib as lib - from pandas.compat import lrange, range from pandas import compat from pandas.util.testing import assert_series_equal, assert_almost_equal @@ -375,7 +373,7 @@ def test_getitem_ambiguous_keyerror(self): def test_getitem_unordered_dup(self): obj = Series(lrange(5), index=['c', 'a', 'a', 'b', 'b']) - self.assertTrue(lib.isscalar(obj['c'])) + self.assertTrue(is_scalar(obj['c'])) self.assertEqual(obj['c'], 0) def test_getitem_dups_with_missing(self): @@ -1174,23 +1172,23 @@ def test_where_numeric_with_string(self): s = pd.Series([1, 2, 3]) w = s.where(s > 1, 'X') - self.assertFalse(com.is_integer(w[0])) - self.assertTrue(com.is_integer(w[1])) - self.assertTrue(com.is_integer(w[2])) + self.assertFalse(is_integer(w[0])) + self.assertTrue(is_integer(w[1])) + self.assertTrue(is_integer(w[2])) self.assertTrue(isinstance(w[0], str)) self.assertTrue(w.dtype == 'object') w = s.where(s > 1, ['X', 'Y', 'Z']) - self.assertFalse(com.is_integer(w[0])) - self.assertTrue(com.is_integer(w[1])) - self.assertTrue(com.is_integer(w[2])) + self.assertFalse(is_integer(w[0])) + self.assertTrue(is_integer(w[1])) + self.assertTrue(is_integer(w[2])) self.assertTrue(isinstance(w[0], str)) self.assertTrue(w.dtype == 'object') w = s.where(s > 1, np.array(['X', 'Y', 'Z'])) - self.assertFalse(com.is_integer(w[0])) - self.assertTrue(com.is_integer(w[1])) - self.assertTrue(com.is_integer(w[2])) + self.assertFalse(is_integer(w[0])) + self.assertTrue(is_integer(w[1])) + self.assertTrue(is_integer(w[2])) self.assertTrue(isinstance(w[0], str)) self.assertTrue(w.dtype == 'object') diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index e0bff7fbd39e4..7d2517987e526 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -7,7 +7,7 @@ from pandas import (Index, Series, _np_version_under1p9) from pandas.tseries.index import Timestamp -import pandas.core.common as com +from pandas.types.common import is_integer import pandas.util.testing as tm from .common import TestData @@ -96,11 +96,11 @@ def test_quantile_interpolation_dtype(self): # interpolation = linear (default case) q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='lower') self.assertEqual(q, percentile(np.array([1, 3, 4]), 50)) - self.assertTrue(com.is_integer(q)) + self.assertTrue(is_integer(q)) q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='higher') self.assertEqual(q, percentile(np.array([1, 3, 4]), 50)) - self.assertTrue(com.is_integer(q)) + self.assertTrue(is_integer(q)) def test_quantile_interpolation_np_lt_1p9(self): # GH #10174 diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 77ae3ca20d123..2721d8d0e5e69 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -9,7 +9,7 @@ import pandas as pd import pandas.compat as compat -import pandas.core.common as com +from pandas.types.common import is_object_dtype, is_datetimetz import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta) @@ -517,7 +517,7 @@ def test_value_counts_unique_nunique(self): continue # special assign to the numpy array - if com.is_datetimetz(o): + if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = pd.tslib.iNaT @@ -982,8 +982,8 @@ def test_memory_usage(self): res = o.memory_usage() res_deep = o.memory_usage(deep=True) - if (com.is_object_dtype(o) or (isinstance(o, Series) and - com.is_object_dtype(o.index))): + if (is_object_dtype(o) or (isinstance(o, Series) and + is_object_dtype(o.index))): # if there are objects, only deep will pick them up self.assertTrue(res_deep > res) else: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 90876a4541da6..2ca1fc71df20a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -8,12 +8,17 @@ import numpy as np +from pandas.types.dtypes import CategoricalDtype +from pandas.types.common import (is_categorical_dtype, + is_object_dtype, + is_float_dtype, + is_integer_dtype) + import pandas as pd import pandas.compat as compat -import pandas.core.common as com import pandas.util.testing as tm from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex, - Timestamp, CategoricalIndex) + Timestamp, CategoricalIndex, isnull) from pandas.compat import range, lrange, u, PY3 from pandas.core.config import option_context @@ -195,18 +200,18 @@ def f(): # This should result in integer categories, not float! cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - self.assertTrue(com.is_integer_dtype(cat.categories)) + self.assertTrue(is_integer_dtype(cat.categories)) # https://github.com/pydata/pandas/issues/3678 cat = pd.Categorical([np.nan, 1, 2, 3]) - self.assertTrue(com.is_integer_dtype(cat.categories)) + self.assertTrue(is_integer_dtype(cat.categories)) # this should result in floats cat = pd.Categorical([np.nan, 1, 2., 3]) - self.assertTrue(com.is_float_dtype(cat.categories)) + self.assertTrue(is_float_dtype(cat.categories)) cat = pd.Categorical([np.nan, 1., 2., 3.]) - self.assertTrue(com.is_float_dtype(cat.categories)) + self.assertTrue(is_float_dtype(cat.categories)) # Deprecating NaNs in categoires (GH #10748) # preserve int as far as possible by converting to object if NaN is in @@ -214,23 +219,23 @@ def f(): with tm.assert_produces_warning(FutureWarning): cat = pd.Categorical([np.nan, 1, 2, 3], categories=[np.nan, 1, 2, 3]) - self.assertTrue(com.is_object_dtype(cat.categories)) + self.assertTrue(is_object_dtype(cat.categories)) # This doesn't work -> this would probably need some kind of "remember # the original type" feature to try to cast the array interface result # to... # vals = np.asarray(cat[cat.notnull()]) - # self.assertTrue(com.is_integer_dtype(vals)) + # self.assertTrue(is_integer_dtype(vals)) with tm.assert_produces_warning(FutureWarning): cat = pd.Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) - self.assertTrue(com.is_object_dtype(cat.categories)) + self.assertTrue(is_object_dtype(cat.categories)) # but don't do it for floats with tm.assert_produces_warning(FutureWarning): cat = pd.Categorical([np.nan, 1., 2., 3.], categories=[np.nan, 1., 2., 3.]) - self.assertTrue(com.is_float_dtype(cat.categories)) + self.assertTrue(is_float_dtype(cat.categories)) # corner cases cat = pd.Categorical([1]) @@ -552,7 +557,7 @@ def test_na_flags_int_categories(self): cat = Categorical(labels, categories, fastpath=True) repr(cat) - self.assert_numpy_array_equal(com.isnull(cat), labels == -1) + self.assert_numpy_array_equal(isnull(cat), labels == -1) def test_categories_none(self): factor = Categorical(['a', 'b', 'b', 'a', @@ -2076,15 +2081,15 @@ def test_assignment_to_dataframe(self): result = df.dtypes expected = Series( - [np.dtype('int32'), com.CategoricalDtype()], index=['value', 'D']) + [np.dtype('int32'), CategoricalDtype()], index=['value', 'D']) tm.assert_series_equal(result, expected) df['E'] = s str(df) result = df.dtypes - expected = Series([np.dtype('int32'), com.CategoricalDtype(), - com.CategoricalDtype()], + expected = Series([np.dtype('int32'), CategoricalDtype(), + CategoricalDtype()], index=['value', 'D', 'E']) tm.assert_series_equal(result, expected) @@ -3234,7 +3239,7 @@ def test_slicing_and_getting_ops(self): # frame res_df = df.iloc[2:4, :] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) # row res_row = df.iloc[2, :] @@ -3244,7 +3249,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) # single value res_val = df.iloc[2, 0] @@ -3254,7 +3259,7 @@ def test_slicing_and_getting_ops(self): # frame res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) # row res_row = df.loc["j", :] @@ -3264,7 +3269,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) # single value res_val = df.loc["j", "cats"] @@ -3275,7 +3280,7 @@ def test_slicing_and_getting_ops(self): # res_df = df.ix["j":"k",[0,1]] # doesn't work? res_df = df.ix["j":"k", :] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) # row res_row = df.ix["j", :] @@ -3285,7 +3290,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.ix[:, "cats"] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) # single value res_val = df.ix["j", 0] @@ -3318,23 +3323,23 @@ def test_slicing_and_getting_ops(self): res_df = df.iloc[slice(2, 4)] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) res_df = df.iloc[[2, 3]] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) res_df = df.iloc[:, slice(0, 2)] tm.assert_frame_equal(res_df, df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) res_df = df.iloc[:, [0, 1]] tm.assert_frame_equal(res_df, df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) def test_slicing_doc_examples(self): @@ -4114,7 +4119,7 @@ def test_astype_to_other(self): s = self.cat['value_group'] expected = s tm.assert_series_equal(s.astype('category'), expected) - tm.assert_series_equal(s.astype(com.CategoricalDtype()), expected) + tm.assert_series_equal(s.astype(CategoricalDtype()), expected) self.assertRaises(ValueError, lambda: s.astype('float64')) cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) @@ -4139,10 +4144,10 @@ def cmp(a, b): # valid conversion for valid in [lambda x: x.astype('category'), - lambda x: x.astype(com.CategoricalDtype()), + lambda x: x.astype(CategoricalDtype()), lambda x: x.astype('object').astype('category'), lambda x: x.astype('object').astype( - com.CategoricalDtype()) + CategoricalDtype()) ]: result = valid(s) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 56b1b542d547e..09dd3f7ab517c 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,21 +1,12 @@ # -*- coding: utf-8 -*- -import collections -from datetime import datetime, timedelta -import re import nose import numpy as np -import pandas as pd -from pandas.tslib import iNaT, NaT -from pandas import (Series, DataFrame, date_range, DatetimeIndex, - TimedeltaIndex, Timestamp, Float64Index) -from pandas import compat -from pandas.compat import range, lrange, lmap, u -from pandas.core.common import notnull, isnull, array_equivalent + +from pandas import Series, Timestamp +from pandas.compat import range, lmap import pandas.core.common as com -import pandas.core.convert as convert import pandas.util.testing as tm -import pandas.core.config as cf _multiprocess_can_split_ = True @@ -28,22 +19,6 @@ def test_mut_exclusive(): assert com._mut_exclusive(major=None, major_axis=None) is None -def test_is_sequence(): - is_seq = com.is_sequence - assert (is_seq((1, 2))) - assert (is_seq([1, 2])) - assert (not is_seq("abcd")) - assert (not is_seq(u("abcd"))) - assert (not is_seq(np.int64)) - - class A(object): - - def __getitem__(self): - return 1 - - assert (not is_seq(A())) - - def test_get_callable_name(): from functools import partial getname = com._get_callable_name @@ -68,407 +43,6 @@ def __call__(self): assert getname(1) is None -class TestInferDtype(tm.TestCase): - - def test_infer_dtype_from_scalar(self): - # Test that _infer_dtype_from_scalar is returning correct dtype for int - # and float. - - for dtypec in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, - np.int32, np.uint64, np.int64]: - data = dtypec(12) - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, type(data)) - - data = 12 - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.int64) - - for dtypec in [np.float16, np.float32, np.float64]: - data = dtypec(12) - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, dtypec) - - data = np.float(12) - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.float64) - - for data in [True, False]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.bool_) - - for data in [np.complex64(1), np.complex128(1)]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.complex_) - - import datetime - for data in [np.datetime64(1, 'ns'), pd.Timestamp(1), - datetime.datetime(2000, 1, 1, 0, 0)]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'M8[ns]') - - for data in [np.timedelta64(1, 'ns'), pd.Timedelta(1), - datetime.timedelta(1)]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'm8[ns]') - - for data in [datetime.date(2000, 1, 1), - pd.Timestamp(1, tz='US/Eastern'), 'foo']: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.object_) - - -def test_notnull(): - assert notnull(1.) - assert not notnull(None) - assert not notnull(np.NaN) - - with cf.option_context("mode.use_inf_as_null", False): - assert notnull(np.inf) - assert notnull(-np.inf) - - arr = np.array([1.5, np.inf, 3.5, -np.inf]) - result = notnull(arr) - assert result.all() - - with cf.option_context("mode.use_inf_as_null", True): - assert not notnull(np.inf) - assert not notnull(-np.inf) - - arr = np.array([1.5, np.inf, 3.5, -np.inf]) - result = notnull(arr) - assert result.sum() == 2 - - with cf.option_context("mode.use_inf_as_null", False): - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: - assert (isinstance(isnull(s), Series)) - - -def test_isnull(): - assert not isnull(1.) - assert isnull(None) - assert isnull(np.NaN) - assert not isnull(np.inf) - assert not isnull(-np.inf) - - # series - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: - assert (isinstance(isnull(s), Series)) - - # frame - for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), - tm.makeMixedDataFrame()]: - result = isnull(df) - expected = df.apply(isnull) - tm.assert_frame_equal(result, expected) - - # panel - for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) - ]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel_equal(result, expected) - - # panel 4d - for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel4d_equal(result, expected) - - -def test_isnull_lists(): - result = isnull([[False]]) - exp = np.array([[False]]) - assert (np.array_equal(result, exp)) - - result = isnull([[1], [2]]) - exp = np.array([[False], [False]]) - assert (np.array_equal(result, exp)) - - # list of strings / unicode - result = isnull(['foo', 'bar']) - assert (not result.any()) - - result = isnull([u('foo'), u('bar')]) - assert (not result.any()) - - -def test_isnull_nat(): - result = isnull([NaT]) - exp = np.array([True]) - assert (np.array_equal(result, exp)) - - result = isnull(np.array([NaT], dtype=object)) - exp = np.array([True]) - assert (np.array_equal(result, exp)) - - -def test_isnull_numpy_nat(): - arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), - np.datetime64('NaT', 's')]) - result = isnull(arr) - expected = np.array([True] * 4) - tm.assert_numpy_array_equal(result, expected) - - -def test_isnull_datetime(): - assert (not isnull(datetime.now())) - assert notnull(datetime.now()) - - idx = date_range('1/1/1990', periods=20) - assert (notnull(idx).all()) - - idx = np.asarray(idx) - idx[0] = iNaT - idx = DatetimeIndex(idx) - mask = isnull(idx) - assert (mask[0]) - assert (not mask[1:].any()) - - # GH 9129 - pidx = idx.to_period(freq='M') - mask = isnull(pidx) - assert (mask[0]) - assert (not mask[1:].any()) - - mask = isnull(pidx[1:]) - assert (not mask.any()) - - -class TestIsNull(tm.TestCase): - - def test_0d_array(self): - self.assertTrue(isnull(np.array(np.nan))) - self.assertFalse(isnull(np.array(0.0))) - self.assertFalse(isnull(np.array(0))) - # test object dtype - self.assertTrue(isnull(np.array(np.nan, dtype=object))) - self.assertFalse(isnull(np.array(0.0, dtype=object))) - self.assertFalse(isnull(np.array(0, dtype=object))) - - -class TestNumberScalar(tm.TestCase): - - def test_is_number(self): - - self.assertTrue(com.is_number(True)) - self.assertTrue(com.is_number(1)) - self.assertTrue(com.is_number(1.1)) - self.assertTrue(com.is_number(1 + 3j)) - self.assertTrue(com.is_number(np.bool(False))) - self.assertTrue(com.is_number(np.int64(1))) - self.assertTrue(com.is_number(np.float64(1.1))) - self.assertTrue(com.is_number(np.complex128(1 + 3j))) - self.assertTrue(com.is_number(np.nan)) - - self.assertFalse(com.is_number(None)) - self.assertFalse(com.is_number('x')) - self.assertFalse(com.is_number(datetime(2011, 1, 1))) - self.assertFalse(com.is_number(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_number(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_number(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_number(timedelta(1000))) - self.assertFalse(com.is_number(pd.Timedelta('1 days'))) - - # questionable - self.assertFalse(com.is_number(np.bool_(False))) - self.assertTrue(com.is_number(np.timedelta64(1, 'D'))) - - def test_is_bool(self): - self.assertTrue(com.is_bool(True)) - self.assertTrue(com.is_bool(np.bool(False))) - self.assertTrue(com.is_bool(np.bool_(False))) - - self.assertFalse(com.is_bool(1)) - self.assertFalse(com.is_bool(1.1)) - self.assertFalse(com.is_bool(1 + 3j)) - self.assertFalse(com.is_bool(np.int64(1))) - self.assertFalse(com.is_bool(np.float64(1.1))) - self.assertFalse(com.is_bool(np.complex128(1 + 3j))) - self.assertFalse(com.is_bool(np.nan)) - self.assertFalse(com.is_bool(None)) - self.assertFalse(com.is_bool('x')) - self.assertFalse(com.is_bool(datetime(2011, 1, 1))) - self.assertFalse(com.is_bool(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_bool(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_bool(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_bool(timedelta(1000))) - self.assertFalse(com.is_bool(np.timedelta64(1, 'D'))) - self.assertFalse(com.is_bool(pd.Timedelta('1 days'))) - - def test_is_integer(self): - self.assertTrue(com.is_integer(1)) - self.assertTrue(com.is_integer(np.int64(1))) - - self.assertFalse(com.is_integer(True)) - self.assertFalse(com.is_integer(1.1)) - self.assertFalse(com.is_integer(1 + 3j)) - self.assertFalse(com.is_integer(np.bool(False))) - self.assertFalse(com.is_integer(np.bool_(False))) - self.assertFalse(com.is_integer(np.float64(1.1))) - self.assertFalse(com.is_integer(np.complex128(1 + 3j))) - self.assertFalse(com.is_integer(np.nan)) - self.assertFalse(com.is_integer(None)) - self.assertFalse(com.is_integer('x')) - self.assertFalse(com.is_integer(datetime(2011, 1, 1))) - self.assertFalse(com.is_integer(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_integer(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_integer(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_integer(timedelta(1000))) - self.assertFalse(com.is_integer(pd.Timedelta('1 days'))) - - # questionable - self.assertTrue(com.is_integer(np.timedelta64(1, 'D'))) - - def test_is_float(self): - self.assertTrue(com.is_float(1.1)) - self.assertTrue(com.is_float(np.float64(1.1))) - self.assertTrue(com.is_float(np.nan)) - - self.assertFalse(com.is_float(True)) - self.assertFalse(com.is_float(1)) - self.assertFalse(com.is_float(1 + 3j)) - self.assertFalse(com.is_float(np.bool(False))) - self.assertFalse(com.is_float(np.bool_(False))) - self.assertFalse(com.is_float(np.int64(1))) - self.assertFalse(com.is_float(np.complex128(1 + 3j))) - self.assertFalse(com.is_float(None)) - self.assertFalse(com.is_float('x')) - self.assertFalse(com.is_float(datetime(2011, 1, 1))) - self.assertFalse(com.is_float(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_float(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_float(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_float(timedelta(1000))) - self.assertFalse(com.is_float(np.timedelta64(1, 'D'))) - self.assertFalse(com.is_float(pd.Timedelta('1 days'))) - - -def test_downcast_conv(): - # test downcasting - - arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) - result = com._possibly_downcast_to_dtype(arr, 'infer') - assert (np.array_equal(result, arr)) - - arr = np.array([8., 8., 8., 8., 8.9999999999995]) - result = com._possibly_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) - - arr = np.array([8., 8., 8., 8., 9.0000000000005]) - result = com._possibly_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) - - # conversions - - expected = np.array([1, 2]) - for dtype in [np.float64, object, np.int64]: - arr = np.array([1.0, 2.0], dtype=dtype) - result = com._possibly_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected, check_dtype=False) - - for dtype in [np.float64, object]: - expected = np.array([1.0, 2.0, np.nan], dtype=dtype) - arr = np.array([1.0, 2.0, np.nan], dtype=dtype) - result = com._possibly_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected) - - # empties - for dtype in [np.int32, np.float64, np.float32, np.bool_, - np.int64, object]: - arr = np.array([], dtype=dtype) - result = com._possibly_downcast_to_dtype(arr, 'int64') - tm.assert_almost_equal(result, np.array([], dtype=np.int64)) - assert result.dtype == np.int64 - - -def test_array_equivalent(): - assert array_equivalent(np.array([np.nan, np.nan]), - np.array([np.nan, np.nan])) - assert array_equivalent(np.array([np.nan, 1, np.nan]), - np.array([np.nan, 1, np.nan])) - assert array_equivalent(np.array([np.nan, None], dtype='object'), - np.array([np.nan, None], dtype='object')) - assert array_equivalent(np.array([np.nan, 1 + 1j], dtype='complex'), - np.array([np.nan, 1 + 1j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1 + 1j], dtype='complex'), np.array( - [np.nan, 1 + 2j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) - assert not array_equivalent( - np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) - assert array_equivalent(Float64Index([0, np.nan]), - Float64Index([0, np.nan])) - assert not array_equivalent( - Float64Index([0, np.nan]), Float64Index([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan]), - DatetimeIndex([0, np.nan])) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) - assert array_equivalent(TimedeltaIndex([0, np.nan]), - TimedeltaIndex([0, np.nan])) - assert not array_equivalent( - TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan], tz='US/Eastern'), - DatetimeIndex([0, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='US/Eastern'), DatetimeIndex( - [1, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='CET'), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) - - -def test_array_equivalent_str(): - for dtype in ['O', 'S', 'U']: - assert array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'B'], dtype=dtype)) - assert not array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'X'], dtype=dtype)) - - -def test_datetimeindex_from_empty_datetime64_array(): - for unit in ['ms', 'us', 'ns']: - idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) - assert (len(idx) == 0) - - -def test_nan_to_nat_conversions(): - - df = DataFrame(dict({ - 'A': np.asarray( - lrange(10), dtype='float64'), - 'B': Timestamp('20010101') - })) - df.iloc[3:6, :] = np.nan - result = df.loc[4, 'B'].value - assert (result == iNaT) - - s = df['B'].copy() - s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) - assert (isnull(s[8])) - - # numpy < 1.7.0 is wrong - from distutils.version import LooseVersion - if LooseVersion(np.__version__) >= '1.7.0': - assert (s[8].value == np.datetime64('NaT').astype(np.int64)) - - def test_any_none(): assert (com._any_none(1, 2, 3, None)) assert (not com._any_none(1, 2, 3, 4)) @@ -567,122 +141,6 @@ def test_groupby(): assert v == expected[k] -def test_is_list_like(): - passes = ([], [1], (1, ), (1, 2), {'a': 1}, set([1, 'a']), Series([1]), - Series([]), Series(['a']).str) - fails = (1, '2', object()) - - for p in passes: - assert com.is_list_like(p) - - for f in fails: - assert not com.is_list_like(f) - - -def test_is_dict_like(): - passes = [{}, {'A': 1}, pd.Series([1])] - fails = ['1', 1, [1, 2], (1, 2), range(2), pd.Index([1])] - - for p in passes: - assert com.is_dict_like(p) - - for f in fails: - assert not com.is_dict_like(f) - - -def test_is_named_tuple(): - passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), ) - fails = ((1, 2, 3), 'a', Series({'pi': 3.14})) - - for p in passes: - assert com.is_named_tuple(p) - - for f in fails: - assert not com.is_named_tuple(f) - - -def test_is_hashable(): - - # all new-style classes are hashable by default - class HashableClass(object): - pass - - class UnhashableClass1(object): - __hash__ = None - - class UnhashableClass2(object): - - def __hash__(self): - raise TypeError("Not hashable") - - hashable = (1, - 3.14, - np.float64(3.14), - 'a', - tuple(), - (1, ), - HashableClass(), ) - not_hashable = ([], UnhashableClass1(), ) - abc_hashable_not_really_hashable = (([], ), UnhashableClass2(), ) - - for i in hashable: - assert com.is_hashable(i) - for i in not_hashable: - assert not com.is_hashable(i) - for i in abc_hashable_not_really_hashable: - assert not com.is_hashable(i) - - # numpy.array is no longer collections.Hashable as of - # https://github.com/numpy/numpy/pull/5326, just test - # pandas.common.is_hashable() - assert not com.is_hashable(np.array([])) - - # old-style classes in Python 2 don't appear hashable to - # collections.Hashable but also seem to support hash() by default - if compat.PY2: - - class OldStyleClass(): - pass - - c = OldStyleClass() - assert not isinstance(c, collections.Hashable) - assert com.is_hashable(c) - hash(c) # this will not raise - - -def test_ensure_int32(): - values = np.arange(10, dtype=np.int32) - result = com._ensure_int32(values) - assert (result.dtype == np.int32) - - values = np.arange(10, dtype=np.int64) - result = com._ensure_int32(values) - assert (result.dtype == np.int32) - - -def test_is_re(): - passes = re.compile('ad'), - fails = 'x', 2, 3, object() - - for p in passes: - assert com.is_re(p) - - for f in fails: - assert not com.is_re(f) - - -def test_is_recompilable(): - passes = (r'a', u('x'), r'asdf', re.compile('adsf'), u(r'\u2233\s*'), - re.compile(r'')) - fails = 1, [], object() - - for p in passes: - assert com.is_re_compilable(p) - - for f in fails: - assert not com.is_re_compilable(f) - - def test_random_state(): import numpy.random as npr # Check with seed @@ -730,83 +188,6 @@ def test_maybe_match_name(): assert (matched == 'y') -class TestMaybe(tm.TestCase): - - def test_maybe_convert_string_to_array(self): - result = com._maybe_convert_string_to_object('x') - tm.assert_numpy_array_equal(result, np.array(['x'], dtype=object)) - self.assertTrue(result.dtype == object) - - result = com._maybe_convert_string_to_object(1) - self.assertEqual(result, 1) - - arr = np.array(['x', 'y'], dtype=str) - result = com._maybe_convert_string_to_object(arr) - tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) - self.assertTrue(result.dtype == object) - - # unicode - arr = np.array(['x', 'y']).astype('U') - result = com._maybe_convert_string_to_object(arr) - tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) - self.assertTrue(result.dtype == object) - - # object - arr = np.array(['x', 2], dtype=object) - result = com._maybe_convert_string_to_object(arr) - tm.assert_numpy_array_equal(result, np.array(['x', 2], dtype=object)) - self.assertTrue(result.dtype == object) - - def test_maybe_convert_scalar(self): - - # pass thru - result = com._maybe_convert_scalar('x') - self.assertEqual(result, 'x') - result = com._maybe_convert_scalar(np.array([1])) - self.assertEqual(result, np.array([1])) - - # leave scalar dtype - result = com._maybe_convert_scalar(np.int64(1)) - self.assertEqual(result, np.int64(1)) - result = com._maybe_convert_scalar(np.int32(1)) - self.assertEqual(result, np.int32(1)) - result = com._maybe_convert_scalar(np.float32(1)) - self.assertEqual(result, np.float32(1)) - result = com._maybe_convert_scalar(np.int64(1)) - self.assertEqual(result, np.float64(1)) - - # coerce - result = com._maybe_convert_scalar(1) - self.assertEqual(result, np.int64(1)) - result = com._maybe_convert_scalar(1.0) - self.assertEqual(result, np.float64(1)) - result = com._maybe_convert_scalar(pd.Timestamp('20130101')) - self.assertEqual(result, pd.Timestamp('20130101').value) - result = com._maybe_convert_scalar(datetime(2013, 1, 1)) - self.assertEqual(result, pd.Timestamp('20130101').value) - result = com._maybe_convert_scalar(pd.Timedelta('1 day 1 min')) - self.assertEqual(result, pd.Timedelta('1 day 1 min').value) - - -class TestConvert(tm.TestCase): - - def test_possibly_convert_objects_copy(self): - values = np.array([1, 2]) - - out = convert._possibly_convert_objects(values, copy=False) - self.assertTrue(values is out) - - out = convert._possibly_convert_objects(values, copy=True) - self.assertTrue(values is not out) - - values = np.array(['apply', 'banana']) - out = convert._possibly_convert_objects(values, copy=False) - self.assertTrue(values is out) - - out = convert._possibly_convert_objects(values, copy=True) - self.assertTrue(values is not out) - - def test_dict_compat(): data_datetime64 = {np.datetime64('1990-03-15'): 1, np.datetime64('2015-03-15'): 2} @@ -817,39 +198,6 @@ def test_dict_compat(): assert (com._dict_compat(data_unchanged) == data_unchanged) -def test_is_timedelta(): - assert (com.is_timedelta64_dtype('timedelta64')) - assert (com.is_timedelta64_dtype('timedelta64[ns]')) - assert (not com.is_timedelta64_ns_dtype('timedelta64')) - assert (com.is_timedelta64_ns_dtype('timedelta64[ns]')) - - tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') - assert (com.is_timedelta64_dtype(tdi)) - assert (com.is_timedelta64_ns_dtype(tdi)) - assert (com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) - # Conversion to Int64Index: - assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) - assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))) - - -def test_array_equivalent_compat(): - # see gh-13388 - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - assert (com.array_equivalent(m, n, strict_nan=True)) - assert (com.array_equivalent(m, n, strict_nan=False)) - - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (4, 3)], dtype=[('a', int), ('b', float)]) - assert (not com.array_equivalent(m, n, strict_nan=True)) - assert (not com.array_equivalent(m, n, strict_nan=False)) - - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (3, 4)], dtype=[('b', int), ('a', float)]) - assert (not com.array_equivalent(m, n, strict_nan=True)) - assert (not com.array_equivalent(m, n, strict_nan=False)) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 2f4c2b414cc30..a53e79439b017 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -7,12 +7,12 @@ from numpy import nan import pandas as pd +from pandas.types.common import is_scalar from pandas import (Index, Series, DataFrame, Panel, isnull, date_range, period_range, Panel4D) from pandas.core.index import MultiIndex import pandas.formats.printing as printing -import pandas.lib as lib from pandas.compat import range, zip, PY3 from pandas import compat @@ -53,7 +53,7 @@ def _construct(self, shape, value=None, dtype=None, **kwargs): if isinstance(shape, int): shape = tuple([shape] * self._ndim) if value is not None: - if lib.isscalar(value): + if is_scalar(value): if value == 'empty': arr = None diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index bd19a83ce2b64..3a5b0117948b7 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -9,6 +9,7 @@ from datetime import datetime, date +from pandas.types.common import is_list_like import pandas as pd from pandas import (Series, DataFrame, MultiIndex, PeriodIndex, date_range, bdate_range) @@ -16,7 +17,6 @@ iteritems, OrderedDict, PY3) from pandas.util.decorators import cache_readonly from pandas.formats.printing import pprint_thing -import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, assert_is_valid_plot_return_object, slow) @@ -157,7 +157,7 @@ def _check_visible(self, collections, visible=True): """ from matplotlib.collections import Collection if not isinstance(collections, - Collection) and not com.is_list_like(collections): + Collection) and not is_list_like(collections): collections = [collections] for patch in collections: @@ -242,7 +242,7 @@ def _check_text_labels(self, texts, expected): expected : str or list-like which has the same length as texts expected text label, or its list """ - if not com.is_list_like(texts): + if not is_list_like(texts): self.assertEqual(texts.get_text(), expected) else: labels = [t.get_text() for t in texts] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a52f22fe2032a..57d43f22757ea 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -5,7 +5,8 @@ from datetime import datetime from numpy import nan -from pandas import date_range, bdate_range, Timestamp +from pandas.types.common import _ensure_platform_int +from pandas import date_range, bdate_range, Timestamp, isnull from pandas.core.index import Index, MultiIndex, CategoricalIndex from pandas.core.api import Categorical, DataFrame from pandas.core.common import UnsupportedFunctionCall @@ -163,9 +164,9 @@ def test_first_last_nth(self): grouped['B'].nth(0) self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan - self.assertTrue(com.isnull(grouped['B'].first()['foo'])) - self.assertTrue(com.isnull(grouped['B'].last()['foo'])) - self.assertTrue(com.isnull(grouped['B'].nth(0)['foo'])) + self.assertTrue(isnull(grouped['B'].first()['foo'])) + self.assertTrue(isnull(grouped['B'].last()['foo'])) + self.assertTrue(isnull(grouped['B'].nth(0)['foo'])) # v0.14.0 whatsnew df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) @@ -1079,8 +1080,9 @@ def test_transform_fast(self): grp = df.groupby('id')['val'] values = np.repeat(grp.mean().values, - com._ensure_platform_int(grp.count().values)) + _ensure_platform_int(grp.count().values)) expected = pd.Series(values, index=df.index, name='val') + result = grp.transform(np.mean) assert_series_equal(result, expected) diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py deleted file mode 100644 index 5f016322f101f..0000000000000 --- a/pandas/tests/test_infer_and_convert.py +++ /dev/null @@ -1,653 +0,0 @@ -# -*- coding: utf-8 -*- - -from datetime import datetime, timedelta, date, time - -import numpy as np -import pandas as pd -import pandas.lib as lib -import pandas.util.testing as tm -from pandas import Index - -from pandas.compat import long, u, PY2 - - -class TestInference(tm.TestCase): - - def test_infer_dtype_bytes(self): - compare = 'string' if PY2 else 'bytes' - - # string array of bytes - arr = np.array(list('abc'), dtype='S1') - self.assertEqual(pd.lib.infer_dtype(arr), compare) - - # object array of bytes - arr = arr.astype(object) - self.assertEqual(pd.lib.infer_dtype(arr), compare) - - def test_isinf_scalar(self): - # GH 11352 - self.assertTrue(lib.isposinf_scalar(float('inf'))) - self.assertTrue(lib.isposinf_scalar(np.inf)) - self.assertFalse(lib.isposinf_scalar(-np.inf)) - self.assertFalse(lib.isposinf_scalar(1)) - self.assertFalse(lib.isposinf_scalar('a')) - - self.assertTrue(lib.isneginf_scalar(float('-inf'))) - self.assertTrue(lib.isneginf_scalar(-np.inf)) - self.assertFalse(lib.isneginf_scalar(np.inf)) - self.assertFalse(lib.isneginf_scalar(1)) - self.assertFalse(lib.isneginf_scalar('a')) - - def test_maybe_convert_numeric_infinities(self): - # see gh-13274 - infinities = ['inf', 'inF', 'iNf', 'Inf', - 'iNF', 'InF', 'INf', 'INF'] - na_values = set(['', 'NULL', 'nan']) - - pos = np.array(['inf'], dtype=np.float64) - neg = np.array(['-inf'], dtype=np.float64) - - msg = "Unable to parse string" - - for infinity in infinities: - for maybe_int in (True, False): - out = lib.maybe_convert_numeric( - np.array([infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - out = lib.maybe_convert_numeric( - np.array(['-' + infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, neg) - - out = lib.maybe_convert_numeric( - np.array([u(infinity)], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - out = lib.maybe_convert_numeric( - np.array(['+' + infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - # too many characters - with tm.assertRaisesRegexp(ValueError, msg): - lib.maybe_convert_numeric( - np.array(['foo_' + infinity], dtype=object), - na_values, maybe_int) - - def test_maybe_convert_numeric_post_floatify_nan(self): - # see gh-13314 - data = np.array(['1.200', '-999.000', '4.500'], dtype=object) - expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) - nan_values = set([-999, -999.0]) - - for coerce_type in (True, False): - out = lib.maybe_convert_numeric(data, nan_values, coerce_type) - tm.assert_numpy_array_equal(out, expected) - - def test_convert_infs(self): - arr = np.array(['inf', 'inf', 'inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - self.assertTrue(result.dtype == np.float64) - - arr = np.array(['-inf', '-inf', '-inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - self.assertTrue(result.dtype == np.float64) - - def test_scientific_no_exponent(self): - # See PR 12215 - arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False, True) - self.assertTrue(np.all(np.isnan(result))) - - def test_convert_non_hashable(self): - # GH13324 - # make sure that we are handing non-hashables - arr = np.array([[10.0, 2], 1.0, 'apple']) - result = lib.maybe_convert_numeric(arr, set(), False, True) - tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) - - -class TestTypeInference(tm.TestCase): - _multiprocess_can_split_ = True - - def test_length_zero(self): - result = lib.infer_dtype(np.array([], dtype='i4')) - self.assertEqual(result, 'integer') - - result = lib.infer_dtype([]) - self.assertEqual(result, 'empty') - - def test_integers(self): - arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='i4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - def test_bools(self): - arr = np.array([True, False, True, True, True], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([True, False, True, 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - arr = np.array([True, False, True], dtype=bool) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - def test_floats(self): - arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], - dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='f4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, 4, 5], dtype='f8') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - def test_string(self): - pass - - def test_unicode(self): - pass - - def test_datetime(self): - - dates = [datetime(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'datetime64') - - def test_infer_dtype_datetime(self): - - arr = np.array([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([np.datetime64('2011-01-01'), - np.datetime64('2011-01-01')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - # starts with nan - for n in [pd.NaT, np.nan]: - arr = np.array([n, pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([n, np.datetime64('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([n, datetime(2011, 1, 1)]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([n, pd.Timestamp('2011-01-02'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([n, np.datetime64('2011-01-02'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([n, datetime(2011, 1, 1), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - # different type of nat - arr = np.array([np.timedelta64('nat'), - np.datetime64('2011-01-02')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.datetime64('2011-01-02'), - np.timedelta64('nat')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - # mixed datetime - arr = np.array([datetime(2011, 1, 1), - pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - # should be datetime? - arr = np.array([np.datetime64('2011-01-01'), - pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([pd.Timestamp('2011-01-02'), - np.datetime64('2011-01-01')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed-integer') - - arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - def test_infer_dtype_timedelta(self): - - arr = np.array([pd.Timedelta('1 days'), - pd.Timedelta('2 days')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([np.timedelta64(1, 'D'), - np.timedelta64(2, 'D')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([timedelta(1), timedelta(2)]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - # starts with nan - for n in [pd.NaT, np.nan]: - arr = np.array([n, pd.Timedelta('1 days')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, np.timedelta64(1, 'D')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, timedelta(1)]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, pd.Timedelta('1 days'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, np.timedelta64(1, 'D'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([n, timedelta(1), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - # different type of nat - arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], - dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], - dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - def test_infer_dtype_all_nan_nat_like(self): - arr = np.array([np.nan, np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'floating') - - # nan and None mix are result in mixed - arr = np.array([np.nan, np.nan, None]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([None, np.nan, np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - # pd.NaT - arr = np.array([pd.NaT]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([pd.NaT, np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([np.nan, pd.NaT]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([np.nan, pd.NaT, np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - arr = np.array([None, pd.NaT, None]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') - - # np.datetime64(nat) - arr = np.array([np.datetime64('nat')]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - for n in [np.nan, pd.NaT, None]: - arr = np.array([n, np.datetime64('nat'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') - - arr = np.array([np.timedelta64('nat')], dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - for n in [np.nan, pd.NaT, None]: - arr = np.array([n, np.timedelta64('nat'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) - self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') - - # datetime / timedelta mixed - arr = np.array([pd.NaT, np.datetime64('nat'), - np.timedelta64('nat'), np.nan]) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], - dtype=object) - self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') - - def test_is_datetimelike_array_all_nan_nat_like(self): - arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) - self.assertTrue(pd.lib.is_datetime_array(arr)) - self.assertTrue(pd.lib.is_datetime64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_array(arr)) - self.assertFalse(pd.lib.is_timedelta64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) - self.assertFalse(pd.lib.is_datetime_array(arr)) - self.assertFalse(pd.lib.is_datetime64_array(arr)) - self.assertTrue(pd.lib.is_timedelta_array(arr)) - self.assertTrue(pd.lib.is_timedelta64_array(arr)) - self.assertTrue(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), - np.timedelta64('nat')]) - self.assertFalse(pd.lib.is_datetime_array(arr)) - self.assertFalse(pd.lib.is_datetime64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_array(arr)) - self.assertFalse(pd.lib.is_timedelta64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - arr = np.array([np.nan, pd.NaT]) - self.assertTrue(pd.lib.is_datetime_array(arr)) - self.assertTrue(pd.lib.is_datetime64_array(arr)) - self.assertTrue(pd.lib.is_timedelta_array(arr)) - self.assertTrue(pd.lib.is_timedelta64_array(arr)) - self.assertTrue(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - arr = np.array([np.nan, np.nan], dtype=object) - self.assertFalse(pd.lib.is_datetime_array(arr)) - self.assertFalse(pd.lib.is_datetime64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_array(arr)) - self.assertFalse(pd.lib.is_timedelta64_array(arr)) - self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) - - def test_date(self): - - dates = [date(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'date') - - def test_to_object_array_tuples(self): - r = (5, 6) - values = [r] - result = lib.to_object_array_tuples(values) - - try: - # make sure record array works - from collections import namedtuple - record = namedtuple('record', 'x y') - r = record(5, 6) - values = [r] - result = lib.to_object_array_tuples(values) # noqa - except ImportError: - pass - - def test_to_object_array_width(self): - # see gh-13320 - rows = [[1, 2, 3], [4, 5, 6]] - - expected = np.array(rows, dtype=object) - out = lib.to_object_array(rows) - tm.assert_numpy_array_equal(out, expected) - - expected = np.array(rows, dtype=object) - out = lib.to_object_array(rows, min_width=1) - tm.assert_numpy_array_equal(out, expected) - - expected = np.array([[1, 2, 3, None, None], - [4, 5, 6, None, None]], dtype=object) - out = lib.to_object_array(rows, min_width=5) - tm.assert_numpy_array_equal(out, expected) - - def test_object(self): - - # GH 7431 - # cannot infer more than this as only a single element - arr = np.array([None], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - def test_categorical(self): - - # GH 8974 - from pandas import Categorical, Series - arr = Categorical(list('abc')) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - def test_is_period(self): - self.assertTrue(lib.is_period(pd.Period('2011-01', freq='M'))) - self.assertFalse(lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))) - self.assertFalse(lib.is_period(pd.Timestamp('2011-01'))) - self.assertFalse(lib.is_period(1)) - self.assertFalse(lib.is_period(np.nan)) - - -class TestConvert(tm.TestCase): - - def test_convert_objects(self): - arr = np.array(['a', 'b', np.nan, np.nan, 'd', 'e', 'f'], dtype='O') - result = lib.maybe_convert_objects(arr) - self.assertTrue(result.dtype == np.object_) - - def test_convert_objects_ints(self): - # test that we can detect many kinds of integers - dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] - - for dtype_str in dtypes: - arr = np.array(list(np.arange(20, dtype=dtype_str)), dtype='O') - self.assertTrue(arr[0].dtype == np.dtype(dtype_str)) - result = lib.maybe_convert_objects(arr) - self.assertTrue(issubclass(result.dtype.type, np.integer)) - - def test_convert_objects_complex_number(self): - for dtype in np.sctypes['complex']: - arr = np.array(list(1j * np.arange(20, dtype=dtype)), dtype='O') - self.assertTrue(arr[0].dtype == np.dtype(dtype)) - result = lib.maybe_convert_objects(arr) - self.assertTrue(issubclass(result.dtype.type, np.complexfloating)) - - -class Testisscalar(tm.TestCase): - - def test_isscalar_builtin_scalars(self): - self.assertTrue(lib.isscalar(None)) - self.assertTrue(lib.isscalar(True)) - self.assertTrue(lib.isscalar(False)) - self.assertTrue(lib.isscalar(0.)) - self.assertTrue(lib.isscalar(np.nan)) - self.assertTrue(lib.isscalar('foobar')) - self.assertTrue(lib.isscalar(b'foobar')) - self.assertTrue(lib.isscalar(u('efoobar'))) - self.assertTrue(lib.isscalar(datetime(2014, 1, 1))) - self.assertTrue(lib.isscalar(date(2014, 1, 1))) - self.assertTrue(lib.isscalar(time(12, 0))) - self.assertTrue(lib.isscalar(timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.NaT)) - - def test_isscalar_builtin_nonscalars(self): - self.assertFalse(lib.isscalar({})) - self.assertFalse(lib.isscalar([])) - self.assertFalse(lib.isscalar([1])) - self.assertFalse(lib.isscalar(())) - self.assertFalse(lib.isscalar((1, ))) - self.assertFalse(lib.isscalar(slice(None))) - self.assertFalse(lib.isscalar(Ellipsis)) - - def test_isscalar_numpy_array_scalars(self): - self.assertTrue(lib.isscalar(np.int64(1))) - self.assertTrue(lib.isscalar(np.float64(1.))) - self.assertTrue(lib.isscalar(np.int32(1))) - self.assertTrue(lib.isscalar(np.object_('foobar'))) - self.assertTrue(lib.isscalar(np.str_('foobar'))) - self.assertTrue(lib.isscalar(np.unicode_(u('foobar')))) - self.assertTrue(lib.isscalar(np.bytes_(b'foobar'))) - self.assertTrue(lib.isscalar(np.datetime64('2014-01-01'))) - self.assertTrue(lib.isscalar(np.timedelta64(1, 'h'))) - - def test_isscalar_numpy_zerodim_arrays(self): - for zerodim in [np.array(1), np.array('foobar'), - np.array(np.datetime64('2014-01-01')), - np.array(np.timedelta64(1, 'h')), - np.array(np.datetime64('NaT'))]: - self.assertFalse(lib.isscalar(zerodim)) - self.assertTrue(lib.isscalar(lib.item_from_zerodim(zerodim))) - - def test_isscalar_numpy_arrays(self): - self.assertFalse(lib.isscalar(np.array([]))) - self.assertFalse(lib.isscalar(np.array([[]]))) - self.assertFalse(lib.isscalar(np.matrix('1; 2'))) - - def test_isscalar_pandas_scalars(self): - self.assertTrue(lib.isscalar(pd.Timestamp('2014-01-01'))) - self.assertTrue(lib.isscalar(pd.Timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.Period('2014-01-01'))) - - def test_lisscalar_pandas_containers(self): - self.assertFalse(lib.isscalar(pd.Series())) - self.assertFalse(lib.isscalar(pd.Series([1]))) - self.assertFalse(lib.isscalar(pd.DataFrame())) - self.assertFalse(lib.isscalar(pd.DataFrame([[1]]))) - self.assertFalse(lib.isscalar(pd.Panel())) - self.assertFalse(lib.isscalar(pd.Panel([[[1]]]))) - self.assertFalse(lib.isscalar(pd.Index([]))) - self.assertFalse(lib.isscalar(pd.Index([1]))) - - -class TestParseSQL(tm.TestCase): - - def test_convert_sql_column_floats(self): - arr = np.array([1.5, None, 3, 4.2], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_strings(self): - arr = np.array(['1.5', None, '3', '4.2'], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_unicode(self): - arr = np.array([u('1.5'), None, u('3'), u('4.2')], - dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], - dtype=object) - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_ints(self): - arr = np.array([1, 2, 3, 4], dtype='O') - arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') - result = lib.convert_sql_column(arr) - result2 = lib.convert_sql_column(arr2) - expected = np.array([1, 2, 3, 4], dtype='i8') - self.assert_numpy_array_equal(result, expected) - self.assert_numpy_array_equal(result2, expected) - - arr = np.array([1, 2, 3, None, 4], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_longs(self): - arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, 4], dtype='i8') - self.assert_numpy_array_equal(result, expected) - - arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_bools(self): - arr = np.array([True, False, True, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, True, False], dtype=bool) - self.assert_numpy_array_equal(result, expected) - - arr = np.array([True, False, None, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, np.nan, False], dtype=object) - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_decimals(self): - from decimal import Decimal - arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_downcast_int64(self): - from pandas.parser import na_values - - arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) - expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) - - # default argument - result = lib.downcast_int64(arr, na_values) - self.assert_numpy_array_equal(result, expected) - - result = lib.downcast_int64(arr, na_values, use_unsigned=False) - self.assert_numpy_array_equal(result, expected) - - expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) - result = lib.downcast_int64(arr, na_values, use_unsigned=True) - self.assert_numpy_array_equal(result, expected) - - # still cast to int8 despite use_unsigned=True - # because of the negative number as an element - arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) - expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) - result = lib.downcast_int64(arr, na_values, use_unsigned=True) - self.assert_numpy_array_equal(result, expected) - - arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) - expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) - result = lib.downcast_int64(arr, na_values) - self.assert_numpy_array_equal(result, expected) - - int8_na = na_values[np.int8] - int64_na = na_values[np.int64] - arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) - expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) - result = lib.downcast_int64(arr, na_values) - self.assert_numpy_array_equal(result, expected) - - -if __name__ == '__main__': - import nose - - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 10a6bb5c75b01..84d7226f1b2f5 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -222,6 +222,7 @@ def test_duplicated_with_nas(): expected = trues + trues assert (np.array_equal(result, expected)) + if __name__ == '__main__': import nose diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1b1db90ea713d..f3b0becccf596 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -10,6 +10,7 @@ from pandas.core.index import Index, MultiIndex from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp +from pandas.types.common import is_float_dtype, is_integer_dtype from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assertRaisesRegexp) import pandas.core.common as com @@ -787,8 +788,8 @@ def test_delevel_infer_dtype(self): df = DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'], index=index) deleveled = df.reset_index() - self.assertTrue(com.is_integer_dtype(deleveled['prm1'])) - self.assertTrue(com.is_float_dtype(deleveled['prm2'])) + self.assertTrue(is_integer_dtype(deleveled['prm1'])) + self.assertTrue(is_float_dtype(deleveled['prm2'])) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop=True) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 904bedde03312..eeeddc278c714 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -5,8 +5,8 @@ import warnings import numpy as np -from pandas import Series -from pandas.core.common import isnull, is_integer_dtype +from pandas import Series, isnull +from pandas.types.common import is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index b1f09ad2685e3..f2e13867d3bf0 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -10,12 +10,13 @@ import numpy as np import pandas as pd +from pandas.types.common import is_float_dtype from pandas import Series, DataFrame, Index, isnull, notnull, pivot, MultiIndex from pandas.core.datetools import bday from pandas.core.nanops import nanall, nanany from pandas.core.panel import Panel from pandas.core.series import remove_na -import pandas.core.common as com + from pandas.formats.printing import pprint_thing from pandas import compat from pandas.compat import range, lrange, StringIO, OrderedDict, signature @@ -903,7 +904,7 @@ def test_set_value(self): self.assertEqual(res.get_value('ItemE', 'foo', 'bar'), 1.5) res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5) - self.assertTrue(com.is_float_dtype(res3['ItemE'].values)) + self.assertTrue(is_float_dtype(res3['ItemE'].values)) with tm.assertRaisesRegexp(TypeError, "There must be an argument for each axis" " plus the value provided"): diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 607048df29faa..16a55c7ec4aeb 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -6,12 +6,12 @@ import numpy as np +from pandas.types.common import is_float_dtype from pandas import Series, Index, isnull, notnull from pandas.core.datetools import bday from pandas.core.panel import Panel from pandas.core.panel4d import Panel4D from pandas.core.series import remove_na -import pandas.core.common as com from pandas.util.testing import (assert_panel_equal, assert_panel4d_equal, @@ -595,7 +595,7 @@ def test_set_value(self): self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) - self.assertTrue(com.is_float_dtype(res3['l4'].values)) + self.assertTrue(is_float_dtype(res3['l4'].values)) class TestPanel4d(tm.TestCase, CheckIndexing, SafeForSparse, diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 67d171bb8efda..4d23bed620265 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -12,8 +12,7 @@ from pandas.compat import range, u import pandas.compat as compat -from pandas import (Index, Series, DataFrame, isnull, MultiIndex) -import pandas.core.common as com +from pandas import (Index, Series, DataFrame, isnull, MultiIndex, notnull) from pandas.util.testing import assert_series_equal import pandas.util.testing as tm @@ -1350,7 +1349,7 @@ def test_len(self): values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo']) result = values.str.len() - exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + exp = values.map(lambda x: len(x) if notnull(x) else NA) tm.assert_series_equal(result, exp) # mixed @@ -1368,7 +1367,7 @@ def test_len(self): 'fooooooo')]) result = values.str.len() - exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + exp = values.map(lambda x: len(x) if notnull(x) else NA) tm.assert_series_equal(result, exp) def test_findall(self): diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py new file mode 100644 index 0000000000000..dd3f07ea8157f --- /dev/null +++ b/pandas/tests/types/test_cast.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +""" +These test the private routines in types/cast.py + +""" + + +import nose +from datetime import datetime +import numpy as np + +from pandas import Timedelta, Timestamp +from pandas.types.cast import (_possibly_downcast_to_dtype, + _possibly_convert_objects, + _infer_dtype_from_scalar, + _maybe_convert_string_to_object, + _maybe_convert_scalar) +from pandas.util import testing as tm + +_multiprocess_can_split_ = True + + +def test_downcast_conv(): + # test downcasting + + arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) + result = _possibly_downcast_to_dtype(arr, 'infer') + assert (np.array_equal(result, arr)) + + arr = np.array([8., 8., 8., 8., 8.9999999999995]) + result = _possibly_downcast_to_dtype(arr, 'infer') + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) + + arr = np.array([8., 8., 8., 8., 9.0000000000005]) + result = _possibly_downcast_to_dtype(arr, 'infer') + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) + + # conversions + + expected = np.array([1, 2]) + for dtype in [np.float64, object, np.int64]: + arr = np.array([1.0, 2.0], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'infer') + tm.assert_almost_equal(result, expected, check_dtype=False) + + for dtype in [np.float64, object]: + expected = np.array([1.0, 2.0, np.nan], dtype=dtype) + arr = np.array([1.0, 2.0, np.nan], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'infer') + tm.assert_almost_equal(result, expected) + + # empties + for dtype in [np.int32, np.float64, np.float32, np.bool_, + np.int64, object]: + arr = np.array([], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'int64') + tm.assert_almost_equal(result, np.array([], dtype=np.int64)) + assert result.dtype == np.int64 + + +class TestInferDtype(tm.TestCase): + + def test_infer_dtype_from_scalar(self): + # Test that _infer_dtype_from_scalar is returning correct dtype for int + # and float. + + for dtypec in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, + np.int32, np.uint64, np.int64]: + data = dtypec(12) + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, type(data)) + + data = 12 + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.int64) + + for dtypec in [np.float16, np.float32, np.float64]: + data = dtypec(12) + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, dtypec) + + data = np.float(12) + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.float64) + + for data in [True, False]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.bool_) + + for data in [np.complex64(1), np.complex128(1)]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.complex_) + + import datetime + for data in [np.datetime64(1, 'ns'), Timestamp(1), + datetime.datetime(2000, 1, 1, 0, 0)]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, 'M8[ns]') + + for data in [np.timedelta64(1, 'ns'), Timedelta(1), + datetime.timedelta(1)]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, 'm8[ns]') + + for data in [datetime.date(2000, 1, 1), + Timestamp(1, tz='US/Eastern'), 'foo']: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.object_) + + +class TestMaybe(tm.TestCase): + + def test_maybe_convert_string_to_array(self): + result = _maybe_convert_string_to_object('x') + tm.assert_numpy_array_equal(result, np.array(['x'], dtype=object)) + self.assertTrue(result.dtype == object) + + result = _maybe_convert_string_to_object(1) + self.assertEqual(result, 1) + + arr = np.array(['x', 'y'], dtype=str) + result = _maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) + self.assertTrue(result.dtype == object) + + # unicode + arr = np.array(['x', 'y']).astype('U') + result = _maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) + self.assertTrue(result.dtype == object) + + # object + arr = np.array(['x', 2], dtype=object) + result = _maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 2], dtype=object)) + self.assertTrue(result.dtype == object) + + def test_maybe_convert_scalar(self): + + # pass thru + result = _maybe_convert_scalar('x') + self.assertEqual(result, 'x') + result = _maybe_convert_scalar(np.array([1])) + self.assertEqual(result, np.array([1])) + + # leave scalar dtype + result = _maybe_convert_scalar(np.int64(1)) + self.assertEqual(result, np.int64(1)) + result = _maybe_convert_scalar(np.int32(1)) + self.assertEqual(result, np.int32(1)) + result = _maybe_convert_scalar(np.float32(1)) + self.assertEqual(result, np.float32(1)) + result = _maybe_convert_scalar(np.int64(1)) + self.assertEqual(result, np.float64(1)) + + # coerce + result = _maybe_convert_scalar(1) + self.assertEqual(result, np.int64(1)) + result = _maybe_convert_scalar(1.0) + self.assertEqual(result, np.float64(1)) + result = _maybe_convert_scalar(Timestamp('20130101')) + self.assertEqual(result, Timestamp('20130101').value) + result = _maybe_convert_scalar(datetime(2013, 1, 1)) + self.assertEqual(result, Timestamp('20130101').value) + result = _maybe_convert_scalar(Timedelta('1 day 1 min')) + self.assertEqual(result, Timedelta('1 day 1 min').value) + + +class TestConvert(tm.TestCase): + + def test_possibly_convert_objects_copy(self): + values = np.array([1, 2]) + + out = _possibly_convert_objects(values, copy=False) + self.assertTrue(values is out) + + out = _possibly_convert_objects(values, copy=True) + self.assertTrue(values is not out) + + values = np.array(['apply', 'banana']) + out = _possibly_convert_objects(values, copy=False) + self.assertTrue(values is out) + + out = _possibly_convert_objects(values, copy=True) + self.assertTrue(values is not out) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py new file mode 100644 index 0000000000000..0a586410ad5a0 --- /dev/null +++ b/pandas/tests/types/test_common.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +import nose +import numpy as np + +from pandas.types.dtypes import DatetimeTZDtype, CategoricalDtype +from pandas.types.common import pandas_dtype + +_multiprocess_can_split_ = True + + +def test_pandas_dtype(): + + assert pandas_dtype('datetime64[ns, US/Eastern]') == DatetimeTZDtype( + 'datetime64[ns, US/Eastern]') + assert pandas_dtype('category') == CategoricalDtype() + for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: + assert pandas_dtype(dtype) == np.dtype(dtype) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index d48b9baf64777..1743e80ae01a9 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -4,13 +4,14 @@ import nose import numpy as np from pandas import Series, Categorical, date_range -import pandas.core.common as com -from pandas.types.api import CategoricalDtype -from pandas.core.common import (is_categorical_dtype, - is_categorical, DatetimeTZDtype, - is_datetime64tz_dtype, is_datetimetz, - is_dtype_equal, is_datetime64_ns_dtype, - is_datetime64_dtype) + +from pandas.types.dtypes import CategoricalDtype +from pandas.types.common import (is_categorical_dtype, + is_categorical, DatetimeTZDtype, + is_datetime64tz_dtype, is_datetimetz, + is_dtype_equal, is_datetime64_ns_dtype, + is_datetime64_dtype, + _coerce_to_dtype) import pandas.util.testing as tm _multiprocess_can_split_ = True @@ -124,9 +125,9 @@ def test_subclass(self): self.assertTrue(issubclass(type(a), type(b))) def test_coerce_to_dtype(self): - self.assertEqual(com._coerce_to_dtype('datetime64[ns, US/Eastern]'), + self.assertEqual(_coerce_to_dtype('datetime64[ns, US/Eastern]'), DatetimeTZDtype('ns', 'US/Eastern')) - self.assertEqual(com._coerce_to_dtype('datetime64[ns, Asia/Tokyo]'), + self.assertEqual(_coerce_to_dtype('datetime64[ns, Asia/Tokyo]'), DatetimeTZDtype('ns', 'Asia/Tokyo')) def test_compat(self): diff --git a/pandas/tests/types/test_generic.py b/pandas/tests/types/test_generic.py index 5549a3a376992..89913de6f6069 100644 --- a/pandas/tests/types/test_generic.py +++ b/pandas/tests/types/test_generic.py @@ -3,8 +3,8 @@ import nose import numpy as np import pandas as pd -import pandas.core.common as com import pandas.util.testing as tm +from pandas.types import generic as gt _multiprocess_can_split_ = True @@ -22,24 +22,24 @@ class TestABCClasses(tm.TestCase): sparse_array = pd.SparseArray(np.random.randn(10)) def test_abc_types(self): - self.assertIsInstance(pd.Index(['a', 'b', 'c']), com.ABCIndex) - self.assertIsInstance(pd.Int64Index([1, 2, 3]), com.ABCInt64Index) - self.assertIsInstance(pd.Float64Index([1, 2, 3]), com.ABCFloat64Index) - self.assertIsInstance(self.multi_index, com.ABCMultiIndex) - self.assertIsInstance(self.datetime_index, com.ABCDatetimeIndex) - self.assertIsInstance(self.timedelta_index, com.ABCTimedeltaIndex) - self.assertIsInstance(self.period_index, com.ABCPeriodIndex) + self.assertIsInstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex) + self.assertIsInstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index) + self.assertIsInstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index) + self.assertIsInstance(self.multi_index, gt.ABCMultiIndex) + self.assertIsInstance(self.datetime_index, gt.ABCDatetimeIndex) + self.assertIsInstance(self.timedelta_index, gt.ABCTimedeltaIndex) + self.assertIsInstance(self.period_index, gt.ABCPeriodIndex) self.assertIsInstance(self.categorical_df.index, - com.ABCCategoricalIndex) - self.assertIsInstance(pd.Index(['a', 'b', 'c']), com.ABCIndexClass) - self.assertIsInstance(pd.Int64Index([1, 2, 3]), com.ABCIndexClass) - self.assertIsInstance(pd.Series([1, 2, 3]), com.ABCSeries) - self.assertIsInstance(self.df, com.ABCDataFrame) - self.assertIsInstance(self.df.to_panel(), com.ABCPanel) - self.assertIsInstance(self.sparse_series, com.ABCSparseSeries) - self.assertIsInstance(self.sparse_array, com.ABCSparseArray) - self.assertIsInstance(self.categorical, com.ABCCategorical) - self.assertIsInstance(pd.Period('2012', freq='A-DEC'), com.ABCPeriod) + gt.ABCCategoricalIndex) + self.assertIsInstance(pd.Index(['a', 'b', 'c']), gt.ABCIndexClass) + self.assertIsInstance(pd.Int64Index([1, 2, 3]), gt.ABCIndexClass) + self.assertIsInstance(pd.Series([1, 2, 3]), gt.ABCSeries) + self.assertIsInstance(self.df, gt.ABCDataFrame) + self.assertIsInstance(self.df.to_panel(), gt.ABCPanel) + self.assertIsInstance(self.sparse_series, gt.ABCSparseSeries) + self.assertIsInstance(self.sparse_array, gt.ABCSparseArray) + self.assertIsInstance(self.categorical, gt.ABCCategorical) + self.assertIsInstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod) if __name__ == '__main__': diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py new file mode 100644 index 0000000000000..34d10ee9dfa42 --- /dev/null +++ b/pandas/tests/types/test_inference.py @@ -0,0 +1,820 @@ +# -*- coding: utf-8 -*- + +""" +These the test the public routines exposed in types/common.py +related to inference and not otherwise tested in types/test_common.py + +""" + +import nose +import collections +import re +from datetime import datetime, date, timedelta, time +import numpy as np + +import pandas as pd +from pandas import lib, tslib +from pandas import (Series, Index, DataFrame, Timedelta, + DatetimeIndex, TimedeltaIndex, Timestamp, + Panel, Period) +from pandas.compat import u, PY2, lrange +from pandas.types import inference +from pandas.types.common import (is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_number, + is_integer, + is_float, + is_bool, + is_scalar, + _ensure_int32) +from pandas.types.missing import isnull +from pandas.util import testing as tm + +_multiprocess_can_split_ = True + + +def test_is_sequence(): + is_seq = inference.is_sequence + assert (is_seq((1, 2))) + assert (is_seq([1, 2])) + assert (not is_seq("abcd")) + assert (not is_seq(u("abcd"))) + assert (not is_seq(np.int64)) + + class A(object): + + def __getitem__(self): + return 1 + + assert (not is_seq(A())) + + +def test_is_list_like(): + passes = ([], [1], (1, ), (1, 2), {'a': 1}, set([1, 'a']), Series([1]), + Series([]), Series(['a']).str) + fails = (1, '2', object()) + + for p in passes: + assert inference.is_list_like(p) + + for f in fails: + assert not inference.is_list_like(f) + + +def test_is_dict_like(): + passes = [{}, {'A': 1}, Series([1])] + fails = ['1', 1, [1, 2], (1, 2), range(2), Index([1])] + + for p in passes: + assert inference.is_dict_like(p) + + for f in fails: + assert not inference.is_dict_like(f) + + +def test_is_named_tuple(): + passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), ) + fails = ((1, 2, 3), 'a', Series({'pi': 3.14})) + + for p in passes: + assert inference.is_named_tuple(p) + + for f in fails: + assert not inference.is_named_tuple(f) + + +def test_is_hashable(): + + # all new-style classes are hashable by default + class HashableClass(object): + pass + + class UnhashableClass1(object): + __hash__ = None + + class UnhashableClass2(object): + + def __hash__(self): + raise TypeError("Not hashable") + + hashable = (1, + 3.14, + np.float64(3.14), + 'a', + tuple(), + (1, ), + HashableClass(), ) + not_hashable = ([], UnhashableClass1(), ) + abc_hashable_not_really_hashable = (([], ), UnhashableClass2(), ) + + for i in hashable: + assert inference.is_hashable(i) + for i in not_hashable: + assert not inference.is_hashable(i) + for i in abc_hashable_not_really_hashable: + assert not inference.is_hashable(i) + + # numpy.array is no longer collections.Hashable as of + # https://github.com/numpy/numpy/pull/5326, just test + # is_hashable() + assert not inference.is_hashable(np.array([])) + + # old-style classes in Python 2 don't appear hashable to + # collections.Hashable but also seem to support hash() by default + if PY2: + + class OldStyleClass(): + pass + + c = OldStyleClass() + assert not isinstance(c, collections.Hashable) + assert inference.is_hashable(c) + hash(c) # this will not raise + + +def test_is_re(): + passes = re.compile('ad'), + fails = 'x', 2, 3, object() + + for p in passes: + assert inference.is_re(p) + + for f in fails: + assert not inference.is_re(f) + + +def test_is_recompilable(): + passes = (r'a', u('x'), r'asdf', re.compile('adsf'), u(r'\u2233\s*'), + re.compile(r'')) + fails = 1, [], object() + + for p in passes: + assert inference.is_re_compilable(p) + + for f in fails: + assert not inference.is_re_compilable(f) + + +class TestInference(tm.TestCase): + + def test_infer_dtype_bytes(self): + compare = 'string' if PY2 else 'bytes' + + # string array of bytes + arr = np.array(list('abc'), dtype='S1') + self.assertEqual(lib.infer_dtype(arr), compare) + + # object array of bytes + arr = arr.astype(object) + self.assertEqual(lib.infer_dtype(arr), compare) + + def test_isinf_scalar(self): + # GH 11352 + self.assertTrue(lib.isposinf_scalar(float('inf'))) + self.assertTrue(lib.isposinf_scalar(np.inf)) + self.assertFalse(lib.isposinf_scalar(-np.inf)) + self.assertFalse(lib.isposinf_scalar(1)) + self.assertFalse(lib.isposinf_scalar('a')) + + self.assertTrue(lib.isneginf_scalar(float('-inf'))) + self.assertTrue(lib.isneginf_scalar(-np.inf)) + self.assertFalse(lib.isneginf_scalar(np.inf)) + self.assertFalse(lib.isneginf_scalar(1)) + self.assertFalse(lib.isneginf_scalar('a')) + + def test_maybe_convert_numeric_infinities(self): + # see gh-13274 + infinities = ['inf', 'inF', 'iNf', 'Inf', + 'iNF', 'InF', 'INf', 'INF'] + na_values = set(['', 'NULL', 'nan']) + + pos = np.array(['inf'], dtype=np.float64) + neg = np.array(['-inf'], dtype=np.float64) + + msg = "Unable to parse string" + + for infinity in infinities: + for maybe_int in (True, False): + out = lib.maybe_convert_numeric( + np.array([infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['-' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, neg) + + out = lib.maybe_convert_numeric( + np.array([u(infinity)], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['+' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + # too many characters + with tm.assertRaisesRegexp(ValueError, msg): + lib.maybe_convert_numeric( + np.array(['foo_' + infinity], dtype=object), + na_values, maybe_int) + + def test_maybe_convert_numeric_post_floatify_nan(self): + # see gh-13314 + data = np.array(['1.200', '-999.000', '4.500'], dtype=object) + expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) + nan_values = set([-999, -999.0]) + + for coerce_type in (True, False): + out = lib.maybe_convert_numeric(data, nan_values, coerce_type) + tm.assert_numpy_array_equal(out, expected) + + def test_convert_infs(self): + arr = np.array(['inf', 'inf', 'inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + arr = np.array(['-inf', '-inf', '-inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + def test_scientific_no_exponent(self): + # See PR 12215 + arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False, True) + self.assertTrue(np.all(np.isnan(result))) + + def test_convert_non_hashable(self): + # GH13324 + # make sure that we are handing non-hashables + arr = np.array([[10.0, 2], 1.0, 'apple']) + result = lib.maybe_convert_numeric(arr, set(), False, True) + tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) + + +class TestTypeInference(tm.TestCase): + _multiprocess_can_split_ = True + + def test_length_zero(self): + result = lib.infer_dtype(np.array([], dtype='i4')) + self.assertEqual(result, 'integer') + + result = lib.infer_dtype([]) + self.assertEqual(result, 'empty') + + def test_integers(self): + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='i4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + def test_bools(self): + arr = np.array([True, False, True, True, True], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([True, False, True, 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + arr = np.array([True, False, True], dtype=bool) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + def test_floats(self): + arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], + dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='f4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, 4, 5], dtype='f8') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + def test_string(self): + pass + + def test_unicode(self): + pass + + def test_datetime(self): + + dates = [datetime(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'datetime64') + + def test_infer_dtype_datetime(self): + + arr = np.array([Timestamp('2011-01-01'), + Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.datetime64('2011-01-01'), + np.datetime64('2011-01-01')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1)]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, pd.Timestamp('2011-01-02'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # different type of nat + arr = np.array([np.timedelta64('nat'), + np.datetime64('2011-01-02')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.datetime64('2011-01-02'), + np.timedelta64('nat')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + # mixed datetime + arr = np.array([datetime(2011, 1, 1), + pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # should be datetime? + arr = np.array([np.datetime64('2011-01-01'), + pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([pd.Timestamp('2011-01-02'), + np.datetime64('2011-01-01')]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) + self.assertEqual(lib.infer_dtype(arr), 'mixed-integer') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_timedelta(self): + + arr = np.array([pd.Timedelta('1 days'), + pd.Timedelta('2 days')]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([np.timedelta64(1, 'D'), + np.timedelta64(2, 'D')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([timedelta(1), timedelta(2)]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, Timedelta('1 days')]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D')]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1)]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, pd.Timedelta('1 days'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + # different type of nat + arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], + dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], + dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_all_nan_nat_like(self): + arr = np.array([np.nan, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'floating') + + # nan and None mix are result in mixed + arr = np.array([np.nan, np.nan, None]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([None, np.nan, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + # pd.NaT + arr = np.array([pd.NaT]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([pd.NaT, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([None, pd.NaT, None]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # np.datetime64(nat) + arr = np.array([np.datetime64('nat')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.datetime64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([np.timedelta64('nat')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.timedelta64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + # datetime / timedelta mixed + arr = np.array([pd.NaT, np.datetime64('nat'), + np.timedelta64('nat'), np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], + dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + def test_is_datetimelike_array_all_nan_nat_like(self): + arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) + self.assertTrue(lib.is_datetime_array(arr)) + self.assertTrue(lib.is_datetime64_array(arr)) + self.assertFalse(lib.is_timedelta_array(arr)) + self.assertFalse(lib.is_timedelta64_array(arr)) + self.assertFalse(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) + self.assertFalse(lib.is_datetime_array(arr)) + self.assertFalse(lib.is_datetime64_array(arr)) + self.assertTrue(lib.is_timedelta_array(arr)) + self.assertTrue(lib.is_timedelta64_array(arr)) + self.assertTrue(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), + np.timedelta64('nat')]) + self.assertFalse(lib.is_datetime_array(arr)) + self.assertFalse(lib.is_datetime64_array(arr)) + self.assertFalse(lib.is_timedelta_array(arr)) + self.assertFalse(lib.is_timedelta64_array(arr)) + self.assertFalse(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT]) + self.assertTrue(lib.is_datetime_array(arr)) + self.assertTrue(lib.is_datetime64_array(arr)) + self.assertTrue(lib.is_timedelta_array(arr)) + self.assertTrue(lib.is_timedelta64_array(arr)) + self.assertTrue(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, np.nan], dtype=object) + self.assertFalse(lib.is_datetime_array(arr)) + self.assertFalse(lib.is_datetime64_array(arr)) + self.assertFalse(lib.is_timedelta_array(arr)) + self.assertFalse(lib.is_timedelta64_array(arr)) + self.assertFalse(lib.is_timedelta_or_timedelta64_array(arr)) + + def test_date(self): + + dates = [date(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'date') + + def test_to_object_array_tuples(self): + r = (5, 6) + values = [r] + result = lib.to_object_array_tuples(values) + + try: + # make sure record array works + from collections import namedtuple + record = namedtuple('record', 'x y') + r = record(5, 6) + values = [r] + result = lib.to_object_array_tuples(values) # noqa + except ImportError: + pass + + def test_object(self): + + # GH 7431 + # cannot infer more than this as only a single element + arr = np.array([None], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + def test_to_object_array_width(self): + # see gh-13320 + rows = [[1, 2, 3], [4, 5, 6]] + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows, min_width=1) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array([[1, 2, 3, None, None], + [4, 5, 6, None, None]], dtype=object) + out = lib.to_object_array(rows, min_width=5) + tm.assert_numpy_array_equal(out, expected) + + def test_is_period(self): + self.assertTrue(lib.is_period(pd.Period('2011-01', freq='M'))) + self.assertFalse(lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))) + self.assertFalse(lib.is_period(pd.Timestamp('2011-01'))) + self.assertFalse(lib.is_period(1)) + self.assertFalse(lib.is_period(np.nan)) + + def test_categorical(self): + + # GH 8974 + from pandas import Categorical, Series + arr = Categorical(list('abc')) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + +class TestNumberScalar(tm.TestCase): + + def test_is_number(self): + + self.assertTrue(is_number(True)) + self.assertTrue(is_number(1)) + self.assertTrue(is_number(1.1)) + self.assertTrue(is_number(1 + 3j)) + self.assertTrue(is_number(np.bool(False))) + self.assertTrue(is_number(np.int64(1))) + self.assertTrue(is_number(np.float64(1.1))) + self.assertTrue(is_number(np.complex128(1 + 3j))) + self.assertTrue(is_number(np.nan)) + + self.assertFalse(is_number(None)) + self.assertFalse(is_number('x')) + self.assertFalse(is_number(datetime(2011, 1, 1))) + self.assertFalse(is_number(np.datetime64('2011-01-01'))) + self.assertFalse(is_number(Timestamp('2011-01-01'))) + self.assertFalse(is_number(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_number(timedelta(1000))) + self.assertFalse(is_number(Timedelta('1 days'))) + + # questionable + self.assertFalse(is_number(np.bool_(False))) + self.assertTrue(is_number(np.timedelta64(1, 'D'))) + + def test_is_bool(self): + self.assertTrue(is_bool(True)) + self.assertTrue(is_bool(np.bool(False))) + self.assertTrue(is_bool(np.bool_(False))) + + self.assertFalse(is_bool(1)) + self.assertFalse(is_bool(1.1)) + self.assertFalse(is_bool(1 + 3j)) + self.assertFalse(is_bool(np.int64(1))) + self.assertFalse(is_bool(np.float64(1.1))) + self.assertFalse(is_bool(np.complex128(1 + 3j))) + self.assertFalse(is_bool(np.nan)) + self.assertFalse(is_bool(None)) + self.assertFalse(is_bool('x')) + self.assertFalse(is_bool(datetime(2011, 1, 1))) + self.assertFalse(is_bool(np.datetime64('2011-01-01'))) + self.assertFalse(is_bool(Timestamp('2011-01-01'))) + self.assertFalse(is_bool(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_bool(timedelta(1000))) + self.assertFalse(is_bool(np.timedelta64(1, 'D'))) + self.assertFalse(is_bool(Timedelta('1 days'))) + + def test_is_integer(self): + self.assertTrue(is_integer(1)) + self.assertTrue(is_integer(np.int64(1))) + + self.assertFalse(is_integer(True)) + self.assertFalse(is_integer(1.1)) + self.assertFalse(is_integer(1 + 3j)) + self.assertFalse(is_integer(np.bool(False))) + self.assertFalse(is_integer(np.bool_(False))) + self.assertFalse(is_integer(np.float64(1.1))) + self.assertFalse(is_integer(np.complex128(1 + 3j))) + self.assertFalse(is_integer(np.nan)) + self.assertFalse(is_integer(None)) + self.assertFalse(is_integer('x')) + self.assertFalse(is_integer(datetime(2011, 1, 1))) + self.assertFalse(is_integer(np.datetime64('2011-01-01'))) + self.assertFalse(is_integer(Timestamp('2011-01-01'))) + self.assertFalse(is_integer(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_integer(timedelta(1000))) + self.assertFalse(is_integer(Timedelta('1 days'))) + + # questionable + self.assertTrue(is_integer(np.timedelta64(1, 'D'))) + + def test_is_float(self): + self.assertTrue(is_float(1.1)) + self.assertTrue(is_float(np.float64(1.1))) + self.assertTrue(is_float(np.nan)) + + self.assertFalse(is_float(True)) + self.assertFalse(is_float(1)) + self.assertFalse(is_float(1 + 3j)) + self.assertFalse(is_float(np.bool(False))) + self.assertFalse(is_float(np.bool_(False))) + self.assertFalse(is_float(np.int64(1))) + self.assertFalse(is_float(np.complex128(1 + 3j))) + self.assertFalse(is_float(None)) + self.assertFalse(is_float('x')) + self.assertFalse(is_float(datetime(2011, 1, 1))) + self.assertFalse(is_float(np.datetime64('2011-01-01'))) + self.assertFalse(is_float(Timestamp('2011-01-01'))) + self.assertFalse(is_float(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_float(timedelta(1000))) + self.assertFalse(is_float(np.timedelta64(1, 'D'))) + self.assertFalse(is_float(Timedelta('1 days'))) + + def test_is_timedelta(self): + self.assertTrue(is_timedelta64_dtype('timedelta64')) + self.assertTrue(is_timedelta64_dtype('timedelta64[ns]')) + self.assertFalse(is_timedelta64_ns_dtype('timedelta64')) + self.assertTrue(is_timedelta64_ns_dtype('timedelta64[ns]')) + + tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') + self.assertTrue(is_timedelta64_dtype(tdi)) + self.assertTrue(is_timedelta64_ns_dtype(tdi)) + self.assertTrue(is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) + + # Conversion to Int64Index: + self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) + self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))) + + +class Testisscalar(tm.TestCase): + + def test_isscalar_builtin_scalars(self): + self.assertTrue(is_scalar(None)) + self.assertTrue(is_scalar(True)) + self.assertTrue(is_scalar(False)) + self.assertTrue(is_scalar(0.)) + self.assertTrue(is_scalar(np.nan)) + self.assertTrue(is_scalar('foobar')) + self.assertTrue(is_scalar(b'foobar')) + self.assertTrue(is_scalar(u('efoobar'))) + self.assertTrue(is_scalar(datetime(2014, 1, 1))) + self.assertTrue(is_scalar(date(2014, 1, 1))) + self.assertTrue(is_scalar(time(12, 0))) + self.assertTrue(is_scalar(timedelta(hours=1))) + self.assertTrue(is_scalar(pd.NaT)) + + def test_isscalar_builtin_nonscalars(self): + self.assertFalse(is_scalar({})) + self.assertFalse(is_scalar([])) + self.assertFalse(is_scalar([1])) + self.assertFalse(is_scalar(())) + self.assertFalse(is_scalar((1, ))) + self.assertFalse(is_scalar(slice(None))) + self.assertFalse(is_scalar(Ellipsis)) + + def test_isscalar_numpy_array_scalars(self): + self.assertTrue(is_scalar(np.int64(1))) + self.assertTrue(is_scalar(np.float64(1.))) + self.assertTrue(is_scalar(np.int32(1))) + self.assertTrue(is_scalar(np.object_('foobar'))) + self.assertTrue(is_scalar(np.str_('foobar'))) + self.assertTrue(is_scalar(np.unicode_(u('foobar')))) + self.assertTrue(is_scalar(np.bytes_(b'foobar'))) + self.assertTrue(is_scalar(np.datetime64('2014-01-01'))) + self.assertTrue(is_scalar(np.timedelta64(1, 'h'))) + + def test_isscalar_numpy_zerodim_arrays(self): + for zerodim in [np.array(1), np.array('foobar'), + np.array(np.datetime64('2014-01-01')), + np.array(np.timedelta64(1, 'h')), + np.array(np.datetime64('NaT'))]: + self.assertFalse(is_scalar(zerodim)) + self.assertTrue(is_scalar(lib.item_from_zerodim(zerodim))) + + def test_isscalar_numpy_arrays(self): + self.assertFalse(is_scalar(np.array([]))) + self.assertFalse(is_scalar(np.array([[]]))) + self.assertFalse(is_scalar(np.matrix('1; 2'))) + + def test_isscalar_pandas_scalars(self): + self.assertTrue(is_scalar(Timestamp('2014-01-01'))) + self.assertTrue(is_scalar(Timedelta(hours=1))) + self.assertTrue(is_scalar(Period('2014-01-01'))) + + def test_lisscalar_pandas_containers(self): + self.assertFalse(is_scalar(Series())) + self.assertFalse(is_scalar(Series([1]))) + self.assertFalse(is_scalar(DataFrame())) + self.assertFalse(is_scalar(DataFrame([[1]]))) + self.assertFalse(is_scalar(Panel())) + self.assertFalse(is_scalar(Panel([[[1]]]))) + self.assertFalse(is_scalar(Index([]))) + self.assertFalse(is_scalar(Index([1]))) + + +def test_datetimeindex_from_empty_datetime64_array(): + for unit in ['ms', 'us', 'ns']: + idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) + assert (len(idx) == 0) + + +def test_nan_to_nat_conversions(): + + df = DataFrame(dict({ + 'A': np.asarray( + lrange(10), dtype='float64'), + 'B': Timestamp('20010101') + })) + df.iloc[3:6, :] = np.nan + result = df.loc[4, 'B'].value + assert (result == tslib.iNaT) + + s = df['B'].copy() + s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) + assert (isnull(s[8])) + + # numpy < 1.7.0 is wrong + from distutils.version import LooseVersion + if LooseVersion(np.__version__) >= '1.7.0': + assert (s[8].value == np.datetime64('NaT').astype(np.int64)) + + +def test_ensure_int32(): + values = np.arange(10, dtype=np.int32) + result = _ensure_int32(values) + assert (result.dtype == np.int32) + + values = np.arange(10, dtype=np.int64) + result = _ensure_int32(values) + assert (result.dtype == np.int32) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_io.py b/pandas/tests/types/test_io.py new file mode 100644 index 0000000000000..545edf8f1386c --- /dev/null +++ b/pandas/tests/types/test_io.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pandas.lib as lib +import pandas.util.testing as tm + +from pandas.compat import long, u + + +class TestParseSQL(tm.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u('1.5'), None, u('3'), u('4.2')], + dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], + dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + self.assert_numpy_array_equal(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + + arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_downcast_int64(self): + from pandas.parser import na_values + + arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) + + # default argument + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + result = lib.downcast_int64(arr, na_values, use_unsigned=False) + self.assert_numpy_array_equal(result, expected) + + expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + # still cast to int8 despite use_unsigned=True + # because of the negative number as an element + arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + int8_na = na_values[np.int8] + int64_na = na_values[np.int64] + arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) + expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + +if __name__ == '__main__': + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py new file mode 100644 index 0000000000000..edcb69de7bfad --- /dev/null +++ b/pandas/tests/types/test_missing.py @@ -0,0 +1,243 @@ +# -*- coding: utf-8 -*- + +import nose +import numpy as np +from datetime import datetime +from pandas.util import testing as tm + +from pandas.core import config as cf +from pandas.compat import u +from pandas.tslib import iNaT +from pandas import (NaT, Float64Index, Series, + DatetimeIndex, TimedeltaIndex, date_range) +from pandas.types.dtypes import DatetimeTZDtype +from pandas.types.missing import (array_equivalent, isnull, notnull, + na_value_for_dtype) + +_multiprocess_can_split_ = True + + +def test_notnull(): + assert notnull(1.) + assert not notnull(None) + assert not notnull(np.NaN) + + with cf.option_context("mode.use_inf_as_null", False): + assert notnull(np.inf) + assert notnull(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notnull(arr) + assert result.all() + + with cf.option_context("mode.use_inf_as_null", True): + assert not notnull(np.inf) + assert not notnull(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notnull(arr) + assert result.sum() == 2 + + with cf.option_context("mode.use_inf_as_null", False): + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), + tm.makeObjectSeries(), tm.makeTimeSeries(), + tm.makePeriodSeries()]: + assert (isinstance(isnull(s), Series)) + + +def test_isnull(): + assert not isnull(1.) + assert isnull(None) + assert isnull(np.NaN) + assert not isnull(np.inf) + assert not isnull(-np.inf) + + # series + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), + tm.makeObjectSeries(), tm.makeTimeSeries(), + tm.makePeriodSeries()]: + assert (isinstance(isnull(s), Series)) + + # frame + for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), + tm.makeMixedDataFrame()]: + result = isnull(df) + expected = df.apply(isnull) + tm.assert_frame_equal(result, expected) + + # panel + for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) + ]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel_equal(result, expected) + + # panel 4d + for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel4d_equal(result, expected) + + +def test_isnull_lists(): + result = isnull([[False]]) + exp = np.array([[False]]) + assert (np.array_equal(result, exp)) + + result = isnull([[1], [2]]) + exp = np.array([[False], [False]]) + assert (np.array_equal(result, exp)) + + # list of strings / unicode + result = isnull(['foo', 'bar']) + assert (not result.any()) + + result = isnull([u('foo'), u('bar')]) + assert (not result.any()) + + +def test_isnull_nat(): + result = isnull([NaT]) + exp = np.array([True]) + assert (np.array_equal(result, exp)) + + result = isnull(np.array([NaT], dtype=object)) + exp = np.array([True]) + assert (np.array_equal(result, exp)) + + +def test_isnull_numpy_nat(): + arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), + np.datetime64('NaT', 's')]) + result = isnull(arr) + expected = np.array([True] * 4) + tm.assert_numpy_array_equal(result, expected) + + +def test_isnull_datetime(): + assert (not isnull(datetime.now())) + assert notnull(datetime.now()) + + idx = date_range('1/1/1990', periods=20) + assert (notnull(idx).all()) + + idx = np.asarray(idx) + idx[0] = iNaT + idx = DatetimeIndex(idx) + mask = isnull(idx) + assert (mask[0]) + assert (not mask[1:].any()) + + # GH 9129 + pidx = idx.to_period(freq='M') + mask = isnull(pidx) + assert (mask[0]) + assert (not mask[1:].any()) + + mask = isnull(pidx[1:]) + assert (not mask.any()) + + +class TestIsNull(tm.TestCase): + + def test_0d_array(self): + self.assertTrue(isnull(np.array(np.nan))) + self.assertFalse(isnull(np.array(0.0))) + self.assertFalse(isnull(np.array(0))) + # test object dtype + self.assertTrue(isnull(np.array(np.nan, dtype=object))) + self.assertFalse(isnull(np.array(0.0, dtype=object))) + self.assertFalse(isnull(np.array(0, dtype=object))) + + +def test_array_equivalent(): + assert array_equivalent(np.array([np.nan, np.nan]), + np.array([np.nan, np.nan])) + assert array_equivalent(np.array([np.nan, 1, np.nan]), + np.array([np.nan, 1, np.nan])) + assert array_equivalent(np.array([np.nan, None], dtype='object'), + np.array([np.nan, None], dtype='object')) + assert array_equivalent(np.array([np.nan, 1 + 1j], dtype='complex'), + np.array([np.nan, 1 + 1j], dtype='complex')) + assert not array_equivalent( + np.array([np.nan, 1 + 1j], dtype='complex'), np.array( + [np.nan, 1 + 2j], dtype='complex')) + assert not array_equivalent( + np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) + assert not array_equivalent( + np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) + assert array_equivalent(Float64Index([0, np.nan]), + Float64Index([0, np.nan])) + assert not array_equivalent( + Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), + DatetimeIndex([0, np.nan])) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) + assert array_equivalent(TimedeltaIndex([0, np.nan]), + TimedeltaIndex([0, np.nan])) + assert not array_equivalent( + TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan], tz='US/Eastern'), + DatetimeIndex([0, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz='US/Eastern'), DatetimeIndex( + [1, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex( + [0, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz='CET'), DatetimeIndex( + [0, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) + + +def test_array_equivalent_compat(): + # see gh-13388 + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + assert (array_equivalent(m, n, strict_nan=True)) + assert (array_equivalent(m, n, strict_nan=False)) + + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (4, 3)], dtype=[('a', int), ('b', float)]) + assert (not array_equivalent(m, n, strict_nan=True)) + assert (not array_equivalent(m, n, strict_nan=False)) + + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (3, 4)], dtype=[('b', int), ('a', float)]) + assert (not array_equivalent(m, n, strict_nan=True)) + assert (not array_equivalent(m, n, strict_nan=False)) + + +def test_array_equivalent_str(): + for dtype in ['O', 'S', 'U']: + assert array_equivalent(np.array(['A', 'B'], dtype=dtype), + np.array(['A', 'B'], dtype=dtype)) + assert not array_equivalent(np.array(['A', 'B'], dtype=dtype), + np.array(['A', 'X'], dtype=dtype)) + + +def test_na_value_for_dtype(): + for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), + DatetimeTZDtype('datetime64[ns, US/Eastern]')]: + assert na_value_for_dtype(dtype) is NaT + + for dtype in ['u1', 'u2', 'u4', 'u8', + 'i1', 'i2', 'i4', 'i8']: + assert na_value_for_dtype(np.dtype(dtype)) == 0 + + for dtype in ['bool']: + assert na_value_for_dtype(np.dtype(dtype)) is False + + for dtype in ['f2', 'f4', 'f8']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + for dtype in ['O']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_types.py b/pandas/tests/types/test_types.py deleted file mode 100644 index b9f6006cab731..0000000000000 --- a/pandas/tests/types/test_types.py +++ /dev/null @@ -1,40 +0,0 @@ -# -*- coding: utf-8 -*- -import nose -import numpy as np - -from pandas import NaT -from pandas.types.api import (DatetimeTZDtype, CategoricalDtype, - na_value_for_dtype, pandas_dtype) - - -def test_pandas_dtype(): - - assert pandas_dtype('datetime64[ns, US/Eastern]') == DatetimeTZDtype( - 'datetime64[ns, US/Eastern]') - assert pandas_dtype('category') == CategoricalDtype() - for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: - assert pandas_dtype(dtype) == np.dtype(dtype) - - -def test_na_value_for_dtype(): - for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), - DatetimeTZDtype('datetime64[ns, US/Eastern]')]: - assert na_value_for_dtype(dtype) is NaT - - for dtype in ['u1', 'u2', 'u4', 'u8', - 'i1', 'i2', 'i4', 'i8']: - assert na_value_for_dtype(np.dtype(dtype)) == 0 - - for dtype in ['bool']: - assert na_value_for_dtype(np.dtype(dtype)) is False - - for dtype in ['f2', 'f4', 'f8']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) - - for dtype in ['O']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 075dff9cf6c38..5b66e55eb60b6 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -12,6 +12,21 @@ from pandas import (Categorical, DataFrame, Series, Index, MultiIndex, Timedelta) from pandas.core.frame import _merge_doc +from pandas.types.generic import ABCSeries +from pandas.types.common import (is_datetime64tz_dtype, + is_datetime64_dtype, + needs_i8_conversion, + is_int64_dtype, + is_integer, + is_int_or_datetime_dtype, + is_dtype_equal, + is_bool, + is_list_like, + _ensure_int64, + _ensure_platform_int, + _ensure_object) +from pandas.types.missing import na_value_for_dtype + from pandas.core.generic import NDFrame from pandas.core.index import (_get_combined_index, _ensure_index, _get_consensus_names, @@ -19,18 +34,10 @@ from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util.decorators import Appender, Substitution -from pandas.core.common import (ABCSeries, is_dtype_equal, - is_datetime64_dtype, - is_int64_dtype, - is_integer, - is_bool, - is_list_like, - needs_i8_conversion) import pandas.core.algorithms as algos import pandas.core.common as com import pandas.types.concat as _concat -from pandas.types.api import na_value_for_dtype import pandas.algos as _algos import pandas.hashtable as _hash @@ -436,7 +443,7 @@ def _merger(x, y): # if we DO have duplicates, then # we cannot guarantee order - sorter = com._ensure_platform_int( + sorter = _ensure_platform_int( np.concatenate([groupby.indices[g] for g, _ in groupby])) if len(result) != len(sorter): if check_duplicates: @@ -1111,8 +1118,8 @@ def _get_single_indexer(join_key, index, sort=False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = _algos.left_outer_join( - com._ensure_int64(left_key), - com._ensure_int64(right_key), + _ensure_int64(left_key), + _ensure_int64(right_key), count, sort=sort) return left_indexer, right_indexer @@ -1158,18 +1165,17 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys(lk, rk, sort=True): - if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk): + if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): lk = lk.values rk = rk.values - - if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk): + if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): klass = _hash.Int64Factorizer - lk = com._ensure_int64(com._values_from_object(lk)) - rk = com._ensure_int64(com._values_from_object(rk)) + lk = _ensure_int64(com._values_from_object(lk)) + rk = _ensure_int64(com._values_from_object(rk)) else: klass = _hash.Factorizer - lk = com._ensure_object(lk) - rk = com._ensure_object(rk) + lk = _ensure_object(lk) + rk = _ensure_object(rk) rizer = klass(max(len(lk), len(rk))) @@ -1208,10 +1214,10 @@ def _sort_labels(uniques, left, right): reverse_indexer = np.empty(len(sorter), dtype=np.int64) reverse_indexer.put(sorter, np.arange(len(sorter))) - new_left = reverse_indexer.take(com._ensure_platform_int(left)) + new_left = reverse_indexer.take(_ensure_platform_int(left)) np.putmask(new_left, left == -1, -1) - new_right = reverse_indexer.take(com._ensure_platform_int(right)) + new_right = reverse_indexer.take(_ensure_platform_int(right)) np.putmask(new_right, right == -1, -1) return new_left, new_right diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index e1405bc9e6add..3e2b7c3af460e 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -1,6 +1,7 @@ # pylint: disable=E1103 +from pandas.types.common import is_list_like, is_scalar from pandas import Series, DataFrame from pandas.core.index import MultiIndex, Index from pandas.core.groupby import Grouper @@ -9,7 +10,6 @@ from pandas.compat import range, lrange, zip from pandas import compat import pandas.core.common as com -import pandas.lib as lib import numpy as np @@ -95,7 +95,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', values_passed = values is not None if values_passed: - if com.is_list_like(values): + if is_list_like(values): values_multi = True values = list(values) else: @@ -361,7 +361,7 @@ def _all_key(): def _convert_by(by): if by is None: by = [] - elif (lib.isscalar(by) or + elif (is_scalar(by) or isinstance(by, (np.ndarray, Index, Series, Grouper)) or hasattr(by, '__call__')): by = [by] diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index b6c1926c1e7fc..4cf3364a03056 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -11,10 +11,17 @@ import numpy as np +from pandas.types.common import (is_list_like, + is_integer, + is_number, + is_hashable, + is_iterator) +from pandas.types.missing import isnull, notnull + from pandas.util.decorators import cache_readonly, deprecate_kwarg from pandas.core.base import PandasObject -import pandas.core.common as com -from pandas.core.common import AbstractMethodError + +from pandas.core.common import AbstractMethodError, _try_sort from pandas.core.generic import _shared_docs, _shared_doc_kwargs from pandas.core.index import Index, MultiIndex from pandas.core.series import Series, remove_na @@ -161,7 +168,7 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', if colormap is not None: warnings.warn("'color' and 'colormap' cannot be used " "simultaneously. Using 'color'") - colors = list(color) if com.is_list_like(color) else color + colors = list(color) if is_list_like(color) else color else: if color_type == 'default': # need to call list() on the result to copy so we don't @@ -336,7 +343,7 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) - mask = com.notnull(df) + mask = notnull(df) marker = _get_marker_compat(marker) @@ -980,7 +987,7 @@ def _validate_color_args(self): "simultaneously. Using 'color'") if 'color' in self.kwds and self.style is not None: - if com.is_list_like(self.style): + if is_list_like(self.style): styles = self.style else: styles = [self.style] @@ -1001,7 +1008,7 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): # TODO: unused? # if self.sort_columns: - # columns = com._try_sort(data.columns) + # columns = _try_sort(data.columns) # else: # columns = data.columns @@ -1099,13 +1106,13 @@ def result(self): Return result axes """ if self.subplots: - if self.layout is not None and not com.is_list_like(self.ax): + if self.layout is not None and not is_list_like(self.ax): return self.axes.reshape(*self.layout) else: return self.axes else: sec_true = isinstance(self.secondary_y, bool) and self.secondary_y - all_sec = (com.is_list_like(self.secondary_y) and + all_sec = (is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries) if (sec_true or all_sec): # if all data is plotted on secondary, return right axes @@ -1322,7 +1329,7 @@ def _get_xticks(self, convert_period=False): @classmethod def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): - mask = com.isnull(y) + mask = isnull(y) if mask.any(): y = np.ma.array(y) y = np.ma.masked_where(mask, y) @@ -1463,8 +1470,8 @@ def match_labels(data, e): err = np.atleast_2d(evalues) err = np.tile(err, (self.nseries, 1)) - elif com.is_list_like(err): - if com.is_iterator(err): + elif is_list_like(err): + if is_iterator(err): err = np.atleast_2d(list(err)) else: # raw error values @@ -1486,7 +1493,7 @@ def match_labels(data, e): if len(err) == 1: err = np.tile(err, (self.nseries, 1)) - elif com.is_number(err): + elif is_number(err): err = np.tile([err], (self.nseries, len(self.data))) else: @@ -1543,9 +1550,9 @@ def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: raise ValueError(self._kind + ' requires and x and y column') - if com.is_integer(x) and not self.data.columns.holds_integer(): + if is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] - if com.is_integer(y) and not self.data.columns.holds_integer(): + if is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] self.x = x self.y = y @@ -1569,7 +1576,7 @@ def __init__(self, data, x, y, s=None, c=None, **kwargs): # the handling of this argument later s = 20 super(ScatterPlot, self).__init__(data, x, y, s=s, **kwargs) - if com.is_integer(c) and not self.data.columns.holds_integer(): + if is_integer(c) and not self.data.columns.holds_integer(): c = self.data.columns[c] self.c = c @@ -1577,7 +1584,7 @@ def _make_plot(self): x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] - c_is_column = com.is_hashable(c) and c in self.data.columns + c_is_column = is_hashable(c) and c in self.data.columns # plot a colorbar only if a colormap is provided or necessary cb = self.kwds.pop('colorbar', self.colormap or c_is_column) @@ -1629,7 +1636,7 @@ class HexBinPlot(PlanePlot): def __init__(self, data, x, y, C=None, **kwargs): super(HexBinPlot, self).__init__(data, x, y, **kwargs) - if com.is_integer(C) and not self.data.columns.holds_integer(): + if is_integer(C) and not self.data.columns.holds_integer(): C = self.data.columns[C] self.C = C @@ -1912,9 +1919,9 @@ def __init__(self, data, **kwargs): self.ax_pos = self.tick_pos - self.tickoffset def _args_adjust(self): - if com.is_list_like(self.bottom): + if is_list_like(self.bottom): self.bottom = np.array(self.bottom) - if com.is_list_like(self.left): + if is_list_like(self.left): self.left = np.array(self.left) @classmethod @@ -2027,18 +2034,18 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): - if com.is_integer(self.bins): + if is_integer(self.bins): # create common bin edge values = (self.data._convert(datetime=True)._get_numeric_data()) values = np.ravel(values) - values = values[~com.isnull(values)] + values = values[~isnull(values)] hist, self.bins = np.histogram( values, bins=self.bins, range=self.kwds.get('range', None), weights=self.kwds.get('weights', None)) - if com.is_list_like(self.bottom): + if is_list_like(self.bottom): self.bottom = np.array(self.bottom) @classmethod @@ -2046,7 +2053,7 @@ def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, stacking_id=None, **kwds): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) - y = y[~com.isnull(y)] + y = y[~isnull(y)] base = np.zeros(len(bins) - 1) bottom = bottom + \ @@ -2411,7 +2418,7 @@ def _plot(data, x=None, y=None, subplots=False, msg = "{0} requires either y column or 'subplots=True'" raise ValueError(msg.format(kind)) elif y is not None: - if com.is_integer(y) and not data.columns.holds_integer(): + if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] # converted to series actually. copy to not modify data = data[y].copy() @@ -2420,12 +2427,12 @@ def _plot(data, x=None, y=None, subplots=False, else: if isinstance(data, DataFrame): if x is not None: - if com.is_integer(x) and not data.columns.holds_integer(): + if is_integer(x) and not data.columns.holds_integer(): x = data.columns[x] data = data.set_index(x) if y is not None: - if com.is_integer(y) and not data.columns.holds_integer(): + if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] label = kwds['label'] if 'label' in kwds else y series = data[y].copy() # Don't modify @@ -2434,7 +2441,7 @@ def _plot(data, x=None, y=None, subplots=False, for kw in ['xerr', 'yerr']: if (kw in kwds) and \ (isinstance(kwds[kw], string_types) or - com.is_integer(kwds[kw])): + is_integer(kwds[kw])): try: kwds[kw] = data[kwds[kw]] except (IndexError, KeyError, TypeError): @@ -2897,7 +2904,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, layout=layout) _axes = _flatten(axes) - for i, col in enumerate(com._try_sort(data.columns)): + for i, col in enumerate(_try_sort(data.columns)): ax = _axes[i] ax.hist(data[col].dropna().values, bins=bins, **kwds) ax.set_title(col) @@ -3345,7 +3352,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, if ax is None: fig = plt.figure(**fig_kw) else: - if com.is_list_like(ax): + if is_list_like(ax): ax = _flatten(ax) if layout is not None: warnings.warn("When passing multiple axes, layout keyword is " @@ -3487,7 +3494,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): def _flatten(axes): - if not com.is_list_like(axes): + if not is_list_like(axes): return np.array([axes]) elif isinstance(axes, (np.ndarray, Index)): return axes.ravel() diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index b0bbf8ba70354..62bbfc2f630a5 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -2,12 +2,14 @@ Quantilization functions and related stuff """ +from pandas.types.missing import isnull +from pandas.types.common import (is_float, is_integer, + is_scalar) + from pandas.core.api import Series from pandas.core.categorical import Categorical import pandas.core.algorithms as algos -import pandas.core.common as com import pandas.core.nanops as nanops -import pandas.lib as lib from pandas.compat import zip import numpy as np @@ -80,7 +82,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 if not np.iterable(bins): - if lib.isscalar(bins) and bins < 1: + if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") try: # for array-like sz = x.size @@ -164,7 +166,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3): >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ - if com.is_integer(q): + if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q @@ -194,7 +196,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, if include_lowest: ids[x == bins[0]] = 1 - na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) + na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: @@ -264,7 +266,7 @@ def _format_label(x, precision=3): fmt_str = '%%.%dg' % precision if np.isinf(x): return str(x) - elif com.is_float(x): + elif is_float(x): frac, whole = np.modf(x) sgn = '-' if x < 0 else '' whole = abs(whole) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index d70904e1bf286..b8b28663387cc 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,6 +1,12 @@ import numpy as np import pandas.lib as lib +from pandas.types.common import (is_number, + is_numeric_dtype, + is_datetime_or_timedelta_dtype, + _ensure_object) +from pandas.types.cast import _possibly_downcast_to_dtype + import pandas as pd from pandas.compat import reduce from pandas.core.index import Index @@ -141,7 +147,7 @@ def to_numeric(arg, errors='raise', downcast=None): elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): - if com.is_number(arg): + if is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') @@ -151,14 +157,13 @@ def to_numeric(arg, errors='raise', downcast=None): values = arg try: - if com.is_numeric_dtype(values): + if is_numeric_dtype(values): pass - elif com.is_datetime_or_timedelta_dtype(values): + elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: - values = com._ensure_object(values) + values = _ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True - values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) @@ -168,7 +173,7 @@ def to_numeric(arg, errors='raise', downcast=None): # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified - if downcast is not None and com.is_numeric_dtype(values): + if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): @@ -189,7 +194,7 @@ def to_numeric(arg, errors='raise', downcast=None): # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize < values.dtype.itemsize: - values = com._possibly_downcast_to_dtype( + values = _possibly_downcast_to_dtype( values, dtype) # successful conversion diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 4bafac873ea09..fe0440170383b 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -9,10 +9,16 @@ from pandas.compat.numpy import function as nv import numpy as np - +from pandas.types.common import (is_integer, is_float, + is_bool_dtype, _ensure_int64, + is_scalar, + is_list_like) +from pandas.types.generic import (ABCIndex, ABCSeries, + ABCPeriodIndex, ABCIndexClass) +from pandas.types.missing import isnull from pandas.core import common as com, algorithms -from pandas.core.common import (is_integer, is_float, is_bool_dtype, - AbstractMethodError) +from pandas.core.common import AbstractMethodError + import pandas.formats.printing as printing import pandas.tslib as tslib import pandas._period as prlib @@ -111,9 +117,9 @@ def _join_i8_wrapper(joinf, dtype, with_indexers=True): @staticmethod def wrapper(left, right): - if isinstance(left, (np.ndarray, com.ABCIndex, com.ABCSeries)): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries)): left = left.view('i8') - if isinstance(right, (np.ndarray, com.ABCIndex, com.ABCSeries)): + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries)): right = right.view('i8') results = joinf(left, right) if with_indexers: @@ -133,10 +139,10 @@ def _evaluate_compare(self, other, op): # coerce to a similar object if not isinstance(other, type(self)): - if not com.is_list_like(other): + if not is_list_like(other): # scalar other = [other] - elif lib.isscalar(lib.item_from_zerodim(other)): + elif is_scalar(lib.item_from_zerodim(other)): # ndarray scalar other = [other.item()] other = type(self)(other) @@ -174,7 +180,7 @@ def _ensure_localized(self, result): # reconvert to local tz if getattr(self, 'tz', None) is not None: - if not isinstance(result, com.ABCIndexClass): + if not isinstance(result, ABCIndexClass): result = self._simple_new(result) result = result.tz_localize(self.tz) return result @@ -202,7 +208,7 @@ def _format_with_header(self, header, **kwargs): def __contains__(self, key): try: res = self.get_loc(key) - return lib.isscalar(res) or type(res) == slice or np.any(res) + return is_scalar(res) or type(res) == slice or np.any(res) except (KeyError, TypeError, ValueError): return False @@ -213,7 +219,7 @@ def __getitem__(self, key): """ is_int = is_integer(key) - if lib.isscalar(key) and not is_int: + if is_scalar(key) and not is_int: raise ValueError getitem = self._data.__getitem__ @@ -282,7 +288,7 @@ def _nat_new(self, box=True): return result attribs = self._get_attributes_dict() - if not isinstance(self, com.ABCPeriodIndex): + if not isinstance(self, ABCPeriodIndex): attribs['freq'] = None return self._simple_new(result, **attribs) @@ -312,7 +318,7 @@ def sort_values(self, return_indexer=False, ascending=True): attribs = self._get_attributes_dict() freq = attribs['freq'] - if freq is not None and not isinstance(self, com.ABCPeriodIndex): + if freq is not None and not isinstance(self, ABCPeriodIndex): if freq.n > 0 and not ascending: freq = freq * -1 elif freq.n < 0 and ascending: @@ -328,7 +334,7 @@ def sort_values(self, return_indexer=False, ascending=True): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_int64(indices) + indices = _ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): @@ -340,7 +346,7 @@ def take(self, indices, axis=0, allow_fill=True, na_value=tslib.iNaT) # keep freq in PeriodIndex, reset otherwise - freq = self.freq if isinstance(self, com.ABCPeriodIndex) else None + freq = self.freq if isinstance(self, ABCPeriodIndex) else None return self._shallow_copy(taken, freq=freq) def get_duplicates(self): @@ -545,7 +551,7 @@ def _convert_scalar_indexer(self, key, kind=None): # we don't allow integer/float indexing for loc # we don't allow float indexing for ix/getitem - if lib.isscalar(key): + if is_scalar(key): is_int = is_integer(key) is_flt = is_float(key) if kind in ['loc'] and (is_int or is_flt): @@ -591,7 +597,7 @@ def __add__(self, other): elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): return self._add_delta(other) - elif com.is_integer(other): + elif is_integer(other): return self.shift(other) elif isinstance(other, (tslib.Timestamp, datetime)): return self._add_datelike(other) @@ -619,7 +625,7 @@ def __sub__(self, other): elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): return self._add_delta(-other) - elif com.is_integer(other): + elif is_integer(other): return self.shift(-other) elif isinstance(other, (tslib.Timestamp, datetime)): return self._sub_datelike(other) @@ -791,9 +797,9 @@ def summary(self, name=None): def _ensure_datetimelike_to_i8(other): """ helper for coercing an input scalar or array to i8 """ - if lib.isscalar(other) and com.isnull(other): + if lib.isscalar(other) and isnull(other): other = tslib.iNaT - elif isinstance(other, com.ABCIndexClass): + elif isinstance(other, ABCIndexClass): # convert tz if needed if getattr(other, 'tz', None) is not None: diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 8937e83c7009a..46e8bd43e8ff8 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -3,19 +3,21 @@ """ import numpy as np + +from pandas.types.common import (_NS_DTYPE, _TD_DTYPE, + is_period_arraylike, + is_datetime_arraylike, is_integer_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, is_categorical_dtype, + is_list_like) + from pandas.core.base import PandasDelegate, NoNewAttributesMixin -from pandas.core import common as com from pandas.tseries.index import DatetimeIndex from pandas._period import IncompatibleFrequency # flake8: noqa from pandas.tseries.period import PeriodIndex from pandas.tseries.tdi import TimedeltaIndex from pandas import tslib from pandas.core.algorithms import take_1d -from pandas.core.common import (_NS_DTYPE, _TD_DTYPE, is_period_arraylike, - is_datetime_arraylike, is_integer_dtype, - is_list_like, - is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_categorical_dtype) def is_datetimelike(data): @@ -129,7 +131,7 @@ def _delegate_method(self, name, *args, **kwargs): method = getattr(self.values, name) result = method(*args, **kwargs) - if not com.is_list_like(result): + if not is_list_like(result): return result result = Series(result, index=self.index, name=self.name) diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index 78b185ae8cf31..fc23f4f99449b 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -10,6 +10,14 @@ from matplotlib.ticker import Formatter, AutoLocator, Locator from matplotlib.transforms import nonsingular + +from pandas.types.common import (is_float, is_integer, + is_integer_dtype, + is_float_dtype, + is_datetime64_ns_dtype, + is_period_arraylike, + ) + from pandas.compat import lrange import pandas.compat as compat import pandas.lib as lib @@ -73,8 +81,8 @@ class TimeConverter(units.ConversionInterface): @staticmethod def convert(value, unit, axis): valid_types = (str, pydt.time) - if (isinstance(value, valid_types) or com.is_integer(value) or - com.is_float(value)): + if (isinstance(value, valid_types) or is_integer(value) or + is_float(value)): return time2num(value) if isinstance(value, Index): return value.map(time2num) @@ -129,14 +137,14 @@ def convert(values, units, axis): raise TypeError('Axis must have `freq` set to convert to Periods') valid_types = (compat.string_types, datetime, Period, pydt.date, pydt.time) - if (isinstance(values, valid_types) or com.is_integer(values) or - com.is_float(values)): + if (isinstance(values, valid_types) or is_integer(values) or + is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, PeriodIndex): return values.asfreq(axis.freq).values if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) - if com.is_period_arraylike(values): + if is_period_arraylike(values): return PeriodIndex(values, freq=axis.freq).values if isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] @@ -149,7 +157,7 @@ def get_datevalue(date, freq): elif isinstance(date, (compat.string_types, datetime, pydt.date, pydt.time)): return Period(date, freq).ordinal - elif (com.is_integer(date) or com.is_float(date) or + elif (is_integer(date) or is_float(date) or (isinstance(date, (np.ndarray, Index)) and (date.size == 1))): return date elif date is None: @@ -163,8 +171,8 @@ def _dt_to_float_ordinal(dt): preserving hours, minutes, seconds and microseconds. Return value is a :func:`float`. """ - if (isinstance(dt, (np.ndarray, Index, Series)) and - com.is_datetime64_ns_dtype(dt)): + if (isinstance(dt, (np.ndarray, Index, Series) + ) and is_datetime64_ns_dtype(dt)): base = dates.epoch2num(dt.asi8 / 1.0E9) else: base = dates.date2num(dt) @@ -188,7 +196,7 @@ def try_parse(values): return _dt_to_float_ordinal(lib.Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) - elif (com.is_integer(values) or com.is_float(values)): + elif (is_integer(values) or is_float(values)): return values elif isinstance(values, compat.string_types): return try_parse(values) @@ -198,7 +206,7 @@ def try_parse(values): if not isinstance(values, np.ndarray): values = com._asarray_tuplesafe(values) - if com.is_integer_dtype(values) or com.is_float_dtype(values): + if is_integer_dtype(values) or is_float_dtype(values): return values try: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 3f1d0c6d969a6..e2132deb97d64 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -6,12 +6,17 @@ import numpy as np +from pandas.types.generic import ABCSeries +from pandas.types.common import (is_integer, + is_period_arraylike, + is_timedelta64_dtype, + is_datetime64_dtype) + import pandas.core.algorithms as algos from pandas.core.algorithms import unique from pandas.tseries.offsets import DateOffset from pandas.util.decorators import cache_readonly import pandas.tseries.offsets as offsets -import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib from pandas.tslib import Timedelta @@ -255,8 +260,8 @@ def get_freq_code(freqstr): freqstr = (freqstr.rule_code, freqstr.n) if isinstance(freqstr, tuple): - if (com.is_integer(freqstr[0]) and - com.is_integer(freqstr[1])): + if (is_integer(freqstr[0]) and + is_integer(freqstr[1])): # e.g., freqstr = (2000, 1) return freqstr else: @@ -265,13 +270,13 @@ def get_freq_code(freqstr): code = _period_str_to_code(freqstr[0]) stride = freqstr[1] except: - if com.is_integer(freqstr[1]): + if is_integer(freqstr[1]): raise code = _period_str_to_code(freqstr[1]) stride = freqstr[0] return code, stride - if com.is_integer(freqstr): + if is_integer(freqstr): return (freqstr, 1) base, stride = _base_and_stride(freqstr) @@ -843,16 +848,16 @@ def infer_freq(index, warn=True): """ import pandas as pd - if isinstance(index, com.ABCSeries): + if isinstance(index, ABCSeries): values = index._values - if not (com.is_datetime64_dtype(values) or - com.is_timedelta64_dtype(values) or + if not (is_datetime64_dtype(values) or + is_timedelta64_dtype(values) or values.dtype == object): raise TypeError("cannot infer freq from a non-convertible " "dtype on a Series of {0}".format(index.dtype)) index = values - if com.is_period_arraylike(index): + if is_period_arraylike(index): raise TypeError("PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq.") elif isinstance(index, pd.TimedeltaIndex): diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 9b36bc5907066..47bb69b8d7ad6 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -6,13 +6,25 @@ from datetime import timedelta import numpy as np from pandas.core.base import _shared_docs -from pandas.core.common import (_INT64_DTYPE, _NS_DTYPE, _maybe_box, - _values_from_object, ABCSeries, - DatetimeTZDtype, PerformanceWarning, - is_datetimetz, is_datetime64_dtype, - is_datetime64_ns_dtype, is_dtype_equal, - is_float, is_integer, is_integer_dtype, - is_object_dtype, is_string_dtype) + +from pandas.types.common import (_NS_DTYPE, _INT64_DTYPE, + is_object_dtype, is_datetime64_dtype, + is_datetimetz, is_dtype_equal, + is_integer, is_float, + is_integer_dtype, + is_datetime64_ns_dtype, + is_bool_dtype, + is_string_dtype, + is_list_like, + is_scalar, + _ensure_int64) +from pandas.types.generic import ABCSeries +from pandas.types.dtypes import DatetimeTZDtype +from pandas.types.missing import isnull + +import pandas.types.concat as _concat +from pandas.core.common import (_values_from_object, _maybe_box, + PerformanceWarning) from pandas.core.index import Index, Int64Index, Float64Index from pandas.indexes.base import _index_shared_docs @@ -27,7 +39,6 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) import pandas.core.common as com -import pandas.types.concat as _concat import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools @@ -87,7 +98,7 @@ def wrapper(self, other): isinstance(other, compat.string_types)): other = _to_m8(other, tz=self.tz) result = func(other) - if com.isnull(other): + if isnull(other): result.fill(nat_result) else: if isinstance(other, list): @@ -109,7 +120,7 @@ def wrapper(self, other): result[self._isnan] = nat_result # support of bool dtype indexers - if com.is_bool_dtype(result): + if is_bool_dtype(result): return result return Index(result) @@ -277,7 +288,7 @@ def __new__(cls, data=None, ambiguous=ambiguous) if not isinstance(data, (np.ndarray, Index, ABCSeries)): - if lib.isscalar(data): + if is_scalar(data): raise ValueError('DatetimeIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) @@ -537,7 +548,7 @@ def _generate(cls, start, end, periods, name, offset, index = _generate_regular_range(start, end, periods, offset) if tz is not None and getattr(index, 'tz', None) is None: - index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz, + index = tslib.tz_localize_to_utc(_ensure_int64(index), tz, ambiguous=ambiguous) index = index.view(_NS_DTYPE) @@ -601,7 +612,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, return cls(values, name=name, freq=freq, tz=tz, dtype=dtype, **kwargs).values elif not is_datetime64_dtype(values): - values = com._ensure_int64(values).view(_NS_DTYPE) + values = _ensure_int64(values).view(_NS_DTYPE) result = object.__new__(cls) result._data = values @@ -1683,7 +1694,7 @@ def inferred_type(self): def dtype(self): if self.tz is None: return _NS_DTYPE - return com.DatetimeTZDtype('ns', self.tz) + return DatetimeTZDtype('ns', self.tz) @property def is_all_dates(self): @@ -1787,9 +1798,9 @@ def delete(self, loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: - if com.is_list_like(loc): + if is_list_like(loc): loc = lib.maybe_indices_to_slice( - com._ensure_int64(np.array(loc)), len(self)) + _ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index d0b1fd746d0d5..f12ba8083f545 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -3,9 +3,9 @@ from pandas import compat import numpy as np +from pandas.types.generic import ABCSeries, ABCDatetimeIndex, ABCPeriod from pandas.tseries.tools import to_datetime, normalize_date -from pandas.core.common import (ABCSeries, ABCDatetimeIndex, ABCPeriod, - AbstractMethodError) +from pandas.core.common import AbstractMethodError # import after tools, dateutil check from dateutil.relativedelta import relativedelta, weekday diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 750e7a5553ef6..45f634050a5d8 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1,6 +1,24 @@ # pylint: disable=E1101,E1103,W0232 from datetime import datetime, timedelta import numpy as np + + +from pandas.core import common as com +from pandas.types.common import (is_integer, + is_float, + is_object_dtype, + is_integer_dtype, + is_float_dtype, + is_scalar, + is_timedelta64_dtype, + is_bool_dtype, + _ensure_int64, + _ensure_object) + +from pandas.types.generic import ABCSeries +from pandas.types.missing import isnull + + import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc from pandas.tseries.index import DatetimeIndex, Int64Index, Index @@ -17,15 +35,10 @@ from pandas.core.base import _shared_docs from pandas.indexes.base import _index_shared_docs -import pandas.core.common as com -from pandas.core.common import ( - _maybe_box, _values_from_object, ABCSeries, is_float, is_integer, - is_integer_dtype, is_object_dtype, isnull) from pandas import compat from pandas.compat.numpy import function as nv from pandas.util.decorators import Appender, cache_readonly, Substitution from pandas.lib import Timedelta -import pandas.lib as lib import pandas.tslib as tslib import pandas.core.missing as missing from pandas.compat import zip, u @@ -209,7 +222,7 @@ def _generate_range(cls, start, end, periods, freq, fields): def _from_arraylike(cls, data, freq, tz): if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): - if lib.isscalar(data) or isinstance(data, Period): + if is_scalar(data) or isinstance(data, Period): raise ValueError('PeriodIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) @@ -219,13 +232,13 @@ def _from_arraylike(cls, data, freq, tz): data = list(data) try: - data = com._ensure_int64(data) + data = _ensure_int64(data) if freq is None: raise ValueError('freq not specified') data = np.array([Period(x, freq=freq).ordinal for x in data], dtype=np.int64) except (TypeError, ValueError): - data = com._ensure_object(data) + data = _ensure_object(data) if freq is None: freq = period.extract_freq(data) @@ -242,7 +255,7 @@ def _from_arraylike(cls, data, freq, tz): base1, base2, 1) else: - if freq is None and com.is_object_dtype(data): + if freq is None and is_object_dtype(data): # must contain Period instance and thus extract ordinals freq = period.extract_freq(data) data = period.extract_ordinals(data, freq) @@ -256,9 +269,9 @@ def _from_arraylike(cls, data, freq, tz): data = dt64arr_to_periodarr(data, freq, tz) else: try: - data = com._ensure_int64(data) + data = _ensure_int64(data) except (TypeError, ValueError): - data = com._ensure_object(data) + data = _ensure_object(data) data = period.extract_ordinals(data, freq) return data, freq @@ -266,9 +279,9 @@ def _from_arraylike(cls, data, freq, tz): @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): - if not com.is_integer_dtype(values): + if not is_integer_dtype(values): values = np.array(values, copy=False) - if (len(values) > 0 and com.is_float_dtype(values)): + if (len(values) > 0 and is_float_dtype(values)): raise TypeError("PeriodIndex can't take floats") else: return PeriodIndex(values, name=name, freq=freq, **kwargs) @@ -339,7 +352,7 @@ def __array_wrap__(self, result, context=None): # from here because numpy catches. raise ValueError(msg.format(func.__name__)) - if com.is_bool_dtype(result): + if is_bool_dtype(result): return result return PeriodIndex(result, freq=self.freq, name=self.name) @@ -580,9 +593,9 @@ def _maybe_convert_timedelta(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) elif isinstance(other, np.ndarray): - if com.is_integer_dtype(other): + if is_integer_dtype(other): return other - elif com.is_timedelta64_dtype(other): + elif is_timedelta64_dtype(other): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) @@ -657,10 +670,11 @@ def get_value(self, series, key): Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ - s = _values_from_object(series) + s = com._values_from_object(series) try: - return _maybe_box(self, super(PeriodIndex, self).get_value(s, key), - series, key) + return com._maybe_box(self, + super(PeriodIndex, self).get_value(s, key), + series, key) except (KeyError, IndexError): try: asdt, parsed, reso = parse_time_string(key, self.freq) @@ -683,16 +697,16 @@ def get_value(self, series, key): return series[key] elif grp == freqn: key = Period(asdt, freq=self.freq).ordinal - return _maybe_box(self, self._engine.get_value(s, key), - series, key) + return com._maybe_box(self, self._engine.get_value(s, key), + series, key) else: raise KeyError(key) except TypeError: pass key = Period(key, self.freq).ordinal - return _maybe_box(self, self._engine.get_value(s, key), - series, key) + return com._maybe_box(self, self._engine.get_value(s, key), + series, key) def get_indexer(self, target, method=None, limit=None, tolerance=None): if hasattr(target, 'freq') and target.freq != self.freq: @@ -849,7 +863,7 @@ def _apply_meta(self, rawarr): def __getitem__(self, key): getitem = self._data.__getitem__ - if lib.isscalar(key): + if is_scalar(key): val = getitem(key) return Period(ordinal=val, freq=self.freq) else: diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index dbc0078b67ae7..f9fb51ebf710c 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -2,11 +2,20 @@ from datetime import timedelta import numpy as np -from pandas.core.common import (ABCSeries, _TD_DTYPE, _maybe_box, - _values_from_object, isnull, - is_integer, is_float, is_integer_dtype, - is_object_dtype, is_timedelta64_dtype, - is_timedelta64_ns_dtype) +from pandas.types.common import (_TD_DTYPE, + is_integer, is_float, + is_bool_dtype, + is_list_like, + is_scalar, + is_integer_dtype, + is_object_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + _ensure_int64) +from pandas.types.missing import isnull +from pandas.types.generic import ABCSeries +from pandas.core.common import _maybe_box, _values_from_object + from pandas.core.index import Index, Int64Index import pandas.compat as compat from pandas.compat import u @@ -44,10 +53,10 @@ def wrapper(self, other): # failed to parse as timedelta raise TypeError(msg.format(type(other))) result = func(other) - if com.isnull(other): + if isnull(other): result.fill(nat_result) else: - if not com.is_list_like(other): + if not is_list_like(other): raise TypeError(msg.format(type(other))) other = TimedeltaIndex(other).values @@ -66,7 +75,7 @@ def wrapper(self, other): result[self._isnan] = nat_result # support of bool dtype indexers - if com.is_bool_dtype(result): + if is_bool_dtype(result): return result return Index(result) @@ -175,7 +184,7 @@ def __new__(cls, data=None, unit=None, data = to_timedelta(data, unit=unit, box=False) if not isinstance(data, (np.ndarray, Index, ABCSeries)): - if lib.isscalar(data): + if is_scalar(data): raise ValueError('TimedeltaIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) @@ -261,7 +270,7 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): if values.dtype == np.object_: values = tslib.array_to_timedelta64(values) if values.dtype != _TD_DTYPE: - values = com._ensure_int64(values).view(_TD_DTYPE) + values = _ensure_int64(values).view(_TD_DTYPE) result = object.__new__(cls) result._data = values @@ -905,9 +914,9 @@ def delete(self, loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: - if com.is_list_like(loc): + if is_list_like(loc): loc = lib.maybe_indices_to_slice( - com._ensure_int64(np.array(loc)), len(self)) + _ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq diff --git a/pandas/tseries/tests/test_bin_groupby.py b/pandas/tseries/tests/test_bin_groupby.py index 6b6c468b7c391..08c0833be0cd6 100644 --- a/pandas/tseries/tests/test_bin_groupby.py +++ b/pandas/tseries/tests/test_bin_groupby.py @@ -3,12 +3,12 @@ from numpy import nan import numpy as np +from pandas.types.common import _ensure_int64 from pandas import Index, isnull from pandas.util.testing import assert_almost_equal import pandas.util.testing as tm import pandas.lib as lib import pandas.algos as algos -from pandas.core import common as com def test_series_grouper(): @@ -90,8 +90,8 @@ def _check(dtype): bins = np.array([6, 12, 20]) out = np.zeros((3, 4), dtype) counts = np.zeros(len(out), dtype=np.int64) - labels = com._ensure_int64(np.repeat(np.arange(3), - np.diff(np.r_[0, bins]))) + labels = _ensure_int64(np.repeat(np.arange(3), + np.diff(np.r_[0, bins]))) func = getattr(algos, 'group_ohlc_%s' % dtype) func(out, counts, obj[:, None], labels) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 807fb86b1b4da..591fa19aad585 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -4326,10 +4326,10 @@ def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='period[D]') val = series[3] - self.assertTrue(com.isnull(val)) + self.assertTrue(isnull(val)) series[2] = val - self.assertTrue(com.isnull(series[2])) + self.assertTrue(isnull(series[2])) def test_NaT_cast(self): result = Series([np.nan]).astype('period[D]') diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 2236d20975eee..518f69485004c 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -11,10 +11,11 @@ import pandas.util.testing as tm from pandas import (Series, DataFrame, Panel, Index, isnull, notnull, Timestamp) + +from pandas.types.generic import ABCSeries, ABCDataFrame from pandas.compat import range, lrange, zip, product, OrderedDict from pandas.core.base import SpecificationError -from pandas.core.common import (ABCSeries, ABCDataFrame, - UnsupportedFunctionCall) +from pandas.core.common import UnsupportedFunctionCall from pandas.core.groupby import DataError from pandas.tseries.frequencies import MONTHS, DAYS from pandas.tseries.frequencies import to_offset diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index e594d31e57296..299ec374567e7 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -12,6 +12,7 @@ import pandas.lib as lib import pandas.tslib as tslib +from pandas.types.common import is_datetime64_ns_dtype import pandas as pd import pandas.compat as compat import pandas.core.common as com @@ -2282,7 +2283,7 @@ def test_to_datetime_tz_psycopg2(self): i = pd.DatetimeIndex([ '2000-01-01 08:00:00+00:00' ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) - self.assertFalse(com.is_datetime64_ns_dtype(i)) + self.assertFalse(is_datetime64_ns_dtype(i)) # tz coerceion result = pd.to_datetime(i, errors='coerce') diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 71a041d5139a2..470aafafec547 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -5,6 +5,7 @@ import numpy as np import pytz +from pandas.types.dtypes import DatetimeTZDtype from pandas import (Index, Series, DataFrame, isnull, Timestamp) from pandas import DatetimeIndex, to_datetime, NaT @@ -17,7 +18,6 @@ from pytz import NonExistentTimeError import pandas.util.testing as tm -from pandas.types.api import DatetimeTZDtype from pandas.util.testing import assert_frame_equal, set_timezone from pandas.compat import lrange, zip diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 5a28218500858..7f28ec86ec40d 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -4,9 +4,11 @@ import numpy as np import pandas.tslib as tslib -from pandas.core.common import (ABCSeries, is_integer_dtype, - is_timedelta64_dtype, is_list_like, - _ensure_object, ABCIndexClass) +from pandas.types.common import (_ensure_object, + is_integer_dtype, + is_timedelta64_dtype, + is_list_like) +from pandas.types.generic import ABCSeries, ABCIndexClass from pandas.util.decorators import deprecate_kwarg diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index efb8590dfccf4..067e8ec19f644 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -4,8 +4,17 @@ import pandas.lib as lib import pandas.tslib as tslib -import pandas.core.common as com -from pandas.core.common import ABCIndexClass, ABCSeries, ABCDataFrame + +from pandas.types.common import (_ensure_object, + is_datetime64_ns_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_integer_dtype, + is_list_like) +from pandas.types.generic import (ABCIndexClass, ABCSeries, + ABCDataFrame) +from pandas.types.missing import notnull + import pandas.compat as compat from pandas.util.decorators import deprecate_kwarg @@ -161,7 +170,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element - non_nan_elements = com.notnull(arr).nonzero()[0] + non_nan_elements = notnull(arr).nonzero()[0] if len(non_nan_elements): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) @@ -307,7 +316,7 @@ def _convert_listlike(arg, box, format, name=None): arg = np.array(arg, dtype='O') # these are shortcutable - if com.is_datetime64_ns_dtype(arg): + if is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None, @@ -317,7 +326,7 @@ def _convert_listlike(arg, box, format, name=None): return arg - elif com.is_datetime64tz_dtype(arg): + elif is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz='utc' if utc else None) if utc: @@ -342,7 +351,7 @@ def _convert_listlike(arg, box, format, name=None): raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') - arg = com._ensure_object(arg) + arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: @@ -399,7 +408,7 @@ def _convert_listlike(arg, box, format, name=None): require_iso8601=require_iso8601 ) - if com.is_datetime64_dtype(result) and box: + if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None, name=name) @@ -424,7 +433,7 @@ def _convert_listlike(arg, box, format, name=None): return _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, box, format, name=arg.name) - elif com.is_list_like(arg): + elif is_list_like(arg): return _convert_listlike(arg, box, format) return _convert_listlike(np.array([arg]), box, format)[0] @@ -511,7 +520,7 @@ def coerce(values): values = to_numeric(values, errors=errors) # prevent overflow in case of int8 or int16 - if com.is_integer_dtype(values): + if is_integer_dtype(values): values = values.astype('int64', copy=False) return values @@ -574,7 +583,7 @@ def calc_with_mask(carg, mask): # a float with actual np.nan try: carg = arg.astype(np.float64) - return calc_with_mask(carg, com.notnull(carg)) + return calc_with_mask(carg, notnull(carg)) except: pass @@ -654,7 +663,7 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): def _guess_time_format_for_array(arr): # Try to guess the format based on the first non-NaN element - non_nan_elements = com.notnull(arr).nonzero()[0] + non_nan_elements = notnull(arr).nonzero()[0] if len(non_nan_elements): element = arr[non_nan_elements[0]] for time_format in _time_formats: @@ -705,7 +714,7 @@ def _convert_listlike(arg, format): raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') - arg = com._ensure_object(arg) + arg = _ensure_object(arg) if infer_time_format and format is None: format = _guess_time_format_for_array(arg) @@ -762,7 +771,7 @@ def _convert_listlike(arg, format): return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, format) - elif com.is_list_like(arg): + elif is_list_like(arg): return _convert_listlike(arg, format) return _convert_listlike(np.array([arg]), format)[0] diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index 7e314657cb25c..98a93d22b09a6 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -1,6 +1,6 @@ from pandas.compat import lrange import numpy as np -import pandas.core.common as com +from pandas.types.common import _ensure_platform_int from pandas.core.frame import DataFrame import pandas.core.nanops as nanops @@ -69,7 +69,7 @@ def pivot_annual(series, freq=None): raise NotImplementedError(freq) flat_index = (year - years.min()) * width + offset - flat_index = com._ensure_platform_int(flat_index) + flat_index = _ensure_platform_int(flat_index) values = np.empty((len(years), width)) values.fill(np.nan) diff --git a/pandas/types/api.py b/pandas/types/api.py index 721d8d29bba8b..2d68e041f632e 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -1,75 +1,54 @@ # flake8: noqa import numpy as np -from pandas.compat import string_types -from .dtypes import (CategoricalDtype, CategoricalDtypeType, - DatetimeTZDtype, DatetimeTZDtypeType) -from .generic import (ABCIndex, ABCInt64Index, ABCRangeIndex, - ABCFloat64Index, ABCMultiIndex, - ABCDatetimeIndex, - ABCTimedeltaIndex, ABCPeriodIndex, - ABCCategoricalIndex, - ABCIndexClass, - ABCSeries, ABCDataFrame, ABCPanel, - ABCSparseSeries, ABCSparseArray, - ABCCategorical, ABCPeriod, - ABCGeneric) - -def pandas_dtype(dtype): - """ - Converts input into a pandas only dtype object or a numpy dtype object. - - Parameters - ---------- - dtype : object to be converted - - Returns - ------- - np.dtype or a pandas dtype - """ - if isinstance(dtype, DatetimeTZDtype): - return dtype - elif isinstance(dtype, CategoricalDtype): - return dtype - elif isinstance(dtype, string_types): - try: - return DatetimeTZDtype.construct_from_string(dtype) - except TypeError: - pass - - try: - return CategoricalDtype.construct_from_string(dtype) - except TypeError: - pass - - return np.dtype(dtype) - -def na_value_for_dtype(dtype): - """ - Return a dtype compat na value - - Parameters - ---------- - dtype : string / dtype - - Returns - ------- - dtype compat na value - """ - - from pandas.core import common as com - from pandas import NaT - dtype = pandas_dtype(dtype) - - if (com.is_datetime64_dtype(dtype) or - com.is_datetime64tz_dtype(dtype) or - com.is_timedelta64_dtype(dtype)): - return NaT - elif com.is_float_dtype(dtype): - return np.nan - elif com.is_integer_dtype(dtype): - return 0 - elif com.is_bool_dtype(dtype): - return False - return np.nan +from .common import (pandas_dtype, + is_dtype_equal, + is_extension_type, + + # categorical + is_categorical, + is_categorical_dtype, + + # datetimelike + is_datetimetz, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime64_any_dtype, + is_datetime64_ns_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + + # string-like + is_string_dtype, + is_object_dtype, + + # sparse + is_sparse, + + # numeric types + is_scalar, + is_sparse, + is_bool, + is_integer, + is_float, + is_complex, + is_number, + is_any_int_dtype, + is_integer_dtype, + is_int64_dtype, + is_numeric_dtype, + is_float_dtype, + is_floating_dtype, + is_bool_dtype, + is_complex_dtype, + + # like + is_re, + is_re_compilable, + is_dict_like, + is_iterator, + is_list_like, + is_hashable, + is_named_tuple, + is_sequence) diff --git a/pandas/types/cast.py b/pandas/types/cast.py new file mode 100644 index 0000000000000..e55cb91d36430 --- /dev/null +++ b/pandas/types/cast.py @@ -0,0 +1,860 @@ +""" routings for casting """ + +from datetime import datetime, timedelta +import numpy as np +from pandas import lib, tslib +from pandas.tslib import iNaT +from pandas.compat import string_types, text_type, PY3 +from .common import (_ensure_object, is_bool, is_integer, is_float, + is_complex, is_datetimetz, is_categorical_dtype, + is_extension_type, is_object_dtype, + is_datetime64tz_dtype, is_datetime64_dtype, + is_timedelta64_dtype, is_dtype_equal, + is_float_dtype, is_complex_dtype, + is_integer_dtype, is_datetime_or_timedelta_dtype, + is_scalar, + _string_dtypes, + _coerce_to_dtype, + _ensure_int8, _ensure_int16, + _ensure_int32, _ensure_int64, + _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, + _DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) +from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries +from .missing import isnull, notnull +from .inference import is_list_like + +_int8_max = np.iinfo(np.int8).max +_int16_max = np.iinfo(np.int16).max +_int32_max = np.iinfo(np.int32).max +_int64_max = np.iinfo(np.int64).max + + +def _possibly_convert_platform(values): + """ try to do platform conversion, allow ndarray or list here """ + + if isinstance(values, (list, tuple)): + values = lib.list_to_object_array(values) + if getattr(values, 'dtype', None) == np.object_: + if hasattr(values, '_values'): + values = values._values + values = lib.maybe_convert_objects(values) + + return values + + +def _possibly_downcast_to_dtype(result, dtype): + """ try to cast to the specified dtype (e.g. convert back to bool/int + or could be an astype of float64->float32 + """ + + if is_scalar(result): + return result + + def trans(x): + return x + + if isinstance(dtype, string_types): + if dtype == 'infer': + inferred_type = lib.infer_dtype(_ensure_object(result.ravel())) + if inferred_type == 'boolean': + dtype = 'bool' + elif inferred_type == 'integer': + dtype = 'int64' + elif inferred_type == 'datetime64': + dtype = 'datetime64[ns]' + elif inferred_type == 'timedelta64': + dtype = 'timedelta64[ns]' + + # try to upcast here + elif inferred_type == 'floating': + dtype = 'int64' + if issubclass(result.dtype.type, np.number): + + def trans(x): # noqa + return x.round() + else: + dtype = 'object' + + if isinstance(dtype, string_types): + dtype = np.dtype(dtype) + + try: + + # don't allow upcasts here (except if empty) + if dtype.kind == result.dtype.kind: + if (result.dtype.itemsize <= dtype.itemsize and + np.prod(result.shape)): + return result + + if issubclass(dtype.type, np.floating): + return result.astype(dtype) + elif dtype == np.bool_ or issubclass(dtype.type, np.integer): + + # if we don't have any elements, just astype it + if not np.prod(result.shape): + return trans(result).astype(dtype) + + # do a test on the first element, if it fails then we are done + r = result.ravel() + arr = np.array([r[0]]) + + # if we have any nulls, then we are done + if isnull(arr).any() or not np.allclose(arr, + trans(arr).astype(dtype)): + return result + + # a comparable, e.g. a Decimal may slip in here + elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, + float, bool)): + return result + + if (issubclass(result.dtype.type, (np.object_, np.number)) and + notnull(result).all()): + new_result = trans(result).astype(dtype) + try: + if np.allclose(new_result, result): + return new_result + except: + + # comparison of an object dtype with a number type could + # hit here + if (new_result == result).all(): + return new_result + + # a datetimelike + elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i']: + try: + result = result.astype(dtype) + except: + if dtype.tz: + # convert to datetime and change timezone + from pandas import to_datetime + result = to_datetime(result).tz_localize(dtype.tz) + + except: + pass + + return result + + +def _maybe_upcast_putmask(result, mask, other): + """ + A safe version of putmask that potentially upcasts the result + + Parameters + ---------- + result : ndarray + The destination array. This will be mutated in-place if no upcasting is + necessary. + mask : boolean ndarray + other : ndarray or scalar + The source array or value + + Returns + ------- + result : ndarray + changed : boolean + Set to true if the result array was upcasted + """ + + if mask.any(): + # Two conversions for date-like dtypes that can't be done automatically + # in np.place: + # NaN -> NaT + # integer or integer array -> date-like array + if result.dtype in _DATELIKE_DTYPES: + if is_scalar(other): + if isnull(other): + other = result.dtype.type('nat') + elif is_integer(other): + other = np.array(other, dtype=result.dtype) + elif is_integer_dtype(other): + other = np.array(other, dtype=result.dtype) + + def changeit(): + + # try to directly set by expanding our array to full + # length of the boolean + try: + om = other[mask] + om_at = om.astype(result.dtype) + if (om == om_at).all(): + new_result = result.values.copy() + new_result[mask] = om_at + result[:] = new_result + return result, False + except: + pass + + # we are forced to change the dtype of the result as the input + # isn't compatible + r, _ = _maybe_upcast(result, fill_value=other, copy=True) + np.place(r, mask, other) + + return r, True + + # we want to decide whether place will work + # if we have nans in the False portion of our mask then we need to + # upcast (possibly), otherwise we DON't want to upcast (e.g. if we + # have values, say integers, in the success portion then it's ok to not + # upcast) + new_dtype, _ = _maybe_promote(result.dtype, other) + if new_dtype != result.dtype: + + # we have a scalar or len 0 ndarray + # and its nan and we are changing some values + if (is_scalar(other) or + (isinstance(other, np.ndarray) and other.ndim < 1)): + if isnull(other): + return changeit() + + # we have an ndarray and the masking has nans in it + else: + + if isnull(other[mask]).any(): + return changeit() + + try: + np.place(result, mask, other) + except: + return changeit() + + return result, False + + +def _maybe_promote(dtype, fill_value=np.nan): + + # if we passed an array here, determine the fill value by dtype + if isinstance(fill_value, np.ndarray): + if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): + fill_value = iNaT + else: + + # we need to change to object type as our + # fill_value is of object type + if fill_value.dtype == np.object_: + dtype = np.dtype(np.object_) + fill_value = np.nan + + # returns tuple of (dtype, fill_value) + if issubclass(dtype.type, (np.datetime64, np.timedelta64)): + # for now: refuse to upcast datetime64 + # (this is because datetime64 will not implicitly upconvert + # to object correctly as of numpy 1.6.1) + if isnull(fill_value): + fill_value = iNaT + else: + if issubclass(dtype.type, np.datetime64): + try: + fill_value = lib.Timestamp(fill_value).value + except: + # the proper thing to do here would probably be to upcast + # to object (but numpy 1.6.1 doesn't do this properly) + fill_value = iNaT + elif issubclass(dtype.type, np.timedelta64): + try: + fill_value = lib.Timedelta(fill_value).value + except: + # as for datetimes, cannot upcast to object + fill_value = iNaT + else: + fill_value = iNaT + elif is_datetimetz(dtype): + if isnull(fill_value): + fill_value = iNaT + elif is_float(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, np.integer): + dtype = np.float64 + elif is_bool(fill_value): + if not issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif is_integer(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, np.integer): + # upcast to prevent overflow + arr = np.asarray(fill_value) + if arr != arr.astype(dtype): + dtype = arr.dtype + elif is_complex(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, (np.integer, np.floating)): + dtype = np.complex128 + elif fill_value is None: + if is_float_dtype(dtype) or is_complex_dtype(dtype): + fill_value = np.nan + elif is_integer_dtype(dtype): + dtype = np.float64 + fill_value = np.nan + elif is_datetime_or_timedelta_dtype(dtype): + fill_value = iNaT + else: + dtype = np.object_ + else: + dtype = np.object_ + + # in case we have a string that looked like a number + if is_categorical_dtype(dtype): + pass + elif is_datetimetz(dtype): + pass + elif issubclass(np.dtype(dtype).type, string_types): + dtype = np.object_ + + return dtype, fill_value + + +def _infer_dtype_from_scalar(val): + """ interpret the dtype from a scalar """ + + dtype = np.object_ + + # a 1-element ndarray + if isinstance(val, np.ndarray): + if val.ndim != 0: + raise ValueError( + "invalid ndarray passed to _infer_dtype_from_scalar") + + dtype = val.dtype + val = val.item() + + elif isinstance(val, string_types): + + # If we create an empty array using a string to infer + # the dtype, NumPy will only allocate one character per entry + # so this is kind of bad. Alternately we could use np.repeat + # instead of np.empty (but then you still don't want things + # coming out as np.str_! + + dtype = np.object_ + + elif isinstance(val, (np.datetime64, + datetime)) and getattr(val, 'tzinfo', None) is None: + val = lib.Timestamp(val).value + dtype = np.dtype('M8[ns]') + + elif isinstance(val, (np.timedelta64, timedelta)): + val = lib.Timedelta(val).value + dtype = np.dtype('m8[ns]') + + elif is_bool(val): + dtype = np.bool_ + + elif is_integer(val): + if isinstance(val, np.integer): + dtype = type(val) + else: + dtype = np.int64 + + elif is_float(val): + if isinstance(val, np.floating): + dtype = type(val) + else: + dtype = np.float64 + + elif is_complex(val): + dtype = np.complex_ + + return dtype, val + + +def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): + """ provide explict type promotion and coercion + + Parameters + ---------- + values : the ndarray that we want to maybe upcast + fill_value : what we want to fill with + dtype : if None, then use the dtype of the values, else coerce to this type + copy : if True always make a copy even if no upcast is required + """ + + if is_extension_type(values): + if copy: + values = values.copy() + else: + if dtype is None: + dtype = values.dtype + new_dtype, fill_value = _maybe_promote(dtype, fill_value) + if new_dtype != values.dtype: + values = values.astype(new_dtype) + elif copy: + values = values.copy() + + return values, fill_value + + +def _possibly_cast_item(obj, item, dtype): + chunk = obj[item] + + if chunk.values.dtype != dtype: + if dtype in (np.object_, np.bool_): + obj[item] = chunk.astype(np.object_) + elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover + raise ValueError("Unexpected dtype encountered: %s" % dtype) + + +def _invalidate_string_dtypes(dtype_set): + """Change string like dtypes to object for + ``DataFrame.select_dtypes()``. + """ + non_string_dtypes = dtype_set - _string_dtypes + if non_string_dtypes != dtype_set: + raise TypeError("string dtypes are not allowed, use 'object' instead") + + +def _maybe_convert_string_to_object(values): + """ + + Convert string-like and string-like array to convert object dtype. + This is to avoid numpy to handle the array as str dtype. + """ + if isinstance(values, string_types): + values = np.array([values], dtype=object) + elif (isinstance(values, np.ndarray) and + issubclass(values.dtype.type, (np.string_, np.unicode_))): + values = values.astype(object) + return values + + +def _maybe_convert_scalar(values): + """ + Convert a python scalar to the appropriate numpy dtype if possible + This avoids numpy directly converting according to platform preferences + """ + if is_scalar(values): + dtype, values = _infer_dtype_from_scalar(values) + try: + values = dtype(values) + except TypeError: + pass + return values + + +def _coerce_indexer_dtype(indexer, categories): + """ coerce the indexer input array to the smallest dtype possible """ + l = len(categories) + if l < _int8_max: + return _ensure_int8(indexer) + elif l < _int16_max: + return _ensure_int16(indexer) + elif l < _int32_max: + return _ensure_int32(indexer) + return _ensure_int64(indexer) + + +def _coerce_to_dtypes(result, dtypes): + """ + given a dtypes and a result set, coerce the result elements to the + dtypes + """ + if len(result) != len(dtypes): + raise AssertionError("_coerce_to_dtypes requires equal len arrays") + + from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + + def conv(r, dtype): + try: + if isnull(r): + pass + elif dtype == _NS_DTYPE: + r = lib.Timestamp(r) + elif dtype == _TD_DTYPE: + r = _coerce_scalar_to_timedelta_type(r) + elif dtype == np.bool_: + # messy. non 0/1 integers do not get converted. + if is_integer(r) and r not in [0, 1]: + return int(r) + r = bool(r) + elif dtype.kind == 'f': + r = float(r) + elif dtype.kind == 'i': + r = int(r) + except: + pass + + return r + + return [conv(r, dtype) for r, dtype in zip(result, dtypes)] + + +def _astype_nansafe(arr, dtype, copy=True): + """ return a view if copy is False, but + need to be very careful as the result shape could change! """ + if not isinstance(dtype, np.dtype): + dtype = _coerce_to_dtype(dtype) + + if issubclass(dtype.type, text_type): + # in Py3 that's str, in Py2 that's unicode + return lib.astype_unicode(arr.ravel()).reshape(arr.shape) + elif issubclass(dtype.type, string_types): + return lib.astype_str(arr.ravel()).reshape(arr.shape) + elif is_datetime64_dtype(arr): + if dtype == object: + return tslib.ints_to_pydatetime(arr.view(np.int64)) + elif dtype == np.int64: + return arr.view(dtype) + elif dtype != _NS_DTYPE: + raise TypeError("cannot astype a datetimelike from [%s] to [%s]" % + (arr.dtype, dtype)) + return arr.astype(_NS_DTYPE) + elif is_timedelta64_dtype(arr): + if dtype == np.int64: + return arr.view(dtype) + elif dtype == object: + return tslib.ints_to_pytimedelta(arr.view(np.int64)) + + # in py3, timedelta64[ns] are int64 + elif ((PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or + (not PY3 and dtype != _TD_DTYPE)): + + # allow frequency conversions + if dtype.kind == 'm': + mask = isnull(arr) + result = arr.astype(dtype).astype(np.float64) + result[mask] = np.nan + return result + + raise TypeError("cannot astype a timedelta from [%s] to [%s]" % + (arr.dtype, dtype)) + + return arr.astype(_TD_DTYPE) + elif (np.issubdtype(arr.dtype, np.floating) and + np.issubdtype(dtype, np.integer)): + + if np.isnan(arr).any(): + raise ValueError('Cannot convert NA to integer') + elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): + # work around NumPy brokenness, #1987 + return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) + + if copy: + return arr.astype(dtype) + return arr.view(dtype) + + +def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, + convert_timedeltas=True, copy=True): + """ if we have an object dtype, try to coerce dates and/or numbers """ + + # if we have passed in a list or scalar + if isinstance(values, (list, tuple)): + values = np.array(values, dtype=np.object_) + if not hasattr(values, 'dtype'): + values = np.array([values], dtype=np.object_) + + # convert dates + if convert_dates and values.dtype == np.object_: + + # we take an aggressive stance and convert to datetime64[ns] + if convert_dates == 'coerce': + new_values = _possibly_cast_to_datetime(values, 'M8[ns]', + errors='coerce') + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + else: + values = lib.maybe_convert_objects(values, + convert_datetime=convert_dates) + + # convert timedeltas + if convert_timedeltas and values.dtype == np.object_: + + if convert_timedeltas == 'coerce': + from pandas.tseries.timedeltas import to_timedelta + new_values = to_timedelta(values, coerce=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + else: + values = lib.maybe_convert_objects( + values, convert_timedelta=convert_timedeltas) + + # convert to numeric + if values.dtype == np.object_: + if convert_numeric: + try: + new_values = lib.maybe_convert_numeric(values, set(), + coerce_numeric=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + except: + pass + else: + # soft-conversion + values = lib.maybe_convert_objects(values) + + values = values.copy() if copy else values + + return values + + +def _soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, + coerce=False, copy=True): + """ if we have an object dtype, try to coerce dates and/or numbers """ + + conversion_count = sum((datetime, numeric, timedelta)) + if conversion_count == 0: + raise ValueError('At least one of datetime, numeric or timedelta must ' + 'be True.') + elif conversion_count > 1 and coerce: + raise ValueError("Only one of 'datetime', 'numeric' or " + "'timedelta' can be True when when coerce=True.") + + if isinstance(values, (list, tuple)): + # List or scalar + values = np.array(values, dtype=np.object_) + elif not hasattr(values, 'dtype'): + values = np.array([values], dtype=np.object_) + elif not is_object_dtype(values.dtype): + # If not object, do not attempt conversion + values = values.copy() if copy else values + return values + + # If 1 flag is coerce, ensure 2 others are False + if coerce: + # Immediate return if coerce + if datetime: + from pandas import to_datetime + return to_datetime(values, errors='coerce', box=False) + elif timedelta: + from pandas import to_timedelta + return to_timedelta(values, errors='coerce', box=False) + elif numeric: + from pandas import to_numeric + return to_numeric(values, errors='coerce') + + # Soft conversions + if datetime: + values = lib.maybe_convert_objects(values, convert_datetime=datetime) + + if timedelta and is_object_dtype(values.dtype): + # Object check to ensure only run if previous did not convert + values = lib.maybe_convert_objects(values, convert_timedelta=timedelta) + + if numeric and is_object_dtype(values.dtype): + try: + converted = lib.maybe_convert_numeric(values, set(), + coerce_numeric=True) + # If all NaNs, then do not-alter + values = converted if not isnull(converted).all() else values + values = values.copy() if copy else values + except: + pass + + return values + + +def _possibly_castable(arr): + # return False to force a non-fastpath + + # check datetime64[ns]/timedelta64[ns] are valid + # otherwise try to coerce + kind = arr.dtype.kind + if kind == 'M' or kind == 'm': + return arr.dtype in _DATELIKE_DTYPES + + return arr.dtype.name not in _POSSIBLY_CAST_DTYPES + + +def _possibly_infer_to_datetimelike(value, convert_dates=False): + """ + we might have a array (or single object) that is datetime like, + and no dtype is passed don't change the value unless we find a + datetime/timedelta set + + this is pretty strict in that a datetime/timedelta is REQUIRED + in addition to possible nulls/string likes + + ONLY strings are NOT datetimelike + + Parameters + ---------- + value : np.array / Series / Index / list-like + convert_dates : boolean, default False + if True try really hard to convert dates (such as datetime.date), other + leave inferred dtype 'date' alone + + """ + + if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex)): + return value + elif isinstance(value, ABCSeries): + if isinstance(value._values, ABCDatetimeIndex): + return value._values + + v = value + + if not is_list_like(v): + v = [v] + v = np.array(v, copy=False) + shape = v.shape + if not v.ndim == 1: + v = v.ravel() + + if len(v): + + def _try_datetime(v): + # safe coerce to datetime64 + try: + v = tslib.array_to_datetime(v, errors='raise') + except ValueError: + + # we might have a sequence of the same-datetimes with tz's + # if so coerce to a DatetimeIndex; if they are not the same, + # then these stay as object dtype + try: + from pandas import to_datetime + return to_datetime(v) + except: + pass + + except: + pass + + return v.reshape(shape) + + def _try_timedelta(v): + # safe coerce to timedelta64 + + # will try first with a string & object conversion + from pandas import to_timedelta + try: + return to_timedelta(v)._values.reshape(shape) + except: + return v + + # do a quick inference for perf + sample = v[:min(3, len(v))] + inferred_type = lib.infer_dtype(sample) + + if (inferred_type in ['datetime', 'datetime64'] or + (convert_dates and inferred_type in ['date'])): + value = _try_datetime(v) + elif inferred_type in ['timedelta', 'timedelta64']: + value = _try_timedelta(v) + + # It's possible to have nulls intermixed within the datetime or + # timedelta. These will in general have an inferred_type of 'mixed', + # so have to try both datetime and timedelta. + + # try timedelta first to avoid spurious datetime conversions + # e.g. '00:00:01' is a timedelta but technically is also a datetime + elif inferred_type in ['mixed']: + + if lib.is_possible_datetimelike_array(_ensure_object(v)): + value = _try_timedelta(v) + if lib.infer_dtype(value) in ['mixed']: + value = _try_datetime(v) + + return value + + +def _possibly_cast_to_datetime(value, dtype, errors='raise'): + """ try to cast the array/value to a datetimelike dtype, converting float + nan to iNaT + """ + from pandas.tseries.timedeltas import to_timedelta + from pandas.tseries.tools import to_datetime + + if dtype is not None: + if isinstance(dtype, string_types): + dtype = np.dtype(dtype) + + is_datetime64 = is_datetime64_dtype(dtype) + is_datetime64tz = is_datetime64tz_dtype(dtype) + is_timedelta64 = is_timedelta64_dtype(dtype) + + if is_datetime64 or is_datetime64tz or is_timedelta64: + + # force the dtype if needed + if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): + if dtype.name == 'datetime64[ns]': + dtype = _NS_DTYPE + else: + raise TypeError("cannot convert datetimelike to " + "dtype [%s]" % dtype) + elif is_datetime64tz: + + # our NaT doesn't support tz's + # this will coerce to DatetimeIndex with + # a matching dtype below + if is_scalar(value) and isnull(value): + value = [value] + + elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): + if dtype.name == 'timedelta64[ns]': + dtype = _TD_DTYPE + else: + raise TypeError("cannot convert timedeltalike to " + "dtype [%s]" % dtype) + + if is_scalar(value): + if value == tslib.iNaT or isnull(value): + value = tslib.iNaT + else: + value = np.array(value, copy=False) + + # have a scalar array-like (e.g. NaT) + if value.ndim == 0: + value = tslib.iNaT + + # we have an array of datetime or timedeltas & nulls + elif np.prod(value.shape) or not is_dtype_equal(value.dtype, + dtype): + try: + if is_datetime64: + value = to_datetime(value, errors=errors)._values + elif is_datetime64tz: + # input has to be UTC at this point, so just + # localize + value = to_datetime( + value, + errors=errors).tz_localize(dtype.tz) + elif is_timedelta64: + value = to_timedelta(value, errors=errors)._values + except (AttributeError, ValueError, TypeError): + pass + + # coerce datetimelike to object + elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): + if is_object_dtype(dtype): + ints = np.asarray(value).view('i8') + return tslib.ints_to_pydatetime(ints) + + # we have a non-castable dtype that was passed + raise TypeError('Cannot cast datetime64 to %s' % dtype) + + else: + + is_array = isinstance(value, np.ndarray) + + # catch a datetime/timedelta that is not of ns variety + # and no coercion specified + if is_array and value.dtype.kind in ['M', 'm']: + dtype = value.dtype + + if dtype.kind == 'M' and dtype != _NS_DTYPE: + value = value.astype(_NS_DTYPE) + + elif dtype.kind == 'm' and dtype != _TD_DTYPE: + value = to_timedelta(value) + + # only do this if we have an array and the dtype of the array is not + # setup already we are not an integer/object, so don't bother with this + # conversion + elif not (is_array and not (issubclass(value.dtype.type, np.integer) or + value.dtype == np.object_)): + value = _possibly_infer_to_datetimelike(value) + + return value diff --git a/pandas/types/common.py b/pandas/types/common.py new file mode 100644 index 0000000000000..9d0ccaac843ef --- /dev/null +++ b/pandas/types/common.py @@ -0,0 +1,448 @@ +""" common type operations """ + +import numpy as np +from pandas.compat import string_types, text_type, binary_type +from pandas import lib, algos +from .dtypes import (CategoricalDtype, CategoricalDtypeType, + DatetimeTZDtype, DatetimeTZDtypeType, + ExtensionDtype) +from .generic import (ABCCategorical, ABCPeriodIndex, + ABCDatetimeIndex, ABCSeries, + ABCSparseArray, ABCSparseSeries) +from .inference import is_integer, is_string_like +from .inference import * # noqa + + +_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name + for t in ['O', 'int8', 'uint8', 'int16', 'uint16', + 'int32', 'uint32', 'int64', 'uint64']]) + +_NS_DTYPE = np.dtype('M8[ns]') +_TD_DTYPE = np.dtype('m8[ns]') +_INT64_DTYPE = np.dtype(np.int64) +_DATELIKE_DTYPES = set([np.dtype(t) + for t in ['M8[ns]', 'M8[ns]', + 'm8[ns]', 'm8[ns]']]) + +_ensure_float64 = algos.ensure_float64 +_ensure_float32 = algos.ensure_float32 + + +def _ensure_float(arr): + if issubclass(arr.dtype.type, (np.integer, np.bool_)): + arr = arr.astype(float) + return arr + +_ensure_int64 = algos.ensure_int64 +_ensure_int32 = algos.ensure_int32 +_ensure_int16 = algos.ensure_int16 +_ensure_int8 = algos.ensure_int8 +_ensure_platform_int = algos.ensure_platform_int +_ensure_object = algos.ensure_object + + +def is_object_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.object_) + + +def is_sparse(array): + """ return if we are a sparse array """ + return isinstance(array, (ABCSparseArray, ABCSparseSeries)) + + +def is_categorical(array): + """ return if we are a categorical possibility """ + return isinstance(array, ABCCategorical) or is_categorical_dtype(array) + + +def is_datetimetz(array): + """ return if we are a datetime with tz array """ + return ((isinstance(array, ABCDatetimeIndex) and + getattr(array, 'tz', None) is not None) or + is_datetime64tz_dtype(array)) + + +def is_datetime64_dtype(arr_or_dtype): + try: + tipo = _get_dtype_type(arr_or_dtype) + except TypeError: + return False + return issubclass(tipo, np.datetime64) + + +def is_datetime64tz_dtype(arr_or_dtype): + return DatetimeTZDtype.is_dtype(arr_or_dtype) + + +def is_timedelta64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.timedelta64) + + +def is_categorical_dtype(arr_or_dtype): + return CategoricalDtype.is_dtype(arr_or_dtype) + + +def is_string_dtype(arr_or_dtype): + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('O', 'S', 'U') + + +def is_period_arraylike(arr): + """ return if we are period arraylike / PeriodIndex """ + if isinstance(arr, ABCPeriodIndex): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return arr.dtype == object and lib.infer_dtype(arr) == 'period' + return getattr(arr, 'inferred_type', None) == 'period' + + +def is_datetime_arraylike(arr): + """ return if we are datetime arraylike / DatetimeIndex """ + if isinstance(arr, ABCDatetimeIndex): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' + return getattr(arr, 'inferred_type', None) == 'datetime' + + +def is_datetimelike(arr): + return (arr.dtype in _DATELIKE_DTYPES or + isinstance(arr, ABCPeriodIndex) or + is_datetimetz(arr)) + + +def is_dtype_equal(source, target): + """ return a boolean if the dtypes are equal """ + try: + source = _get_dtype(source) + target = _get_dtype(target) + return source == target + except (TypeError, AttributeError): + + # invalid comparison + # object == category will hit this + return False + + +def is_any_int_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.integer) + + +def is_integer_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.integer) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_int64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.int64) + + +def is_int_or_datetime_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.integer) or + issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_datetime64_any_dtype(arr_or_dtype): + return (is_datetime64_dtype(arr_or_dtype) or + is_datetime64tz_dtype(arr_or_dtype)) + + +def is_datetime64_ns_dtype(arr_or_dtype): + try: + tipo = _get_dtype(arr_or_dtype) + except TypeError: + return False + return tipo == _NS_DTYPE + + +def is_timedelta64_ns_dtype(arr_or_dtype): + tipo = _get_dtype(arr_or_dtype) + return tipo == _TD_DTYPE + + +def is_datetime_or_timedelta_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, (np.datetime64, np.timedelta64)) + + +def is_numeric_v_string_like(a, b): + """ + numpy doesn't like to compare numeric arrays vs scalar string-likes + + return a boolean result if this is the case for a,b or b,a + + """ + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + + is_a_numeric_array = is_a_array and is_numeric_dtype(a) + is_b_numeric_array = is_b_array and is_numeric_dtype(b) + is_a_string_array = is_a_array and is_string_like_dtype(a) + is_b_string_array = is_b_array and is_string_like_dtype(b) + + is_a_scalar_string_like = not is_a_array and is_string_like(a) + is_b_scalar_string_like = not is_b_array and is_string_like(b) + + return ((is_a_numeric_array and is_b_scalar_string_like) or + (is_b_numeric_array and is_a_scalar_string_like) or + (is_a_numeric_array and is_b_string_array) or + (is_b_numeric_array and is_a_string_array)) + + +def is_datetimelike_v_numeric(a, b): + # return if we have an i8 convertible and numeric comparison + if not hasattr(a, 'dtype'): + a = np.asarray(a) + if not hasattr(b, 'dtype'): + b = np.asarray(b) + + def is_numeric(x): + return is_integer_dtype(x) or is_float_dtype(x) + + is_datetimelike = needs_i8_conversion + return ((is_datetimelike(a) and is_numeric(b)) or + (is_datetimelike(b) and is_numeric(a))) + + +def is_datetimelike_v_object(a, b): + # return if we have an i8 convertible and object comparsion + if not hasattr(a, 'dtype'): + a = np.asarray(a) + if not hasattr(b, 'dtype'): + b = np.asarray(b) + + def f(x): + return is_object_dtype(x) + + def is_object(x): + return is_integer_dtype(x) or is_float_dtype(x) + + is_datetimelike = needs_i8_conversion + return ((is_datetimelike(a) and is_object(b)) or + (is_datetimelike(b) and is_object(a))) + + +def needs_i8_conversion(arr_or_dtype): + return (is_datetime_or_timedelta_dtype(arr_or_dtype) or + is_datetime64tz_dtype(arr_or_dtype)) + + +def is_numeric_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, (np.number, np.bool_)) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_string_like_dtype(arr_or_dtype): + # exclude object as its a mixed dtype + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('S', 'U') + + +def is_float_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.floating) + + +def is_floating_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return isinstance(tipo, np.floating) + + +def is_bool_dtype(arr_or_dtype): + try: + tipo = _get_dtype_type(arr_or_dtype) + except ValueError: + # this isn't even a dtype + return False + return issubclass(tipo, np.bool_) + + +def is_extension_type(value): + """ + if we are a klass that is preserved by the internals + these are internal klasses that we represent (and don't use a np.array) + """ + if is_categorical(value): + return True + elif is_sparse(value): + return True + elif is_datetimetz(value): + return True + return False + + +def is_complex_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.complexfloating) + + +def _coerce_to_dtype(dtype): + """ coerce a string / np.dtype to a dtype """ + if is_categorical_dtype(dtype): + dtype = CategoricalDtype() + elif is_datetime64tz_dtype(dtype): + dtype = DatetimeTZDtype(dtype) + else: + dtype = np.dtype(dtype) + return dtype + + +def _get_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype) + elif isinstance(arr_or_dtype, CategoricalDtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, DatetimeTZDtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, string_types): + if is_categorical_dtype(arr_or_dtype): + return CategoricalDtype.construct_from_string(arr_or_dtype) + elif is_datetime64tz_dtype(arr_or_dtype): + return DatetimeTZDtype.construct_from_string(arr_or_dtype) + + if hasattr(arr_or_dtype, 'dtype'): + arr_or_dtype = arr_or_dtype.dtype + return np.dtype(arr_or_dtype) + + +def _get_dtype_type(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype.type + elif isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype).type + elif isinstance(arr_or_dtype, CategoricalDtype): + return CategoricalDtypeType + elif isinstance(arr_or_dtype, DatetimeTZDtype): + return DatetimeTZDtypeType + elif isinstance(arr_or_dtype, string_types): + if is_categorical_dtype(arr_or_dtype): + return CategoricalDtypeType + elif is_datetime64tz_dtype(arr_or_dtype): + return DatetimeTZDtypeType + return _get_dtype_type(np.dtype(arr_or_dtype)) + try: + return arr_or_dtype.dtype.type + except AttributeError: + return type(None) + + +def _get_dtype_from_object(dtype): + """Get a numpy dtype.type-style object. This handles the datetime64[ns] + and datetime64[ns, TZ] compat + + Notes + ----- + If nothing can be found, returns ``object``. + """ + + # type object from a dtype + if isinstance(dtype, type) and issubclass(dtype, np.generic): + return dtype + elif is_categorical(dtype): + return CategoricalDtype().type + elif is_datetimetz(dtype): + return DatetimeTZDtype(dtype).type + elif isinstance(dtype, np.dtype): # dtype object + try: + _validate_date_like_dtype(dtype) + except TypeError: + # should still pass if we don't have a datelike + pass + return dtype.type + elif isinstance(dtype, string_types): + if dtype == 'datetime' or dtype == 'timedelta': + dtype += '64' + + try: + return _get_dtype_from_object(getattr(np, dtype)) + except (AttributeError, TypeError): + # handles cases like _get_dtype(int) + # i.e., python objects that are valid dtypes (unlike user-defined + # types, in general) + # TypeError handles the float16 typecode of 'e' + # further handle internal types + pass + + return _get_dtype_from_object(np.dtype(dtype)) + + +def _validate_date_like_dtype(dtype): + try: + typ = np.datetime_data(dtype)[0] + except ValueError as e: + raise TypeError('%s' % e) + if typ != 'generic' and typ != 'ns': + raise ValueError('%r is too specific of a frequency, try passing %r' % + (dtype.name, dtype.type.__name__)) + + +def _lcd_dtypes(a_dtype, b_dtype): + """ return the lcd dtype to hold these types """ + + if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype): + return _NS_DTYPE + elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype): + return _TD_DTYPE + elif is_complex_dtype(a_dtype): + if is_complex_dtype(b_dtype): + return a_dtype + return np.float64 + elif is_integer_dtype(a_dtype): + if is_integer_dtype(b_dtype): + if a_dtype.itemsize == b_dtype.itemsize: + return a_dtype + return np.int64 + return np.float64 + elif is_float_dtype(a_dtype): + if is_float_dtype(b_dtype): + if a_dtype.itemsize == b_dtype.itemsize: + return a_dtype + else: + return np.float64 + elif is_integer(b_dtype): + return np.float64 + return np.object + +_string_dtypes = frozenset(map(_get_dtype_from_object, (binary_type, + text_type))) + + +def pandas_dtype(dtype): + """ + Converts input into a pandas only dtype object or a numpy dtype object. + + Parameters + ---------- + dtype : object to be converted + + Returns + ------- + np.dtype or a pandas dtype + """ + if isinstance(dtype, DatetimeTZDtype): + return dtype + elif isinstance(dtype, CategoricalDtype): + return dtype + elif isinstance(dtype, string_types): + try: + return DatetimeTZDtype.construct_from_string(dtype) + except TypeError: + pass + + try: + return CategoricalDtype.construct_from_string(dtype) + except TypeError: + pass + elif isinstance(dtype, ExtensionDtype): + return dtype + + return np.dtype(dtype) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 44338f26eb2e8..3b30531fb30ac 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -3,10 +3,19 @@ """ import numpy as np -import pandas.core.common as com import pandas.tslib as tslib from pandas import compat from pandas.compat import map +from .common import (is_categorical_dtype, + is_sparse, + is_datetimetz, + is_datetime64_dtype, + is_timedelta64_dtype, + is_object_dtype, + is_bool_dtype, + is_dtype_equal, + _NS_DTYPE, + _TD_DTYPE) def get_dtype_kinds(l): @@ -24,19 +33,19 @@ def get_dtype_kinds(l): for arr in l: dtype = arr.dtype - if com.is_categorical_dtype(dtype): + if is_categorical_dtype(dtype): typ = 'category' - elif com.is_sparse(arr): + elif is_sparse(arr): typ = 'sparse' - elif com.is_datetimetz(arr): + elif is_datetimetz(arr): typ = 'datetimetz' - elif com.is_datetime64_dtype(dtype): + elif is_datetime64_dtype(dtype): typ = 'datetime' - elif com.is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(dtype): typ = 'timedelta' - elif com.is_object_dtype(dtype): + elif is_object_dtype(dtype): typ = 'object' - elif com.is_bool_dtype(dtype): + elif is_bool_dtype(dtype): typ = 'bool' else: typ = dtype.kind @@ -51,14 +60,14 @@ def _get_series_result_type(result): """ if isinstance(result, dict): # concat Series with axis 1 - if all(com.is_sparse(c) for c in compat.itervalues(result)): + if all(is_sparse(c) for c in compat.itervalues(result)): from pandas.sparse.api import SparseDataFrame return SparseDataFrame else: from pandas.core.frame import DataFrame return DataFrame - elif com.is_sparse(result): + elif is_sparse(result): # concat Series with axis 1 from pandas.sparse.api import SparseSeries return SparseSeries @@ -165,7 +174,7 @@ def _concat_categorical(to_concat, axis=0): def convert_categorical(x): # coerce to object dtype - if com.is_categorical_dtype(x.dtype): + if is_categorical_dtype(x.dtype): return x.get_values() return x.ravel() @@ -177,7 +186,7 @@ def convert_categorical(x): # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical - categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)] + categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] # validate the categories categories = categoricals[0] @@ -235,7 +244,7 @@ def union_categoricals(to_union): if any(c.ordered for c in to_union): raise TypeError("Can only combine unordered Categoricals") - if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype) + if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") @@ -272,7 +281,7 @@ def convert_to_pydatetime(x, axis): # coerce to an object dtype # if dtype is of datetimetz or timezone - if x.dtype.kind == com._NS_DTYPE.kind: + if x.dtype.kind == _NS_DTYPE.kind: if getattr(x, 'tz', None) is not None: x = x.asobject.values else: @@ -280,7 +289,7 @@ def convert_to_pydatetime(x, axis): x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) x = x.reshape(shape) - elif x.dtype == com._TD_DTYPE: + elif x.dtype == _TD_DTYPE: shape = x.shape x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) x = x.reshape(shape) @@ -310,12 +319,12 @@ def convert_to_pydatetime(x, axis): elif 'datetime' in typs: new_values = np.concatenate([x.view(np.int64) for x in to_concat], axis=axis) - return new_values.view(com._NS_DTYPE) + return new_values.view(_NS_DTYPE) elif 'timedelta' in typs: new_values = np.concatenate([x.view(np.int64) for x in to_concat], axis=axis) - return new_values.view(com._TD_DTYPE) + return new_values.view(_TD_DTYPE) # need to coerce to object to_concat = [convert_to_pydatetime(x, axis) for x in to_concat] @@ -350,7 +359,7 @@ def convert_sparse(x, axis): return x if typs is None: - typs = com.get_dtype_kinds(to_concat) + typs = get_dtype_kinds(to_concat) if len(typs) == 1: # concat input as it is if all inputs are sparse @@ -374,7 +383,7 @@ def convert_sparse(x, axis): # input may be sparse / dense mixed and may have different fill_value # input must contain sparse at least 1 - sparses = [c for c in to_concat if com.is_sparse(c)] + sparses = [c for c in to_concat if is_sparse(c)] fill_values = [c.fill_value for c in sparses] sp_indexes = [c.sp_index for c in sparses] diff --git a/pandas/types/inference.py b/pandas/types/inference.py new file mode 100644 index 0000000000000..35a2dc2fb831b --- /dev/null +++ b/pandas/types/inference.py @@ -0,0 +1,104 @@ +""" basic inference routines """ + +import collections +import re +import numpy as np +from numbers import Number +from pandas.compat import (string_types, text_type, + string_and_binary_types) +from pandas import lib + +is_bool = lib.is_bool + +is_integer = lib.is_integer + +is_float = lib.is_float + +is_complex = lib.is_complex + +is_scalar = lib.isscalar + + +def is_number(obj): + return isinstance(obj, (Number, np.number)) + + +def is_string_like(obj): + return isinstance(obj, (text_type, string_types)) + + +def _iterable_not_string(x): + return (isinstance(x, collections.Iterable) and + not isinstance(x, string_types)) + + +def is_iterator(obj): + # python 3 generators have __next__ instead of next + return hasattr(obj, 'next') or hasattr(obj, '__next__') + + +def is_re(obj): + return isinstance(obj, re._pattern_type) + + +def is_re_compilable(obj): + try: + re.compile(obj) + except TypeError: + return False + else: + return True + + +def is_list_like(arg): + return (hasattr(arg, '__iter__') and + not isinstance(arg, string_and_binary_types)) + + +def is_dict_like(arg): + return hasattr(arg, '__getitem__') and hasattr(arg, 'keys') + + +def is_named_tuple(arg): + return isinstance(arg, tuple) and hasattr(arg, '_fields') + + +def is_hashable(arg): + """Return True if hash(arg) will succeed, False otherwise. + + Some types will pass a test against collections.Hashable but fail when they + are actually hashed with hash(). + + Distinguish between these and other types by trying the call to hash() and + seeing if they raise TypeError. + + Examples + -------- + >>> a = ([],) + >>> isinstance(a, collections.Hashable) + True + >>> is_hashable(a) + False + """ + # unfortunately, we can't use isinstance(arg, collections.Hashable), which + # can be faster than calling hash, because numpy scalars on Python 3 fail + # this test + + # reconsider this decision once this numpy bug is fixed: + # https://github.com/numpy/numpy/issues/5562 + + try: + hash(arg) + except TypeError: + return False + else: + return True + + +def is_sequence(x): + try: + iter(x) + len(x) # it has a length + return not isinstance(x, string_and_binary_types) + except (TypeError, AttributeError): + return False diff --git a/pandas/types/missing.py b/pandas/types/missing.py new file mode 100644 index 0000000000000..8b4193d02beb7 --- /dev/null +++ b/pandas/types/missing.py @@ -0,0 +1,394 @@ +""" +missing types & inference +""" +import numpy as np +from pandas import lib +from pandas.tslib import NaT, iNaT +from .generic import (ABCMultiIndex, ABCSeries, + ABCIndexClass, ABCGeneric) +from .common import (is_string_dtype, is_datetimelike, + is_datetimelike_v_numeric, is_float_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, + is_complex_dtype, is_categorical_dtype, + is_string_like_dtype, is_bool_dtype, + is_integer_dtype, is_dtype_equal, + needs_i8_conversion, _ensure_object, + pandas_dtype, + is_scalar, + is_object_dtype, + is_integer, + _TD_DTYPE, + _NS_DTYPE, + _DATELIKE_DTYPES) +from .inference import is_list_like + + +def isnull(obj): + """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) + + Parameters + ---------- + arr : ndarray or object value + Object to check for null-ness + + Returns + ------- + isnulled : array-like of bool or bool + Array or bool indicating whether an object is null or if an array is + given which of the element is null. + + See also + -------- + pandas.notnull: boolean inverse of pandas.isnull + """ + return _isnull(obj) + + +def _isnull_new(obj): + if is_scalar(obj): + return lib.checknull(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, ABCMultiIndex): + raise NotImplementedError("isnull is not defined for MultiIndex") + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + return _isnull_ndarraylike(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isnull(func=isnull)) + elif isinstance(obj, list) or hasattr(obj, '__array__'): + return _isnull_ndarraylike(np.asarray(obj)) + else: + return obj is None + + +def _isnull_old(obj): + """Detect missing values. Treat None, NaN, INF, -INF as null. + + Parameters + ---------- + arr: ndarray or object value + + Returns + ------- + boolean ndarray or boolean + """ + if is_scalar(obj): + return lib.checknull_old(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, ABCMultiIndex): + raise NotImplementedError("isnull is not defined for MultiIndex") + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + return _isnull_ndarraylike_old(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isnull(func=_isnull_old)) + elif isinstance(obj, list) or hasattr(obj, '__array__'): + return _isnull_ndarraylike_old(np.asarray(obj)) + else: + return obj is None + + +_isnull = _isnull_new + + +def _use_inf_as_null(key): + """Option change callback for null/inf behaviour + Choose which replacement for numpy.isnan / -numpy.isfinite is used. + + Parameters + ---------- + flag: bool + True means treat None, NaN, INF, -INF as null (old way), + False means None and NaN are null, but INF, -INF are not null + (new way). + + Notes + ----- + This approach to setting global module values is discussed and + approved here: + + * http://stackoverflow.com/questions/4859217/ + programmatically-creating-variables-in-python/4859312#4859312 + """ + from pandas.core.config import get_option + flag = get_option(key) + if flag: + globals()['_isnull'] = _isnull_old + else: + globals()['_isnull'] = _isnull_new + + +def _isnull_ndarraylike(obj): + + values = getattr(obj, 'values', obj) + dtype = values.dtype + + if is_string_dtype(dtype): + if is_categorical_dtype(values): + from pandas import Categorical + if not isinstance(values, Categorical): + values = values.values + result = values.isnull() + else: + + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj(values.ravel()) + result[...] = vec.reshape(shape) + + elif is_datetimelike(obj): + # this is the NaT pattern + result = values.view('i8') == iNaT + else: + result = np.isnan(values) + + # box + if isinstance(obj, ABCSeries): + from pandas import Series + result = Series(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def _isnull_ndarraylike_old(obj): + values = getattr(obj, 'values', obj) + dtype = values.dtype + + if is_string_dtype(dtype): + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj_old(values.ravel()) + result[:] = vec.reshape(shape) + + elif dtype in _DATELIKE_DTYPES: + # this is the NaT pattern + result = values.view('i8') == iNaT + else: + result = ~np.isfinite(values) + + # box + if isinstance(obj, ABCSeries): + from pandas import Series + result = Series(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def notnull(obj): + """Replacement for numpy.isfinite / -numpy.isnan which is suitable for use + on object arrays. + + Parameters + ---------- + arr : ndarray or object value + Object to check for *not*-null-ness + + Returns + ------- + isnulled : array-like of bool or bool + Array or bool indicating whether an object is *not* null or if an array + is given which of the element is *not* null. + + See also + -------- + pandas.isnull : boolean inverse of pandas.notnull + """ + res = isnull(obj) + if is_scalar(res): + return not res + return ~res + + +def is_null_datelike_scalar(other): + """ test whether the object is a null datelike, e.g. Nat + but guard against passing a non-scalar """ + if other is NaT or other is None: + return True + elif is_scalar(other): + + # a timedelta + if hasattr(other, 'dtype'): + return other.view('i8') == iNaT + elif is_integer(other) and other == iNaT: + return True + return isnull(other) + return False + + +def _is_na_compat(arr, fill_value=np.nan): + """ + Parameters + ---------- + arr: a numpy array + fill_value: fill value, default to np.nan + + Returns + ------- + True if we can fill using this fill_value + """ + dtype = arr.dtype + if isnull(fill_value): + return not (is_bool_dtype(dtype) or + is_integer_dtype(dtype)) + return True + + +def array_equivalent(left, right, strict_nan=False): + """ + True if two arrays, left and right, have equal non-NaN elements, and NaNs + in corresponding locations. False otherwise. It is assumed that left and + right are NumPy arrays of the same dtype. The behavior of this function + (particularly with respect to NaNs) is not defined if the dtypes are + different. + + Parameters + ---------- + left, right : ndarrays + strict_nan : bool, default False + If True, consider NaN and None to be different. + + Returns + ------- + b : bool + Returns True if the arrays are equivalent. + + Examples + -------- + >>> array_equivalent( + ... np.array([1, 2, np.nan]), + ... np.array([1, 2, np.nan])) + True + >>> array_equivalent( + ... np.array([1, np.nan, 2]), + ... np.array([1, 2, np.nan])) + False + """ + + left, right = np.asarray(left), np.asarray(right) + + # shape compat + if left.shape != right.shape: + return False + + # Object arrays can contain None, NaN and NaT. + # string dtypes must be come to this path for NumPy 1.7.1 compat + if is_string_dtype(left) or is_string_dtype(right): + + if not strict_nan: + # isnull considers NaN and None to be equivalent. + return lib.array_equivalent_object( + _ensure_object(left.ravel()), _ensure_object(right.ravel())) + + for left_value, right_value in zip(left, right): + if left_value is NaT and right_value is not NaT: + return False + + elif isinstance(left_value, float) and np.isnan(left_value): + if (not isinstance(right_value, float) or + not np.isnan(right_value)): + return False + else: + if left_value != right_value: + return False + return True + + # NaNs can occur in float and complex arrays. + if is_float_dtype(left) or is_complex_dtype(left): + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + # numpy will will not allow this type of datetimelike vs integer comparison + elif is_datetimelike_v_numeric(left, right): + return False + + # M8/m8 + elif needs_i8_conversion(left) and needs_i8_conversion(right): + if not is_dtype_equal(left.dtype, right.dtype): + return False + + left = left.view('i8') + right = right.view('i8') + + # NaNs cannot occur otherwise. + try: + return np.array_equal(left, right) + except AttributeError: + # see gh-13388 + # + # NumPy v1.7.1 has a bug in its array_equal + # function that prevents it from correctly + # comparing two arrays with complex dtypes. + # This bug is corrected in v1.8.0, so remove + # this try-except block as soon as we stop + # supporting NumPy versions < 1.8.0 + if not is_dtype_equal(left.dtype, right.dtype): + return False + + left = left.tolist() + right = right.tolist() + + return left == right + + +def _infer_fill_value(val): + """ + infer the fill value for the nan/NaT from the provided + scalar/ndarray/list-like if we are a NaT, return the correct dtyped + element to provide proper block construction + """ + + if not is_list_like(val): + val = [val] + val = np.array(val, copy=False) + if is_datetimelike(val): + return np.array('NaT', dtype=val.dtype) + elif is_object_dtype(val.dtype): + dtype = lib.infer_dtype(_ensure_object(val)) + if dtype in ['datetime', 'datetime64']: + return np.array('NaT', dtype=_NS_DTYPE) + elif dtype in ['timedelta', 'timedelta64']: + return np.array('NaT', dtype=_TD_DTYPE) + return np.nan + + +def _maybe_fill(arr, fill_value=np.nan): + """ + if we have a compatiable fill_value and arr dtype, then fill + """ + if _is_na_compat(arr, fill_value): + arr.fill(fill_value) + return arr + + +def na_value_for_dtype(dtype): + """ + Return a dtype compat na value + + Parameters + ---------- + dtype : string / dtype + + Returns + ------- + np.dtype or a pandas dtype + """ + dtype = pandas_dtype(dtype) + + if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or + is_timedelta64_dtype(dtype)): + return NaT + elif is_float_dtype(dtype): + return np.nan + elif is_integer_dtype(dtype): + return 0 + elif is_bool_dtype(dtype): + return False + return np.nan diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 2961b2fb2241f..4442eed898b60 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -23,11 +23,14 @@ import numpy as np import pandas as pd -from pandas.core.common import (is_sequence, array_equivalent, - is_list_like, is_datetimelike_v_numeric, - is_datetimelike_v_object, - is_number, is_bool, - needs_i8_conversion, is_categorical_dtype) +from pandas.types.missing import array_equivalent +from pandas.types.common import (is_datetimelike_v_numeric, + is_datetimelike_v_object, + is_number, is_bool, + needs_i8_conversion, + is_categorical_dtype, + is_sequence, + is_list_like) from pandas.formats.printing import pprint_thing from pandas.core.algorithms import take_1d @@ -1001,17 +1004,20 @@ def assert_categorical_equal(left, right, check_dtype=True, assert_attr_equal('ordered', left, right, obj=obj) -def raise_assert_detail(obj, message, left, right): +def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): left = pprint_thing(left) if isinstance(right, np.ndarray): right = pprint_thing(right) + if diff is not None: + diff = "\n[diff]: {diff}".format(diff=diff) + msg = """{0} are different {1} [left]: {2} -[right]: {3}""".format(obj, message, left, right) +[right]: {3}{4}""".format(obj, message, left, right, diff) raise AssertionError(msg) diff --git a/pandas/util/validators.py b/pandas/util/validators.py index bbfd24df9c13e..964fa9d9b38d5 100644 --- a/pandas/util/validators.py +++ b/pandas/util/validators.py @@ -3,6 +3,8 @@ for validating data or function arguments """ +from pandas.types.common import is_bool + def _check_arg_length(fname, args, max_fname_arg_count, compat_args): """ @@ -35,8 +37,6 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): checked that arg_val_dict.keys() is a subset of compat_args """ - from pandas.core.common import is_bool - for key in arg_val_dict: # try checking equality directly with '=' operator, # as comparison may have been overriden for the left From 20de2661c8eff66e465248cbe28062eae0e0e3bb Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 13 Jul 2016 10:38:09 -0400 Subject: [PATCH 096/359] BLD: included pandas.api.* in setup.py (#13640) --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index 8f8865ecc3b7a..650357588570a 100755 --- a/setup.py +++ b/setup.py @@ -547,6 +547,9 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=['pandas', + 'pandas.api', + 'pandas.api.tests', + 'pandas.api.types', 'pandas.compat', 'pandas.compat.numpy', 'pandas.computation', From 44f3229709d40241917267f4cfa7b28f9a92678b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Jul 2016 09:12:52 +0200 Subject: [PATCH 097/359] DOC/BLD: pin IPython version to 4.2.0 (#13639) (#13647) --- ci/requirements-2.7_DOC_BUILD.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7_DOC_BUILD.run b/ci/requirements-2.7_DOC_BUILD.run index a07721c75cf34..cde0719aa027e 100644 --- a/ci/requirements-2.7_DOC_BUILD.run +++ b/ci/requirements-2.7_DOC_BUILD.run @@ -1,4 +1,4 @@ -ipython=4 +ipython=4.2.0 ipykernel sphinx nbconvert From 6f0a020e0929d53b2341f58f970806c85facef91 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Thu, 14 Jul 2016 17:15:23 +0900 Subject: [PATCH 098/359] TST: reorganize tools.tests (#13619) --- pandas/tools/tests/test_concat.py | 432 +++++++++----- pandas/tools/tests/test_join.py | 787 ++++++++++++++++++++++++++ pandas/tools/tests/test_merge.py | 900 +----------------------------- 3 files changed, 1082 insertions(+), 1037 deletions(-) create mode 100644 pandas/tools/tests/test_join.py diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index a8c86657a48cc..568cf63c02e30 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -17,7 +17,7 @@ assert_almost_equal) -class TestConcatenate(tm.TestCase): +class ConcatenateBase(tm.TestCase): _multiprocess_can_split_ = True @@ -26,6 +26,9 @@ def setUp(self): self.mixed_frame = self.frame.copy() self.mixed_frame['foo'] = 'bar' + +class TestAppend(ConcatenateBase): + def test_append(self): begin_index = self.frame.index[:5] end_index = self.frame.index[5:] @@ -142,42 +145,32 @@ def test_append_preserve_index_name(self): result = df1.append(df2) self.assertEqual(result.index.name, 'A') - def test_join_many(self): - df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) - df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] - - joined = df_list[0].join(df_list[1:]) - tm.assert_frame_equal(joined, df) - - df_list = [df[['a', 'b']][:-2], - df[['c', 'd']][2:], df[['e', 'f']][1:9]] - - def _check_diff_index(df_list, result, exp_index): - reindexed = [x.reindex(exp_index) for x in df_list] - expected = reindexed[0].join(reindexed[1:]) - tm.assert_frame_equal(result, expected) - - # different join types - joined = df_list[0].join(df_list[1:], how='outer') - _check_diff_index(df_list, joined, df.index) - - joined = df_list[0].join(df_list[1:]) - _check_diff_index(df_list, joined, df_list[0].index) - - joined = df_list[0].join(df_list[1:], how='inner') - _check_diff_index(df_list, joined, df.index[2:8]) - - self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') - - def test_join_many_mixed(self): - df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) - df['key'] = ['foo', 'bar'] * 4 - df1 = df.ix[:, ['A', 'B']] - df2 = df.ix[:, ['C', 'D']] - df3 = df.ix[:, ['key']] - - result = df1.join([df2, df3]) - assert_frame_equal(result, df) + def test_append_dtype_coerce(self): + + # GH 4993 + # appending with datetime will incorrectly convert datetime64 + import datetime as dt + from pandas import NaT + + df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0)], + columns=['start_time']) + df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10)], + [dt.datetime(2013, 1, 4, 0, 0), + dt.datetime(2013, 1, 4, 7, 10)]], + columns=['start_time', 'end_time']) + + expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 4, 7, 10)], + name='end_time'), + Series([dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0), + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 4, 0, 0)], + name='start_time')], axis=1) + result = df1.append(df2, ignore_index=True) + assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self): df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) @@ -188,6 +181,9 @@ def test_append_missing_column_proper_upcast(self): self.assertEqual(appended['A'].dtype, 'f8') self.assertEqual(appended['B'].dtype, 'O') + +class TestConcatenate(ConcatenateBase): + def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) @@ -524,35 +520,6 @@ def test_with_mixed_tuples(self): # it works concat([df1, df2]) - def test_join_dups(self): - - # joining dups - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) - - expected = concat([df, df], axis=1) - result = df.join(df, rsuffix='_2') - result.columns = expected.columns - assert_frame_equal(result, expected) - - # GH 4975, invalid join on dups - w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - - dta = x.merge(y, left_index=True, right_index=True).merge( - z, left_index=True, right_index=True, how="outer") - dta = dta.merge(w, left_index=True, right_index=True) - expected = concat([x, y, z, w], axis=1) - expected.columns = ['x_x', 'y_x', 'x_y', - 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] - assert_frame_equal(dta, expected) - def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) @@ -649,86 +616,40 @@ def test_concat_mixed_objs(self): panel = tm.makePanel() self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) - def test_panel_join(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[:2, :10, :3] - p2 = panel.ix[2:, 5:, 2:] - - # left join - result = p1.join(p2) - expected = p1.copy() - expected['ItemC'] = p2['ItemC'] - tm.assert_panel_equal(result, expected) - - # right join - result = p1.join(p2, how='right') - expected = p2.copy() - expected['ItemA'] = p1['ItemA'] - expected['ItemB'] = p1['ItemB'] - expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) - tm.assert_panel_equal(result, expected) - - # inner join - result = p1.join(p2, how='inner') - expected = panel.ix[:, 5:10, 2:3] - tm.assert_panel_equal(result, expected) - - # outer join - result = p1.join(p2, how='outer') - expected = p1.reindex(major=panel.major_axis, - minor=panel.minor_axis) - expected = expected.join(p2.reindex(major=panel.major_axis, - minor=panel.minor_axis)) - tm.assert_panel_equal(result, expected) - - def test_panel_join_overlap(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] - p2 = panel.ix[['ItemB', 'ItemC']] - - # Expected index is - # - # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 - joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') - p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') - p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') - no_overlap = panel.ix[['ItemA']] - expected = no_overlap.join(p1_suf.join(p2_suf)) - tm.assert_panel_equal(joined, expected) - - def test_panel_join_many(self): - tm.K = 10 - panel = tm.makePanel() - tm.K = 4 + def test_empty_dtype_coerce(self): - panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + # xref to #12411 + # xref to #12045 + # xref to #11594 + # see below - joined = panels[0].join(panels[1:]) - tm.assert_panel_equal(joined, panel) + # 10571 + df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b']) + df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b']) + result = concat([df1, df2]) + expected = df1.dtypes + tm.assert_series_equal(result.dtypes, expected) - panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + def test_dtype_coerceion(self): - data_dict = {} - for p in panels: - data_dict.update(p.iteritems()) + # 12411 + df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), + pd.NaT]}) - joined = panels[0].join(panels[1:], how='inner') - expected = Panel.from_dict(data_dict, intersect=True) - tm.assert_panel_equal(joined, expected) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) - joined = panels[0].join(panels[1:], how='outer') - expected = Panel.from_dict(data_dict, intersect=False) - tm.assert_panel_equal(joined, expected) + # 12045 + import datetime + df = DataFrame({'date': [datetime.datetime(2012, 1, 1), + datetime.datetime(1012, 1, 2)]}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) - # edge cases - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='outer', lsuffix='foo', rsuffix='bar') - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='right') + # 11594 + df = DataFrame({'text': ['some words'] + [None] * 9}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) def test_panel_concat_other_axes(self): panel = tm.makePanel() @@ -1080,6 +1001,239 @@ def test_concat_invalid_first_argument(self): expected = read_csv(StringIO(data)) assert_frame_equal(result, expected) + def test_concat_NaT_series(self): + # GH 11693 + # test for merging NaT series with datetime series. + x = Series(date_range('20151124 08:00', '20151124 09:00', + freq='1h', tz='US/Eastern')) + y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') + expected = Series([x[0], x[1], pd.NaT, pd.NaT]) + + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT with tz + expected = Series(pd.NaT, index=range(4), + dtype='datetime64[ns, US/Eastern]') + result = pd.concat([y, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # without tz + x = pd.Series(pd.date_range('20151124 08:00', + '20151124 09:00', freq='1h')) + y = pd.Series(pd.date_range('20151124 10:00', + '20151124 11:00', freq='1h')) + y[:] = pd.NaT + expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT without tz + x[:] = pd.NaT + expected = pd.Series(pd.NaT, index=range(4), + dtype='datetime64[ns]') + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + def test_concat_tz_frame(self): + df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), + B=pd.Timestamp('20130603', tz='CET')), + index=range(5)) + + # concat + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + assert_frame_equal(df2, df3) + + def test_concat_tz_series(self): + # GH 11755 + # tz and no tz + x = Series(date_range('20151124 08:00', + '20151124 09:00', + freq='1h', tz='UTC')) + y = Series(date_range('2012-01-01', '2012-01-02')) + expected = Series([x[0], x[1], y[0], y[1]], + dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # GH 11887 + # concat tz and object + x = Series(date_range('20151124 08:00', + '20151124 09:00', + freq='1h', tz='UTC')) + y = Series(['a', 'b']) + expected = Series([x[0], x[1], y[0], y[1]], + dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # 12217 + # 12306 fixed I think + + # Concat'ing two UTC times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize('UTC') + + second = pd.DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize('UTC') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, UTC]') + + # Concat'ing two London times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize('Europe/London') + + second = pd.DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize('Europe/London') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') + + # Concat'ing 2+1 London times + first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) + first[0] = first[0].dt.tz_localize('Europe/London') + + second = pd.DataFrame([[datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize('Europe/London') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') + + # Concat'ing 1+2 London times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize('Europe/London') + + second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize('Europe/London') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') + + def test_concat_tz_series_with_datetimelike(self): + # GH 12620 + # tz and timedelta + x = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-02-01', tz='US/Eastern')] + y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + + # tz and period + y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + + def test_concat_tz_series_tzlocal(self): + # GH 13583 + tm._skip_if_no_dateutil() + import dateutil + x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()), + pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())] + y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()), + pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y)) + self.assertEqual(result.dtype, 'datetime64[ns, tzlocal()]') + + def test_concat_period_series(self): + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + # different freq + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M')) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M')) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + # non-period + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01'])) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(['A', 'B']) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + def test_concat_empty_series(self): + # GH 11082 + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name='y') + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(res, exp) + + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name='y') + res = pd.concat([s1, s2], axis=0) + # name will be reset + exp = pd.Series([1, 2, 3]) + tm.assert_series_equal(res, exp) + + # empty Series with no name + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name=None) + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, + columns=['x', 0]) + tm.assert_frame_equal(res, exp) + + def test_default_index(self): + # is_series and ignore_index + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series([4, 5, 6], name='y') + res = pd.concat([s1, s2], axis=1, ignore_index=True) + self.assertIsInstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + # use check_index_type=True to check the result have + # RangeIndex (default index) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + # is_series and all inputs have no names + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + res = pd.concat([s1, s2], axis=1, ignore_index=False) + self.assertIsInstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + exp.columns = pd.RangeIndex(2) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + # is_dataframe and ignore_index + df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]}) + df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]}) + + res = pd.concat([df1, df2], axis=0, ignore_index=True) + exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], + columns=['A', 'B']) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + res = pd.concat([df1, df2], axis=1, ignore_index=True) + exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py new file mode 100644 index 0000000000000..86aee0b4a01c9 --- /dev/null +++ b/pandas/tools/tests/test_join.py @@ -0,0 +1,787 @@ +# pylint: disable=E1103 + +import nose + +from numpy.random import randn +import numpy as np + +import pandas as pd +from pandas.compat import lrange +import pandas.compat as compat +from pandas.tools.merge import merge, concat +from pandas.util.testing import assert_frame_equal +from pandas import DataFrame, MultiIndex, Series + +import pandas.algos as algos +import pandas.util.testing as tm +from pandas.tools.tests.test_merge import get_test_data, N, NGROUPS + + +a_ = np.array + + +class TestJoin(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + # aggregate multiple columns + self.df = DataFrame({'key1': get_test_data(), + 'key2': get_test_data(), + 'data1': np.random.randn(N), + 'data2': np.random.randn(N)}) + + # exclude a couple keys for fun + self.df = self.df[self.df['key2'] > 1] + + self.df2 = DataFrame({'key1': get_test_data(n=N // 5), + 'key2': get_test_data(ngroups=NGROUPS // 2, + n=N // 5), + 'value': np.random.randn(N // 5)}) + + index, data = tm.getMixedTypeDict() + self.target = DataFrame(data, index=index) + + # Join on string value + self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, + index=data['C']) + + def test_cython_left_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + ls, rs = algos.left_outer_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5, -1, -1]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_right_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + rs, ls = algos.left_outer_join(right, left, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + # 0 1 1 1 + exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, + # 2 2 4 + 6, 7, 8, 6, 7, 8, -1]) + exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, + 4, 4, 4, 5, 5, 5, 6]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_inner_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + max_group = 5 + + ls, rs = algos.inner_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_left_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') + + joined_both = merge(self.df, self.df2) + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='left') + + def test_right_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='right') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') + + joined_both = merge(self.df, self.df2, how='right') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='right') + + def test_full_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='outer') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') + + joined_both = merge(self.df, self.df2, how='outer') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='outer') + + def test_inner_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='inner') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') + + joined_both = merge(self.df, self.df2, how='inner') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='inner') + + def test_handle_overlap(self): + joined = merge(self.df, self.df2, on='key2', + suffixes=['.foo', '.bar']) + + self.assertIn('key1.foo', joined) + self.assertIn('key1.bar', joined) + + def test_handle_overlap_arbitrary_key(self): + joined = merge(self.df, self.df2, + left_on='key2', right_on='key1', + suffixes=['.foo', '.bar']) + self.assertIn('key1.foo', joined) + self.assertIn('key2.bar', joined) + + def test_join_on(self): + target = self.target + source = self.source + + merged = target.join(source, on='C') + self.assert_series_equal(merged['MergedA'], target['A'], + check_names=False) + self.assert_series_equal(merged['MergedD'], target['D'], + check_names=False) + + # join with duplicates (fix regression from DataFrame/Matrix merge) + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + joined = df.join(df2, on='key') + expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], + 'value': [0, 0, 1, 1, 2]}) + assert_frame_equal(joined, expected) + + # Test when some are missing + df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], + columns=['one']) + df_b = DataFrame([['foo'], ['bar']], index=[1, 2], + columns=['two']) + df_c = DataFrame([[1], [2]], index=[1, 2], + columns=['three']) + joined = df_a.join(df_b, on='one') + joined = joined.join(df_c, on='one') + self.assertTrue(np.isnan(joined['two']['c'])) + self.assertTrue(np.isnan(joined['three']['c'])) + + # merge column not p resent + self.assertRaises(KeyError, target.join, source, on='E') + + # overlap + source_copy = source.copy() + source_copy['A'] = 0 + self.assertRaises(ValueError, target.join, source_copy, on='A') + + def test_join_on_fails_with_different_right_index(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}) + df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2)) + merge(df, df2, left_on='a', right_index=True) + + def test_join_on_fails_with_different_left_index(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}, + index=tm.makeCustomIndex(10, 2)) + df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}) + merge(df, df2, right_on='b', left_index=True) + + def test_join_on_fails_with_different_column_counts(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}) + df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2)) + merge(df, df2, right_on='a', left_on=['a', 'b']) + + def test_join_on_fails_with_wrong_object_type(self): + # GH12081 + wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] + df = DataFrame({'a': [1, 1]}) + + for obj in wrongly_typed: + with tm.assertRaisesRegexp(ValueError, str(type(obj))): + merge(obj, df, left_on='a', right_on='a') + with tm.assertRaisesRegexp(ValueError, str(type(obj))): + merge(df, obj, left_on='a', right_on='a') + + def test_join_on_pass_vector(self): + expected = self.target.join(self.source, on='C') + del expected['C'] + + join_col = self.target.pop('C') + result = self.target.join(self.source, on=join_col) + assert_frame_equal(result, expected) + + def test_join_with_len0(self): + # nothing to merge + merged = self.target.join(self.source.reindex([]), on='C') + for col in self.source: + self.assertIn(col, merged) + self.assertTrue(merged[col].isnull().all()) + + merged2 = self.target.join(self.source.reindex([]), on='C', + how='inner') + self.assert_index_equal(merged2.columns, merged.columns) + self.assertEqual(len(merged2), 0) + + def test_join_on_inner(self): + df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) + + joined = df.join(df2, on='key', how='inner') + + expected = df.join(df2, on='key') + expected = expected[expected['value'].notnull()] + self.assert_series_equal(joined['key'], expected['key'], + check_dtype=False) + self.assert_series_equal(joined['value'], expected['value'], + check_dtype=False) + self.assert_index_equal(joined.index, expected.index) + + def test_join_on_singlekey_list(self): + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + + # corner cases + joined = df.join(df2, on=['key']) + expected = df.join(df2, on='key') + + assert_frame_equal(joined, expected) + + def test_join_on_series(self): + result = self.target.join(self.source['MergedA'], on='C') + expected = self.target.join(self.source[['MergedA']], on='C') + assert_frame_equal(result, expected) + + def test_join_on_series_buglet(self): + # GH #638 + df = DataFrame({'a': [1, 1]}) + ds = Series([2], index=[1], name='b') + result = df.join(ds, on='a') + expected = DataFrame({'a': [1, 1], + 'b': [2, 2]}, index=df.index) + tm.assert_frame_equal(result, expected) + + def test_join_index_mixed(self): + df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(10), + columns=['A', 'B', 'C', 'D']) + self.assertEqual(df1['B'].dtype, np.int64) + self.assertEqual(df1['D'].dtype, np.bool_) + + df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(0, 10, 2), + columns=['A', 'B', 'C', 'D']) + + # overlap + joined = df1.join(df2, lsuffix='_one', rsuffix='_two') + expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', + 'A_two', 'B_two', 'C_two', 'D_two'] + df1.columns = expected_columns[:4] + df2.columns = expected_columns[4:] + expected = _join_by_hand(df1, df2) + assert_frame_equal(joined, expected) + + # no overlapping blocks + df1 = DataFrame(index=np.arange(10)) + df1['bool'] = True + df1['string'] = 'foo' + + df2 = DataFrame(index=np.arange(5, 15)) + df2['int'] = 1 + df2['float'] = 1. + + for kind in ['inner', 'outer', 'left', 'right']: + + joined = df1.join(df2, how=kind) + expected = _join_by_hand(df1, df2, how=kind) + assert_frame_equal(joined, expected) + + joined = df2.join(df1, how=kind) + expected = _join_by_hand(df2, df1, how=kind) + assert_frame_equal(joined, expected) + + def test_join_empty_bug(self): + # generated an exception in 0.4.3 + x = DataFrame() + x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + + def test_join_unconsolidated(self): + # GH #331 + a = DataFrame(randn(30, 2), columns=['a', 'b']) + c = Series(randn(30)) + a['c'] = c + d = DataFrame(randn(30, 1), columns=['q']) + + # it works! + a.join(d) + d.join(a) + + def test_join_multiindex(self): + index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], + [1, 2, 3, 1, 2, 3]], + names=['first', 'second']) + + index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], + [1, 2, 3, 1, 2, 3]], + names=['first', 'second']) + + df1 = DataFrame(data=np.random.randn(6), index=index1, + columns=['var X']) + df2 = DataFrame(data=np.random.randn(6), index=index2, + columns=['var Y']) + + df1 = df1.sortlevel(0) + df2 = df2.sortlevel(0) + + joined = df1.join(df2, how='outer') + ex_index = index1._tuple_index.union(index2._tuple_index) + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + df1 = df1.sortlevel(1) + df2 = df2.sortlevel(1) + + joined = df1.join(df2, how='outer').sortlevel(0) + ex_index = index1._tuple_index.union(index2._tuple_index) + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + def test_join_inner_multiindex(self): + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1': key1, 'key2': key2, + 'data': data}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + joined = data.join(to_join, on=['key1', 'key2'], how='inner') + expected = merge(data, to_join.reset_index(), + left_on=['key1', 'key2'], + right_on=['first', 'second'], how='inner', + sort=False) + + expected2 = merge(to_join, data, + right_on=['key1', 'key2'], left_index=True, + how='inner', sort=False) + assert_frame_equal(joined, expected2.reindex_like(joined)) + + expected2 = merge(to_join, data, right_on=['key1', 'key2'], + left_index=True, how='inner', sort=False) + + expected = expected.drop(['first', 'second'], axis=1) + expected.index = joined.index + + self.assertTrue(joined.index.is_monotonic) + assert_frame_equal(joined, expected) + + # _assert_same_contents(expected, expected2.ix[:, expected.columns]) + + def test_join_hierarchical_mixed(self): + # GH 2024 + df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) + new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) + other_df = DataFrame( + [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) + other_df.set_index('a', inplace=True) + # GH 9455, 12219 + with tm.assert_produces_warning(UserWarning): + result = merge(new_df, other_df, left_index=True, right_index=True) + self.assertTrue(('b', 'mean') in result) + self.assertTrue('b' in result) + + def test_join_float64_float32(self): + + a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) + b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) + joined = a.join(b) + self.assertEqual(joined.dtypes['a'], 'float64') + self.assertEqual(joined.dtypes['b'], 'float64') + self.assertEqual(joined.dtypes['c'], 'float32') + + a = np.random.randint(0, 5, 100).astype('int64') + b = np.random.random(100).astype('float64') + c = np.random.random(100).astype('float32') + df = DataFrame({'a': a, 'b': b, 'c': c}) + xpdf = DataFrame({'a': a, 'b': b, 'c': c}) + s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) + rs = df.merge(s, left_on='a', right_index=True) + self.assertEqual(rs.dtypes['a'], 'int64') + self.assertEqual(rs.dtypes['b'], 'float64') + self.assertEqual(rs.dtypes['c'], 'float32') + self.assertEqual(rs.dtypes['md'], 'float32') + + xp = xpdf.merge(s, left_on='a', right_index=True) + assert_frame_equal(rs, xp) + + def test_join_many_non_unique_index(self): + df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) + df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) + df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + + result = idf1.join([idf2, idf3], how='outer') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') + + result = result.reset_index() + expected = expected[result.columns] + expected['a'] = expected.a.astype('int64') + expected['b'] = expected.b.astype('int64') + assert_frame_equal(result, expected) + + df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) + df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) + df3 = DataFrame( + {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + result = idf1.join([idf2, idf3], how='inner') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') + + result = result.reset_index() + + assert_frame_equal(result, expected.ix[:, result.columns]) + + # GH 11519 + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + s = Series(np.repeat(np.arange(8), 2), + index=np.repeat(np.arange(8), 2), name='TEST') + inner = df.join(s, how='inner') + outer = df.join(s, how='outer') + left = df.join(s, how='left') + right = df.join(s, how='right') + assert_frame_equal(inner, outer) + assert_frame_equal(inner, left) + assert_frame_equal(inner, right) + + def test_join_sort(self): + left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], + 'value': [1, 2, 3, 4]}) + right = DataFrame({'value2': ['a', 'b', 'c']}, + index=['bar', 'baz', 'foo']) + + joined = left.join(right, on='key', sort=True) + expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], + 'value': [2, 3, 1, 4], + 'value2': ['a', 'b', 'c', 'c']}, + index=[1, 2, 0, 3]) + assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on='key', sort=False) + self.assert_index_equal(joined.index, pd.Index(lrange(4))) + + def test_mixed_type_join_with_suffix(self): + # GH #916 + df = DataFrame(np.random.randn(20, 6), + columns=['a', 'b', 'c', 'd', 'e', 'f']) + df.insert(0, 'id', 0) + df.insert(5, 'dt', 'foo') + + grouped = df.groupby('id') + mn = grouped.mean() + cn = grouped.count() + + # it works! + mn.join(cn, rsuffix='_right') + + def test_join_many(self): + df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) + df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] + + joined = df_list[0].join(df_list[1:]) + tm.assert_frame_equal(joined, df) + + df_list = [df[['a', 'b']][:-2], + df[['c', 'd']][2:], df[['e', 'f']][1:9]] + + def _check_diff_index(df_list, result, exp_index): + reindexed = [x.reindex(exp_index) for x in df_list] + expected = reindexed[0].join(reindexed[1:]) + tm.assert_frame_equal(result, expected) + + # different join types + joined = df_list[0].join(df_list[1:], how='outer') + _check_diff_index(df_list, joined, df.index) + + joined = df_list[0].join(df_list[1:]) + _check_diff_index(df_list, joined, df_list[0].index) + + joined = df_list[0].join(df_list[1:], how='inner') + _check_diff_index(df_list, joined, df.index[2:8]) + + self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') + + def test_join_many_mixed(self): + df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) + df['key'] = ['foo', 'bar'] * 4 + df1 = df.ix[:, ['A', 'B']] + df2 = df.ix[:, ['C', 'D']] + df3 = df.ix[:, ['key']] + + result = df1.join([df2, df3]) + assert_frame_equal(result, df) + + def test_join_dups(self): + + # joining dups + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], + axis=1) + + expected = concat([df, df], axis=1) + result = df.join(df, rsuffix='_2') + result.columns = expected.columns + assert_frame_equal(result, expected) + + # GH 4975, invalid join on dups + w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + + dta = x.merge(y, left_index=True, right_index=True).merge( + z, left_index=True, right_index=True, how="outer") + dta = dta.merge(w, left_index=True, right_index=True) + expected = concat([x, y, z, w], axis=1) + expected.columns = ['x_x', 'y_x', 'x_y', + 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] + assert_frame_equal(dta, expected) + + def test_panel_join(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[:2, :10, :3] + p2 = panel.ix[2:, 5:, 2:] + + # left join + result = p1.join(p2) + expected = p1.copy() + expected['ItemC'] = p2['ItemC'] + tm.assert_panel_equal(result, expected) + + # right join + result = p1.join(p2, how='right') + expected = p2.copy() + expected['ItemA'] = p1['ItemA'] + expected['ItemB'] = p1['ItemB'] + expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) + tm.assert_panel_equal(result, expected) + + # inner join + result = p1.join(p2, how='inner') + expected = panel.ix[:, 5:10, 2:3] + tm.assert_panel_equal(result, expected) + + # outer join + result = p1.join(p2, how='outer') + expected = p1.reindex(major=panel.major_axis, + minor=panel.minor_axis) + expected = expected.join(p2.reindex(major=panel.major_axis, + minor=panel.minor_axis)) + tm.assert_panel_equal(result, expected) + + def test_panel_join_overlap(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] + p2 = panel.ix[['ItemB', 'ItemC']] + + # Expected index is + # + # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 + joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') + p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') + p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') + no_overlap = panel.ix[['ItemA']] + expected = no_overlap.join(p1_suf.join(p2_suf)) + tm.assert_panel_equal(joined, expected) + + def test_panel_join_many(self): + tm.K = 10 + panel = tm.makePanel() + tm.K = 4 + + panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + + joined = panels[0].join(panels[1:]) + tm.assert_panel_equal(joined, panel) + + panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + + data_dict = {} + for p in panels: + data_dict.update(p.iteritems()) + + joined = panels[0].join(panels[1:], how='inner') + expected = pd.Panel.from_dict(data_dict, intersect=True) + tm.assert_panel_equal(joined, expected) + + joined = panels[0].join(panels[1:], how='outer') + expected = pd.Panel.from_dict(data_dict, intersect=False) + tm.assert_panel_equal(joined, expected) + + # edge cases + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='outer', lsuffix='foo', rsuffix='bar') + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='right') + + +def _check_join(left, right, result, join_col, how='left', + lsuffix='_x', rsuffix='_y'): + + # some smoke tests + for c in join_col: + assert(result[c].notnull().all()) + + left_grouped = left.groupby(join_col) + right_grouped = right.groupby(join_col) + + for group_key, group in result.groupby(join_col): + l_joined = _restrict_to_columns(group, left.columns, lsuffix) + r_joined = _restrict_to_columns(group, right.columns, rsuffix) + + try: + lgroup = left_grouped.get_group(group_key) + except KeyError: + if how in ('left', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(l_joined, left.columns, join_col) + else: + _assert_same_contents(l_joined, lgroup) + + try: + rgroup = right_grouped.get_group(group_key) + except KeyError: + if how in ('right', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(r_joined, right.columns, join_col) + else: + _assert_same_contents(r_joined, rgroup) + + +def _restrict_to_columns(group, columns, suffix): + found = [c for c in group.columns + if c in columns or c.replace(suffix, '') in columns] + + # filter + group = group.ix[:, found] + + # get rid of suffixes, if any + group = group.rename(columns=lambda x: x.replace(suffix, '')) + + # put in the right order... + group = group.ix[:, columns] + + return group + + +def _assert_same_contents(join_chunk, source): + NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... + + jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values + svalues = source.fillna(NA_SENTINEL).drop_duplicates().values + + rows = set(tuple(row) for row in jvalues) + assert(len(rows) == len(source)) + assert(all(tuple(row) in rows for row in svalues)) + + +def _assert_all_na(join_chunk, source_columns, join_col): + for c in source_columns: + if c in join_col: + continue + assert(join_chunk[c].isnull().all()) + + +def _join_by_hand(a, b, how='left'): + join_index = a.index.join(b.index, how=how) + + a_re = a.reindex(join_index) + b_re = b.reindex(join_index) + + result_columns = a.columns.append(b.columns) + + for col, s in compat.iteritems(b_re): + a_re[col] = s + return a_re.reindex(columns=result_columns) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 6c448de741e0c..396b095fabbd6 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -9,23 +9,17 @@ import random import pandas as pd -from pandas.compat import range, lrange, lzip +from pandas.compat import lrange, lzip from pandas.tools.merge import merge, concat, MergeError from pandas.util.testing import (assert_frame_equal, assert_series_equal, slow) -from pandas import (DataFrame, Index, MultiIndex, - Series, date_range, Categorical, - compat) -import pandas.algos as algos +from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm -a_ = np.array - N = 50 NGROUPS = 8 -JOIN_TYPES = ['inner', 'outer', 'left', 'right'] def get_test_data(ngroups=NGROUPS, n=N): @@ -58,496 +52,16 @@ def setUp(self): n=N // 5), 'value': np.random.randn(N // 5)}) - index, data = tm.getMixedTypeDict() - self.target = DataFrame(data, index=index) - - # Join on string value - self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, - index=data['C']) - self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) self.right = DataFrame({'v2': np.random.randn(4)}, index=['d', 'b', 'c', 'a']) - def test_cython_left_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - ls, rs = algos.left_outer_join(left, right, max_group) - - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8, 9, 10]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5, -1, -1]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_cython_right_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - rs, ls = algos.left_outer_join(right, left, max_group) - - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') - - # 0 1 1 1 - exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, - # 2 2 4 - 6, 7, 8, 6, 7, 8, -1]) - exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, - 4, 4, 4, 5, 5, 5, 6]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_cython_inner_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) - max_group = 5 - - ls, rs = algos.inner_join(left, right, max_group) - - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_left_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') - - joined_both = merge(self.df, self.df2) - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='left') - - def test_right_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='right') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') - - joined_both = merge(self.df, self.df2, how='right') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='right') - - def test_full_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='outer') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') - - joined_both = merge(self.df, self.df2, how='outer') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='outer') - - def test_inner_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='inner') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') - - joined_both = merge(self.df, self.df2, how='inner') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='inner') - - def test_handle_overlap(self): - joined = merge(self.df, self.df2, on='key2', - suffixes=['.foo', '.bar']) - - self.assertIn('key1.foo', joined) - self.assertIn('key1.bar', joined) - - def test_handle_overlap_arbitrary_key(self): - joined = merge(self.df, self.df2, - left_on='key2', right_on='key1', - suffixes=['.foo', '.bar']) - self.assertIn('key1.foo', joined) - self.assertIn('key2.bar', joined) - def test_merge_common(self): joined = merge(self.df, self.df2) exp = merge(self.df, self.df2, on=['key1', 'key2']) tm.assert_frame_equal(joined, exp) - def test_join_on(self): - target = self.target - source = self.source - - merged = target.join(source, on='C') - self.assert_series_equal(merged['MergedA'], target['A'], - check_names=False) - self.assert_series_equal(merged['MergedD'], target['D'], - check_names=False) - - # join with duplicates (fix regression from DataFrame/Matrix merge) - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) - joined = df.join(df2, on='key') - expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], - 'value': [0, 0, 1, 1, 2]}) - assert_frame_equal(joined, expected) - - # Test when some are missing - df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], - columns=['one']) - df_b = DataFrame([['foo'], ['bar']], index=[1, 2], - columns=['two']) - df_c = DataFrame([[1], [2]], index=[1, 2], - columns=['three']) - joined = df_a.join(df_b, on='one') - joined = joined.join(df_c, on='one') - self.assertTrue(np.isnan(joined['two']['c'])) - self.assertTrue(np.isnan(joined['three']['c'])) - - # merge column not p resent - self.assertRaises(KeyError, target.join, source, on='E') - - # overlap - source_copy = source.copy() - source_copy['A'] = 0 - self.assertRaises(ValueError, target.join, source_copy, on='A') - - def test_join_on_fails_with_different_right_index(self): - with tm.assertRaises(ValueError): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) - merge(df, df2, left_on='a', right_index=True) - - def test_join_on_fails_with_different_left_index(self): - with tm.assertRaises(ValueError): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}, - index=tm.makeCustomIndex(10, 2)) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}) - merge(df, df2, right_on='b', left_index=True) - - def test_join_on_fails_with_different_column_counts(self): - with tm.assertRaises(ValueError): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) - merge(df, df2, right_on='a', left_on=['a', 'b']) - - def test_join_on_fails_with_wrong_object_type(self): - # GH12081 - wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] - df = DataFrame({'a': [1, 1]}) - - for obj in wrongly_typed: - with tm.assertRaisesRegexp(ValueError, str(type(obj))): - merge(obj, df, left_on='a', right_on='a') - with tm.assertRaisesRegexp(ValueError, str(type(obj))): - merge(df, obj, left_on='a', right_on='a') - - def test_join_on_pass_vector(self): - expected = self.target.join(self.source, on='C') - del expected['C'] - - join_col = self.target.pop('C') - result = self.target.join(self.source, on=join_col) - assert_frame_equal(result, expected) - - def test_join_with_len0(self): - # nothing to merge - merged = self.target.join(self.source.reindex([]), on='C') - for col in self.source: - self.assertIn(col, merged) - self.assertTrue(merged[col].isnull().all()) - - merged2 = self.target.join(self.source.reindex([]), on='C', - how='inner') - self.assert_index_equal(merged2.columns, merged.columns) - self.assertEqual(len(merged2), 0) - - def test_join_on_inner(self): - df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) - - joined = df.join(df2, on='key', how='inner') - - expected = df.join(df2, on='key') - expected = expected[expected['value'].notnull()] - self.assert_series_equal(joined['key'], expected['key'], - check_dtype=False) - self.assert_series_equal(joined['value'], expected['value'], - check_dtype=False) - self.assert_index_equal(joined.index, expected.index) - - def test_join_on_singlekey_list(self): - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) - - # corner cases - joined = df.join(df2, on=['key']) - expected = df.join(df2, on='key') - - assert_frame_equal(joined, expected) - - def test_join_on_series(self): - result = self.target.join(self.source['MergedA'], on='C') - expected = self.target.join(self.source[['MergedA']], on='C') - assert_frame_equal(result, expected) - - def test_join_on_series_buglet(self): - # GH #638 - df = DataFrame({'a': [1, 1]}) - ds = Series([2], index=[1], name='b') - result = df.join(ds, on='a') - expected = DataFrame({'a': [1, 1], - 'b': [2, 2]}, index=df.index) - tm.assert_frame_equal(result, expected) - - def test_join_index_mixed(self): - df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(10), - columns=['A', 'B', 'C', 'D']) - self.assertEqual(df1['B'].dtype, np.int64) - self.assertEqual(df1['D'].dtype, np.bool_) - - df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(0, 10, 2), - columns=['A', 'B', 'C', 'D']) - - # overlap - joined = df1.join(df2, lsuffix='_one', rsuffix='_two') - expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', - 'A_two', 'B_two', 'C_two', 'D_two'] - df1.columns = expected_columns[:4] - df2.columns = expected_columns[4:] - expected = _join_by_hand(df1, df2) - assert_frame_equal(joined, expected) - - # no overlapping blocks - df1 = DataFrame(index=np.arange(10)) - df1['bool'] = True - df1['string'] = 'foo' - - df2 = DataFrame(index=np.arange(5, 15)) - df2['int'] = 1 - df2['float'] = 1. - - for kind in JOIN_TYPES: - - joined = df1.join(df2, how=kind) - expected = _join_by_hand(df1, df2, how=kind) - assert_frame_equal(joined, expected) - - joined = df2.join(df1, how=kind) - expected = _join_by_hand(df2, df1, how=kind) - assert_frame_equal(joined, expected) - - def test_join_empty_bug(self): - # generated an exception in 0.4.3 - x = DataFrame() - x.join(DataFrame([3], index=[0], columns=['A']), how='outer') - - def test_join_unconsolidated(self): - # GH #331 - a = DataFrame(randn(30, 2), columns=['a', 'b']) - c = Series(randn(30)) - a['c'] = c - d = DataFrame(randn(30, 1), columns=['q']) - - # it works! - a.join(d) - d.join(a) - - def test_join_multiindex(self): - index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) - - index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) - - df1 = DataFrame(data=np.random.randn(6), index=index1, - columns=['var X']) - df2 = DataFrame(data=np.random.randn(6), index=index2, - columns=['var Y']) - - df1 = df1.sortlevel(0) - df2 = df2.sortlevel(0) - - joined = df1.join(df2, how='outer') - ex_index = index1._tuple_index.union(index2._tuple_index) - expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) - expected.index.names = index1.names - assert_frame_equal(joined, expected) - self.assertEqual(joined.index.names, index1.names) - - df1 = df1.sortlevel(1) - df2 = df2.sortlevel(1) - - joined = df1.join(df2, how='outer').sortlevel(0) - ex_index = index1._tuple_index.union(index2._tuple_index) - expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) - expected.index.names = index1.names - - assert_frame_equal(joined, expected) - self.assertEqual(joined.index.names, index1.names) - - def test_join_inner_multiindex(self): - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] - - data = np.random.randn(len(key1)) - data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - to_join = DataFrame(np.random.randn(10, 3), index=index, - columns=['j_one', 'j_two', 'j_three']) - - joined = data.join(to_join, on=['key1', 'key2'], how='inner') - expected = merge(data, to_join.reset_index(), - left_on=['key1', 'key2'], - right_on=['first', 'second'], how='inner', - sort=False) - - expected2 = merge(to_join, data, - right_on=['key1', 'key2'], left_index=True, - how='inner', sort=False) - assert_frame_equal(joined, expected2.reindex_like(joined)) - - expected2 = merge(to_join, data, right_on=['key1', 'key2'], - left_index=True, how='inner', sort=False) - - expected = expected.drop(['first', 'second'], axis=1) - expected.index = joined.index - - self.assertTrue(joined.index.is_monotonic) - assert_frame_equal(joined, expected) - - # _assert_same_contents(expected, expected2.ix[:, expected.columns]) - - def test_join_hierarchical_mixed(self): - # GH 2024 - df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) - new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) - other_df = DataFrame( - [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) - other_df.set_index('a', inplace=True) - # GH 9455, 12219 - with tm.assert_produces_warning(UserWarning): - result = merge(new_df, other_df, left_index=True, right_index=True) - self.assertTrue(('b', 'mean') in result) - self.assertTrue('b' in result) - - def test_join_float64_float32(self): - - a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) - b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) - joined = a.join(b) - self.assertEqual(joined.dtypes['a'], 'float64') - self.assertEqual(joined.dtypes['b'], 'float64') - self.assertEqual(joined.dtypes['c'], 'float32') - - a = np.random.randint(0, 5, 100).astype('int64') - b = np.random.random(100).astype('float64') - c = np.random.random(100).astype('float32') - df = DataFrame({'a': a, 'b': b, 'c': c}) - xpdf = DataFrame({'a': a, 'b': b, 'c': c}) - s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) - rs = df.merge(s, left_on='a', right_index=True) - self.assertEqual(rs.dtypes['a'], 'int64') - self.assertEqual(rs.dtypes['b'], 'float64') - self.assertEqual(rs.dtypes['c'], 'float32') - self.assertEqual(rs.dtypes['md'], 'float32') - - xp = xpdf.merge(s, left_on='a', right_index=True) - assert_frame_equal(rs, xp) - - def test_join_many_non_unique_index(self): - df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) - df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) - df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) - idf1 = df1.set_index(["a", "b"]) - idf2 = df2.set_index(["a", "b"]) - idf3 = df3.set_index(["a", "b"]) - - result = idf1.join([idf2, idf3], how='outer') - - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') - - result = result.reset_index() - expected = expected[result.columns] - expected['a'] = expected.a.astype('int64') - expected['b'] = expected.b.astype('int64') - assert_frame_equal(result, expected) - - df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) - df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) - df3 = DataFrame( - {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) - idf1 = df1.set_index(["a", "b"]) - idf2 = df2.set_index(["a", "b"]) - idf3 = df3.set_index(["a", "b"]) - result = idf1.join([idf2, idf3], how='inner') - - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') - - result = result.reset_index() - - assert_frame_equal(result, expected.ix[:, result.columns]) - - # GH 11519 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - s = Series(np.repeat(np.arange(8), 2), - index=np.repeat(np.arange(8), 2), name='TEST') - inner = df.join(s, how='inner') - outer = df.join(s, how='outer') - left = df.join(s, how='left') - right = df.join(s, how='right') - assert_frame_equal(inner, outer) - assert_frame_equal(inner, left) - assert_frame_equal(inner, right) - def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) @@ -651,23 +165,6 @@ def test_merge_nocopy(self): merged['d'] = 'peekaboo' self.assertTrue((right['d'] == 'peekaboo').all()) - def test_join_sort(self): - left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], - 'value': [1, 2, 3, 4]}) - right = DataFrame({'value2': ['a', 'b', 'c']}, - index=['bar', 'baz', 'foo']) - - joined = left.join(right, on='key', sort=True) - expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], - 'value': [2, 3, 1, 4], - 'value2': ['a', 'b', 'c', 'c']}, - index=[1, 2, 0, 3]) - assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on='key', sort=False) - self.assert_index_equal(joined.index, pd.Index(lrange(4))) - def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -737,20 +234,6 @@ def test_handle_join_key_pass_array(self): merged = merge(left, right, left_index=True, right_on=key, how='outer') self.assert_series_equal(merged['key_0'], Series(key, name='key_0')) - def test_mixed_type_join_with_suffix(self): - # GH #916 - df = DataFrame(np.random.randn(20, 6), - columns=['a', 'b', 'c', 'd', 'e', 'f']) - df.insert(0, 'id', 0) - df.insert(5, 'dt', 'foo') - - grouped = df.groupby('id') - mn = grouped.mean() - cn = grouped.count() - - # it works! - mn.join(cn, rsuffix='_right') - def test_no_overlap_more_informative_error(self): dt = datetime.now() df1 = DataFrame({'x': ['a']}, index=[dt]) @@ -963,68 +446,6 @@ def _constructor(self): tm.assertIsInstance(result, NotADataFrame) - def test_empty_dtype_coerce(self): - - # xref to #12411 - # xref to #12045 - # xref to #11594 - # see below - - # 10571 - df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b']) - df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b']) - result = concat([df1, df2]) - expected = df1.dtypes - assert_series_equal(result.dtypes, expected) - - def test_dtype_coerceion(self): - - # 12411 - df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), - pd.NaT]}) - - result = concat([df.iloc[[0]], df.iloc[[1]]]) - assert_series_equal(result.dtypes, df.dtypes) - - # 12045 - import datetime - df = DataFrame({'date': [datetime.datetime(2012, 1, 1), - datetime.datetime(1012, 1, 2)]}) - result = concat([df.iloc[[0]], df.iloc[[1]]]) - assert_series_equal(result.dtypes, df.dtypes) - - # 11594 - df = DataFrame({'text': ['some words'] + [None] * 9}) - result = concat([df.iloc[[0]], df.iloc[[1]]]) - assert_series_equal(result.dtypes, df.dtypes) - - def test_append_dtype_coerce(self): - - # GH 4993 - # appending with datetime will incorrectly convert datetime64 - import datetime as dt - from pandas import NaT - - df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0)], - columns=['start_time']) - df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 3, 6, 10)], - [dt.datetime(2013, 1, 4, 0, 0), - dt.datetime(2013, 1, 4, 7, 10)]], - columns=['start_time', 'end_time']) - - expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 4, 7, 10)], - name='end_time'), - Series([dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0), - dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], axis=1) - result = df1.append(df2, ignore_index=True) - assert_frame_equal(result, expected) - def test_join_append_timedeltas(self): import datetime as dt @@ -1140,239 +561,6 @@ def test_merge_on_periods(self): self.assertEqual(result['value_x'].dtype, 'object') self.assertEqual(result['value_y'].dtype, 'object') - def test_concat_NaT_series(self): - # GH 11693 - # test for merging NaT series with datetime series. - x = Series(date_range('20151124 08:00', '20151124 09:00', - freq='1h', tz='US/Eastern')) - y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') - expected = Series([x[0], x[1], pd.NaT, pd.NaT]) - - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # all NaT with tz - expected = Series(pd.NaT, index=range(4), - dtype='datetime64[ns, US/Eastern]') - result = pd.concat([y, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # without tz - x = pd.Series(pd.date_range('20151124 08:00', - '20151124 09:00', freq='1h')) - y = pd.Series(pd.date_range('20151124 10:00', - '20151124 11:00', freq='1h')) - y[:] = pd.NaT - expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) - result = pd.concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # all NaT without tz - x[:] = pd.NaT - expected = pd.Series(pd.NaT, index=range(4), - dtype='datetime64[ns]') - result = pd.concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - def test_concat_tz_frame(self): - df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), - B=pd.Timestamp('20130603', tz='CET')), - index=range(5)) - - # concat - df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) - assert_frame_equal(df2, df3) - - def test_concat_tz_series(self): - # GH 11755 - # tz and no tz - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(date_range('2012-01-01', '2012-01-02')) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # GH 11887 - # concat tz and object - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(['a', 'b']) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # 12217 - # 12306 fixed I think - - # Concat'ing two UTC times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('UTC') - - second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('UTC') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, UTC]') - - # Concat'ing two London times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') - - second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('Europe/London') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') - - # Concat'ing 2+1 London times - first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) - first[0] = first[0].dt.tz_localize('Europe/London') - - second = pd.DataFrame([[datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') - - # Concat'ing 1+2 London times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') - - second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') - - def test_concat_tz_series_with_datetimelike(self): - # GH 12620 - # tz and timedelta - x = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-02-01', tz='US/Eastern')] - y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) - - # tz and period - y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) - - def test_concat_tz_series_tzlocal(self): - # GH 13583 - tm._skip_if_no_dateutil() - import dateutil - x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()), - pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())] - y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()), - pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y)) - self.assertEqual(result.dtype, 'datetime64[ns, tzlocal()]') - - def test_concat_period_series(self): - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - # different freq - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - # non-period - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01'])) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(['A', 'B']) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - def test_concat_empty_series(self): - # GH 11082 - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') - res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) - tm.assert_frame_equal(res, exp) - - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') - res = pd.concat([s1, s2], axis=0) - # name will be reset - exp = pd.Series([1, 2, 3]) - tm.assert_series_equal(res, exp) - - # empty Series with no name - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name=None) - res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=['x', 0]) - tm.assert_frame_equal(res, exp) - - def test_default_index(self): - # is_series and ignore_index - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series([4, 5, 6], name='y') - res = pd.concat([s1, s2], axis=1, ignore_index=True) - self.assertIsInstance(res.columns, pd.RangeIndex) - exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) - # use check_index_type=True to check the result have - # RangeIndex (default index) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - - # is_series and all inputs have no names - s1 = pd.Series([1, 2, 3]) - s2 = pd.Series([4, 5, 6]) - res = pd.concat([s1, s2], axis=1, ignore_index=False) - self.assertIsInstance(res.columns, pd.RangeIndex) - exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) - exp.columns = pd.RangeIndex(2) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - - # is_dataframe and ignore_index - df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]}) - df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]}) - - res = pd.concat([df1, df2], axis=0, ignore_index=True) - exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], - columns=['A', 'B']) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - - res = pd.concat([df1, df2], axis=1, ignore_index=True) - exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - def test_indicator(self): # PR #10054. xref #7412 and closes #8790. df1 = DataFrame({'col1': [0, 1], 'col_left': [ @@ -2134,90 +1322,6 @@ def f(): self.assertRaises(NotImplementedError, f) -def _check_join(left, right, result, join_col, how='left', - lsuffix='_x', rsuffix='_y'): - - # some smoke tests - for c in join_col: - assert(result[c].notnull().all()) - - left_grouped = left.groupby(join_col) - right_grouped = right.groupby(join_col) - - for group_key, group in result.groupby(join_col): - l_joined = _restrict_to_columns(group, left.columns, lsuffix) - r_joined = _restrict_to_columns(group, right.columns, rsuffix) - - try: - lgroup = left_grouped.get_group(group_key) - except KeyError: - if how in ('left', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) - - _assert_all_na(l_joined, left.columns, join_col) - else: - _assert_same_contents(l_joined, lgroup) - - try: - rgroup = right_grouped.get_group(group_key) - except KeyError: - if how in ('right', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) - - _assert_all_na(r_joined, right.columns, join_col) - else: - _assert_same_contents(r_joined, rgroup) - - -def _restrict_to_columns(group, columns, suffix): - found = [c for c in group.columns - if c in columns or c.replace(suffix, '') in columns] - - # filter - group = group.ix[:, found] - - # get rid of suffixes, if any - group = group.rename(columns=lambda x: x.replace(suffix, '')) - - # put in the right order... - group = group.ix[:, columns] - - return group - - -def _assert_same_contents(join_chunk, source): - NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... - - jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values - svalues = source.fillna(NA_SENTINEL).drop_duplicates().values - - rows = set(tuple(row) for row in jvalues) - assert(len(rows) == len(source)) - assert(all(tuple(row) in rows for row in svalues)) - - -def _assert_all_na(join_chunk, source_columns, join_col): - for c in source_columns: - if c in join_col: - continue - assert(join_chunk[c].isnull().all()) - - -def _join_by_hand(a, b, how='left'): - join_index = a.index.join(b.index, how=how) - - a_re = a.reindex(join_index) - b_re = b.reindex(join_index) - - result_columns = a.columns.append(b.columns) - - for col, s in compat.iteritems(b_re): - a_re[col] = s - return a_re.reindex(columns=result_columns) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From a711b4251c765c0c4b9d1c8deb985162dfaf09ae Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 14 Jul 2016 04:44:18 -0400 Subject: [PATCH 099/359] BF(TST): allow AttributeError being raised (in addition to TypeError) from mpl (#13641) Closes #13570 --- pandas/tests/test_graphics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 3a5b0117948b7..5493eb37c358b 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -1330,7 +1330,8 @@ def test_plot(self): self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) df = DataFrame({'x': [1, 2], 'y': [3, 4]}) - with tm.assertRaises(TypeError): + # mpl >= 1.5.2 (or slightly below) throw AttributError + with tm.assertRaises((TypeError, AttributeError)): df.plot.line(blarg=True) df = DataFrame(np.random.rand(10, 3), From 084ceaee135627680f4dd00115c3d6c7d930a22d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 14 Jul 2016 06:20:50 -0400 Subject: [PATCH 100/359] API, DEPR: Raise and Deprecate Reshape for Pandas Objects Author: gfyoung Closes #13012 from gfyoung/categorical-reshape-validate and squashes the following commits: 3ad161d [gfyoung] API: Prevent invalid arguments to Categorical.reshape --- doc/source/whatsnew/v0.19.0.txt | 3 ++ pandas/core/categorical.py | 23 +++++++-- pandas/core/internals.py | 26 +++++++++- pandas/core/series.py | 14 ++++-- pandas/indexes/base.py | 10 ++++ pandas/io/packers.py | 7 +-- pandas/tests/indexes/test_base.py | 6 +++ pandas/tests/series/test_analytics.py | 68 ++++++++++++++++----------- pandas/tests/test_categorical.py | 37 +++++++++++++-- 9 files changed, 151 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index bef02a06135de..688f3b7ff6ada 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -256,6 +256,7 @@ API changes ~~~~~~~~~~~ +- ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue: `12882`) - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) @@ -449,6 +450,8 @@ Furthermore: Deprecations ^^^^^^^^^^^^ +- ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 79d8bfbf57f12..1d1a9f990e61a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -383,11 +383,28 @@ def itemsize(self): def reshape(self, new_shape, *args, **kwargs): """ - An ndarray-compatible method that returns - `self` because categorical instances cannot - actually be reshaped. + DEPRECATED: calling this method will raise an error in a + future release. + + An ndarray-compatible method that returns `self` because + `Categorical` instances cannot actually be reshaped. + + Parameters + ---------- + new_shape : int or tuple of ints + A 1-D array of integers that correspond to the new + shape of the `Categorical`. For more information on + the parameter, please refer to `np.reshape`. """ + warn("reshape is deprecated and will raise " + "in a subsequent release", FutureWarning, stacklevel=2) + nv.validate_reshape(args, kwargs) + + # while the 'new_shape' parameter has no effect, + # we should still enforce valid shape parameters + np.reshape(self.codes, new_shape) + return self @property diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 363ac8249eb06..ff12cfddbe9cd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1839,7 +1839,7 @@ def convert(self, *args, **kwargs): try: values = values.reshape(shape) values = _block_shape(values, ndim=self.ndim) - except AttributeError: + except (AttributeError, NotImplementedError): pass newb = make_block(values, ndim=self.ndim, placement=[rl]) blocks.append(newb) @@ -3616,7 +3616,7 @@ def value_getitem(placement): return value else: if value.ndim == self.ndim - 1: - value = value.reshape((1,) + value.shape) + value = _safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value @@ -4686,6 +4686,28 @@ def rrenamer(x): _transform_index(right, rrenamer)) +def _safe_reshape(arr, new_shape): + """ + If possible, reshape `arr` to have shape `new_shape`, + with a couple of exceptions (see gh-13012): + + 1) If `arr` is a Categorical or Index, `arr` will be + returned as is. + 2) If `arr` is a Series, the `_values` attribute will + be reshaped and returned. + + Parameters + ---------- + arr : array-like, object to be reshaped + new_shape : int or tuple of ints, the new shape + """ + if isinstance(arr, ABCSeries): + arr = arr._values + if not isinstance(arr, Categorical): + arr = arr.reshape(new_shape) + return arr + + def _transform_index(index, func): """ Apply function to all values found in index. diff --git a/pandas/core/series.py b/pandas/core/series.py index 2c7f298dde2ec..b933f68cfad62 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -843,14 +843,22 @@ def repeat(self, reps, *args, **kwargs): def reshape(self, *args, **kwargs): """ - Return the values attribute of `self` with shape `args`. - However, if the specified shape matches exactly the current - shape, `self` is returned for compatibility reasons. + DEPRECATED: calling this method will raise an error in a + future release. Please call ``.values.reshape(...)`` instead. + + return an ndarray with the values shape + if the specified shape matches exactly the current shape, then + return self (for compat) See also -------- numpy.ndarray.reshape """ + warnings.warn("reshape is deprecated and will raise " + "in a subsequent release. Please use " + ".values.reshape(...) instead", FutureWarning, + stacklevel=2) + if len(args) == 1 and hasattr(args[0], '__iter__'): shape = args[0] else: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 5c9938c932da2..b013d6ccb0b8e 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -957,6 +957,16 @@ def rename(self, name, inplace=False): """ return self.set_names([name], inplace=inplace) + def reshape(self, *args, **kwargs): + """ + NOT IMPLEMENTED: do not call this method, as reshaping is not + supported for Index objects and will raise an error. + + Reshape an Index. + """ + raise NotImplementedError("reshaping is not supported " + "for Index objects") + @property def _has_complex_internals(self): # to disable groupby tricks in MultiIndex diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 14e2c9b371296..94f390955dddd 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -61,7 +61,7 @@ from pandas.core.generic import NDFrame from pandas.core.common import PerformanceWarning from pandas.io.common import get_filepath_or_buffer -from pandas.core.internals import BlockManager, make_block +from pandas.core.internals import BlockManager, make_block, _safe_reshape import pandas.core.internals as internals from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType @@ -622,8 +622,9 @@ def decode(obj): axes = obj[u'axes'] def create_block(b): - values = unconvert(b[u'values'], dtype_for(b[u'dtype']), - b[u'compress']).reshape(b[u'shape']) + values = _safe_reshape(unconvert( + b[u'values'], dtype_for(b[u'dtype']), + b[u'compress']), b[u'shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 67869901b068e..06662e52e3a6f 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1413,6 +1413,12 @@ def test_take_fill_value(self): with tm.assertRaises(IndexError): idx.take(np.array([1, -5])) + def test_reshape_raise(self): + msg = "reshaping is not supported" + idx = pd.Index([0, 1, 2]) + tm.assertRaisesRegexp(NotImplementedError, msg, + idx.reshape, idx.shape) + def test_reindex_preserves_name_if_target_is_list_or_ndarray(self): # GH6552 idx = pd.Index([0, 1, 2]) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d9e2d8096c8d7..34cfb2f0c1529 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1554,49 +1554,63 @@ def test_shift_categorical(self): assert_index_equal(s.values.categories, sp1.values.categories) assert_index_equal(s.values.categories, sn2.values.categories) - def test_reshape_non_2d(self): - # GH 4554 - x = Series(np.random.random(201), name='x') - self.assertTrue(x.reshape(x.shape, ) is x) + def test_reshape_deprecate(self): + x = Series(np.random.random(10), name='x') + tm.assert_produces_warning(FutureWarning, x.reshape, x.shape) - # GH 2719 - a = Series([1, 2, 3, 4]) - result = a.reshape(2, 2) - expected = a.values.reshape(2, 2) - tm.assert_numpy_array_equal(result, expected) - self.assertIsInstance(result, type(expected)) + def test_reshape_non_2d(self): + # see gh-4554 + with tm.assert_produces_warning(FutureWarning): + x = Series(np.random.random(201), name='x') + self.assertTrue(x.reshape(x.shape, ) is x) + + # see gh-2719 + with tm.assert_produces_warning(FutureWarning): + a = Series([1, 2, 3, 4]) + result = a.reshape(2, 2) + expected = a.values.reshape(2, 2) + tm.assert_numpy_array_equal(result, expected) + self.assertIsInstance(result, type(expected)) def test_reshape_2d_return_array(self): x = Series(np.random.random(201), name='x') - result = x.reshape((-1, 1)) - self.assertNotIsInstance(result, Series) - result2 = np.reshape(x, (-1, 1)) - self.assertNotIsInstance(result2, Series) + with tm.assert_produces_warning(FutureWarning): + result = x.reshape((-1, 1)) + self.assertNotIsInstance(result, Series) - result = x[:, None] - expected = x.reshape((-1, 1)) - assert_almost_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result2 = np.reshape(x, (-1, 1)) + self.assertNotIsInstance(result2, Series) + + with tm.assert_produces_warning(FutureWarning): + result = x[:, None] + expected = x.reshape((-1, 1)) + assert_almost_equal(result, expected) def test_reshape_bad_kwarg(self): a = Series([1, 2, 3, 4]) - msg = "'foo' is an invalid keyword argument for this function" - tm.assertRaisesRegexp(TypeError, msg, a.reshape, (2, 2), foo=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "'foo' is an invalid keyword argument for this function" + tm.assertRaisesRegexp(TypeError, msg, a.reshape, (2, 2), foo=2) - msg = "reshape\(\) got an unexpected keyword argument 'foo'" - tm.assertRaisesRegexp(TypeError, msg, a.reshape, a.shape, foo=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "reshape\(\) got an unexpected keyword argument 'foo'" + tm.assertRaisesRegexp(TypeError, msg, a.reshape, a.shape, foo=2) def test_numpy_reshape(self): a = Series([1, 2, 3, 4]) - result = np.reshape(a, (2, 2)) - expected = a.values.reshape(2, 2) - tm.assert_numpy_array_equal(result, expected) - self.assertIsInstance(result, type(expected)) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = np.reshape(a, (2, 2)) + expected = a.values.reshape(2, 2) + tm.assert_numpy_array_equal(result, expected) + self.assertIsInstance(result, type(expected)) - result = np.reshape(a, a.shape) - tm.assert_series_equal(result, a) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = np.reshape(a, a.shape) + tm.assert_series_equal(result, a) def test_unstack(self): from numpy import nan diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 2ca1fc71df20a..dd39861ac3114 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4058,13 +4058,40 @@ def test_numpy_repeat(self): msg = "the 'axis' parameter is not supported" tm.assertRaisesRegexp(ValueError, msg, np.repeat, cat, 2, axis=1) + def test_reshape(self): + cat = pd.Categorical([], categories=["a", "b"]) + tm.assert_produces_warning(FutureWarning, cat.reshape, 0) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape(0), cat) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape((5, -1)), cat) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape(cat.shape), cat) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape(cat.size), cat) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "can only specify one unknown dimension" + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + tm.assertRaisesRegexp(ValueError, msg, cat.reshape, (-2, -1)) + def test_numpy_reshape(self): - cat = pd.Categorical(["a", "b"], categories=["a", "b"]) - self.assert_categorical_equal(np.reshape(cat, cat.shape), cat) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + self.assert_categorical_equal(np.reshape(cat, cat.shape), cat) - msg = "the 'order' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.reshape, - cat, cat.shape, order='F') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "the 'order' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.reshape, + cat, cat.shape, order='F') def test_na_actions(self): From 3f6d4bdd63d9a1ae27e587bd033e507f7a5e1109 Mon Sep 17 00:00:00 2001 From: yui-knk Date: Thu, 14 Jul 2016 06:47:32 -0400 Subject: [PATCH 101/359] CLN: Fix compile time warnings Author: yui-knk Closes #13643 from yui-knk/warning2 and squashes the following commits: ee3a4fb [yui-knk] CLN: Fix compile time warnings --- pandas/src/datetime/np_datetime.c | 2 +- pandas/src/ujson/python/objToJSON.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/src/datetime/np_datetime.c b/pandas/src/datetime/np_datetime.c index c30b404d2b8b2..80703c8b08de6 100644 --- a/pandas/src/datetime/np_datetime.c +++ b/pandas/src/datetime/np_datetime.c @@ -576,7 +576,7 @@ void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, } PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { - return ((PyDatetimeScalarObject *) obj)->obmeta.base; + return (PANDAS_DATETIMEUNIT)((PyDatetimeScalarObject *) obj)->obmeta.base; } diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 1080e9548ba56..75de63acbd7d6 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -493,7 +493,7 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outV PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *) _obj; PRINTMARK(); - pandas_datetime_to_datetimestruct(obj->obval, obj->obmeta.base, &dts); + pandas_datetime_to_datetimestruct(obj->obval, (PANDAS_DATETIMEUNIT)obj->obmeta.base, &dts); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } From c9a27ede0925ddbaa8d3ec9efd3c332a636505cf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Jul 2016 16:26:07 +0200 Subject: [PATCH 102/359] CLN: fix some issues in asv benchmark suite (#13630) * CLN: fix params list * Fix issue in asv.conf.json for win32+other environment Fix mistaken exclusion of virtualenv or existing:same on win32 in the config. Credits: @pv * CLN: remove DataMatrix * ASV: fix exlusion of tables package for non-conda environments --- asv_bench/asv.conf.json | 6 +++--- asv_bench/benchmarks/indexing.py | 20 -------------------- asv_bench/benchmarks/inference.py | 10 +++++----- asv_bench/benchmarks/join_merge.py | 16 ---------------- 4 files changed, 8 insertions(+), 44 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 7b9fe353df2e3..f5fa849464881 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -77,11 +77,11 @@ // On conda install pytables, otherwise tables {"environment_type": "conda", "tables": ""}, {"environment_type": "conda", "pytables": null}, - {"environment_type": "virtualenv", "tables": null}, - {"environment_type": "virtualenv", "pytables": ""}, + {"environment_type": "(?!conda).*", "tables": null}, + {"environment_type": "(?!conda).*", "pytables": ""}, // On conda&win32, install libpython {"sys_platform": "(?!win32).*", "libpython": ""}, - {"sys_platform": "win32", "libpython": null}, + {"environment_type": "conda", "sys_platform": "win32", "libpython": null}, {"environment_type": "(?!conda).*", "libpython": ""} ], "include": [], diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 53d37a8161f43..094ae23a92fad 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -19,24 +19,6 @@ def time_dataframe_getitem_scalar(self): self.df[self.col][self.idx] -class datamatrix_getitem_scalar(object): - goal_time = 0.2 - - def setup(self): - try: - self.klass = DataMatrix - except: - self.klass = DataFrame - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = self.klass(np.random.rand(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - - def time_datamatrix_getitem_scalar(self): - self.df[self.col][self.idx] - - class series_get_value(object): goal_time = 0.2 @@ -498,5 +480,3 @@ def setup(self): def time_float_loc(self): self.ind.get_loc(0) - - diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 6809c351beade..ee9d3104be4b1 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -143,12 +143,12 @@ class to_numeric(object): param_names = ['data', 'downcast'] params = [ - [(['1'] * N / 2) + ([2] * N / 2), - (['-1'] * N / 2) + ([2] * N / 2), - np.repeat(np.array('1970-01-01', '1970-01-02', + [(['1'] * (N / 2)) + ([2] * (N / 2)), + (['-1'] * (N / 2)) + ([2] * (N / 2)), + np.repeat(np.array(['1970-01-01', '1970-01-02'], dtype='datetime64[D]'), N), - (['1.1'] * N / 2) + ([2] * N / 2), - ([1] * N / 2) + ([2] * N / 2), + (['1.1'] * (N / 2)) + ([2] * (N / 2)), + ([1] * (N / 2)) + ([2] * (N / 2)), np.repeat(np.int32(1), N)], [None, 'integer', 'signed', 'unsigned', 'float'], ] diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 39ebd9cb1cb73..dcd07911f2ff0 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -179,10 +179,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -210,10 +206,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -241,10 +233,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -272,10 +260,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) From 05b976c9339bad84f488c8d6813ed19232c9255c Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 14 Jul 2016 20:06:52 -0400 Subject: [PATCH 103/359] TST: add tests for Timestamp.toordinal/fromordinal follow-up for #13593 Author: sinhrks Closes #13610 from sinhrks/depr_timestamp_offset2 and squashes the following commits: 28f8d41 [sinhrks] TST: add tests for Timestamp.toordinal --- pandas/tseries/tests/test_tslib.py | 27 +++++++++++++++++++++++++++ pandas/tslib.pyx | 21 ++++++++++++++++++--- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index ce88edcf4249b..31d6393c1c26e 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -255,6 +255,18 @@ def test_constructor_keyword(self): hour=1, minute=2, second=3, microsecond=999999)), repr(Timestamp('2015-11-12 01:02:03.999999'))) + def test_constructor_fromordinal(self): + base = datetime.datetime(2000, 1, 1) + + ts = Timestamp.fromordinal(base.toordinal(), freq='D') + self.assertEqual(base, ts) + self.assertEqual(ts.freq, 'D') + self.assertEqual(base.toordinal(), ts.toordinal()) + + ts = Timestamp.fromordinal(base.toordinal(), tz='US/Eastern') + self.assertEqual(pd.Timestamp('2000-01-01', tz='US/Eastern'), ts) + self.assertEqual(base.toordinal(), ts.toordinal()) + def test_constructor_offset_depr(self): # GH 12160 with tm.assert_produces_warning(FutureWarning, @@ -270,6 +282,21 @@ def test_constructor_offset_depr(self): with tm.assertRaisesRegexp(TypeError, msg): Timestamp('2011-01-01', offset='D', freq='D') + def test_constructor_offset_depr_fromordinal(self): + # GH 12160 + base = datetime.datetime(2000, 1, 1) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + ts = Timestamp.fromordinal(base.toordinal(), offset='D') + self.assertEqual(pd.Timestamp('2000-01-01'), ts) + self.assertEqual(ts.freq, 'D') + self.assertEqual(base.toordinal(), ts.toordinal()) + + msg = "Can only specify freq or offset, not both" + with tm.assertRaisesRegexp(TypeError, msg): + Timestamp.fromordinal(base.toordinal(), offset='D', freq='D') + def test_conversion(self): # GH 9255 ts = Timestamp('2000-01-01') diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 650b4c7979d8d..2af08f2713262 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -235,12 +235,14 @@ class Timestamp(_Timestamp): ---------- ts_input : datetime-like, str, int, float Value to be converted to Timestamp - offset : str, DateOffset + freq : str, DateOffset Offset which Timestamp will have tz : string, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will have. unit : string numpy unit used for conversion, if ts_input is int or float + offset : str, DateOffset + Deprecated, use freq The other two forms mimic the parameters from ``datetime.datetime``. They can be passed by either position or keyword, but not both mixed together. @@ -262,8 +264,21 @@ class Timestamp(_Timestamp): @classmethod def fromordinal(cls, ordinal, freq=None, tz=None, offset=None): - """ passed an ordinal, translate and convert to a ts - note: by definition there cannot be any tz info on the ordinal itself """ + """ + passed an ordinal, translate and convert to a ts + note: by definition there cannot be any tz info on the ordinal itself + + Parameters + ---------- + ordinal : int + date corresponding to a proleptic Gregorian ordinal + freq : str, DateOffset + Offset which Timestamp will have + tz : string, pytz.timezone, dateutil.tz.tzfile or None + Time zone for time which Timestamp will have. + offset : str, DateOffset + Deprecated, use freq + """ return cls(datetime.fromordinal(ordinal), freq=freq, tz=tz, offset=offset) @classmethod From 71a06752a7040a75402f3e30a82b96e10816b492 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 14 Jul 2016 20:12:33 -0400 Subject: [PATCH 104/359] CLN: Initialization coincides with mapping, hence with uniqueness check - [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` Rebased version of https://github.com/pydata/pandas/pull/10229 which was [actually not](h ttps://github.com/pydata/pandas/pull/10229#issuecomment-131470116) fixed by https://github.com/pydata/pandas/pull/10199. Nothing particular relevant, just wanted to delete this branch locally and noticed it still applies: you'll judge what to do of it. Author: Pietro Battiston Closes #13594 from toobaz/fix_checkunique and squashes the following commits: a63bd12 [Pietro Battiston] CLN: Initialization coincides with mapping, hence with uniqueness check --- pandas/index.pyx | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 71717dd2d771b..bc985100692fc 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -80,7 +80,7 @@ cdef class IndexEngine: cdef: bint unique, monotonic_inc, monotonic_dec - bint initialized, monotonic_check, unique_check + bint initialized, monotonic_check def __init__(self, vgetter, n): self.vgetter = vgetter @@ -91,7 +91,6 @@ cdef class IndexEngine: self.monotonic_check = 0 self.unique = 0 - self.unique_check = 0 self.monotonic_inc = 0 self.monotonic_dec = 0 @@ -211,8 +210,8 @@ cdef class IndexEngine: property is_unique: def __get__(self): - if not self.unique_check: - self._do_unique_check() + if not self.initialized: + self.initialize() return self.unique == 1 @@ -246,9 +245,6 @@ cdef class IndexEngine: cdef _get_index_values(self): return self.vgetter() - cdef inline _do_unique_check(self): - self._ensure_mapping_populated() - def _call_monotonic(self, values): raise NotImplementedError @@ -270,7 +266,6 @@ cdef class IndexEngine: if len(self.mapping) == len(values): self.unique = 1 - self.unique_check = 1 self.initialized = 1 From 0a70b5fef3ae2363fea040ea47dd52247811c8c8 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 14 Jul 2016 20:26:01 -0400 Subject: [PATCH 105/359] API: Change Period('NAT') to return NaT closes #12759 closes #13582 Author: sinhrks Closes #13609 from sinhrks/period_nat and squashes the following commits: 9305c36 [sinhrks] COMPAT: Period(NaT) now returns pd.NaT --- doc/source/whatsnew/v0.19.0.txt | 39 +++ pandas/src/period.pyx | 269 ++++++++++--------- pandas/tests/indexes/test_datetimelike.py | 9 +- pandas/tseries/period.py | 49 ++-- pandas/tseries/tests/test_base.py | 26 +- pandas/tseries/tests/test_period.py | 305 +++++++++++++--------- pandas/tseries/tests/test_tslib.py | 7 + pandas/tslib.pyx | 5 +- 8 files changed, 407 insertions(+), 302 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 688f3b7ff6ada..c9f501c682a18 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -446,6 +446,45 @@ Furthermore: - Passing duplicated ``percentiles`` will now raise a ``ValueError``. - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) +.. _whatsnew_0190.api.periodnat: + +``Period('NaT')`` now returns ``pd.NaT`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, ``Period`` has its own ``Period('NaT')`` representation different from ``pd.NaT``. Now ``Period('NaT')`` has been changed to return ``pd.NaT``. (:issue:`12759`, :issue:`13582`) + +Previous Behavior: + +.. code-block:: ipython + + In [5]: pd.Period('NaT', freq='D') + Out[5]: Period('NaT', 'D') + +New Behavior: + +.. ipython:: python + + pd.Period('NaT') + + +To be compat with ``Period`` addition and subtraction, ``pd.NaT`` now supports addition and subtraction with ``int``. Previously it raises ``ValueError``. + +Previous Behavior: + +.. code-block:: ipython + + In [5]: pd.NaT + 1 + ... + ValueError: Cannot add integral value to Timestamp without freq. + +New Behavior: + +.. ipython:: python + + pd.NaT + 1 + pd.NaT - 1 + + .. _whatsnew_0190.deprecations: Deprecations diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index af2e295ae0cfc..37f265ede07e7 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -472,7 +472,11 @@ def extract_ordinals(ndarray[object] values, freq): except AttributeError: p = Period(p, freq=freq) - ordinals[i] = p.ordinal + if p is tslib.NaT: + # input may contain NaT-like string + ordinals[i] = tslib.iNaT + else: + ordinals[i] = p.ordinal return ordinals @@ -665,24 +669,8 @@ class IncompatibleFrequency(ValueError): pass -cdef class Period(object): - """ - Represents an period of time +cdef class _Period(object): - Parameters - ---------- - value : Period or compat.string_types, default None - The time period represented (e.g., '4Q2005') - freq : str, default None - One of pandas period strings or corresponding objects - year : int, default None - month : int, default 1 - quarter : int, default None - day : int, default 1 - hour : int, default 0 - minute : int, default 0 - second : int, default 0 - """ cdef public: int64_t ordinal object freq @@ -711,97 +699,22 @@ cdef class Period(object): @classmethod def _from_ordinal(cls, ordinal, freq): """ fast creation from an ordinal and freq that are already validated! """ - self = Period.__new__(cls) - self.ordinal = ordinal - self.freq = cls._maybe_convert_freq(freq) - return self - - def __init__(self, value=None, freq=None, ordinal=None, - year=None, month=1, quarter=None, day=1, - hour=0, minute=0, second=0): - # freq points to a tuple (base, mult); base is one of the defined - # periods such as A, Q, etc. Every five minutes would be, e.g., - # ('T', 5) but may be passed in as a string like '5T' - - # ordinal is the period offset from the gregorian proleptic epoch - - if ordinal is not None and value is not None: - raise ValueError(("Only value or ordinal but not both should be " - "given but not both")) - elif ordinal is not None: - if not lib.is_integer(ordinal): - raise ValueError("Ordinal must be an integer") - if freq is None: - raise ValueError('Must supply freq for ordinal value') - - elif value is None: - if freq is None: - raise ValueError("If value is None, freq cannot be None") - ordinal = _ordinal_from_fields(year, month, quarter, day, - hour, minute, second, freq) - - elif isinstance(value, Period): - other = value - if freq is None or frequencies.get_freq_code(freq) == frequencies.get_freq_code(other.freq): - ordinal = other.ordinal - freq = other.freq - else: - converted = other.asfreq(freq) - ordinal = converted.ordinal - - elif is_null_datetimelike(value) or value in tslib._nat_strings: - ordinal = tslib.iNaT - if freq is None: - raise ValueError("If value is NaT, freq cannot be None " - "because it cannot be inferred") - - elif isinstance(value, compat.string_types) or lib.is_integer(value): - if lib.is_integer(value): - value = str(value) - value = value.upper() - dt, _, reso = parse_time_string(value, freq) - - if freq is None: - try: - freq = frequencies.Resolution.get_freq(reso) - except KeyError: - raise ValueError("Invalid frequency or could not infer: %s" % reso) - - elif isinstance(value, datetime): - dt = value - if freq is None: - raise ValueError('Must supply freq for datetime value') - elif isinstance(value, np.datetime64): - dt = Timestamp(value) - if freq is None: - raise ValueError('Must supply freq for datetime value') - elif isinstance(value, date): - dt = datetime(year=value.year, month=value.month, day=value.day) - if freq is None: - raise ValueError('Must supply freq for datetime value') - else: - msg = "Value must be Period, string, integer, or datetime" - raise ValueError(msg) - - base, mult = frequencies.get_freq_code(freq) - - if ordinal is None: - self.ordinal = get_period_ordinal(dt.year, dt.month, dt.day, - dt.hour, dt.minute, dt.second, - dt.microsecond, 0, base) + if ordinal == tslib.iNaT: + return tslib.NaT else: + self = _Period.__new__(cls) self.ordinal = ordinal - - self.freq = self._maybe_convert_freq(freq) + self.freq = cls._maybe_convert_freq(freq) + return self def __richcmp__(self, other, op): if isinstance(other, Period): if other.freq != self.freq: msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: - return _nat_scalar_rules[op] return PyObject_RichCompareBool(self.ordinal, other.ordinal, op) + elif other is tslib.NaT: + return _nat_scalar_rules[op] # index/series like elif hasattr(other, '_typ'): return NotImplemented @@ -824,10 +737,7 @@ cdef class Period(object): offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal + (nanos // offset_nanos) + ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) msg = 'Input cannot be converted to Period(freq={0})' raise IncompatibleFrequency(msg.format(self.freqstr)) @@ -835,10 +745,7 @@ cdef class Period(object): freqstr = frequencies.get_standard_freq(other) base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal + other.n + ordinal = self.ordinal + other.n return Period(ordinal=ordinal, freq=self.freq) msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) @@ -853,10 +760,7 @@ cdef class Period(object): elif other is tslib.NaT: return tslib.NaT elif lib.is_integer(other): - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal + other * self.freq.n + ordinal = self.ordinal + other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) else: # pragma: no cover return NotImplemented @@ -872,17 +776,12 @@ cdef class Period(object): neg_other = -other return self + neg_other elif lib.is_integer(other): - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal - other * self.freq.n + ordinal = self.ordinal - other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) elif isinstance(other, Period): if other.freq != self.freq: msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: - return Period(ordinal=tslib.iNaT, freq=self.freq) return self.ordinal - other.ordinal elif getattr(other, '_typ', None) == 'periodindex': return -other.__sub__(self) @@ -914,16 +813,13 @@ cdef class Period(object): base1, mult1 = frequencies.get_freq_code(self.freq) base2, mult2 = frequencies.get_freq_code(freq) - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal + # mult1 can't be negative or 0 + end = how == 'E' + if end: + ordinal = self.ordinal + mult1 - 1 else: - # mult1 can't be negative or 0 - end = how == 'E' - if end: - ordinal = self.ordinal + mult1 - 1 - else: - ordinal = self.ordinal - ordinal = period_asfreq(ordinal, base1, base2, end) + ordinal = self.ordinal + ordinal = period_asfreq(ordinal, base1, base2, end) return Period(ordinal=ordinal, freq=freq) @@ -933,12 +829,9 @@ cdef class Period(object): @property def end_time(self): - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - # freq.n can't be negative or 0 - # ordinal = (self + self.freq.n).start_time.value - 1 - ordinal = (self + 1).start_time.value - 1 + # freq.n can't be negative or 0 + # ordinal = (self + self.freq.n).start_time.value - 1 + ordinal = (self + 1).start_time.value - 1 return Timestamp(ordinal) def to_timestamp(self, freq=None, how='start', tz=None): @@ -1199,8 +1092,114 @@ cdef class Period(object): return period_format(self.ordinal, base, fmt) -def _ordinal_from_fields(year, month, quarter, day, hour, minute, - second, freq): +class Period(_Period): + """ + Represents an period of time + + Parameters + ---------- + value : Period or compat.string_types, default None + The time period represented (e.g., '4Q2005') + freq : str, default None + One of pandas period strings or corresponding objects + year : int, default None + month : int, default 1 + quarter : int, default None + day : int, default 1 + hour : int, default 0 + minute : int, default 0 + second : int, default 0 + """ + + def __new__(cls, value=None, freq=None, ordinal=None, + year=None, month=None, quarter=None, day=None, + hour=None, minute=None, second=None): + # freq points to a tuple (base, mult); base is one of the defined + # periods such as A, Q, etc. Every five minutes would be, e.g., + # ('T', 5) but may be passed in as a string like '5T' + + # ordinal is the period offset from the gregorian proleptic epoch + + cdef _Period self + + if ordinal is not None and value is not None: + raise ValueError(("Only value or ordinal but not both should be " + "given but not both")) + elif ordinal is not None: + if not lib.is_integer(ordinal): + raise ValueError("Ordinal must be an integer") + if freq is None: + raise ValueError('Must supply freq for ordinal value') + + elif value is None: + if (year is None and month is None and quarter is None and + day is None and hour is None and minute is None and second is None): + ordinal = tslib.iNaT + else: + if freq is None: + raise ValueError("If value is None, freq cannot be None") + + # set defaults + month = 1 if month is None else month + day = 1 if day is None else day + hour = 0 if hour is None else hour + minute = 0 if minute is None else minute + second = 0 if second is None else second + + ordinal = _ordinal_from_fields(year, month, quarter, day, + hour, minute, second, freq) + + elif isinstance(value, Period): + other = value + if freq is None or frequencies.get_freq_code(freq) == frequencies.get_freq_code(other.freq): + ordinal = other.ordinal + freq = other.freq + else: + converted = other.asfreq(freq) + ordinal = converted.ordinal + + elif is_null_datetimelike(value) or value in tslib._nat_strings: + ordinal = tslib.iNaT + + elif isinstance(value, compat.string_types) or lib.is_integer(value): + if lib.is_integer(value): + value = str(value) + value = value.upper() + dt, _, reso = parse_time_string(value, freq) + + if freq is None: + try: + freq = frequencies.Resolution.get_freq(reso) + except KeyError: + raise ValueError("Invalid frequency or could not infer: %s" % reso) + + elif isinstance(value, datetime): + dt = value + if freq is None: + raise ValueError('Must supply freq for datetime value') + elif isinstance(value, np.datetime64): + dt = Timestamp(value) + if freq is None: + raise ValueError('Must supply freq for datetime value') + elif isinstance(value, date): + dt = datetime(year=value.year, month=value.month, day=value.day) + if freq is None: + raise ValueError('Must supply freq for datetime value') + else: + msg = "Value must be Period, string, integer, or datetime" + raise ValueError(msg) + + if ordinal is None: + base, mult = frequencies.get_freq_code(freq) + ordinal = get_period_ordinal(dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second, + dt.microsecond, 0, base) + + return cls._from_ordinal(ordinal, freq) + + +def _ordinal_from_fields(year, month, quarter, day, + hour, minute, second, freq): base, mult = frequencies.get_freq_code(freq) if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 9eba481a66685..5c21f71d64660 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -741,14 +741,7 @@ def test_astype(self): result = idx.astype(object) expected = Index([Period('2016-05-16', freq='D')] + [Period(NaT, freq='D')] * 3, dtype='object') - # Hack because of lack of support for Period null checking (GH12759) - tm.assert_index_equal(result[:1], expected[:1]) - result_arr = np.asarray([p.ordinal for p in result], dtype=np.int64) - expected_arr = np.asarray([p.ordinal for p in expected], - dtype=np.int64) - tm.assert_numpy_array_equal(result_arr, expected_arr) - # TODO: When GH12759 is resolved, change the above hack to: - # tm.assert_index_equal(result, expected) # now, it raises. + tm.assert_index_equal(result, expected) result = idx.astype(int) expected = Int64Index([16937] + [-9223372036854775808] * 3, diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 45f634050a5d8..dffb71cff526a 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -92,13 +92,14 @@ def wrapper(self, other): result[mask] = nat_result return result + elif other is tslib.NaT: + result = np.empty(len(self.values), dtype=bool) + result.fill(nat_result) else: other = Period(other, freq=self.freq) func = getattr(self.values, opname) result = func(other.ordinal) - if other.ordinal == tslib.iNaT: - result.fill(nat_result) mask = self.values == tslib.iNaT if mask.any(): result[mask] = nat_result @@ -235,7 +236,7 @@ def _from_arraylike(cls, data, freq, tz): data = _ensure_int64(data) if freq is None: raise ValueError('freq not specified') - data = np.array([Period(x, freq=freq).ordinal for x in data], + data = np.array([Period(x, freq=freq) for x in data], dtype=np.int64) except (TypeError, ValueError): data = _ensure_object(data) @@ -322,15 +323,18 @@ def _na_value(self): return self._box_func(tslib.iNaT) def __contains__(self, key): - if not isinstance(key, Period) or key.freq != self.freq: - if isinstance(key, compat.string_types): - try: - self.get_loc(key) - return True - except Exception: - return False + if isinstance(key, Period): + if key.freq != self.freq: + return False + else: + return key.ordinal in self._engine + else: + try: + self.get_loc(key) + return True + except Exception: + return False return False - return key.ordinal in self._engine def __array_wrap__(self, result, context=None): """ @@ -622,17 +626,13 @@ def _sub_period(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - if other.ordinal == tslib.iNaT: - new_data = np.empty(len(self)) - new_data.fill(np.nan) - else: - asi8 = self.asi8 - new_data = asi8 - other.ordinal + asi8 = self.asi8 + new_data = asi8 - other.ordinal - if self.hasnans: - mask = asi8 == tslib.iNaT - new_data = new_data.astype(np.float64) - new_data[mask] = np.nan + if self.hasnans: + mask = asi8 == tslib.iNaT + new_data = new_data.astype(np.float64) + new_data[mask] = np.nan # result must be Int64Index or Float64Index return Index(new_data, name=self.name) @@ -740,8 +740,10 @@ def get_loc(self, key, method=None, tolerance=None): # we cannot construct the Period # as we have an invalid type raise KeyError(key) + try: - return Index.get_loc(self, key.ordinal, method, tolerance) + ordinal = tslib.iNaT if key is tslib.NaT else key.ordinal + return Index.get_loc(self, ordinal, method, tolerance) except KeyError: raise KeyError(key) @@ -1044,8 +1046,7 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): if is_start_per and is_end_per and start.freq != end.freq: raise ValueError('Start and end must have same freq') - if ((is_start_per and start.ordinal == tslib.iNaT) or - (is_end_per and end.ordinal == tslib.iNaT)): + if (start is tslib.NaT or end is tslib.NaT): raise ValueError('Start and end must not be NaT') if freq is None: diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 68cea17ba3fc9..958a10c329a46 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1587,17 +1587,16 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) + tm.assert_index_equal(result, expected) for i in [0, 1, 3]: - self.assertTrue(result[i], expected[i]) - self.assertTrue(result[2].ordinal, pd.tslib.iNaT) - self.assertTrue(result[2].freq, 'D') + self.assertEqual(result[i], expected[i]) + self.assertIs(result[2], pd.NaT) self.assertEqual(result.name, expected.name) result_list = idx.tolist() for i in [0, 1, 3]: - self.assertTrue(result_list[i], expected_list[i]) - self.assertTrue(result_list[2].ordinal, pd.tslib.iNaT) - self.assertTrue(result_list[2].freq, 'D') + self.assertEqual(result_list[i], expected_list[i]) + self.assertIs(result_list[2], pd.NaT) def test_minmax(self): @@ -1623,18 +1622,15 @@ def test_minmax(self): # Return NaT obj = PeriodIndex([], freq='M') result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') + self.assertIs(result, tslib.NaT) obj = PeriodIndex([pd.NaT], freq='M') result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') + self.assertIs(result, tslib.NaT) obj = PeriodIndex([pd.NaT, pd.NaT, pd.NaT], freq='M') result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') + self.assertIs(result, tslib.NaT) def test_numpy_minmax(self): pr = pd.period_range(start='2016-01-15', end='2016-01-20') @@ -1735,9 +1731,9 @@ def test_representation_to_series(self): 2 2013 dtype: object""" - exp6 = """0 2011-01-01 09:00 -1 2012-02-01 10:00 -2 NaT + exp6 = """0 2011-01-01 09:00 +1 2012-02-01 10:00 +2 NaT dtype: object""" exp7 = """0 2013Q1 diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 591fa19aad585..8d217ff0753a6 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -36,14 +36,17 @@ def test_quarterly_negative_ordinals(self): p = Period(ordinal=-1, freq='Q-DEC') self.assertEqual(p.year, 1969) self.assertEqual(p.quarter, 4) + self.assertIsInstance(p, Period) p = Period(ordinal=-2, freq='Q-DEC') self.assertEqual(p.year, 1969) self.assertEqual(p.quarter, 3) + self.assertIsInstance(p, Period) p = Period(ordinal=-2, freq='M') self.assertEqual(p.year, 1969) self.assertEqual(p.month, 11) + self.assertIsInstance(p, Period) def test_period_cons_quarterly(self): # bugs in scikits.timeseries @@ -67,6 +70,7 @@ def test_period_cons_annual(self): stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) p = Period(stamp, freq=freq) self.assertEqual(p, exp + 1) + self.assertIsInstance(p, Period) def test_period_cons_weekly(self): for num in range(10, 17): @@ -77,34 +81,46 @@ def test_period_cons_weekly(self): result = Period(daystr, freq=freq) expected = Period(daystr, freq='D').asfreq(freq) self.assertEqual(result, expected) + self.assertIsInstance(result, Period) + + def test_period_from_ordinal(self): + p = pd.Period('2011-01', freq='M') + res = pd.Period._from_ordinal(p.ordinal, freq='M') + self.assertEqual(p, res) + self.assertIsInstance(res, Period) def test_period_cons_nat(self): p = Period('NaT', freq='M') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, 'M') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) p = Period('nat', freq='W-SUN') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, 'W-SUN') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) p = Period(tslib.iNaT, freq='D') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, 'D') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) p = Period(tslib.iNaT, freq='3D') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, offsets.Day(3)) - self.assertEqual(p.freqstr, '3D') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) + + p = Period('NaT') + self.assertIs(p, pd.NaT) + + p = Period(tslib.iNaT) + self.assertIs(p, pd.NaT) + + def test_cons_null_like(self): + # check Timestamp compat + self.assertIs(Timestamp('NaT'), pd.NaT) + self.assertIs(Period('NaT'), pd.NaT) + + self.assertIs(Timestamp(None), pd.NaT) + self.assertIs(Period(None), pd.NaT) - self.assertRaises(ValueError, Period, 'NaT') + self.assertIs(Timestamp(float('nan')), pd.NaT) + self.assertIs(Period(float('nan')), pd.NaT) + + self.assertIs(Timestamp(np.nan), pd.NaT) + self.assertIs(Period(np.nan), pd.NaT) def test_period_cons_mult(self): p1 = Period('2011-01', freq='3M') @@ -197,13 +213,6 @@ def test_timestamp_tz_arg_dateutil_from_string(self): freq='M').to_timestamp(tz='dateutil/Europe/Brussels') self.assertEqual(p.tz, gettz('Europe/Brussels')) - def test_timestamp_nat_tz(self): - t = Period('NaT', freq='M').to_timestamp() - self.assertTrue(t is tslib.NaT) - - t = Period('NaT', freq='M').to_timestamp(tz='Asia/Tokyo') - self.assertTrue(t is tslib.NaT) - def test_timestamp_mult(self): p = pd.Period('2011-01', freq='M') self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) @@ -213,12 +222,6 @@ def test_timestamp_mult(self): self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) self.assertEqual(p.to_timestamp(how='E'), pd.Timestamp('2011-03-31')) - def test_timestamp_nat_mult(self): - for freq in ['M', '3M']: - p = pd.Period('NaT', freq=freq) - self.assertTrue(p.to_timestamp(how='S') is pd.NaT) - self.assertTrue(p.to_timestamp(how='E') is pd.NaT) - def test_period_constructor(self): i1 = Period('1/1/2005', freq='M') i2 = Period('Jan 2005') @@ -552,9 +555,6 @@ def _ex(p): result = p.to_timestamp('5S', how='start') self.assertEqual(result, expected) - p = Period('NaT', freq='W') - self.assertTrue(p.to_timestamp() is tslib.NaT) - def test_start_time(self): freq_lst = ['A', 'Q', 'M', 'D', 'H', 'T', 'S'] xp = datetime(2012, 1, 1) @@ -566,9 +566,6 @@ def test_start_time(self): self.assertEqual(Period('2012', freq='W').start_time, datetime(2011, 12, 26)) - p = Period('NaT', freq='W') - self.assertTrue(p.start_time is tslib.NaT) - def test_end_time(self): p = Period('2012', freq='A') @@ -607,9 +604,6 @@ def _ex(*args): xp = _ex(2012, 1, 16) self.assertEqual(xp, p.end_time) - p = Period('NaT', freq='W') - self.assertTrue(p.end_time is tslib.NaT) - def test_anchor_week_end_time(self): def _ex(*args): return Timestamp(Timestamp(datetime(*args)).value - 1) @@ -758,15 +752,14 @@ def test_properties_secondly(self): def test_properties_nat(self): p_nat = Period('NaT', freq='M') t_nat = pd.Timestamp('NaT') + self.assertIs(p_nat, t_nat) + # confirm Period('NaT') work identical with Timestamp('NaT') for f in ['year', 'month', 'day', 'hour', 'minute', 'second', 'week', 'dayofyear', 'quarter', 'days_in_month']: self.assertTrue(np.isnan(getattr(p_nat, f))) self.assertTrue(np.isnan(getattr(t_nat, f))) - for f in ['weekofyear', 'dayofweek', 'weekday', 'qyear']: - self.assertTrue(np.isnan(getattr(p_nat, f))) - def test_pnow(self): dt = datetime.now() @@ -789,7 +782,7 @@ def test_constructor_corner(self): self.assertRaises(ValueError, Period, 1.6, freq='D') self.assertRaises(ValueError, Period, ordinal=1.6, freq='D') self.assertRaises(ValueError, Period, ordinal=2, value=1, freq='D') - self.assertRaises(ValueError, Period) + self.assertIs(Period(None), pd.NaT) self.assertRaises(ValueError, Period, month=1) p = Period('2007-01-01', freq='D') @@ -1526,12 +1519,6 @@ def test_conv_secondly(self): self.assertEqual(ival_S.asfreq('S'), ival_S) - def test_asfreq_nat(self): - p = Period('NaT', freq='A') - result = p.asfreq('M') - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') - def test_asfreq_mult(self): # normal freq to mult freq p = Period(freq='A', year=2007) @@ -1603,21 +1590,6 @@ def test_asfreq_mult(self): self.assertEqual(result.ordinal, expected.ordinal) self.assertEqual(result.freq, expected.freq) - def test_asfreq_mult_nat(self): - # normal freq to mult freq - for p in [Period('NaT', freq='A'), Period('NaT', freq='3A'), - Period('NaT', freq='2M'), Period('NaT', freq='3D')]: - for freq in ['3A', offsets.YearEnd(3)]: - result = p.asfreq(freq) - expected = Period('NaT', freq='3A') - self.assertEqual(result.ordinal, pd.tslib.iNaT) - self.assertEqual(result.freq, expected.freq) - - result = p.asfreq(freq, how='S') - expected = Period('NaT', freq='3A') - self.assertEqual(result.ordinal, pd.tslib.iNaT) - self.assertEqual(result.freq, expected.freq) - class TestPeriodIndex(tm.TestCase): def setUp(self): @@ -1995,6 +1967,19 @@ def test_getitem_datetime(self): rs = ts[dt1:dt4] tm.assert_series_equal(rs, ts) + def test_getitem_nat(self): + idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + self.assertEqual(idx[0], pd.Period('2011-01', freq='M')) + self.assertIs(idx[1], tslib.NaT) + + s = pd.Series([0, 1, 2], index=idx) + self.assertEqual(s[pd.NaT], 1) + + s = pd.Series(idx, index=idx) + self.assertEqual(s[pd.Period('2011-01', freq='M')], + pd.Period('2011-01', freq='M')) + self.assertIs(s[pd.NaT], tslib.NaT) + def test_slice_with_negative_step(self): ts = Series(np.arange(20), period_range('2014-01', periods=20, freq='M')) @@ -2038,6 +2023,20 @@ def test_contains(self): self.assertFalse(Period('2007-01', freq='D') in rng) self.assertFalse(Period('2007-01', freq='2M') in rng) + def test_contains_nat(self): + # GH13582 + idx = period_range('2007-01', freq='M', periods=10) + self.assertFalse(pd.NaT in idx) + self.assertFalse(None in idx) + self.assertFalse(float('nan') in idx) + self.assertFalse(np.nan in idx) + + idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + self.assertTrue(pd.NaT in idx) + self.assertTrue(None in idx) + self.assertTrue(float('nan') in idx) + self.assertTrue(np.nan in idx) + def test_sub(self): rng = period_range('2007-01', periods=50) @@ -3292,6 +3291,17 @@ def test_get_loc_msg(self): except KeyError as inst: self.assertEqual(inst.args[0], bad_period) + def test_get_loc_nat(self): + didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) + pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') + + # check DatetimeIndex compat + for idx in [didx, pidx]: + self.assertEqual(idx.get_loc(pd.NaT), 1) + self.assertEqual(idx.get_loc(None), 1) + self.assertEqual(idx.get_loc(float('nan')), 1) + self.assertEqual(idx.get_loc(np.nan), 1) + def test_append_concat(self): # #1815 d1 = date_range('12/31/1990', '12/31/1999', freq='A-DEC') @@ -3576,95 +3586,87 @@ def test_add_offset_nat(self): for freq in ['A', '2A', '3A']: p = Period('NaT', freq=freq) for o in [offsets.YearEnd(2)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) for freq in ['M', '2M', '3M']: p = Period('NaT', freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) + # freq is Tick for freq in ['D', '2D', '3D']: p = Period('NaT', freq=freq) for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), np.timedelta64(3600 * 24, 's'), timedelta(-2), timedelta(hours=48)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) for freq in ['H', '2H', '3H']: p = Period('NaT', freq=freq) for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), np.timedelta64(3600, 's'), timedelta(minutes=120), timedelta(days=4, minutes=180)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) if not isinstance(o, np.timedelta64): - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) def test_sub_pdnat(self): # GH 13071 @@ -3749,24 +3751,22 @@ def test_sub_offset_nat(self): for freq in ['A', '2A', '3A']: p = Period('NaT', freq=freq) for o in [offsets.YearEnd(2)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) for freq in ['M', '2M', '3M']: p = Period('NaT', freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) # freq is Tick for freq in ['D', '2D', '3D']: @@ -3774,37 +3774,33 @@ def test_sub_offset_nat(self): for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), np.timedelta64(3600 * 24, 's'), timedelta(-2), timedelta(hours=48)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) for freq in ['H', '2H', '3H']: p = Period('NaT', freq=freq) for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), np.timedelta64(3600, 's'), timedelta(minutes=120), timedelta(days=4, minutes=180)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) def test_nat_ops(self): for freq in ['M', '2M', '3M']: p = Period('NaT', freq=freq) - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) - self.assertEqual((p - 1).ordinal, tslib.iNaT) - self.assertEqual((p - Period('2011-01', freq=freq)).ordinal, - tslib.iNaT) - self.assertEqual((Period('2011-01', freq=freq) - p).ordinal, - tslib.iNaT) + self.assertIs(p + 1, tslib.NaT) + self.assertIs(1 + p, tslib.NaT) + self.assertIs(p - 1, tslib.NaT) + self.assertIs(p - Period('2011-01', freq=freq), tslib.NaT) + self.assertIs(Period('2011-01', freq=freq) - p, tslib.NaT) def test_period_ops_offset(self): p = Period('2011-04-01', freq='D') @@ -3830,18 +3826,17 @@ class TestPeriodIndexSeriesMethods(tm.TestCase): def _check(self, values, func, expected): idx = pd.PeriodIndex(values) result = func(idx) - tm.assert_index_equal(result, pd.PeriodIndex(expected)) + if isinstance(expected, pd.Index): + tm.assert_index_equal(result, expected) + else: + # comp op results in bool + tm.assert_numpy_array_equal(result, expected) s = pd.Series(values) result = func(s) - exp = pd.Series(expected) - # Period(NaT) != Period(NaT) - - lmask = result.map(lambda x: x.ordinal != tslib.iNaT) - rmask = exp.map(lambda x: x.ordinal != tslib.iNaT) - tm.assert_series_equal(lmask, rmask) - tm.assert_series_equal(result[lmask], exp[rmask]) + exp = pd.Series(expected, name=values.name) + tm.assert_series_equal(result, exp) def test_pi_ops(self): idx = PeriodIndex(['2011-01', '2011-02', '2011-03', @@ -3962,7 +3957,7 @@ def test_pi_sub_period(self): exp = pd.Index([12, 11, 10, 9], name='idx') tm.assert_index_equal(result, exp) - exp = pd.Index([np.nan, np.nan, np.nan, np.nan], name='idx') + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) @@ -3987,10 +3982,82 @@ def test_pi_sub_period_nat(self): exp = pd.Index([12, np.nan, 10, 9], name='idx') tm.assert_index_equal(result, exp) - exp = pd.Index([np.nan, np.nan, np.nan, np.nan], name='idx') + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + def test_pi_comp_period(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + + f = lambda x: x == pd.Period('2011-03', freq='M') + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period('2011-03', freq='M') + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') != x + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > pd.Period('2011-03', freq='M') + exp = np.array([False, False, False, True], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + def test_pi_comp_period_nat(self): + idx = PeriodIndex(['2011-01', 'NaT', '2011-03', + '2011-04'], freq='M', name='idx') + + f = lambda x: x == pd.Period('2011-03', freq='M') + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') == x + self._check(idx, f, exp) + + f = lambda x: x == tslib.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: tslib.NaT == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period('2011-03', freq='M') + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') != x + self._check(idx, f, exp) + + f = lambda x: x != tslib.NaT + exp = np.array([True, True, True, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: tslib.NaT != x + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x < pd.Period('2011-03', freq='M') + exp = np.array([True, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > tslib.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: tslib.NaT >= x + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + class TestPeriodRepresentation(tm.TestCase): """ diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 31d6393c1c26e..6696c03a070f7 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -1224,6 +1224,13 @@ def test_nat_arithmetic(self): self.assertIs(left - right, pd.NaT) self.assertIs(right - left, pd.NaT) + # int addition / subtraction + for (left, right) in [(pd.NaT, 2), (pd.NaT, 0), (pd.NaT, -3)]: + self.assertIs(right + left, pd.NaT) + self.assertIs(left + right, pd.NaT) + self.assertIs(left - right, pd.NaT) + self.assertIs(right - left, pd.NaT) + def test_nat_arithmetic_index(self): # GH 11718 diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 2af08f2713262..c681cebd84836 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1097,7 +1097,10 @@ cdef class _Timestamp(datetime): return Timestamp(self.value + other_int, tz=self.tzinfo, freq=self.freq) elif is_integer_object(other): - if self.freq is None: + if self is NaT: + # to be compat with Period + return NaT + elif self.freq is None: raise ValueError("Cannot add integral value to Timestamp " "without freq.") return Timestamp((self.freq * other).apply(self), freq=self.freq) From 1bee56ed9aa96ffe99aa62d5e8c0212d6dc947ee Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 15 Jul 2016 06:20:39 -0400 Subject: [PATCH 106/359] BUG: construction of Series with integers on windows not default to int64 closes #13646 Author: Jeff Reback Closes #13661 from jreback/foo and squashes the following commits: e26f9bf [Jeff Reback] BUG: construction of Series with integers on windows not defaulting to int64 --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/series.py | 2 +- pandas/tests/frame/test_operators.py | 2 +- pandas/tests/series/test_constructors.py | 11 +++++++++++ pandas/types/cast.py | 2 +- 5 files changed, 15 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index c9f501c682a18..747fc70f858b4 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -534,7 +534,7 @@ Bug Fixes - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - +- Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) - Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) diff --git a/pandas/core/series.py b/pandas/core/series.py index b933f68cfad62..3c1f834c3d479 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2820,7 +2820,7 @@ def _try_cast(arr, take_fast_path): subarr = data.copy() return subarr - elif isinstance(data, list) and len(data) > 0: + elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: try: subarr = _try_cast(data, False) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index e2e0f568e4098..c91585a28d867 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -1196,7 +1196,7 @@ def test_alignment_non_pandas(self): align = pd.core.ops._align_method_FRAME - for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3])]: + for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.intp)]: tm.assert_series_equal(align(df, val, 'index'), Series([1, 2, 3], index=df.index)) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index b7ec4d570f18b..c8e04f1ffd75f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -109,6 +109,17 @@ def test_constructor_iterator(self): result = Series(range(10), dtype='int64') assert_series_equal(result, expected) + def test_constructor_list_like(self): + + # make sure that we are coercing different + # list-likes to standard dtypes and not + # platform specific + expected = Series([1, 2, 3], dtype='int64') + for obj in [[1, 2, 3], (1, 2, 3), + np.array([1, 2, 3], dtype='int64')]: + result = Series(obj, index=[0, 1, 2]) + assert_series_equal(result, expected) + def test_constructor_generator(self): gen = (i for i in range(10)) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index e55cb91d36430..ca23d8d26a426 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -33,7 +33,7 @@ def _possibly_convert_platform(values): """ try to do platform conversion, allow ndarray or list here """ if isinstance(values, (list, tuple)): - values = lib.list_to_object_array(values) + values = lib.list_to_object_array(list(values)) if getattr(values, 'dtype', None) == np.object_: if hasattr(values, '_values'): values = values._values From d7c028d4965932160fa3b69f56c716b1454c42a5 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 15 Jul 2016 06:25:54 -0400 Subject: [PATCH 107/359] CLN: Removed levels attribute from Categorical Deprecated back in `0.15.0` and therefore long overdue. Closes #8376. Author: gfyoung Closes #13612 from gfyoung/categorical-levels-remove and squashes the following commits: f1254df [gfyoung] MAINT: Relocated backwards compat categorical pickle tests f3321cb [gfyoung] CLN: Removed levels attribute from Categorical --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/categorical.py | 30 +---------- .../tests/data/categorical_0_14_1.pickle | 0 .../tests/data/categorical_0_15_2.pickle | Bin pandas/io/tests/test_pickle.py | 38 +++++++++++++ pandas/tests/test_categorical.py | 50 ------------------ setup.py | 4 +- 7 files changed, 43 insertions(+), 80 deletions(-) rename pandas/{ => io}/tests/data/categorical_0_14_1.pickle (100%) rename pandas/{ => io}/tests/data/categorical_0_15_2.pickle (100%) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 747fc70f858b4..0b9695125c0a9 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -506,6 +506,7 @@ Removal of prior version deprecations/changes - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) +- ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) .. _whatsnew_0190.performance: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1d1a9f990e61a..a26cc5125db78 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -228,8 +228,8 @@ class Categorical(PandasObject): __array_priority__ = 1000 _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, name=None, - fastpath=False, levels=None): + def __init__(self, values, categories=None, ordered=False, + name=None, fastpath=False): if fastpath: # fast path @@ -245,17 +245,6 @@ def __init__(self, values, categories=None, ordered=False, name=None, "name=\"something\")'") warn(msg, UserWarning, stacklevel=2) - # TODO: Remove after deprecation period in 2017/ after 0.18 - if levels is not None: - warn("Creating a 'Categorical' with 'levels' is deprecated, use " - "'categories' instead", FutureWarning, stacklevel=2) - if categories is None: - categories = levels - else: - raise ValueError("Cannot pass in both 'categories' and " - "(deprecated) 'levels', use only " - "'categories'", stacklevel=2) - # sanitize input if is_categorical_dtype(values): @@ -580,21 +569,6 @@ def _get_categories(self): categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) - def _set_levels(self, levels): - """ set new levels (deprecated, use "categories") """ - warn("Assigning to 'levels' is deprecated, use 'categories'", - FutureWarning, stacklevel=2) - self.categories = levels - - def _get_levels(self): - """ Gets the levels (deprecated, use "categories") """ - warn("Accessing 'levels' is deprecated, use 'categories'", - FutureWarning, stacklevel=2) - return self.categories - - # TODO: Remove after deprecation period in 2017/ after 0.18 - levels = property(fget=_get_levels, fset=_set_levels) - _ordered = None def _set_ordered(self, value): diff --git a/pandas/tests/data/categorical_0_14_1.pickle b/pandas/io/tests/data/categorical_0_14_1.pickle similarity index 100% rename from pandas/tests/data/categorical_0_14_1.pickle rename to pandas/io/tests/data/categorical_0_14_1.pickle diff --git a/pandas/tests/data/categorical_0_15_2.pickle b/pandas/io/tests/data/categorical_0_15_2.pickle similarity index 100% rename from pandas/tests/data/categorical_0_15_2.pickle rename to pandas/io/tests/data/categorical_0_15_2.pickle diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 55c14fee9e3ed..6019144d59698 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -231,6 +231,44 @@ def python_unpickler(path): result = python_unpickler(path) self.compare_element(result, expected, typ) + def test_pickle_v0_14_1(self): + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_14_1.pickle') + # This code was executed once on v0.14.1 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + + def test_pickle_v0_15_2(self): + # ordered -> _ordered + # GH 9347 + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_15_2.pickle') + # This code was executed once on v0.15.2 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index dd39861ac3114..1edd9443fe356 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1559,18 +1559,6 @@ def test_deprecated_labels(self): res = cat.labels self.assert_numpy_array_equal(res, exp) - def test_deprecated_levels(self): - # TODO: levels is deprecated and should be removed in 0.18 or 2017, - # whatever is earlier - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - exp = cat.categories - with tm.assert_produces_warning(FutureWarning): - res = cat.levels - self.assert_index_equal(res, exp) - with tm.assert_produces_warning(FutureWarning): - res = pd.Categorical([1, 2, 3, np.nan], levels=[1, 2, 3]) - self.assert_index_equal(res.categories, exp) - def test_removed_names_produces_warning(self): # 10482 @@ -4431,44 +4419,6 @@ def test_dt_accessor_api_for_categorical(self): invalid.dt self.assertFalse(hasattr(invalid, 'str')) - def test_pickle_v0_14_1(self): - - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') - # This code was executed once on v0.14.1 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) - - def test_pickle_v0_15_2(self): - # ordered -> _ordered - # GH 9347 - - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') - # This code was executed once on v0.15.2 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) - def test_concat_categorical(self): # See GH 10177 df1 = pd.DataFrame( diff --git a/setup.py b/setup.py index 650357588570a..c77ca4d9e60fe 100755 --- a/setup.py +++ b/setup.py @@ -589,6 +589,7 @@ def pxd(name): 'tests/data/legacy_msgpack/*/*.msgpack', 'tests/data/*.csv*', 'tests/data/*.dta', + 'tests/data/*.pickle', 'tests/data/*.txt', 'tests/data/*.xls', 'tests/data/*.xlsx', @@ -605,8 +606,7 @@ def pxd(name): 'tests/data/html_encoding/*.html', 'tests/json/data/*.json'], 'pandas.tools': ['tests/data/*.csv'], - 'pandas.tests': ['data/*.pickle', - 'data/*.csv'], + 'pandas.tests': ['data/*.csv'], 'pandas.tests.formats': ['data/*.csv'], 'pandas.tests.indexes': ['data/*.pickle'], 'pandas.tseries.tests': ['data/*.pickle', From 043879fbb7de71605eed87991eb037c1917bace1 Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Sun, 17 Jul 2016 05:40:27 -0700 Subject: [PATCH 108/359] DOC: Add reference of DataFrame.rename_axis and Series.rename_axis to api.rst (#13678) --- doc/source/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/api.rst b/doc/source/api.rst index 0dde341d820e3..e8fe26e8a525d 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -380,6 +380,7 @@ Reindexing / Selection / Label manipulation Series.reindex Series.reindex_like Series.rename + Series.rename_axis Series.reset_index Series.sample Series.select @@ -889,6 +890,7 @@ Reindexing / Selection / Label manipulation DataFrame.reindex_axis DataFrame.reindex_like DataFrame.rename + DataFrame.rename_axis DataFrame.reset_index DataFrame.sample DataFrame.select From 76d7e779e82c12f73c08704ea44c3b802e914ce7 Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Sun, 17 Jul 2016 11:30:19 -0700 Subject: [PATCH 109/359] DOC: correct template for .cum* descriptions (#13683) Closes #13682 --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d6e6f571be53a..6c1676fbdd7f4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5504,7 +5504,7 @@ def _make_cum_function(cls, name, name1, name2, axis_descr, desc, accum_func, mask_a, mask_b): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) - @Appender("Return cumulative {0} over requested axis.".format(name) + + @Appender("Return {0} over requested axis.".format(desc) + _cnum_doc) def cum_func(self, axis=None, skipna=True, *args, **kwargs): skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) From ada6bf350f7fd4daaf2b80188ca165fb9543a252 Mon Sep 17 00:00:00 2001 From: Yuichiro Kaneko Date: Mon, 18 Jul 2016 03:32:25 +0900 Subject: [PATCH 110/359] DOC: fix a keyword coerce in array_to_timedelta64 (#13686) --- pandas/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index c681cebd84836..5624b84523705 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -3028,7 +3028,7 @@ cdef inline bint is_timedelta(object o): def array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): """ convert an ndarray to an array of ints that are timedeltas - force conversion if coerce = True, + force conversion if errors = 'coerce', else will raise if cannot convert """ cdef: Py_ssize_t i, n From 6b9cd15f6a655b1ade2c571e32e142bf56dde769 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Tue, 19 Jul 2016 07:08:26 +0900 Subject: [PATCH 111/359] TST: assert message shows unnecessary diff (#13676) --- pandas/util/testing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4442eed898b60..402613d3f1728 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1010,14 +1010,15 @@ def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(right, np.ndarray): right = pprint_thing(right) - if diff is not None: - diff = "\n[diff]: {diff}".format(diff=diff) - msg = """{0} are different {1} [left]: {2} -[right]: {3}{4}""".format(obj, message, left, right, diff) +[right]: {3}""".format(obj, message, left, right) + + if diff is not None: + msg = msg + "\n[diff]: {diff}".format(diff=diff) + raise AssertionError(msg) From 694fe61f931e1c0f034f93f3e0f1084a8974a1f3 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 18 Jul 2016 21:03:58 -0400 Subject: [PATCH 112/359] ENH: Series.append now has ignore_index kw Author: sinhrks Closes #13677 from sinhrks/append_series and squashes the following commits: 4bc7b54 [sinhrks] ENH: Series.append now has ignore_index kw --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/series.py | 20 ++++++++++++++++++-- pandas/tests/series/test_combine_concat.py | 21 +++++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0b9695125c0a9..a69617bfbec55 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -249,6 +249,7 @@ Other enhancements - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) +- ``Series.append`` now supports ``ignore_index`` option (:issue:`13677`) .. _whatsnew_0190.api: diff --git a/pandas/core/series.py b/pandas/core/series.py index 3c1f834c3d479..c3f5b1b8e641c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1511,13 +1511,18 @@ def searchsorted(self, v, side='left', sorter=None): # ------------------------------------------------------------------- # Combination - def append(self, to_append, verify_integrity=False): + def append(self, to_append, ignore_index=False, verify_integrity=False): """ Concatenate two or more Series. Parameters ---------- to_append : Series or list/tuple of Series + ignore_index : boolean, default False + If True, do not use the index labels. + + .. versionadded: 0.19.0 + verify_integrity : boolean, default False If True, raise Exception on creating index with duplicates @@ -1548,6 +1553,17 @@ def append(self, to_append, verify_integrity=False): 5 6 dtype: int64 + With `ignore_index` set to True: + + >>> s1.append(s2, ignore_index=True) + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + dtype: int64 + With `verify_integrity` set to True: >>> s1.append(s2, verify_integrity=True) @@ -1561,7 +1577,7 @@ def append(self, to_append, verify_integrity=False): to_concat = [self] + to_append else: to_concat = [self, to_append] - return concat(to_concat, ignore_index=False, + return concat(to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity) def _binop(self, other, func, level=None, fill_value=None): diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index eb560d4a17055..fd6fd90cd631f 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -39,6 +39,27 @@ def test_append_many(self): result = pieces[0].append(pieces[1:]) assert_series_equal(result, self.ts) + def test_append_duplicates(self): + # GH 13677 + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(s1.append(s2), exp) + tm.assert_series_equal(pd.concat([s1, s2]), exp) + + # the result must have RangeIndex + exp = pd.Series([1, 2, 3, 4, 5, 6]) + tm.assert_series_equal(s1.append(s2, ignore_index=True), + exp, check_index_type=True) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), + exp, check_index_type=True) + + msg = 'Indexes have overlapping values:' + with tm.assertRaisesRegexp(ValueError, msg): + s1.append(s2, verify_integrity=True) + with tm.assertRaisesRegexp(ValueError, msg): + pd.concat([s1, s2], verify_integrity=True) + def test_combine_first(self): values = tm.makeIntIndex(20).values.astype(float) series = Series(values, index=tm.makeIntIndex(20)) From 5a521713f3892539b648bc2735d3cc502feb2b48 Mon Sep 17 00:00:00 2001 From: wcwagner Date: Mon, 18 Jul 2016 21:06:40 -0400 Subject: [PATCH 113/359] BUG: Add type check for width parameter in str.pad method GH13598 closes #13598 Author: wcwagner Closes #13690 from wcwagner/bug/13598 and squashes the following commits: 9669f3f [wcwagner] BUG: "Replaced isinstance with is_integer, and changed test_pad_width to use getattr" 40a3188 [wcwagner] BUG: "Switched to single test method asserting functions that use pad raise correctly." 06795db [wcwagner] BUG: "Added tests for width parameter on center, ljust, rjust, zfill." 468df3a [wcwagner] BUG: Add type check for width parameter in str.pad method GH13598 --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/strings.py | 7 ++++++- pandas/tests/test_strings.py | 9 +++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a69617bfbec55..99396f6cfbc89 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -583,7 +583,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - +- Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6ec28f9735850..3150fc5d0143a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,7 +8,8 @@ is_object_dtype, is_string_like, is_list_like, - is_scalar) + is_scalar, + is_integer) from pandas.core.common import _values_from_object from pandas.core.algorithms import take_1d @@ -914,6 +915,10 @@ def str_pad(arr, width, side='left', fillchar=' '): if len(fillchar) != 1: raise TypeError('fillchar must be a character, not str') + if not is_integer(width): + msg = 'width must be of integer type, not {0}' + raise TypeError(msg.format(type(width).__name__)) + if side == 'left': f = lambda x: x.rjust(width, fillchar) elif side == 'right': diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 4d23bed620265..fcdbec8fbc5c4 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1603,6 +1603,15 @@ def test_pad_fillchar(self): "fillchar must be a character, not int"): result = values.str.pad(5, fillchar=5) + def test_pad_width(self): + # GH 13598 + s = Series(['1', '22', 'a', 'bb']) + + for f in ['center', 'ljust', 'rjust', 'zfill', 'pad']: + with tm.assertRaisesRegexp(TypeError, + "width must be of integer type, not*"): + getattr(s.str, f)('f') + def test_translate(self): def _check(result, expected): From 9f635cd74316d26110809bf1bb2a5525ac4d23fe Mon Sep 17 00:00:00 2001 From: yui-knk Date: Mon, 18 Jul 2016 21:12:07 -0400 Subject: [PATCH 114/359] BUG: Cast a key to NaT before get loc from Index closes #13603 Author: yui-knk Closes #13687 from yui-knk/fix_13603 and squashes the following commits: 0960395 [yui-knk] BUG: Cast a key to NaT before get loc from Index --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tseries/tdi.py | 6 +++++- pandas/tseries/tests/test_timedeltas.py | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 99396f6cfbc89..fd58eb1b00171 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -584,6 +584,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) +- Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index f9fb51ebf710c..78ab333be8ea5 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -697,6 +697,10 @@ def get_loc(self, key, method=None, tolerance=None): ------- loc : int """ + + if isnull(key): + key = tslib.NaT + if tolerance is not None: # try converting tolerance now, so errors don't get swallowed by # the try/except clauses below @@ -754,7 +758,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): def _get_string_slice(self, key, use_lhs=True, use_rhs=True): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) - if is_integer(key) or is_float(key): + if is_integer(key) or is_float(key) or key is tslib.NaT: self._invalid_indexer('slice', key) loc = self._partial_td_slice(key, freq, use_lhs=use_lhs, use_rhs=use_rhs) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 4f985998d5e20..36ae479c3dfcc 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -30,6 +30,25 @@ class TestTimedeltas(tm.TestCase): def setUp(self): pass + def test_get_loc_nat(self): + tidx = TimedeltaIndex(['1 days 01:00:00', 'NaT', '2 days 01:00:00']) + + self.assertEqual(tidx.get_loc(pd.NaT), 1) + self.assertEqual(tidx.get_loc(None), 1) + self.assertEqual(tidx.get_loc(float('nan')), 1) + self.assertEqual(tidx.get_loc(np.nan), 1) + + def test_contains(self): + # Checking for any NaT-like objects + # GH 13603 + td = to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + for v in [pd.NaT, None, float('nan'), np.nan]: + self.assertFalse((v in td)) + + td = to_timedelta([pd.NaT]) + for v in [pd.NaT, None, float('nan'), np.nan]: + self.assertTrue((v in td)) + def test_construction(self): expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') From b05453631270d4b78f79dc272222d5f3fe499ad7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 18 Jul 2016 21:15:04 -0400 Subject: [PATCH 115/359] BUG: merge_asof not handling allow_exact_matches and tolerance on first entry closes #13695 Author: Jeff Reback Closes #13698 from jreback/merge_asof and squashes the following commits: c46dcfa [Jeff Reback] BUG: merge_asof not handling allow_exact_matches and tolerance on first entry --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/src/join.pyx | 18 ++++++++------- pandas/tools/tests/test_merge_asof.py | 33 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index fd58eb1b00171..e728cb7910134 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -46,7 +46,7 @@ The following are now part of this API: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A long-time requested feature has been added through the :func:`merge_asof` function, to -support asof style joining of time-series. (:issue:`1870`). Full documentation is +support asof style joining of time-series. (:issue:`1870`, :issue:`13695`). Full documentation is :ref:`here ` The :func:`merge_asof` performs an asof merge, which is similar to a left-join diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index a81ac0aa35d4e..ad3b1d4e4a90e 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -193,11 +193,12 @@ def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, diff = left_val - right_val # do we allow exact matches - if allow_exact_matches and diff > tol: - right_indexer[indexer] = -1 - continue + if allow_exact_matches: + if diff > tol: + right_indexer[indexer] = -1 + continue elif not allow_exact_matches: - if diff >= tol: + if diff >= tol or lc == rc: right_indexer[indexer] = -1 continue @@ -220,13 +221,14 @@ def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, diff = left_val - right_val # do we allow exact matches - if allow_exact_matches and diff > tol: - right_indexer[indexer] = -1 - continue + if allow_exact_matches: + if diff > tol: + right_indexer[indexer] = -1 + continue # we don't allow exact matches elif not allow_exact_matches: - if diff >= tol or not right_pos: + if diff >= tol or lc == rc: right_indexer[indexer] = -1 else: right_indexer[indexer] = right_pos - 1 diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index 5d78ccf199ed3..bcbb0f0fadb49 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -347,6 +347,39 @@ def test_allow_exact_matches_and_tolerance(self): expected = self.allow_exact_matches_and_tolerance assert_frame_equal(result, expected) + def test_allow_exact_matches_and_tolerance2(self): + # GH 13695 + df1 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob']}) + df2 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.000', + '2016-07-15 13:30:00.030']), + 'version': [1, 2]}) + + result = pd.merge_asof(df1, df2, on='time') + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob'], + 'version': [2]}) + assert_frame_equal(result, expected) + + result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False) + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob'], + 'version': [1]}) + assert_frame_equal(result, expected) + + result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, + tolerance=pd.Timedelta('10ms')) + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob'], + 'version': [np.nan]}) + assert_frame_equal(result, expected) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 361a2b4f90e8c536934c6bd652830ef4950b43aa Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 18 Jul 2016 21:17:50 -0400 Subject: [PATCH 116/359] CLN: removed pandas.sandbox xref #9615 Author: gfyoung Closes #13670 from gfyoung/sandbox-removal and squashes the following commits: 2a014aa [gfyoung] CLN: removed pandas.sandbox --- doc/source/ecosystem.rst | 6 ++ doc/source/faq.rst | 78 +---------------- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/api/tests/test_api.py | 2 +- pandas/sandbox/__init__.py | 0 pandas/sandbox/qtpandas.py | 145 -------------------------------- setup.py | 1 - 7 files changed, 11 insertions(+), 222 deletions(-) delete mode 100644 pandas/sandbox/__init__.py delete mode 100644 pandas/sandbox/qtpandas.py diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 8fafe8ec9eaa2..0d010b47f393a 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -93,6 +93,12 @@ targets the IPython Notebook environment. `Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. +`Pandas-Qt `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Spun off from the main pandas library, the `Pandas-Qt `__ +library enables DataFrame visualization and manipulation in PyQt4 and PySide applications. + .. _ecosystem.ide: IDE diff --git a/doc/source/faq.rst b/doc/source/faq.rst index e5d659cc31606..d23e0ca59254d 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -110,78 +110,6 @@ details. Visualizing Data in Qt applications ----------------------------------- -.. warning:: - - The ``qt`` support is **deprecated and will be removed in a future version**. - We refer users to the external package `pandas-qt `_. - -There is experimental support for visualizing DataFrames in PyQt4 and PySide -applications. At the moment you can display and edit the values of the cells -in the DataFrame. Qt will take care of displaying just the portion of the -DataFrame that is currently visible and the edits will be immediately saved to -the underlying DataFrame - -To demonstrate this we will create a simple PySide application that will switch -between two editable DataFrames. For this will use the ``DataFrameModel`` class -that handles the access to the DataFrame, and the ``DataFrameWidget``, which is -just a thin layer around the ``QTableView``. - -.. code-block:: python - - import numpy as np - import pandas as pd - from pandas.sandbox.qtpandas import DataFrameModel, DataFrameWidget - from PySide import QtGui, QtCore - - # Or if you use PyQt4: - # from PyQt4 import QtGui, QtCore - - class MainWidget(QtGui.QWidget): - def __init__(self, parent=None): - super(MainWidget, self).__init__(parent) - - # Create two DataFrames - self.df1 = pd.DataFrame(np.arange(9).reshape(3, 3), - columns=['foo', 'bar', 'baz']) - self.df2 = pd.DataFrame({ - 'int': [1, 2, 3], - 'float': [1.5, 2.5, 3.5], - 'string': ['a', 'b', 'c'], - 'nan': [np.nan, np.nan, np.nan] - }, index=['AAA', 'BBB', 'CCC'], - columns=['int', 'float', 'string', 'nan']) - - # Create the widget and set the first DataFrame - self.widget = DataFrameWidget(self.df1) - - # Create the buttons for changing DataFrames - self.button_first = QtGui.QPushButton('First') - self.button_first.clicked.connect(self.on_first_click) - self.button_second = QtGui.QPushButton('Second') - self.button_second.clicked.connect(self.on_second_click) - - # Set the layout - vbox = QtGui.QVBoxLayout() - vbox.addWidget(self.widget) - hbox = QtGui.QHBoxLayout() - hbox.addWidget(self.button_first) - hbox.addWidget(self.button_second) - vbox.addLayout(hbox) - self.setLayout(vbox) - - def on_first_click(self): - '''Sets the first DataFrame''' - self.widget.setDataFrame(self.df1) - - def on_second_click(self): - '''Sets the second DataFrame''' - self.widget.setDataFrame(self.df2) - - if __name__ == '__main__': - import sys - - # Initialize the application - app = QtGui.QApplication(sys.argv) - mw = MainWidget() - mw.show() - app.exec_() +There is no support for such visualization in pandas. However, the external +package `pandas-qt `_ does +provide this functionality. diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e728cb7910134..0107bdea542d6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -505,6 +505,7 @@ Deprecations Removal of prior version deprecations/changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index 3f6c97441d659..0aefdbeae0518 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -28,7 +28,7 @@ class TestPDApi(Base, tm.TestCase): # these are optionally imported based on testing # & need to be ignored - ignored = ['tests', 'rpy', 'sandbox', 'locale'] + ignored = ['tests', 'rpy', 'locale'] # top-level sub-packages lib = ['api', 'compat', 'computation', 'core', diff --git a/pandas/sandbox/__init__.py b/pandas/sandbox/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/sandbox/qtpandas.py b/pandas/sandbox/qtpandas.py deleted file mode 100644 index b6af40a0e2156..0000000000000 --- a/pandas/sandbox/qtpandas.py +++ /dev/null @@ -1,145 +0,0 @@ -""" -Easy integration of DataFrame into pyqt framework - -@author: Jev Kuznetsov -""" - -# flake8: noqa - -# GH9615 - -import warnings -warnings.warn("The pandas.sandbox.qtpandas module is deprecated and will be " - "removed in a future version. We refer users to the external package " - "here: https://github.com/datalyze-solutions/pandas-qt") - -try: - from PyQt4.QtCore import QAbstractTableModel, Qt, QVariant, QModelIndex - from PyQt4.QtGui import ( - QApplication, QDialog, QVBoxLayout, QTableView, QWidget) -except ImportError: - from PySide.QtCore import QAbstractTableModel, Qt, QModelIndex - from PySide.QtGui import ( - QApplication, QDialog, QVBoxLayout, QTableView, QWidget) - QVariant = lambda value=None: value - -from pandas import DataFrame, Index - - -class DataFrameModel(QAbstractTableModel): - """ data model for a DataFrame class """ - def __init__(self): - super(DataFrameModel, self).__init__() - self.df = DataFrame() - - def setDataFrame(self, dataFrame): - self.df = dataFrame - - def signalUpdate(self): - """ tell viewers to update their data (this is full update, not - efficient)""" - self.layoutChanged.emit() - - #------------- table display functions ----------------- - def headerData(self, section, orientation, role=Qt.DisplayRole): - if role != Qt.DisplayRole: - return QVariant() - - if orientation == Qt.Horizontal: - try: - return self.df.columns.tolist()[section] - except (IndexError, ): - return QVariant() - elif orientation == Qt.Vertical: - try: - # return self.df.index.tolist() - return self.df.index.tolist()[section] - except (IndexError, ): - return QVariant() - - def data(self, index, role=Qt.DisplayRole): - if role != Qt.DisplayRole: - return QVariant() - - if not index.isValid(): - return QVariant() - - return QVariant(str(self.df.ix[index.row(), index.column()])) - - def flags(self, index): - flags = super(DataFrameModel, self).flags(index) - flags |= Qt.ItemIsEditable - return flags - - def setData(self, index, value, role): - row = self.df.index[index.row()] - col = self.df.columns[index.column()] - if hasattr(value, 'toPyObject'): - # PyQt4 gets a QVariant - value = value.toPyObject() - else: - # PySide gets an unicode - dtype = self.df[col].dtype - if dtype != object: - value = None if value == '' else dtype.type(value) - self.df.set_value(row, col, value) - return True - - def rowCount(self, index=QModelIndex()): - return self.df.shape[0] - - def columnCount(self, index=QModelIndex()): - return self.df.shape[1] - - -class DataFrameWidget(QWidget): - """ a simple widget for using DataFrames in a gui """ - def __init__(self, dataFrame, parent=None): - super(DataFrameWidget, self).__init__(parent) - - self.dataModel = DataFrameModel() - self.dataTable = QTableView() - self.dataTable.setModel(self.dataModel) - - layout = QVBoxLayout() - layout.addWidget(self.dataTable) - self.setLayout(layout) - # Set DataFrame - self.setDataFrame(dataFrame) - - def setDataFrame(self, dataFrame): - self.dataModel.setDataFrame(dataFrame) - self.dataModel.signalUpdate() - self.dataTable.resizeColumnsToContents() - -#-----------------stand alone test code - - -def testDf(): - """ creates test dataframe """ - data = {'int': [1, 2, 3], 'float': [1.5, 2.5, 3.5], - 'string': ['a', 'b', 'c'], 'nan': [np.nan, np.nan, np.nan]} - return DataFrame(data, index=Index(['AAA', 'BBB', 'CCC']), - columns=['int', 'float', 'string', 'nan']) - - -class Form(QDialog): - def __init__(self, parent=None): - super(Form, self).__init__(parent) - - df = testDf() # make up some data - widget = DataFrameWidget(df) - widget.resizeColumnsToContents() - - layout = QVBoxLayout() - layout.addWidget(widget) - self.setLayout(layout) - -if __name__ == '__main__': - import sys - import numpy as np - - app = QApplication(sys.argv) - form = Form() - form.show() - app.exec_() diff --git a/setup.py b/setup.py index c77ca4d9e60fe..0bff49c4976b8 100755 --- a/setup.py +++ b/setup.py @@ -560,7 +560,6 @@ def pxd(name): 'pandas.io.sas', 'pandas.formats', 'pandas.rpy', - 'pandas.sandbox', 'pandas.sparse', 'pandas.sparse.tests', 'pandas.stats', From 1e1e9b348bb3f256c2b4997db090a1d35da9938b Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 18 Jul 2016 21:28:01 -0400 Subject: [PATCH 117/359] DEPR: Remove legacy offsets Follow-up of #10951. Remove legacy offsets deprecated in 0.17.0. Author: sinhrks Closes #13590 from sinhrks/depr_legacy_offset and squashes the following commits: 2593b1f [sinhrks] DEPR: Remove legacy offsets --- doc/source/timeseries.rst | 46 +----- doc/source/whatsnew/v0.19.0.txt | 11 ++ pandas/tseries/frequencies.py | 108 ++------------ pandas/tseries/tests/test_base.py | 16 ++- pandas/tseries/tests/test_frequencies.py | 39 +++--- pandas/tseries/tests/test_offsets.py | 25 ++-- pandas/tseries/tests/test_period.py | 170 ++++++----------------- pandas/tseries/tests/test_tslib.py | 14 +- 8 files changed, 118 insertions(+), 311 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 7e832af14c051..f6a1e169afe9d 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -752,7 +752,7 @@ calculate significantly slower and will raise a ``PerformanceWarning`` rng + BQuarterEnd() -.. _timeseries.alias: +.. _timeseries.custombusinessdays: Custom Business Days (Experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -953,6 +953,8 @@ You can use keyword arguments suported by either ``BusinessHour`` and ``CustomBu # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 +.. _timeseries.alias: + Offset Aliases ~~~~~~~~~~~~~~ @@ -1103,48 +1105,6 @@ it is rolled forward to the next anchor point. pd.Timestamp('2014-01-01') + MonthBegin(n=0) pd.Timestamp('2014-01-31') + MonthEnd(n=0) -.. _timeseries.legacyaliases: - -Legacy Aliases -~~~~~~~~~~~~~~ -Note that prior to v0.8.0, time rules had a slightly different look. These are -deprecated in v0.17.0, and removed in future version. - -.. csv-table:: - :header: "Legacy Time Rule", "Offset Alias" - :widths: 15, 65 - - "WEEKDAY", "B" - "EOM", "BM" - "W\@MON", "W\-MON" - "W\@TUE", "W\-TUE" - "W\@WED", "W\-WED" - "W\@THU", "W\-THU" - "W\@FRI", "W\-FRI" - "W\@SAT", "W\-SAT" - "W\@SUN", "W\-SUN" - "Q\@JAN", "BQ\-JAN" - "Q\@FEB", "BQ\-FEB" - "Q\@MAR", "BQ\-MAR" - "A\@JAN", "BA\-JAN" - "A\@FEB", "BA\-FEB" - "A\@MAR", "BA\-MAR" - "A\@APR", "BA\-APR" - "A\@MAY", "BA\-MAY" - "A\@JUN", "BA\-JUN" - "A\@JUL", "BA\-JUL" - "A\@AUG", "BA\-AUG" - "A\@SEP", "BA\-SEP" - "A\@OCT", "BA\-OCT" - "A\@NOV", "BA\-NOV" - "A\@DEC", "BA\-DEC" - - -As you can see, legacy quarterly and annual frequencies are business quarters -and business year ends. Please also note the legacy time rule for milliseconds -``ms`` versus the new offset alias for month start ``MS``. This means that -offset alias parsing is case sensitive. - .. _timeseries.holiday: Holidays / Holiday Calendars diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0107bdea542d6..5a1b5041cb521 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -510,6 +510,17 @@ Removal of prior version deprecations/changes - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) +- Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) + + Previous Behavior: + + .. code-block:: ipython + + In [2]: pd.date_range('2016-07-01', freq='W@MON', periods=3) + pandas/tseries/frequencies.py:465: FutureWarning: Freq "W@MON" is deprecated, use "W-MON" as alternative. + Out[2]: DatetimeIndex(['2016-07-04', '2016-07-11', '2016-07-18'], dtype='datetime64[ns]', freq='W-MON') + + Now legacy time rules raises ``ValueError``. For the list of currently supported offsets, see :ref:`here ` .. _whatsnew_0190.performance: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index e2132deb97d64..8b3785d78d260 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -1,5 +1,5 @@ from datetime import timedelta -from pandas.compat import range, long, zip +from pandas.compat import long, zip from pandas import compat import re import warnings @@ -356,34 +356,6 @@ def get_period_alias(offset_str): """ alias to closest period strings BQ->Q etc""" return _offset_to_period_map.get(offset_str, None) -_rule_aliases = { - # Legacy rules that will continue to map to their original values - # essentially for the rest of time - 'WEEKDAY': 'B', - 'EOM': 'BM', - 'W@MON': 'W-MON', - 'W@TUE': 'W-TUE', - 'W@WED': 'W-WED', - 'W@THU': 'W-THU', - 'W@FRI': 'W-FRI', - 'W@SAT': 'W-SAT', - 'W@SUN': 'W-SUN', - 'Q@JAN': 'BQ-JAN', - 'Q@FEB': 'BQ-FEB', - 'Q@MAR': 'BQ-MAR', - 'A@JAN': 'BA-JAN', - 'A@FEB': 'BA-FEB', - 'A@MAR': 'BA-MAR', - 'A@APR': 'BA-APR', - 'A@MAY': 'BA-MAY', - 'A@JUN': 'BA-JUN', - 'A@JUL': 'BA-JUL', - 'A@AUG': 'BA-AUG', - 'A@SEP': 'BA-SEP', - 'A@OCT': 'BA-OCT', - 'A@NOV': 'BA-NOV', - 'A@DEC': 'BA-DEC', -} _lite_rule_alias = { 'W': 'W-SUN', @@ -401,17 +373,6 @@ def get_period_alias(offset_str): 'ns': 'N' } -# TODO: Can this be killed? -for _i, _weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): - for _iweek in range(4): - _name = 'WOM-%d%s' % (_iweek + 1, _weekday) - _rule_aliases[_name.replace('-', '@')] = _name - -# Note that _rule_aliases is not 1:1 (d[BA]==d[A@DEC]), and so traversal -# order matters when constructing an inverse. we pick one. #2331 -# Used in get_legacy_offset_name -_legacy_reverse_map = dict((v, k) for k, v in - reversed(sorted(compat.iteritems(_rule_aliases)))) _name_to_offset_map = {'days': Day(1), 'hours': Hour(1), @@ -422,6 +383,9 @@ def get_period_alias(offset_str): 'nanoseconds': Nano(1)} +_INVALID_FREQ_ERROR = "Invalid frequency: {0}" + + def to_offset(freqstr): """ Return DateOffset object from string representation or @@ -460,7 +424,7 @@ def to_offset(freqstr): else: delta = delta + offset except Exception: - raise ValueError("Could not evaluate %s" % freqstr) + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) else: delta = None @@ -479,10 +443,10 @@ def to_offset(freqstr): else: delta = delta + offset except Exception: - raise ValueError("Could not evaluate %s" % freqstr) + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) if delta is None: - raise ValueError('Unable to understand %s as a frequency' % freqstr) + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) return delta @@ -526,9 +490,6 @@ def get_base_alias(freqstr): _dont_uppercase = set(('MS', 'ms')) -_LEGACY_FREQ_WARNING = 'Freq "{0}" is deprecated, use "{1}" as alternative.' - - def get_offset(name): """ Return DateOffset object associated with rule name @@ -539,27 +500,9 @@ def get_offset(name): """ if name not in _dont_uppercase: name = name.upper() - - if name in _rule_aliases: - new = _rule_aliases[name] - warnings.warn(_LEGACY_FREQ_WARNING.format(name, new), - FutureWarning, stacklevel=2) - name = new - elif name.lower() in _rule_aliases: - new = _rule_aliases[name.lower()] - warnings.warn(_LEGACY_FREQ_WARNING.format(name, new), - FutureWarning, stacklevel=2) - name = new - name = _lite_rule_alias.get(name, name) name = _lite_rule_alias.get(name.lower(), name) - else: - if name in _rule_aliases: - new = _rule_aliases[name] - warnings.warn(_LEGACY_FREQ_WARNING.format(name, new), - FutureWarning, stacklevel=2) - name = new name = _lite_rule_alias.get(name, name) if name not in _offset_map: @@ -571,7 +514,7 @@ def get_offset(name): offset = klass._from_name(*split[1:]) except (ValueError, TypeError, KeyError): # bad prefix or suffix - raise ValueError('Bad rule name requested: %s.' % name) + raise ValueError(_INVALID_FREQ_ERROR.format(name)) # cache _offset_map[name] = offset # do not return cache because it's mutable @@ -595,17 +538,6 @@ def get_offset_name(offset): return offset.freqstr -def get_legacy_offset_name(offset): - """ - Return the pre pandas 0.8.0 name for the date offset - """ - - # This only used in test_timeseries_legacy.py - - name = offset.name - return _legacy_reverse_map.get(name, name) - - def get_standard_freq(freq): """ Return the standardized frequency string @@ -796,36 +728,18 @@ def _period_alias_dictionary(): def _period_str_to_code(freqstr): - # hack - if freqstr in _rule_aliases: - new = _rule_aliases[freqstr] - warnings.warn(_LEGACY_FREQ_WARNING.format(freqstr, new), - FutureWarning, stacklevel=3) - freqstr = new freqstr = _lite_rule_alias.get(freqstr, freqstr) if freqstr not in _dont_uppercase: lower = freqstr.lower() - if lower in _rule_aliases: - new = _rule_aliases[lower] - warnings.warn(_LEGACY_FREQ_WARNING.format(lower, new), - FutureWarning, stacklevel=3) - freqstr = new freqstr = _lite_rule_alias.get(lower, freqstr) + if freqstr not in _dont_uppercase: + freqstr = freqstr.upper() try: - if freqstr not in _dont_uppercase: - freqstr = freqstr.upper() return _period_code_map[freqstr] except KeyError: - try: - alias = _period_alias_dict[freqstr] - warnings.warn(_LEGACY_FREQ_WARNING.format(freqstr, alias), - FutureWarning, stacklevel=3) - except KeyError: - raise ValueError("Unknown freqstr: %s" % freqstr) - - return _period_code_map[alias] + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) def infer_freq(index, warn=True): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 958a10c329a46..6c996285369b8 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -160,9 +160,11 @@ def test_round(self): tm.assert_index_equal(rng.round(freq='H'), expected_rng) self.assertEqual(elt.round(freq='H'), expected_elt) - msg = "Could not evaluate foo" - tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='foo') - tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='foo') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with tm.assertRaisesRegexp(ValueError, msg): + rng.round(freq='foo') + with tm.assertRaisesRegexp(ValueError, msg): + elt.round(freq='foo') msg = " is a non-fixed frequency" tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='M') @@ -847,9 +849,11 @@ def test_round(self): tm.assert_index_equal(td.round(freq='H'), expected_rng) self.assertEqual(elt.round(freq='H'), expected_elt) - msg = "Could not evaluate foo" - tm.assertRaisesRegexp(ValueError, msg, td.round, freq='foo') - tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='foo') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + td.round(freq='foo') + with tm.assertRaisesRegexp(ValueError, msg): + elt.round(freq='foo') msg = " is a non-fixed frequency" tm.assertRaisesRegexp(ValueError, msg, td.round, freq='M') diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 1f06b7ad4361b..268933fada7a2 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -245,10 +245,10 @@ def _assert_depr(freq, expected, aliases): assert isinstance(aliases, list) assert (frequencies._period_str_to_code(freq) == expected) + msg = frequencies._INVALID_FREQ_ERROR for alias in aliases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert (frequencies._period_str_to_code(alias) == expected) + with tm.assertRaisesRegexp(ValueError, msg): + frequencies._period_str_to_code(alias) _assert_depr("M", 3000, ["MTH", "MONTH", "MONTHLY"]) @@ -699,8 +699,9 @@ def test_series(self): s = Series(period_range('2013', periods=10, freq=freq)) self.assertRaises(TypeError, lambda: frequencies.infer_freq(s)) for freq in ['Y']: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + + msg = frequencies._INVALID_FREQ_ERROR + with tm.assertRaisesRegexp(ValueError, msg): s = Series(period_range('2013', periods=10, freq=freq)) self.assertRaises(TypeError, lambda: frequencies.infer_freq(s)) @@ -715,17 +716,23 @@ def test_series(self): self.assertEqual(inferred, 'D') def test_legacy_offset_warnings(self): - for k, v in compat.iteritems(frequencies._rule_aliases): - with tm.assert_produces_warning(FutureWarning): - result = frequencies.get_offset(k) - exp = frequencies.get_offset(v) - self.assertEqual(result, exp) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - idx = date_range('2011-01-01', periods=5, freq=k) - exp = date_range('2011-01-01', periods=5, freq=v) - self.assert_index_equal(idx, exp) + freqs = ['WEEKDAY', 'EOM', 'W@MON', 'W@TUE', 'W@WED', 'W@THU', + 'W@FRI', 'W@SAT', 'W@SUN', 'Q@JAN', 'Q@FEB', 'Q@MAR', + 'A@JAN', 'A@FEB', 'A@MAR', 'A@APR', 'A@MAY', 'A@JUN', + 'A@JUL', 'A@AUG', 'A@SEP', 'A@OCT', 'A@NOV', 'A@DEC', + 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', 'WOM@4MON', + 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', 'WOM@4TUE', + 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', 'WOM@4WED', + 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', 'WOM@4THU' + 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', 'WOM@4FRI'] + + msg = frequencies._INVALID_FREQ_ERROR + for freq in freqs: + with tm.assertRaisesRegexp(ValueError, msg): + frequencies.get_offset(freq) + + with tm.assertRaisesRegexp(ValueError, msg): + date_range('2011-01-01', periods=5, freq=freq) MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 5965a661699a6..b31e4d54c551f 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -23,7 +23,7 @@ from pandas.core.series import Series from pandas.tseries.frequencies import (_offset_map, get_freq_code, - _get_freq_str) + _get_freq_str, _INVALID_FREQ_ERROR) from pandas.tseries.index import _to_m8, DatetimeIndex, _daterange_cache from pandas.tseries.tools import parse_time_string, DateParseError import pandas.tseries.offsets as offsets @@ -4531,8 +4531,11 @@ def test_get_offset_name(self): def test_get_offset(): - assertRaisesRegexp(ValueError, "rule.*GIBBERISH", get_offset, 'gibberish') - assertRaisesRegexp(ValueError, "rule.*QS-JAN-B", get_offset, 'QS-JAN-B') + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_offset('gibberish') + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_offset('QS-JAN-B') + pairs = [ ('B', BDay()), ('b', BDay()), ('bm', BMonthEnd()), ('Bm', BMonthEnd()), ('W-MON', Week(weekday=0)), @@ -4558,10 +4561,8 @@ def test_get_offset(): def test_get_offset_legacy(): pairs = [('w@Sat', Week(weekday=5))] for name, expected in pairs: - with tm.assert_produces_warning(FutureWarning): - offset = get_offset(name) - assert offset == expected, ("Expected %r to yield %r (actual: %r)" % - (name, expected, offset)) + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_offset(name) class TestParseTimeString(tm.TestCase): @@ -4595,16 +4596,14 @@ def test_get_standard_freq(): assert fstr == get_standard_freq('1w') assert fstr == get_standard_freq(('W', 1)) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = get_standard_freq('WeEk') - assert fstr == result + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_standard_freq('WeEk') fstr = get_standard_freq('5Q') assert fstr == get_standard_freq('5q') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = get_standard_freq('5QuarTer') - assert fstr == result + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_standard_freq('5QuarTer') assert fstr == get_standard_freq(('q', 5)) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 8d217ff0753a6..c90cbbf80086a 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -451,13 +451,16 @@ def test_period_deprecated_freq(self): "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"]} + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR for exp, freqs in iteritems(cases): for freq in freqs: + with self.assertRaisesRegexp(ValueError, msg): + Period('2016-03-01 09:00', freq=freq) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = pd.Period('2016-03-01 09:00', freq=freq) - self.assertEqual(res, Period('2016-03-01 09:00', freq=exp)) + # check supported freq-aliases still works + p = Period('2016-03-01 09:00', freq=exp) + tm.assertIsInstance(p, Period) def test_repr(self): p = Period('Jan-2000') @@ -659,19 +662,21 @@ def test_properties_weekly(self): def test_properties_weekly_legacy(self): # Test properties on Periods with daily frequency. - with tm.assert_produces_warning(FutureWarning): - w_date = Period(freq='WK', year=2007, month=1, day=7) - # + w_date = Period(freq='W', year=2007, month=1, day=7) self.assertEqual(w_date.year, 2007) self.assertEqual(w_date.quarter, 1) self.assertEqual(w_date.month, 1) self.assertEqual(w_date.week, 1) self.assertEqual((w_date - 1).week, 52) self.assertEqual(w_date.days_in_month, 31) - with tm.assert_produces_warning(FutureWarning): - exp = Period(freq='WK', year=2012, month=2, day=1) + + exp = Period(freq='W', year=2012, month=2, day=1) self.assertEqual(exp.days_in_month, 29) + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK', year=2007, month=1, day=7) + def test_properties_daily(self): # Test properties on Periods with daily frequency. b_date = Period(freq='B', year=2007, month=1, day=1) @@ -819,10 +824,11 @@ def test_asfreq_MS(self): self.assertEqual(initial.asfreq(freq="M", how="S"), Period('2013-01', 'M')) - with self.assertRaisesRegexp(ValueError, "Unknown freqstr"): + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): initial.asfreq(freq="MS", how="S") - with tm.assertRaisesRegexp(ValueError, "Unknown freqstr: MS"): + with tm.assertRaisesRegexp(ValueError, msg): pd.Period('2013-01', 'MS') self.assertTrue(_period_code_map.get("MS") is None) @@ -1122,123 +1128,28 @@ def test_conv_weekly(self): self.assertEqual(ival_W.asfreq('W'), ival_W) + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + ival_W.asfreq('WK') + def test_conv_weekly_legacy(self): # frequency conversion tests: from Weekly Frequency - - with tm.assert_produces_warning(FutureWarning): - ival_W = Period(freq='WK', year=2007, month=1, day=1) - - with tm.assert_produces_warning(FutureWarning): - ival_WSUN = Period(freq='WK', year=2007, month=1, day=7) - with tm.assert_produces_warning(FutureWarning): - ival_WSAT = Period(freq='WK-SAT', year=2007, month=1, day=6) - with tm.assert_produces_warning(FutureWarning): - ival_WFRI = Period(freq='WK-FRI', year=2007, month=1, day=5) - with tm.assert_produces_warning(FutureWarning): - ival_WTHU = Period(freq='WK-THU', year=2007, month=1, day=4) - with tm.assert_produces_warning(FutureWarning): - ival_WWED = Period(freq='WK-WED', year=2007, month=1, day=3) - with tm.assert_produces_warning(FutureWarning): - ival_WTUE = Period(freq='WK-TUE', year=2007, month=1, day=2) - with tm.assert_produces_warning(FutureWarning): - ival_WMON = Period(freq='WK-MON', year=2007, month=1, day=1) - - ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) - ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) - ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) - ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) - ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) - ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) - ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) - ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) - ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) - ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) - ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) - ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) - - with tm.assert_produces_warning(FutureWarning): - ival_W_end_of_year = Period(freq='WK', year=2007, month=12, day=31) - with tm.assert_produces_warning(FutureWarning): - ival_W_end_of_quarter = Period(freq='WK', year=2007, month=3, - day=31) - with tm.assert_produces_warning(FutureWarning): - ival_W_end_of_month = Period(freq='WK', year=2007, month=1, day=31) - ival_W_to_A = Period(freq='A', year=2007) - ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_W_to_M = Period(freq='M', year=2007, month=1) - - if Period(freq='D', year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq='A', year=2007) - else: - ival_W_to_A_end_of_year = Period(freq='A', year=2008) - - if Period(freq='D', year=2007, month=3, day=31).weekday == 6: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1) - else: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2) - - if Period(freq='D', year=2007, month=1, day=31).weekday == 6: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) - else: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) - - ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) - ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23) - ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, - minute=59, second=59) - - self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) - self.assertEqual(ival_W_end_of_year.asfreq('A'), - ival_W_to_A_end_of_year) - self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) - self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), - ival_W_to_Q_end_of_quarter) - self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) - self.assertEqual(ival_W_end_of_month.asfreq('M'), - ival_W_to_M_end_of_month) - - self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) - self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) - - self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) - self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) - - self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) - self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) - self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) - self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) - self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) - self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) - self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) - self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) - self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) - self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) - self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) - self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) - self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) - self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) - - self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) - self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) - self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) - self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) - self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) - self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) - - with tm.assert_produces_warning(FutureWarning): - self.assertEqual(ival_W.asfreq('WK'), ival_W) + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK', year=2007, month=1, day=1) + + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-SAT', year=2007, month=1, day=6) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-FRI', year=2007, month=1, day=5) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-THU', year=2007, month=1, day=4) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-WED', year=2007, month=1, day=3) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-TUE', year=2007, month=1, day=2) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-MON', year=2007, month=1, day=1) def test_conv_business(self): # frequency conversion tests: from Business Frequency" @@ -2894,11 +2805,14 @@ def test_to_period_monthish(self): prng = rng.to_period() self.assertEqual(prng.freq, 'M') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - rng = date_range('01-Jan-2012', periods=8, freq='EOM') + rng = date_range('01-Jan-2012', periods=8, freq='M') prng = rng.to_period() self.assertEqual(prng.freq, 'M') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + date_range('01-Jan-2012', periods=8, freq='EOM') + def test_multiples(self): result1 = Period('1989', freq='2A') result2 = Period('1989', freq='A') diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 6696c03a070f7..f30f01e66cb0b 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -1422,16 +1422,14 @@ def _check_round(freq, expected): result = stamp.round(freq=freq) self.assertEqual(result, expected) - for freq, expected in [ - ('D', Timestamp('2000-01-05 00:00:00')), - ('H', Timestamp('2000-01-05 05:00:00')), - ('S', Timestamp('2000-01-05 05:09:15')) - ]: + for freq, expected in [('D', Timestamp('2000-01-05 00:00:00')), + ('H', Timestamp('2000-01-05 05:00:00')), + ('S', Timestamp('2000-01-05 05:09:15'))]: _check_round(freq, expected) - msg = "Could not evaluate" - tm.assertRaisesRegexp(ValueError, msg, - stamp.round, 'foo') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + stamp.round('foo') class TestTimestampOps(tm.TestCase): From 006bd0b1c2f3ff183c1834a27305a1a3039011d8 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 18 Jul 2016 21:42:39 -0400 Subject: [PATCH 118/359] CLN: removed setter method of categorical's ordered attribute xref #9611 Author: gfyoung Closes #13671 from gfyoung/cat-set-order-removal and squashes the following commits: 58938e7 [gfyoung] CLN: removed setter method of categorical's ordered attribute --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/categorical.py | 8 +------- pandas/tests/test_categorical.py | 11 +++++------ 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 5a1b5041cb521..053028d896466 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -508,6 +508,7 @@ Removal of prior version deprecations/changes - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) +- ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) - Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index a26cc5125db78..39e140e962821 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -571,12 +571,6 @@ def _get_categories(self): _ordered = None - def _set_ordered(self, value): - """ Sets the ordered attribute to the boolean value """ - warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", - FutureWarning, stacklevel=2) - self.set_ordered(value, inplace=True) - def set_ordered(self, value, inplace=False): """ Sets the ordered attribute to the boolean value @@ -624,7 +618,7 @@ def _get_ordered(self): """ Gets the ordered attribute """ return self._ordered - ordered = property(fget=_get_ordered, fset=_set_ordered) + ordered = property(fget=_get_ordered) def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 1edd9443fe356..35b1b8c1bf341 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -808,13 +808,12 @@ def test_set_ordered(self): cat2.set_ordered(False, inplace=True) self.assertFalse(cat2.ordered) - # deperecated in v0.16.0 - with tm.assert_produces_warning(FutureWarning): - cat.ordered = False - self.assertFalse(cat.ordered) - with tm.assert_produces_warning(FutureWarning): + # removed in 0.19.0 + msg = "can\'t set attribute" + with tm.assertRaisesRegexp(AttributeError, msg): cat.ordered = True - self.assertTrue(cat.ordered) + with tm.assertRaisesRegexp(AttributeError, msg): + cat.ordered = False def test_set_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) From b225cacb1d2a34e3c4041533a0590133098756fa Mon Sep 17 00:00:00 2001 From: Piotr Jucha Date: Mon, 18 Jul 2016 21:46:17 -0400 Subject: [PATCH 119/359] BUG/PERF: Sort mixed-int in Py3, fix Index.difference fixes some issues from #13432 closes #12044 closes #12814 Author: Piotr Jucha Closes #13514 from pijucha/setop13432 and squashes the following commits: 3a96089 [Piotr Jucha] BUG/PERF: Sort mixed-int in Py3, fix Index.difference --- asv_bench/benchmarks/index_object.py | 55 ++++++++ doc/source/whatsnew/v0.19.0.txt | 34 ++++- pandas/core/algorithms.py | 125 ++++++++++++++---- pandas/indexes/base.py | 79 +++++++++-- pandas/tests/indexes/common.py | 39 ++++++ pandas/tests/indexes/test_base.py | 188 ++++++++++++++++++++++++--- pandas/tests/indexes/test_multi.py | 9 ++ pandas/tests/test_algos.py | 74 +++++++++++ pandas/tests/test_groupby.py | 12 ++ pandas/tools/merge.py | 14 +- pandas/tools/tests/test_join.py | 17 +++ 11 files changed, 583 insertions(+), 63 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 8c65f09937df4..a0a1b560d36f3 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -63,6 +63,27 @@ def time_index_datetime_union(self): self.rng.union(self.rng2) +class index_datetime_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.A = self.N - 20000 + self.B = self.N + 20000 + self.idx1 = DatetimeIndex(range(self.N)) + self.idx2 = DatetimeIndex(range(self.A, self.B)) + self.idx3 = DatetimeIndex(range(self.N, self.B)) + + def time_index_datetime_difference(self): + self.idx1.difference(self.idx2) + + def time_index_datetime_difference_disjoint(self): + self.idx1.difference(self.idx3) + + def time_index_datetime_symmetric_difference(self): + self.idx1.symmetric_difference(self.idx2) + + class index_float64_boolean_indexer(object): goal_time = 0.2 @@ -183,6 +204,40 @@ def time_index_int64_union(self): self.left.union(self.right) +class index_int64_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.options = np.arange(self.N) + self.left = Index(self.options.take( + np.random.permutation(self.N)[:(self.N // 2)])) + self.right = Index(self.options.take( + np.random.permutation(self.N)[:(self.N // 2)])) + + def time_index_int64_difference(self): + self.left.difference(self.right) + + def time_index_int64_symmetric_difference(self): + self.left.symmetric_difference(self.right) + + +class index_str_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.strs = tm.rands_array(10, self.N) + self.left = Index(self.strs[:self.N * 2 // 3]) + self.right = Index(self.strs[self.N // 3:]) + + def time_str_difference(self): + self.left.difference(self.right) + + def time_str_symmetric_difference(self): + self.left.symmetric_difference(self.right) + + class index_str_boolean_indexer(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 053028d896466..8d3fe84ab835e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -396,7 +396,7 @@ resulting dtype will be upcast, which is unchanged from previous. pd.merge(df1, df2, how='outer', on='key') pd.merge(df1, df2, how='outer', on='key').dtypes -.. _whatsnew_0190.describe: +.. _whatsnew_0190.api.describe: ``.describe()`` changes ^^^^^^^^^^^^^^^^^^^^^^^ @@ -485,6 +485,34 @@ New Behavior: pd.NaT + 1 pd.NaT - 1 +.. _whatsnew_0190.api.difference: + +``Index.difference`` and ``.symmetric_difference`` changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Index.difference`` and ``Index.symmetric_difference`` will now, more consistently, treat ``NaN`` values as any other values. (:issue:`13514`) + +.. ipython:: python + + idx1 = pd.Index([1, 2, 3, np.nan]) + idx2 = pd.Index([0, 1, np.nan]) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: idx1.difference(idx2) + Out[3]: Float64Index([nan, 2.0, 3.0], dtype='float64') + + In [4]: idx1.symmetric_difference(idx2) + Out[4]: Float64Index([0.0, nan, 2.0, 3.0], dtype='float64') + +New Behavior: + +.. ipython:: python + + idx1.difference(idx2) + idx1.symmetric_difference(idx2) .. _whatsnew_0190.deprecations: @@ -534,7 +562,7 @@ Performance Improvements - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - +- Improved performance of ``Index.difference`` (:issue:`12044`) .. _whatsnew_0190.bug_fixes: @@ -629,3 +657,5 @@ Bug Fixes - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) +- Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) +- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c3ba734353a8d..5cc54e61f6b2a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -163,6 +163,104 @@ def isin(comps, values): return f(comps, values) +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): + """ + Sort ``values`` and reorder corresponding ``labels``. + ``values`` should be unique if ``labels`` is not None. + Safe for use with mixed types (int, str), orders ints before strs. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + values : list-like + Sequence; must be unique if ``labels`` is not None. + labels : list_like + Indices to ``values``. All out of bound indices are treated as + "not found" and will be masked with ``na_sentinel``. + na_sentinel : int, default -1 + Value in ``labels`` to mark "not found". + Ignored when ``labels`` is None. + assume_unique : bool, default False + When True, ``values`` are assumed to be unique, which can speed up + the calculation. Ignored when ``labels`` is None. + + Returns + ------- + ordered : ndarray + Sorted ``values`` + new_labels : ndarray + Reordered ``labels``; returned when ``labels`` is not None. + + Raises + ------ + TypeError + * If ``values`` is not list-like or if ``labels`` is neither None + nor list-like + * If ``values`` cannot be sorted + ValueError + * If ``labels`` is not None and ``values`` contain duplicates. + """ + if not is_list_like(values): + raise TypeError("Only list-like objects are allowed to be passed to" + "safe_sort as values") + values = np.array(values, copy=False) + + def sort_mixed(values): + # order ints before strings, safe in py3 + str_pos = np.array([isinstance(x, string_types) for x in values], + dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return _ensure_object(np.concatenate([nums, strs])) + + sorter = None + if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer': + # unorderable in py3 if mixed str/int + ordered = sort_mixed(values) + else: + try: + sorter = values.argsort() + ordered = values.take(sorter) + except TypeError: + # try this anyway + ordered = sort_mixed(values) + + # labels: + + if labels is None: + return ordered + + if not is_list_like(labels): + raise TypeError("Only list-like objects or None are allowed to be" + "passed to safe_sort as labels") + labels = _ensure_platform_int(np.asarray(labels)) + + from pandas import Index + if not assume_unique and not Index(values).is_unique: + raise ValueError("values should be unique if labels is not None") + + if sorter is None: + # mixed types + (hash_klass, _), values = _get_data_algo(values, _hashtables) + t = hash_klass(len(values)) + t.map_locations(values) + sorter = _ensure_platform_int(t.lookup(ordered)) + + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = (labels < -len(values)) | (labels >= len(values)) | \ + (labels == na_sentinel) + + # (Out of bound indices will be masked with `na_sentinel` next, so we may + # deal with them here without performance loss using `mode='wrap'`.) + new_labels = reverse_indexer.take(labels, mode='wrap') + np.putmask(new_labels, mask, na_sentinel) + + return ordered, new_labels + + def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable @@ -210,33 +308,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = uniques.to_array() if sort and len(uniques) > 0: - try: - sorter = uniques.argsort() - except: - # unorderable in py3 if mixed str/int - t = hash_klass(len(uniques)) - t.map_locations(_ensure_object(uniques)) - - # order ints before strings - ordered = np.concatenate([ - np.sort(np.array([e for i, e in enumerate(uniques) if f(e)], - dtype=object)) for f in - [lambda x: not isinstance(x, string_types), - lambda x: isinstance(x, string_types)]]) - sorter = _ensure_platform_int(t.lookup( - _ensure_object(ordered))) - - reverse_indexer = np.empty(len(sorter), dtype=np.int_) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - mask = labels < 0 - labels = reverse_indexer.take(labels) - np.putmask(labels, mask, -1) - - uniques = uniques.take(sorter) + uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, + assume_unique=True) if is_datetimetz_type: - # reset tz uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( values.tz) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index b013d6ccb0b8e..71d5fdd17ee5c 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1773,7 +1773,7 @@ def _get_consensus_name(self, other): else: name = None if self.name != name: - return other._shallow_copy(name=name) + return self._shallow_copy(name=name) return self def union(self, other): @@ -1920,7 +1920,8 @@ def difference(self, other): Return a new Index with elements from the index that are not in `other`. - This is the sorted set difference of two Index objects. + This is the set difference of two Index objects. + It's sorted if sorting is possible. Parameters ---------- @@ -1946,14 +1947,27 @@ def difference(self, other): other, result_name = self._convert_can_do_setop(other) - theDiff = sorted(set(self) - set(other)) - return Index(theDiff, name=result_name) + this = self._get_unique_index() + + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) + + label_diff = np.setdiff1d(np.arange(this.size), indexer, + assume_unique=True) + the_diff = this.values.take(label_diff) + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass + + return this._shallow_copy(the_diff, name=result_name) diff = deprecate('diff', difference) def symmetric_difference(self, other, result_name=None): """ - Compute the sorted symmetric difference of two Index objects. + Compute the symmetric difference of two Index objects. + It's sorted if sorting is possible. Parameters ---------- @@ -1970,9 +1984,6 @@ def symmetric_difference(self, other, result_name=None): ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. - The sorting of a result containing ``NaN`` values is not guaranteed - across Python versions. See GitHub issue #6444. - Examples -------- >>> idx1 = Index([1, 2, 3, 4]) @@ -1990,8 +2001,26 @@ def symmetric_difference(self, other, result_name=None): if result_name is None: result_name = result_name_update - the_diff = sorted(set((self.difference(other)). - union(other.difference(self)))) + this = self._get_unique_index() + other = other._get_unique_index() + indexer = this.get_indexer(other) + + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, + assume_unique=True) + left_diff = this.values.take(left_indexer) + + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other.values.take(right_indexer) + + the_diff = _concat._concat_compat([left_diff, right_diff]) + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass + attribs = self._get_attributes_dict() attribs['name'] = result_name if 'freq' in attribs: @@ -2000,6 +2029,36 @@ def symmetric_difference(self, other, result_name=None): sym_diff = deprecate('sym_diff', symmetric_difference) + def _get_unique_index(self, dropna=False): + """ + Returns an index containing unique values. + + Parameters + ---------- + dropna : bool + If True, NaN values are dropped. + + Returns + ------- + uniques : index + """ + if self.is_unique and not dropna: + return self + + values = self.values + + if not self.is_unique: + values = self.unique() + + if dropna: + try: + if self.hasnans: + values = values[~isnull(values)] + except NotImplementedError: + pass + + return self._shallow_copy(values) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index d6f7493bb25f9..92560363be8fe 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -287,6 +287,45 @@ def test_duplicates(self): self.assertEqual(result.name, 'foo') self.assert_index_equal(result, Index([ind[0]], name='foo')) + def test_get_unique_index(self): + for ind in self.indices.values(): + + # MultiIndex tested separately + if not len(ind) or isinstance(ind, MultiIndex): + continue + + idx = ind[[0] * 5] + idx_unique = ind[[0]] + # We test against `idx_unique`, so first we make sure it's unique + # and doesn't contain nans. + self.assertTrue(idx_unique.is_unique) + try: + self.assertFalse(idx_unique.hasnans) + except NotImplementedError: + pass + + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + self.assert_index_equal(result, idx_unique) + + # nans: + + if not ind._can_hold_na: + continue + + vals = ind.values[[0] * 5] + vals[0] = np.nan + vals_unique = vals[:2] + idx_nan = ind._shallow_copy(vals) + idx_unique_nan = ind._shallow_copy(vals_unique) + self.assertTrue(idx_unique_nan.is_unique) + + for dropna, expected in zip([False, True], + [idx_unique_nan, idx_unique]): + for i in [idx_nan, idx_unique_nan]: + result = i._get_unique_index(dropna=dropna) + self.assert_index_equal(result, expected) + def test_sort(self): for ind in self.indices.values(): self.assertRaises(TypeError, ind.sort) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 06662e52e3a6f..cc5dd24292bb8 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -640,47 +640,56 @@ def test_union(self): first = Index(list('ab'), name='A') second = Index(list('ab'), name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index([], name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index([], name='A') second = Index(list('ab'), name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index(list('ab')) second = Index(list('ab'), name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index([]) second = Index(list('ab'), name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index(list('ab')) second = Index([], name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index(list('ab')) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index([]) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) first = Index([], name='A') second = Index(list('ab')) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) def test_add(self): @@ -803,17 +812,19 @@ def test_symmetric_difference(self): self.assertTrue(tm.equalContents(result, expected)) # nans: - # GH #6444, sorting of nans. Make sure the number of nans is right - # and the correct non-nan values are there. punt on sorting. - idx1 = Index([1, 2, 3, np.nan]) + # GH 13514 change: {nan} - {nan} == {} + # (GH 6444, sorting of nans, is no longer an issue) + idx1 = Index([1, np.nan, 2, 3]) idx2 = Index([0, 1, np.nan]) + idx3 = Index([0, 1]) + result = idx1.symmetric_difference(idx2) - # expected = Index([0.0, np.nan, 2.0, 3.0, np.nan]) + expected = Index([0.0, 2.0, 3.0]) + tm.assert_index_equal(result, expected) - nans = pd.isnull(result) - self.assertEqual(nans.sum(), 1) - self.assertEqual((~nans).sum(), 3) - [self.assertIn(x, result) for x in [0.0, 2.0, 3.0]] + result = idx1.symmetric_difference(idx3) + expected = Index([0.0, 2.0, 3.0, np.nan]) + tm.assert_index_equal(result, expected) # other not an Index: idx1 = Index([1, 2, 3, 4], name='idx1') @@ -1665,6 +1676,149 @@ def test_string_index_repr(self): self.assertEqual(coerce(idx), expected) +class TestMixedIntIndex(Base, tm.TestCase): + # Mostly the tests from common.py for which the results differ + # in py2 and py3 because ints and strings are uncomparable in py3 + # (GH 13514) + + _holder = Index + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(mixedIndex=Index([0, 'a', 1, 'b', 2, 'c'])) + self.setup_indices() + + def create_index(self): + return self.mixedIndex + + def test_order(self): + idx = self.create_index() + # 9816 deprecated + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + with tm.assert_produces_warning(FutureWarning): + idx.order() + else: + with tm.assert_produces_warning(FutureWarning): + idx.order() + + def test_argsort(self): + idx = self.create_index() + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + result = idx.argsort() + else: + result = idx.argsort() + expected = np.array(idx).argsort() + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + def test_numpy_argsort(self): + idx = self.create_index() + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + result = np.argsort(idx) + else: + result = np.argsort(idx) + expected = idx.argsort() + tm.assert_numpy_array_equal(result, expected) + + def test_copy_name(self): + # Check that "name" argument passed at initialization is honoured + # GH12309 + idx = self.create_index() + + first = idx.__class__(idx, copy=True, name='mario') + second = first.__class__(first, copy=False) + + # Even though "copy=False", we want a new object. + self.assertIsNot(first, second) + # Not using tm.assert_index_equal() since names differ: + self.assertTrue(idx.equals(first)) + + self.assertEqual(first.name, 'mario') + self.assertEqual(second.name, 'mario') + + s1 = Series(2, index=first) + s2 = Series(3, index=second[:-1]) + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + s3 = s1 * s2 + else: + s3 = s1 * s2 + self.assertEqual(s3.index.name, 'mario') + + def test_union_base(self): + idx = self.create_index() + first = idx[3:] + second = idx[:5] + + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + result = first.union(second) + expected = Index(['b', 2, 'c', 0, 'a', 1]) + self.assert_index_equal(result, expected) + else: + result = first.union(second) + expected = Index(['b', 2, 'c', 0, 'a', 1]) + self.assert_index_equal(result, expected) + + # GH 10149 + cases = [klass(second.values) + for klass in [np.array, Series, list]] + for case in cases: + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + result = first.union(case) + self.assertTrue(tm.equalContents(result, idx)) + else: + result = first.union(case) + self.assertTrue(tm.equalContents(result, idx)) + + def test_intersection_base(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:5] + second = idx[:3] + result = first.intersection(second) + expected = Index([0, 'a', 1]) + self.assert_index_equal(result, expected) + + # GH 10149 + cases = [klass(second.values) + for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + def test_difference_base(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:4] + second = idx[3:] + + result = first.difference(second) + expected = Index([0, 1, 'a']) + self.assert_index_equal(result, expected) + + def test_symmetric_difference(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:4] + second = idx[3:] + + result = first.symmetric_difference(second) + expected = Index([0, 1, 2, 'a', 'c']) + self.assert_index_equal(result, expected) + + def test_logical_compat(self): + idx = self.create_index() + self.assertEqual(idx.all(), idx.values.all()) + self.assertEqual(idx.any(), idx.values.any()) + + def test_get_combined_index(): from pandas.core.index import _get_combined_index result = _get_combined_index([]) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e6a8aafc32be4..2734e90a1971b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1877,6 +1877,15 @@ def test_duplicate_meta_data(self): self.assertTrue(idx.has_duplicates) self.assertEqual(idx.drop_duplicates().names, idx.names) + def test_get_unique_index(self): + idx = self.index[[0, 1, 0, 1, 1, 0, 0]] + expected = self.index._shallow_copy(idx[[0, 1]]) + + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + self.assertTrue(result.unique) + self.assert_index_equal(result, expected) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cb90110c953c1..f18d869b3843d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -56,6 +56,80 @@ def test_strings(self): tm.assert_series_equal(result, expected) +class TestSafeSort(tm.TestCase): + _multiprocess_can_split_ = True + + def test_basic_sort(self): + values = [3, 1, 2, 0, 4] + result = algos.safe_sort(values) + expected = np.array([0, 1, 2, 3, 4]) + tm.assert_numpy_array_equal(result, expected) + + values = list("baaacb") + result = algos.safe_sort(values) + expected = np.array(list("aaabbc")) + tm.assert_numpy_array_equal(result, expected) + + values = [] + result = algos.safe_sort(values) + expected = np.array([]) + tm.assert_numpy_array_equal(result, expected) + + def test_labels(self): + values = [3, 1, 2, 0, 4] + expected = np.array([0, 1, 2, 3, 4]) + + labels = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # na_sentinel + labels = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_labels = algos.safe_sort(values, labels, + na_sentinel=99) + expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # out of bound indices + labels = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + labels = [] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([], dtype=np.int_) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + def test_mixed_integer(self): + values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object) + result = algos.safe_sort(values) + expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + values = np.array(['b', 1, 0, 'a'], dtype=object) + labels = [0, 1, 2, 3, 0, -1, 1] + result, result_labels = algos.safe_sort(values, labels) + expected = np.array([0, 1, 'a', 'b'], dtype=object) + + def test_exceptions(self): + with tm.assertRaisesRegexp(TypeError, + "Only list-like objects are allowed"): + algos.safe_sort(values=1) + + with tm.assertRaisesRegexp(TypeError, + "Only list-like objects or None"): + algos.safe_sort(values=[0, 1, 2], labels=1) + + with tm.assertRaisesRegexp(ValueError, "values should be unique"): + algos.safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) + + class TestFactorize(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 57d43f22757ea..258f36cb1b68f 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3210,6 +3210,18 @@ def test_groupby_nonstring_columns(self): expected = df.groupby(df[0]).mean() assert_frame_equal(result, expected) + def test_groupby_mixed_type_columns(self): + # GH 13432, unorderable types in py3 + df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) + expected = DataFrame([[1, 2]], columns=['B', 0], + index=Index([0], name='A')) + + result = df.groupby('A').first() + tm.assert_frame_equal(result, expected) + + result = df.groupby('A').sum() + tm.assert_frame_equal(result, expected) + def test_cython_grouper_series_bug_noncontig(self): arr = np.empty((100, 100)) arr.fill(np.nan) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 5b66e55eb60b6..e7d165354ec6c 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1209,16 +1209,12 @@ def _sort_labels(uniques, left, right): # tuplesafe uniques = Index(uniques).values - sorter = uniques.argsort() + l = len(left) + labels = np.concatenate([left, right]) - reverse_indexer = np.empty(len(sorter), dtype=np.int64) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - new_left = reverse_indexer.take(_ensure_platform_int(left)) - np.putmask(new_left, left == -1, -1) - - new_right = reverse_indexer.take(_ensure_platform_int(right)) - np.putmask(new_right, right == -1, -1) + _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) + new_labels = _ensure_int64(new_labels) + new_left, new_right = new_labels[:l], new_labels[l:] return new_left, new_right diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index 86aee0b4a01c9..cb84c1f06653b 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -536,6 +536,23 @@ def test_join_sort(self): joined = left.join(right, on='key', sort=False) self.assert_index_equal(joined.index, pd.Index(lrange(4))) + def test_join_mixed_non_unique_index(self): + # GH 12814, unorderable types in py3 with a non-unique index + df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) + df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) + result = df1.join(df2) + expected = DataFrame({'a': [1, 2, 3, 3, 4], + 'b': [5, np.nan, 6, 7, np.nan]}, + index=[1, 2, 3, 3, 'a']) + tm.assert_frame_equal(result, expected) + + df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) + df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) + result = df3.join(df4) + expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, + index=[1, 2, 2, 'a']) + tm.assert_frame_equal(result, expected) + def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), From fafef5d91126d6a145f86f2ab4c4725039f3d739 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Mon, 18 Jul 2016 21:57:23 -0400 Subject: [PATCH 120/359] ENH: Add support for writing variable labels to Stata files closes #13536 closes #13535 Author: Kevin Sheppard Closes #13631 from bashtage/stata-data-labels and squashes the following commits: 1e1e1bf [Kevin Sheppard] ENH: Add support for writing variable labels --- doc/source/whatsnew/v0.19.0.txt | 3 +- pandas/core/frame.py | 18 ++++++++-- pandas/io/stata.py | 51 +++++++++++++++++++------- pandas/io/tests/test_stata.py | 64 +++++++++++++++++++++++++++++---- 4 files changed, 114 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8d3fe84ab835e..df9f60fd499fa 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -249,7 +249,8 @@ Other enhancements - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) -- ``Series.append`` now supports ``ignore_index`` option (:issue:`13677`) +- ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) +- ``.to_stata()`` and ```StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) .. _whatsnew_0190.api: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 334526b424be5..4fe7b318b3a18 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1467,7 +1467,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None): + data_label=None, variable_labels=None): """ A class for writing Stata binary dta files from array-like objects @@ -1480,11 +1480,24 @@ def to_stata(self, fname, convert_dates=None, write_index=True, format that you want to use for the dates. Options are 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a number or a name. + write_index : bool + Write the index to Stata dataset. encoding : str Default is latin-1. Note that Stata does not support unicode. byteorder : str Can be ">", "<", "little", or "big". The default is None which uses `sys.byteorder` + time_stamp : datetime + A date time to use when writing the file. Can be None, in which + case the current time is used. + dataset_label : str + A label for the data set. Should be 80 characters or smaller. + + .. versionadded:: 0.19.0 + + variable_labels : dict + Dictionary containing columns as keys and variable labels as + values. Each label must be 80 characters or smaller. Examples -------- @@ -1500,7 +1513,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True, writer = StataWriter(fname, self, convert_dates=convert_dates, encoding=encoding, byteorder=byteorder, time_stamp=time_stamp, data_label=data_label, - write_index=write_index) + write_index=write_index, + variable_labels=variable_labels) writer.write_file() @Appender(fmt.docstring_to_string, indents=1) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index bd19102c7f18c..d35466e8896ba 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1059,7 +1059,7 @@ def _read_new_header(self, first_char): self.lbllist = self._get_lbllist() self.path_or_buf.seek(self._seek_variable_labels) - self.vlblist = self._get_vlblist() + self._variable_labels = self._get_variable_labels() # Get data type information, works for versions 117-118. def _get_dtypes(self, seek_vartypes): @@ -1127,7 +1127,7 @@ def _get_lbllist(self): return [self._null_terminate(self.path_or_buf.read(b)) for i in range(self.nvar)] - def _get_vlblist(self): + def _get_variable_labels(self): if self.format_version == 118: vlblist = [self._decode(self.path_or_buf.read(321)) for i in range(self.nvar)] @@ -1242,7 +1242,7 @@ def _read_old_header(self, first_char): self.lbllist = self._get_lbllist() - self.vlblist = self._get_vlblist() + self._variable_labels = self._get_variable_labels() # ignore expansion fields (Format 105 and later) # When reading, read five bytes; the last four bytes now tell you @@ -1306,11 +1306,11 @@ def _read_value_labels(self): while True: if self.format_version >= 117: if self.path_or_buf.read(5) == b' - break # end of variable label table + break # end of value label table slength = self.path_or_buf.read(4) if not slength: - break # end of variable label table (format < 117) + break # end of value label table (format < 117) if self.format_version <= 117: labname = self._null_terminate(self.path_or_buf.read(33)) else: @@ -1666,7 +1666,7 @@ def variable_labels(self): """Returns variable labels as a dict, associating each variable name with corresponding label """ - return dict(zip(self.varlist, self.vlblist)) + return dict(zip(self.varlist, self._variable_labels)) def value_labels(self): """Returns a dict, associating each variable name a dict, associating @@ -1696,7 +1696,7 @@ def _set_endianness(endianness): def _pad_bytes(name, length): """ - Takes a char string and pads it wih null bytes until it's length chars + Takes a char string and pads it with null bytes until it's length chars """ return name + "\x00" * (length - len(name)) @@ -1831,6 +1831,12 @@ class StataWriter(StataParser): dataset_label : str A label for the data set. Should be 80 characters or smaller. + .. versionadded:: 0.19.0 + + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + Returns ------- writer : StataWriter instance @@ -1853,12 +1859,13 @@ class StataWriter(StataParser): def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None): + data_label=None, variable_labels=None): super(StataWriter, self).__init__(encoding) self._convert_dates = convert_dates self._write_index = write_index self._time_stamp = time_stamp self._data_label = data_label + self._variable_labels = variable_labels # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -2135,11 +2142,29 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, else: # Default is empty label self._write(_pad_bytes("", 33)) - def _write_variable_labels(self, labels=None): - nvar = self.nvar - if labels is None: - for i in range(nvar): - self._write(_pad_bytes("", 81)) + def _write_variable_labels(self): + # Missing labels are 80 blank characters plus null termination + blank = _pad_bytes('', 81) + + if self._variable_labels is None: + for i in range(self.nvar): + self._write(blank) + return + + for col in self.data: + if col in self._variable_labels: + label = self._variable_labels[col] + if len(label) > 80: + raise ValueError('Variable labels must be 80 characters ' + 'or fewer') + is_latin1 = all(ord(c) < 256 for c in label) + if not is_latin1: + raise ValueError('Variable labels must contain only ' + 'characters that can be encoded in ' + 'Latin-1') + self._write(_pad_bytes(label, 81)) + else: + self._write(blank) def _prepare_data(self): data = self.data diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 5f45d1b547e62..91850e6ffe9b9 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1,27 +1,27 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101 -from datetime import datetime import datetime as dt import os -import warnings -import nose import struct import sys +import warnings +from datetime import datetime from distutils.version import LooseVersion +import nose import numpy as np import pandas as pd +import pandas.util.testing as tm +from pandas import compat from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series from pandas.types.common import is_categorical_dtype +from pandas.tslib import NaT from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) -import pandas.util.testing as tm -from pandas.tslib import NaT -from pandas import compat class TestStata(tm.TestCase): @@ -1113,6 +1113,58 @@ def test_read_chunks_columns(self): tm.assert_frame_equal(from_frame, chunk, check_dtype=False) pos += chunksize + def test_write_variable_labels(self): + # GH 13631, add support for writing variable labels + original = pd.DataFrame({'a': [1, 2, 3, 4], + 'b': [1.0, 3.0, 27.0, 81.0], + 'c': ['Atlanta', 'Birmingham', + 'Cincinnati', 'Detroit']}) + original.index.name = 'index' + variable_labels = {'a': 'City Rank', 'b': 'City Exponent', 'c': 'City'} + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + expected_labels = {'index': '', + 'a': 'City Rank', + 'b': 'City Exponent', + 'c': 'City'} + tm.assert_equal(read_labels, expected_labels) + + variable_labels['index'] = 'The Index' + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + tm.assert_equal(read_labels, variable_labels) + + def test_write_variable_label_errors(self): + original = pd.DataFrame({'a': [1, 2, 3, 4], + 'b': [1.0, 3.0, 27.0, 81.0], + 'c': ['Atlanta', 'Birmingham', + 'Cincinnati', 'Detroit']}) + values = [u'\u03A1', u'\u0391', + u'\u039D', u'\u0394', + u'\u0391', u'\u03A3'] + + variable_labels_utf8 = {'a': 'City Rank', + 'b': 'City Exponent', + 'c': u''.join(values)} + + with tm.assertRaises(ValueError): + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels_utf8) + + variable_labels_long = {'a': 'City Rank', + 'b': 'City Exponent', + 'c': 'A very, very, very long variable label ' + 'that is too long for Stata which means ' + 'that it has more than 80 characters'} + + with tm.assertRaises(ValueError): + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels_long) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 506520bd35331aa82db50686c07d96594cac0c10 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 18 Jul 2016 22:06:18 -0400 Subject: [PATCH 121/359] API: Index doesn't results in PeriodIndex if Period contains NaT Author: sinhrks Closes #13664 from sinhrks/period_infer2 and squashes the following commits: b208a9e [sinhrks] API: Index doesn't results in PeriodIndex if Period contains NaT --- doc/source/whatsnew/v0.19.0.txt | 3 +- pandas/core/ops.py | 11 ++- pandas/indexes/base.py | 31 +++++---- pandas/src/inference.pyx | 34 +++++++-- pandas/tests/indexes/test_datetimelike.py | 84 +++++++++++++++++------ pandas/tests/types/test_inference.py | 27 ++++++++ pandas/tseries/base.py | 7 +- pandas/tseries/tests/test_base.py | 6 +- 8 files changed, 153 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index df9f60fd499fa..f65f7d57d5d08 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -269,6 +269,8 @@ API changes - ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) - ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) +- Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) +- ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) .. _whatsnew_0190.api.tolist: @@ -645,7 +647,6 @@ Bug Fixes - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - Clean some compile time warnings in datetime parsing (:issue:`13607`) - - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index d76f011df3dd8..44e3be32c23df 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -31,7 +31,7 @@ is_list_like, _ensure_object) from pandas.types.cast import _maybe_upcast_putmask -from pandas.types.generic import ABCSeries, ABCIndex +from pandas.types.generic import ABCSeries, ABCIndex, ABCPeriodIndex # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory @@ -773,6 +773,15 @@ def wrapper(self, other, axis=None): if (not lib.isscalar(lib.item_from_zerodim(other)) and len(self) != len(other)): raise ValueError('Lengths must match to compare') + + if isinstance(other, ABCPeriodIndex): + # temp workaround until fixing GH 13637 + # tested in test_nat_comparisons + # (pandas.tests.series.test_operators.TestSeriesOperators) + return self._constructor(na_op(self.values, + other.asobject.values), + index=self.index) + return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 71d5fdd17ee5c..567d2a458dafa 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -224,7 +224,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, pass # maybe coerce to a sub-class - from pandas.tseries.period import PeriodIndex + from pandas.tseries.period import (PeriodIndex, + IncompatibleFrequency) if isinstance(data, PeriodIndex): return PeriodIndex(data, copy=copy, name=name, **kwargs) if issubclass(data.dtype.type, np.integer): @@ -265,13 +266,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - elif (inferred.startswith('timedelta') or - lib.is_timedelta_array(subarr)): + elif inferred.startswith('timedelta'): from pandas.tseries.tdi import TimedeltaIndex return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == 'period': - return PeriodIndex(subarr, name=name, **kwargs) + try: + return PeriodIndex(subarr, name=name, **kwargs) + except IncompatibleFrequency: + pass return cls._simple_new(subarr, name) elif hasattr(data, '__array__'): @@ -866,6 +869,16 @@ def _convert_can_do_setop(self, other): result_name = self.name if self.name == other.name else None return other, result_name + def _convert_for_op(self, value): + """ Convert value to be insertable to ndarray """ + return value + + def _assert_can_do_op(self, value): + """ Check value is valid for scalar op """ + if not lib.isscalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) + @property def nlevels(self): return 1 @@ -1508,16 +1521,6 @@ def hasnans(self): else: return False - def _convert_for_op(self, value): - """ Convert value to be insertable to ndarray """ - return value - - def _assert_can_do_op(self, value): - """ Check value is valid for scalar op """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) - def putmask(self, mask, value): """ return a new Index of the values set with the mask diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 9f96037c97c62..fe4748eb0eba0 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -270,7 +270,7 @@ cdef inline bint is_null_datetimelike(v): cdef inline bint is_null_datetime64(v): - # determine if we have a null for a datetime (or integer versions)x, + # determine if we have a null for a datetime (or integer versions), # excluding np.timedelta64('nat') if util._checknull(v): return True @@ -282,7 +282,7 @@ cdef inline bint is_null_datetime64(v): cdef inline bint is_null_timedelta64(v): - # determine if we have a null for a timedelta (or integer versions)x, + # determine if we have a null for a timedelta (or integer versions), # excluding np.datetime64('nat') if util._checknull(v): return True @@ -293,6 +293,16 @@ cdef inline bint is_null_timedelta64(v): return False +cdef inline bint is_null_period(v): + # determine if we have a null for a Period (or integer versions), + # excluding np.datetime64('nat') and np.timedelta64('nat') + if util._checknull(v): + return True + elif v is NaT: + return True + return False + + cdef inline bint is_datetime(object o): return PyDateTime_Check(o) @@ -531,6 +541,7 @@ def is_timedelta_array(ndarray values): return False return null_count != n + def is_timedelta64_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v @@ -546,6 +557,7 @@ def is_timedelta64_array(ndarray values): return False return null_count != n + def is_timedelta_or_timedelta64_array(ndarray values): """ infer with timedeltas and/or nat/none """ cdef Py_ssize_t i, null_count = 0, n = len(values) @@ -562,6 +574,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return False return null_count != n + def is_date_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: @@ -571,6 +584,7 @@ def is_date_array(ndarray[object] values): return False return True + def is_time_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: @@ -582,15 +596,21 @@ def is_time_array(ndarray[object] values): def is_period_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) - from pandas.tseries.period import Period - + cdef Py_ssize_t i, null_count = 0, n = len(values) + cdef object v if n == 0: return False + + # return False for all nulls for i in range(n): - if not isinstance(values[i], Period): + v = values[i] + if is_null_period(v): + # we are a regular null + if util._checknull(v): + null_count += 1 + elif not is_period(v): return False - return True + return null_count != n cdef extern from "parse_helper.h": diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 5c21f71d64660..af44767ae5be5 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -119,10 +119,10 @@ def test_pickle_compat_construction(self): def test_construction_index_with_mixed_timezones(self): # GH 11488 # no tz results in DatetimeIndex - result = Index( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + result = Index([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) self.assertIsNone(result.tz) @@ -295,9 +295,9 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')], name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00') - ], tz='Asia/Tokyo', name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), + Timestamp('2011-01-02 10:00')], + tz='Asia/Tokyo', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) @@ -338,6 +338,17 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='US/Eastern', name='idx') + def test_construction_base_constructor(self): + arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + def test_astype(self): # GH 13149, GH 13209 idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) @@ -699,12 +710,11 @@ def test_fillna_datetime64(self): pd.Timestamp('2011-01-01 11:00')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - idx = pd.DatetimeIndex( - ['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], tz=tz) + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], tz=tz) - exp = pd.DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], tz=tz) + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], tz=tz) self.assert_index_equal( idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) @@ -734,6 +744,26 @@ def setUp(self): def create_index(self): return period_range('20130101', periods=5, freq='D') + def test_construction_base_constructor(self): + # GH 13664 + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='D')] + tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) + + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.Index(np.array(arr), dtype=object)) + def test_astype(self): # GH 13149, GH 13209 idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') @@ -874,7 +904,6 @@ def test_repeat(self): self.assertEqual(res.freqstr, 'D') def test_period_index_indexer(self): - # GH4125 idx = pd.period_range('2002-01', '2003-12', freq='M') df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) @@ -886,12 +915,11 @@ def test_period_index_indexer(self): def test_fillna_period(self): # GH 11343 - idx = pd.PeriodIndex( - ['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], freq='H') + idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], freq='H') - exp = pd.PeriodIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], freq='H') + exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], freq='H') self.assert_index_equal( idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp) @@ -899,10 +927,11 @@ def test_fillna_period(self): pd.Period('2011-01-01 11:00', freq='H')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - with tm.assertRaisesRegexp( - ValueError, - 'Input has different freq=D from PeriodIndex\\(freq=H\\)'): - idx.fillna(pd.Period('2011-01-01', freq='D')) + exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), + pd.Period('2011-01-01', freq='D'), + pd.Period('2011-01-01 11:00', freq='H')], dtype=object) + self.assert_index_equal(idx.fillna(pd.Period('2011-01-01', freq='D')), + exp) def test_no_millisecond_field(self): with self.assertRaises(AttributeError): @@ -923,6 +952,17 @@ def setUp(self): def create_index(self): return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + def test_construction_base_constructor(self): + arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timedelta('1 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + def test_shift(self): # test shift for TimedeltaIndex # err8083 diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index 34d10ee9dfa42..9a12220f5b41d 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -431,6 +431,33 @@ def test_infer_dtype_timedelta(self): dtype=object) self.assertEqual(lib.infer_dtype(arr), 'mixed') + def test_infer_dtype_period(self): + # GH 13664 + arr = np.array([pd.Period('2011-01', freq='D'), + pd.Period('2011-02', freq='D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + arr = np.array([pd.Period('2011-01', freq='D'), + pd.Period('2011-02', freq='M')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Period('2011-01', freq='D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + arr = np.array([n, pd.Period('2011-01', freq='D'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + # different type of nat + arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) self.assertEqual(lib.infer_dtype(arr), 'floating') diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index fe0440170383b..188f538372092 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -800,12 +800,15 @@ def _ensure_datetimelike_to_i8(other): if lib.isscalar(other) and isnull(other): other = tslib.iNaT elif isinstance(other, ABCIndexClass): - # convert tz if needed if getattr(other, 'tz', None) is not None: other = other.tz_localize(None).asi8 else: other = other.asi8 else: - other = np.array(other, copy=False).view('i8') + try: + other = np.array(other, copy=False).view('i8') + except TypeError: + # period array cannot be coerces to int + other = Index(other).asi8 return other diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 6c996285369b8..4aa1e2f5d33dd 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1735,9 +1735,9 @@ def test_representation_to_series(self): 2 2013 dtype: object""" - exp6 = """0 2011-01-01 09:00 -1 2012-02-01 10:00 -2 NaT + exp6 = """0 2011-01-01 09:00 +1 2012-02-01 10:00 +2 NaT dtype: object""" exp7 = """0 2013Q1 From 31c2e5ffa9c8008e2d84dc5ffa02f2d938a32294 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 19 Jul 2016 08:47:19 -0400 Subject: [PATCH 122/359] PERF: improve DTI string parse closes #11169 closes #11287 Author: sinhrks Closes #13692 from sinhrks/dti_perf and squashes the following commits: 8774772 [sinhrks] PERF: improve DTI string parse --- doc/source/whatsnew/v0.19.0.txt | 4 + pandas/io/parsers.py | 2 +- pandas/tests/indexes/test_datetimelike.py | 31 +++----- pandas/tseries/index.py | 93 +++++------------------ pandas/tseries/resample.py | 7 +- pandas/tseries/tests/test_timeseries.py | 5 +- pandas/tseries/tests/test_tslib.py | 22 +++--- pandas/tseries/tools.py | 33 +++----- pandas/tslib.pyx | 55 +++----------- 9 files changed, 74 insertions(+), 178 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f65f7d57d5d08..69200d7142b9f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -566,6 +566,7 @@ Performance Improvements - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - Improved performance of ``Index.difference`` (:issue:`12044`) +- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) .. _whatsnew_0190.bug_fixes: @@ -631,6 +632,7 @@ Bug Fixes - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) + - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) @@ -654,6 +656,8 @@ Bug Fixes - Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) - Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`) +- Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`) +- Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 84ea2a92b8026..f6a84ea9debaa 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2440,7 +2440,7 @@ def converter(*date_cols): strs = _concat_date_cols(date_cols) try: - return tools._to_datetime( + return tools.to_datetime( _ensure_object(strs), utc=None, box=False, diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index af44767ae5be5..378e8c545ec83 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -170,16 +170,6 @@ def test_construction_index_with_mixed_timezones(self): self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) - # passing tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 19:00'), - Timestamp('2011-01-03 00:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - # length = 1 result = Index([Timestamp('2011-01-01')], name='idx') exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx') @@ -253,17 +243,6 @@ def test_construction_index_with_mixed_timezones_with_NaT(self): self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) - # passing tz results in DatetimeIndex - result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 19:00'), - pd.NaT, Timestamp('2011-01-03 00:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - # all NaT result = Index([pd.NaT, pd.NaT], name='idx') exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx') @@ -323,12 +302,13 @@ def test_construction_dti_with_mixed_timezones(self): self.assertTrue(isinstance(result, DatetimeIndex)) # tz mismatch affecting to tz-aware raises TypeError/ValueError + with tm.assertRaises(ValueError): DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), Timestamp('2011-01-02 10:00', tz='US/Eastern')], name='idx') - with tm.assertRaises(TypeError): + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): DatetimeIndex([Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='Asia/Tokyo', name='idx') @@ -338,6 +318,13 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='US/Eastern', name='idx') + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): + # passing tz should results in DatetimeIndex, then mismatch raises + # TypeError + Index([pd.NaT, Timestamp('2011-01-01 10:00'), + pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], + tz='Asia/Tokyo', name='idx') + def test_construction_base_constructor(self): arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 47bb69b8d7ad6..d448ca9878b99 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -292,55 +292,32 @@ def __new__(cls, data=None, raise ValueError('DatetimeIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) - # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) - data = np.asarray(data, dtype='O') + elif isinstance(data, ABCSeries): + data = data._values - # try a few ways to make it datetime64 - if lib.is_string_array(data): - data = tslib.parse_str_array_to_datetime(data, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) - else: - data = tools.to_datetime(data, errors='raise') - data.offset = freq - if isinstance(data, DatetimeIndex): - if name is not None: - data.name = name - - if tz is not None: - - # we might already be localized to this tz - # so passing the same tz is ok - # however any other tz is a no-no - if data.tz is None: - return data.tz_localize(tz, ambiguous=ambiguous) - elif str(tz) != str(data.tz): - raise TypeError("Already tz-aware, use tz_convert " - "to convert.") - - return data._deepcopy_if_needed(ref_to_data, copy) - - if issubclass(data.dtype.type, compat.string_types): - data = tslib.parse_str_array_to_datetime(data, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) + # data must be Index or np.ndarray here + if not (is_datetime64_dtype(data) or is_datetimetz(data) or + is_integer_dtype(data)): + data = tools.to_datetime(data, dayfirst=dayfirst, + yearfirst=yearfirst) if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): - if isinstance(data, ABCSeries): - data = data._values + if isinstance(data, DatetimeIndex): if tz is None: tz = data.tz - + elif data.tz is None: + data = data.tz_localize(tz, ambiguous=ambiguous) else: # the tz's must match if str(tz) != str(data.tz): - raise TypeError("Already tz-aware, use tz_convert " - "to convert.") + msg = ('data is already tz-aware {0}, unable to ' + 'set specified tz: {1}') + raise TypeError(msg.format(data.tz, tz)) subarr = data.values @@ -356,35 +333,6 @@ def __new__(cls, data=None, if isinstance(data, Int64Index): raise TypeError('cannot convert Int64Index->DatetimeIndex') subarr = data.view(_NS_DTYPE) - else: - if isinstance(data, (ABCSeries, Index)): - values = data._values - else: - values = data - - if lib.is_string_array(values): - subarr = tslib.parse_str_array_to_datetime( - values, freq=freq, dayfirst=dayfirst, yearfirst=yearfirst) - else: - try: - subarr = tools.to_datetime(data, box=False) - - # make sure that we have a index/ndarray like (and not a - # Series) - if isinstance(subarr, ABCSeries): - subarr = subarr._values - if subarr.dtype == np.object_: - subarr = tools._to_datetime(subarr, box=False) - - except ValueError: - # tz aware - subarr = tools._to_datetime(data, box=False, utc=True) - - # we may not have been able to convert - if not (is_datetimetz(subarr) or - np.issubdtype(subarr.dtype, np.datetime64)): - raise ValueError('Unable to convert %s to datetime dtype' - % str(data)) if isinstance(subarr, DatetimeIndex): if tz is None: @@ -399,27 +347,21 @@ def __new__(cls, data=None, ints = subarr.view('i8') subarr = tslib.tz_localize_to_utc(ints, tz, ambiguous=ambiguous) - subarr = subarr.view(_NS_DTYPE) subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) - - # if dtype is provided, coerce here if dtype is not None: - if not is_dtype_equal(subarr.dtype, dtype): - + # dtype must be coerced to DatetimeTZDtype above if subarr.tz is not None: raise ValueError("cannot localize from non-UTC data") - dtype = DatetimeTZDtype.construct_from_string(dtype) - subarr = subarr.tz_localize(dtype.tz) if verify_integrity and len(subarr) > 0: if freq is not None and not freq_infer: inferred = subarr.inferred_freq if inferred != freq.freqstr: - on_freq = cls._generate(subarr[0], None, len( - subarr), None, freq, tz=tz, ambiguous=ambiguous) + on_freq = cls._generate(subarr[0], None, len(subarr), None, + freq, tz=tz, ambiguous=ambiguous) if not np.array_equal(subarr.asi8, on_freq.asi8): raise ValueError('Inferred frequency {0} from passed ' 'dates does not conform to passed ' @@ -563,7 +505,6 @@ def _generate(cls, start, end, periods, name, offset, index = index[1:] if not right_closed and len(index) and index[-1] == end: index = index[:-1] - index = cls._simple_new(index, name=name, freq=offset, tz=tz) return index @@ -669,7 +610,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None, xdr = generate_range(offset=offset, start=_CACHE_START, end=_CACHE_END) - arr = tools._to_datetime(list(xdr), box=False) + arr = tools.to_datetime(list(xdr), box=False) cachedRange = DatetimeIndex._simple_new(arr) cachedRange.offset = offset diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 8d6955ab43711..e493e9d936b02 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1046,7 +1046,12 @@ def _get_binner_for_grouping(self, obj): l = [] for key, group in grouper.get_iterator(self.ax): l.extend([key] * len(group)) - grouper = binner.__class__(l, freq=binner.freq, name=binner.name) + + if isinstance(self.ax, PeriodIndex): + grouper = binner.__class__(l, freq=binner.freq, name=binner.name) + else: + # resampling causes duplicated values, specifying freq is invalid + grouper = binner.__class__(l, name=binner.name) # since we may have had to sort # may need to reorder groups here diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 299ec374567e7..59fc147ead4eb 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4087,8 +4087,9 @@ def test_dti_set_index_reindex(self): # 11314 # with tz - index = date_range(datetime(2015, 10, 1), datetime( - 2015, 10, 1, 23), freq='H', tz='US/Eastern') + index = date_range(datetime(2015, 10, 1), + datetime(2015, 10, 1, 23), + freq='H', tz='US/Eastern') df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) new_index = date_range(datetime(2015, 10, 2), datetime(2015, 10, 2, 23), diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index f30f01e66cb0b..22bb3bddbc742 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -7,7 +7,8 @@ import datetime import pandas as pd -from pandas.core.api import Timestamp, Series, Timedelta, Period, to_datetime +from pandas.core.api import (Timestamp, Index, Series, Timedelta, Period, + to_datetime) from pandas.tslib import get_timezone from pandas._period import period_asfreq, period_ordinal from pandas.tseries.index import date_range, DatetimeIndex @@ -698,14 +699,19 @@ def test_parsers(self): yearfirst=yearfirst) result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) + # result5 is used below result4 = to_datetime(np.array([date_str], dtype=object), yearfirst=yearfirst) - result6 = DatetimeIndex([date_str], yearfirst=yearfirst)[0] - self.assertEqual(result1, expected) - self.assertEqual(result2, expected) - self.assertEqual(result3, expected) - self.assertEqual(result4, expected) - self.assertEqual(result6, expected) + result6 = DatetimeIndex([date_str], yearfirst=yearfirst) + # result7 is used below + result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) + result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst) + + for res in [result1, result2]: + self.assertEqual(res, expected) + for res in [result3, result4, result6, result8, result9]: + exp = DatetimeIndex([pd.Timestamp(expected)]) + tm.assert_index_equal(res, exp) # these really need to have yearfist, but we don't support if not yearfirst: @@ -893,9 +899,7 @@ def test_parsers_monthfreq(self): for date_str, expected in compat.iteritems(cases): result1, _, _ = tools.parse_time_string(date_str, freq='M') - result2 = tools._to_datetime(date_str, freq='M') self.assertEqual(result1, expected) - self.assertEqual(result2, expected) def test_parsers_quarterly_with_freq(self): msg = ('Incorrect quarterly string is given, quarter ' diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 067e8ec19f644..93d35ff964e69 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -295,22 +295,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, 1 loop, best of 3: 471 ms per loop """ - return _to_datetime(arg, errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, box=box, format=format, exact=exact, - unit=unit, infer_datetime_format=infer_datetime_format) - -def _to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - utc=None, box=True, format=None, exact=True, - unit=None, freq=None, infer_datetime_format=False): - """ - Same as to_datetime, but accept freq for - DatetimeIndex internal construction - """ from pandas.tseries.index import DatetimeIndex - def _convert_listlike(arg, box, format, name=None): + tz = 'utc' if utc else None + + def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') @@ -319,8 +309,7 @@ def _convert_listlike(arg, box, format, name=None): if is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: - return DatetimeIndex(arg, tz='utc' if utc else None, - name=name) + return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass @@ -328,7 +317,7 @@ def _convert_listlike(arg, box, format, name=None): elif is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): - return DatetimeIndex(arg, tz='utc' if utc else None) + return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg @@ -344,8 +333,7 @@ def _convert_listlike(arg, box, format, name=None): from pandas import Index return Index(result) - return DatetimeIndex(result, tz='utc' if utc else None, - name=name) + return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' @@ -382,8 +370,8 @@ def _convert_listlike(arg, box, format, name=None): # fallback if result is None: try: - result = tslib.array_strptime( - arg, format, exact=exact, errors=errors) + result = tslib.array_strptime(arg, format, exact=exact, + errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise @@ -404,14 +392,11 @@ def _convert_listlike(arg, box, format, name=None): utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, - freq=freq, require_iso8601=require_iso8601 ) if is_datetime64_dtype(result) and box: - result = DatetimeIndex(result, - tz='utc' if utc else None, - name=name) + result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 5624b84523705..016c49ea2b859 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -852,13 +852,6 @@ cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1: return _nat_scalar_rules[op] -cdef _tz_format(object obj, object zone): - try: - return obj.strftime(' %%Z, tz=%s' % zone) - except: - return ', tz=%s' % zone - - cpdef object get_value_box(ndarray arr, object loc): cdef: Py_ssize_t i, sz @@ -1642,14 +1635,6 @@ cdef inline _check_dts_bounds(pandas_datetimestruct *dts): raise OutOfBoundsDatetime('Out of bounds nanosecond timestamp: %s' % fmt) -# elif isinstance(ts, _Timestamp): -# tmp = ts -# obj.value = (<_Timestamp> ts).value -# obj.dtval = -# elif isinstance(ts, object): -# # If all else fails -# obj.value = _dtlike_to_datetime64(ts, &obj.dts) -# obj.dtval = _dts_to_pydatetime(&obj.dts) def datetime_to_datetime64(ndarray[object] values): cdef: @@ -1689,7 +1674,7 @@ def datetime_to_datetime64(ndarray[object] values): cdef: set _not_datelike_strings = set(['a','A','m','M','p','P','t','T']) -cpdef object _does_string_look_like_datetime(object date_string): +cpdef bint _does_string_look_like_datetime(object date_string): if date_string.startswith('0'): # Strings starting with 0 are more consistent with a # date-like string than a number @@ -1827,8 +1812,14 @@ def parse_datetime_string(object date_string, object freq=None, except ValueError: pass - dt = parse_date(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + try: + dt = parse_date(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + except TypeError: + # following may be raised from dateutil + # TypeError: 'NoneType' object is not iterable + raise ValueError('Given date string not likely a datetime.') + return dt @@ -2214,7 +2205,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): cpdef array_to_datetime(ndarray[object] values, errors='raise', - dayfirst=False, yearfirst=False, freq=None, + dayfirst=False, yearfirst=False, format=None, utc=None, require_iso8601=False): cdef: @@ -2343,7 +2334,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) + yearfirst=yearfirst) except Exception: if is_coerce: iresult[i] = NPY_NAT @@ -2423,7 +2414,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) + yearfirst=yearfirst) _pydatetime_to_dts(oresult[i], &dts) _check_dts_bounds(&dts) except Exception: @@ -2438,28 +2429,6 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return oresult -def parse_str_array_to_datetime(ndarray values, dayfirst=False, - yearfirst=False, object freq=None): - """Shortcut to parse str array for quicker DatetimeIndex construction""" - cdef: - Py_ssize_t i, n = len(values) - object val, py_dt - ndarray[int64_t] iresult - _TSObject _ts - - iresult = np.empty(n, dtype='i8') - - for i in range(n): - val = values[i] - try: - py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) - except Exception: - raise ValueError - _ts = convert_to_tsobject(py_dt, None, None, 0, 0) - iresult[i] = _ts.value - - return iresult # Similar to Timestamp/datetime, this is a construction requirement for timedeltas # we need to do object instantiation in python From 4c9ae94f1ee4d867e2d92735d5755d43daef618d Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 19 Jul 2016 09:11:08 -0400 Subject: [PATCH 123/359] DOC: resample warnings closes #13618 closes #13520 Author: Chris Closes #13675 from chris-b1/resample-warning and squashes the following commits: 2185c1f [Chris] whatsnew note c58c70c [Chris] DOC: resample warnings --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tseries/resample.py | 52 ++++++++++++++++----------- pandas/tseries/tests/test_resample.py | 7 ++++ 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 69200d7142b9f..efa6e5575fa79 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -639,7 +639,7 @@ Bug Fixes - Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) - Bug in ``pd.to_datetime()`` which overflowed on ``int8``, and ``int16`` dtypes (:issue:`13451`) - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - +- Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`) - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`) - Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index e493e9d936b02..38c2e009a01f3 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -60,12 +60,15 @@ class Resampler(_GroupBy): 'loffset', 'base', 'kind'] # API compat of allowed attributes - _deprecated_valids = _attributes + ['_ipython_display_', '__doc__', - '_cache', '_attributes', 'binner', - 'grouper', 'groupby', 'keys', - 'sort', 'kind', 'squeeze', - 'group_keys', 'as_index', - 'exclusions', '_groupby'] + _deprecated_valids = _attributes + ['__doc__', '_cache', '_attributes', + 'binner', 'grouper', 'groupby', + 'sort', 'kind', 'squeeze', 'keys', + 'group_keys', 'as_index', 'exclusions', + '_groupby'] + + # don't raise deprecation warning on attributes starting with these + # patterns - prevents warnings caused by IPython introspection + _deprecated_valid_patterns = ['_ipython', '_repr'] # API compat of disallowed attributes _deprecated_invalids = ['iloc', 'loc', 'ix', 'iat', 'at'] @@ -109,9 +112,12 @@ def _typ(self): return 'series' return 'dataframe' - def _deprecated(self): - warnings.warn(".resample() is now a deferred operation\n" - "use .resample(...).mean() instead of .resample(...)", + def _deprecated(self, op): + warnings.warn(("\n.resample() is now a deferred operation\n" + "You called {op}(...) on this deferred object " + "which materialized it into a {klass}\nby implicitly " + "taking the mean. Use .resample(...).mean() " + "instead").format(op=op, klass=self._typ), FutureWarning, stacklevel=3) return self.mean() @@ -119,20 +125,20 @@ def _make_deprecated_binop(op): # op is a string def _evaluate_numeric_binop(self, other): - result = self._deprecated() + result = self._deprecated(op) return getattr(result, op)(other) return _evaluate_numeric_binop - def _make_deprecated_unary(op): + def _make_deprecated_unary(op, name): # op is a callable def _evaluate_numeric_unary(self): - result = self._deprecated() + result = self._deprecated(name) return op(result) return _evaluate_numeric_unary def __array__(self): - return self._deprecated().__array__() + return self._deprecated('__array__').__array__() __gt__ = _make_deprecated_binop('__gt__') __ge__ = _make_deprecated_binop('__ge__') @@ -148,10 +154,10 @@ def __array__(self): __truediv__ = __rtruediv__ = _make_deprecated_binop('__truediv__') if not compat.PY3: __div__ = __rdiv__ = _make_deprecated_binop('__div__') - __neg__ = _make_deprecated_unary(lambda x: -x) - __pos__ = _make_deprecated_unary(lambda x: x) - __abs__ = _make_deprecated_unary(lambda x: np.abs(x)) - __inv__ = _make_deprecated_unary(lambda x: -x) + __neg__ = _make_deprecated_unary(lambda x: -x, '__neg__') + __pos__ = _make_deprecated_unary(lambda x: x, '__pos__') + __abs__ = _make_deprecated_unary(lambda x: np.abs(x), '__abs__') + __inv__ = _make_deprecated_unary(lambda x: -x, '__inv__') def __getattr__(self, attr): if attr in self._internal_names_set: @@ -165,8 +171,12 @@ def __getattr__(self, attr): raise ValueError(".resample() is now a deferred operation\n" "\tuse .resample(...).mean() instead of " ".resample(...)") - if attr not in self._deprecated_valids: - self = self._deprecated() + + matches_pattern = any(attr.startswith(x) for x + in self._deprecated_valid_patterns) + if not matches_pattern and attr not in self._deprecated_valids: + self = self._deprecated(attr) + return object.__getattribute__(self, attr) def __setattr__(self, attr, value): @@ -182,7 +192,7 @@ def __getitem__(self, key): # compat for deprecated if isinstance(self.obj, com.ABCSeries): - return self._deprecated()[key] + return self._deprecated('__getitem__')[key] raise @@ -230,7 +240,7 @@ def _assure_grouper(self): def plot(self, *args, **kwargs): # for compat with prior versions, we want to # have the warnings shown here and just have this work - return self._deprecated().plot(*args, **kwargs) + return self._deprecated('plot').plot(*args, **kwargs) def aggregate(self, arg, *args, **kwargs): """ diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 518f69485004c..85d8cd52e1866 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -168,6 +168,13 @@ def f(): check_stacklevel=False): self.assertIsInstance(getattr(r, op)(2), pd.Series) + # IPython introspection shouldn't trigger warning GH 13618 + for op in ['_repr_json', '_repr_latex', + '_ipython_canary_method_should_not_exist_']: + r = self.series.resample('H') + with tm.assert_produces_warning(None): + getattr(r, op, None) + # getitem compat df = self.series.to_frame('foo') From 8acfad343c88760a6d09fea221996dd50393fa8a Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 19 Jul 2016 16:31:08 -0400 Subject: [PATCH 124/359] CLN: Removed the flavor='mysql' option and deprecate flavor in DataFrame.to_sql (#13611) --- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/core/generic.py | 13 +- pandas/io/sql.py | 165 +++++++------------- pandas/io/tests/test_sql.py | 262 ++++++++++---------------------- 4 files changed, 144 insertions(+), 298 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index efa6e5575fa79..57b0d8895f67b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -524,6 +524,7 @@ Deprecations - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``DataFrame.to_sql()`` has deprecated the ``flavor`` parameter, as it is superfluous when SQLAlchemy is not installed (:issue:`13611`) - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) - ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) @@ -541,6 +542,7 @@ Removal of prior version deprecations/changes - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) +- ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) - Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c1676fbdd7f4..e59bec2dbd7e0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1144,7 +1144,7 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): return packers.to_msgpack(path_or_buf, self, encoding=encoding, **kwargs) - def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail', + def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): """ Write records stored in a DataFrame to a SQL database. @@ -1155,12 +1155,11 @@ def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail', Name of SQL table con : SQLAlchemy engine or DBAPI2 connection (legacy mode) Using SQLAlchemy makes it possible to use any DB supported by that - library. - If a DBAPI2 object, only sqlite3 is supported. - flavor : {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy engine. - 'mysql' is deprecated and will be removed in future versions, but - it will be further supported through SQLAlchemy engines. + library. If a DBAPI2 object, only sqlite3 is supported. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version, + as 'sqlite' is the only supported option if SQLAlchemy is not + installed. schema : string, default None Specify the schema (if database flavor supports this). If None, use default schema. diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 8485a3f13f047..b9eaa0e4d657b 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -41,6 +41,24 @@ class DatabaseError(IOError): _SQLALCHEMY_INSTALLED = None +def _validate_flavor_parameter(flavor): + """ + Checks whether a database 'flavor' was specified. + If not None, produces FutureWarning if 'sqlite' and + raises a ValueError if anything else. + """ + if flavor is not None: + if flavor == 'sqlite': + warnings.warn("the 'flavor' parameter is deprecated " + "and will be removed in a future version, " + "as 'sqlite' is the only supported option " + "when SQLAlchemy is not installed.", + FutureWarning, stacklevel=2) + else: + raise ValueError("database flavor {flavor} is not " + "supported".format(flavor=flavor)) + + def _is_sqlalchemy_connectable(con): global _SQLALCHEMY_INSTALLED if _SQLALCHEMY_INSTALLED is None: @@ -517,7 +535,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, chunksize=chunksize) -def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', +def to_sql(frame, name, con, flavor=None, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): """ Write records stored in a DataFrame to a SQL database. @@ -532,10 +550,8 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - flavor : {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy connectable. - 'mysql' is deprecated and will be removed in future versions, but it - will be further supported through SQLAlchemy connectables. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version schema : string, default None Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). @@ -573,7 +589,7 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', chunksize=chunksize, dtype=dtype) -def has_table(table_name, con, flavor='sqlite', schema=None): +def has_table(table_name, con, flavor=None, schema=None): """ Check if DataBase has named table. @@ -585,10 +601,8 @@ def has_table(table_name, con, flavor='sqlite', schema=None): Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - flavor: {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy connectable. - 'mysql' is deprecated and will be removed in future versions, but it - will be further supported through SQLAlchemy connectables. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version schema : string, default None Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). @@ -603,12 +617,6 @@ def has_table(table_name, con, flavor='sqlite', schema=None): table_exists = has_table -_MYSQL_WARNING = ("The 'mysql' flavor with DBAPI connection is deprecated " - "and will be removed in future versions. " - "MySQL will be further supported with SQLAlchemy " - "connectables.") - - def _engine_builder(con): """ Returns a SQLAlchemy engine from a URI (if con is a string) @@ -632,15 +640,15 @@ def pandasSQL_builder(con, flavor=None, schema=None, meta=None, Convenience function to return the correct PandasSQL subclass based on the provided parameters """ + _validate_flavor_parameter(flavor) + # When support for DBAPI connections is removed, # is_cursor should not be necessary. con = _engine_builder(con) if _is_sqlalchemy_connectable(con): return SQLDatabase(con, schema=schema, meta=meta) else: - if flavor == 'mysql': - warnings.warn(_MYSQL_WARNING, FutureWarning, stacklevel=3) - return SQLiteDatabase(con, flavor, is_cursor=is_cursor) + return SQLiteDatabase(con, is_cursor=is_cursor) class SQLTable(PandasObject): @@ -1035,11 +1043,11 @@ class PandasSQL(PandasObject): def read_sql(self, *args, **kwargs): raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or connection+sql flavor") + "connectable or sqlite connection") def to_sql(self, *args, **kwargs): raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or connection+sql flavor") + "connectable or sqlite connection") class SQLDatabase(PandasSQL): @@ -1308,38 +1316,16 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): # ---- SQL without SQLAlchemy --- -# Flavour specific sql strings and handler class for access to DBs without -# SQLAlchemy installed -# SQL type convertions for each DB +# sqlite-specific sql strings and handler class +# dictionary used for readability purposes _SQL_TYPES = { - 'string': { - 'mysql': 'VARCHAR (63)', - 'sqlite': 'TEXT', - }, - 'floating': { - 'mysql': 'DOUBLE', - 'sqlite': 'REAL', - }, - 'integer': { - 'mysql': 'BIGINT', - 'sqlite': 'INTEGER', - }, - 'datetime': { - 'mysql': 'DATETIME', - 'sqlite': 'TIMESTAMP', - }, - 'date': { - 'mysql': 'DATE', - 'sqlite': 'DATE', - }, - 'time': { - 'mysql': 'TIME', - 'sqlite': 'TIME', - }, - 'boolean': { - 'mysql': 'BOOLEAN', - 'sqlite': 'INTEGER', - } + 'string': 'TEXT', + 'floating': 'REAL', + 'integer': 'INTEGER', + 'datetime': 'TIMESTAMP', + 'date': 'DATE', + 'time': 'TIME', + 'boolean': 'INTEGER', } @@ -1351,22 +1337,6 @@ def _get_unicode_name(name): return uname -def _get_valid_mysql_name(name): - # Filter for unquoted identifiers - # See http://dev.mysql.com/doc/refman/5.0/en/identifiers.html - uname = _get_unicode_name(name) - if not len(uname): - raise ValueError("Empty table or column name specified") - - basere = r'[0-9,a-z,A-Z$_]' - for c in uname: - if not re.match(basere, c): - if not (0x80 < ord(c) < 0xFFFF): - raise ValueError("Invalid MySQL identifier '%s'" % uname) - - return '`' + uname + '`' - - def _get_valid_sqlite_name(name): # See http://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ # -for-sqlite-table-column-names-in-python @@ -1385,19 +1355,6 @@ def _get_valid_sqlite_name(name): return '"' + uname.replace('"', '""') + '"' -# SQL enquote and wildcard symbols -_SQL_WILDCARD = { - 'mysql': '%s', - 'sqlite': '?' -} - -# Validate and return escaped identifier -_SQL_GET_IDENTIFIER = { - 'mysql': _get_valid_mysql_name, - 'sqlite': _get_valid_sqlite_name, -} - - _SAFE_NAMES_WARNING = ("The spaces in these column names will not be changed. " "In pandas versions < 0.14, spaces were converted to " "underscores.") @@ -1428,9 +1385,8 @@ def _execute_create(self): def insert_statement(self): names = list(map(text_type, self.frame.columns)) - flv = self.pd_sql.flavor - wld = _SQL_WILDCARD[flv] # wildcard char - escape = _SQL_GET_IDENTIFIER[flv] + wld = '?' # wildcard char + escape = _get_valid_sqlite_name if self.index is not None: [names.insert(0, idx) for idx in self.index[::-1]] @@ -1460,8 +1416,7 @@ def _create_table_setup(self): if any(map(pat.search, column_names)): warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6) - flv = self.pd_sql.flavor - escape = _SQL_GET_IDENTIFIER[flv] + escape = _get_valid_sqlite_name create_tbl_stmts = [escape(cname) + ' ' + ctype for cname, ctype, _ in column_names_and_types] @@ -1514,7 +1469,7 @@ def _sql_type_name(self, col): if col_type not in _SQL_TYPES: col_type = "string" - return _SQL_TYPES[col_type][self.pd_sql.flavor] + return _SQL_TYPES[col_type] class SQLiteDatabase(PandasSQL): @@ -1522,25 +1477,17 @@ class SQLiteDatabase(PandasSQL): Version of SQLDatabase to support sqlite connections (fallback without sqlalchemy). This should only be used internally. - For now still supports `flavor` argument to deal with 'mysql' database - for backwards compatibility, but this will be removed in future versions. - Parameters ---------- con : sqlite connection object """ - def __init__(self, con, flavor, is_cursor=False): + def __init__(self, con, flavor=None, is_cursor=False): + _validate_flavor_parameter(flavor) + self.is_cursor = is_cursor self.con = con - if flavor is None: - flavor = 'sqlite' - if flavor not in ['sqlite', 'mysql']: - raise NotImplementedError("flavors other than SQLite and MySQL " - "are not supported") - else: - self.flavor = flavor @contextmanager def run_transaction(self): @@ -1665,15 +1612,12 @@ def to_sql(self, frame, name, if_exists='fail', index=True, def has_table(self, name, schema=None): # TODO(wesm): unused? - # escape = _SQL_GET_IDENTIFIER[self.flavor] + # escape = _get_valid_sqlite_name # esc_name = escape(name) - wld = _SQL_WILDCARD[self.flavor] - flavor_map = { - 'sqlite': ("SELECT name FROM sqlite_master " - "WHERE type='table' AND name=%s;") % wld, - 'mysql': "SHOW TABLES LIKE %s" % wld} - query = flavor_map.get(self.flavor) + wld = '?' + query = ("SELECT name FROM sqlite_master " + "WHERE type='table' AND name=%s;") % wld return len(self.execute(query, [name, ]).fetchall()) > 0 @@ -1681,8 +1625,7 @@ def get_table(self, table_name, schema=None): return None # not supported in fallback mode def drop_table(self, name, schema=None): - escape = _SQL_GET_IDENTIFIER[self.flavor] - drop_sql = "DROP TABLE %s" % escape(name) + drop_sql = "DROP TABLE %s" % _get_valid_sqlite_name(name) self.execute(drop_sql) def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): @@ -1691,7 +1634,7 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): return str(table.sql_schema()) -def get_schema(frame, name, flavor='sqlite', keys=None, con=None, dtype=None): +def get_schema(frame, name, flavor=None, keys=None, con=None, dtype=None): """ Get the SQL db table schema for the given frame. @@ -1700,16 +1643,14 @@ def get_schema(frame, name, flavor='sqlite', keys=None, con=None, dtype=None): frame : DataFrame name : string name of SQL table - flavor : {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy connectable. - 'mysql' is deprecated and will be removed in future versions, but it - will be further supported through SQLAlchemy engines. keys : string or sequence, default: None columns to use a primary key con: an open SQL database connection object or a SQLAlchemy connectable Using SQLAlchemy makes it possible to use any DB supported by that library, default: None If a DBAPI2 object, only sqlite3 is supported. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version dtype : dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should be a SQLAlchemy type, or a string for sqlite3 fallback connection. diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index e5a49c5213a48..41be39f9abaa6 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -13,7 +13,7 @@ common methods, `_TestSQLAlchemyConn` tests the API with a SQLAlchemy Connection object. The different tested flavors (sqlite3, MySQL, PostgreSQL) derive from the base class - - Tests for the fallback mode (`TestSQLiteFallback` and `TestMySQLLegacy`) + - Tests for the fallback mode (`TestSQLiteFallback`) """ @@ -526,30 +526,29 @@ def test_read_sql_view(self): self._check_iris_loaded_frame(iris_frame) def test_to_sql(self): - sql.to_sql(self.test_frame1, 'test_frame1', self.conn, flavor='sqlite') + sql.to_sql(self.test_frame1, 'test_frame1', self.conn) self.assertTrue( - sql.has_table('test_frame1', self.conn, flavor='sqlite'), + sql.has_table('test_frame1', self.conn), 'Table not written to DB') def test_to_sql_fail(self): sql.to_sql(self.test_frame1, 'test_frame2', - self.conn, flavor='sqlite', if_exists='fail') + self.conn, if_exists='fail') self.assertTrue( - sql.has_table('test_frame2', self.conn, flavor='sqlite'), + sql.has_table('test_frame2', self.conn), 'Table not written to DB') self.assertRaises(ValueError, sql.to_sql, self.test_frame1, - 'test_frame2', self.conn, flavor='sqlite', - if_exists='fail') + 'test_frame2', self.conn, if_exists='fail') def test_to_sql_replace(self): sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, flavor='sqlite', if_exists='fail') + self.conn, if_exists='fail') # Add to table again sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, flavor='sqlite', if_exists='replace') + self.conn, if_exists='replace') self.assertTrue( - sql.has_table('test_frame3', self.conn, flavor='sqlite'), + sql.has_table('test_frame3', self.conn), 'Table not written to DB') num_entries = len(self.test_frame1) @@ -560,13 +559,13 @@ def test_to_sql_replace(self): def test_to_sql_append(self): sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, flavor='sqlite', if_exists='fail') + self.conn, if_exists='fail') # Add to table again sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, flavor='sqlite', if_exists='append') + self.conn, if_exists='append') self.assertTrue( - sql.has_table('test_frame4', self.conn, flavor='sqlite'), + sql.has_table('test_frame4', self.conn), 'Table not written to DB') num_entries = 2 * len(self.test_frame1) @@ -576,26 +575,25 @@ def test_to_sql_append(self): num_rows, num_entries, "not the same number of rows as entries") def test_to_sql_type_mapping(self): - sql.to_sql(self.test_frame3, 'test_frame5', - self.conn, flavor='sqlite', index=False) + sql.to_sql(self.test_frame3, 'test_frame5', self.conn, index=False) result = sql.read_sql("SELECT * FROM test_frame5", self.conn) tm.assert_frame_equal(self.test_frame3, result) def test_to_sql_series(self): s = Series(np.arange(5, dtype='int64'), name='series') - sql.to_sql(s, "test_series", self.conn, flavor='sqlite', index=False) + sql.to_sql(s, "test_series", self.conn, index=False) s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) tm.assert_frame_equal(s.to_frame(), s2) def test_to_sql_panel(self): panel = tm.makePanel() self.assertRaises(NotImplementedError, sql.to_sql, panel, - 'test_panel', self.conn, flavor='sqlite') + 'test_panel', self.conn) def test_roundtrip(self): sql.to_sql(self.test_frame1, 'test_frame_roundtrip', - con=self.conn, flavor='sqlite') + con=self.conn) result = sql.read_sql_query( 'SELECT * FROM test_frame_roundtrip', con=self.conn) @@ -609,7 +607,7 @@ def test_roundtrip(self): def test_roundtrip_chunksize(self): sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn, - index=False, flavor='sqlite', chunksize=2) + index=False, chunksize=2) result = sql.read_sql_query( 'SELECT * FROM test_frame_roundtrip', con=self.conn) @@ -764,27 +762,25 @@ def test_integer_col_names(self): if_exists='replace') def test_get_schema(self): - create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite', - con=self.conn) + create_sql = sql.get_schema(self.test_frame1, 'test', con=self.conn) self.assertTrue('CREATE' in create_sql) def test_get_schema_dtypes(self): float_frame = DataFrame({'a': [1.1, 1.2], 'b': [2.1, 2.2]}) dtype = sqlalchemy.Integer if self.mode == 'sqlalchemy' else 'INTEGER' - create_sql = sql.get_schema(float_frame, 'test', 'sqlite', + create_sql = sql.get_schema(float_frame, 'test', con=self.conn, dtype={'b': dtype}) self.assertTrue('CREATE' in create_sql) self.assertTrue('INTEGER' in create_sql) def test_get_schema_keys(self): frame = DataFrame({'Col1': [1.1, 1.2], 'Col2': [2.1, 2.2]}) - create_sql = sql.get_schema(frame, 'test', 'sqlite', - con=self.conn, keys='Col1') + create_sql = sql.get_schema(frame, 'test', con=self.conn, keys='Col1') constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' self.assertTrue(constraint_sentence in create_sql) # multiple columns as key (GH10385) - create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite', + create_sql = sql.get_schema(self.test_frame1, 'test', con=self.conn, keys=['A', 'B']) constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' self.assertTrue(constraint_sentence in create_sql) @@ -1044,8 +1040,8 @@ def test_sql_open_close(self): with tm.ensure_clean() as name: conn = self.connect(name) - sql.to_sql(self.test_frame3, "test_frame3_legacy", conn, - flavor="sqlite", index=False) + sql.to_sql(self.test_frame3, "test_frame3_legacy", + conn, index=False) conn.close() conn = self.connect(name) @@ -1067,12 +1063,11 @@ def test_safe_names_warning(self): df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b ']) # has a space # warns on create table with spaces in names with tm.assert_produces_warning(): - sql.to_sql(df, "test_frame3_legacy", self.conn, - flavor="sqlite", index=False) + sql.to_sql(df, "test_frame3_legacy", self.conn, index=False) def test_get_schema2(self): # without providing a connection object (available for backwards comp) - create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite') + create_sql = sql.get_schema(self.test_frame1, 'test') self.assertTrue('CREATE' in create_sql) def test_tquery(self): @@ -1098,7 +1093,7 @@ def test_sqlite_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) df = DataFrame({'time': to_datetime(['201412120154', '201412110254'], utc=True)}) - db = sql.SQLiteDatabase(self.conn, self.flavor) + db = sql.SQLiteDatabase(self.conn) table = sql.SQLiteTable("test_type", db, frame=df) schema = table.sql_schema() self.assertEqual(self._get_sqlite_column_type(schema, 'time'), @@ -1908,16 +1903,12 @@ def connect(cls): def setUp(self): self.conn = self.connect() - self.pandasSQL = sql.SQLiteDatabase(self.conn, 'sqlite') + self.pandasSQL = sql.SQLiteDatabase(self.conn) self._load_iris_data() self._load_test1_data() - def test_invalid_flavor(self): - self.assertRaises( - NotImplementedError, sql.SQLiteDatabase, self.conn, 'oracle') - def test_read_sql(self): self._read_sql_iris() @@ -1965,7 +1956,7 @@ def test_execute_sql(self): def test_datetime_date(self): # test support for datetime.date df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - df.to_sql('test_date', self.conn, index=False, flavor=self.flavor) + df.to_sql('test_date', self.conn, index=False) res = read_sql_query('SELECT * FROM test_date', self.conn) if self.flavor == 'sqlite': # comes back as strings @@ -1976,7 +1967,7 @@ def test_datetime_date(self): def test_datetime_time(self): # test support for datetime.time, GH #8341 df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - df.to_sql('test_time', self.conn, index=False, flavor=self.flavor) + df.to_sql('test_time', self.conn, index=False) res = read_sql_query('SELECT * FROM test_time', self.conn) if self.flavor == 'sqlite': # comes back as strings @@ -2051,130 +2042,22 @@ def test_illegal_names(self): df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) # Raise error on blank - self.assertRaises(ValueError, df.to_sql, "", self.conn, - flavor=self.flavor) + self.assertRaises(ValueError, df.to_sql, "", self.conn) for ndx, weird_name in enumerate( ['test_weird_name]', 'test_weird_name[', 'test_weird_name`', 'test_weird_name"', 'test_weird_name\'', '_b.test_weird_name_01-30', '"_b.test_weird_name_01-30"', '99beginswithnumber', '12345', u'\xe9']): - df.to_sql(weird_name, self.conn, flavor=self.flavor) + df.to_sql(weird_name, self.conn) sql.table_exists(weird_name, self.conn) df2 = DataFrame([[1, 2], [3, 4]], columns=['a', weird_name]) c_tbl = 'test_weird_col_name%d' % ndx - df2.to_sql(c_tbl, self.conn, flavor=self.flavor) + df2.to_sql(c_tbl, self.conn) sql.table_exists(c_tbl, self.conn) -class TestMySQLLegacy(MySQLMixIn, TestSQLiteFallback): - """ - Test the legacy mode against a MySQL database. - - """ - flavor = 'mysql' - - @classmethod - def setUpClass(cls): - cls.setup_driver() - - # test connection - try: - cls.connect() - except cls.driver.err.OperationalError: - raise nose.SkipTest( - "{0} - can't connect to MySQL server".format(cls)) - - @classmethod - def setup_driver(cls): - try: - import pymysql - cls.driver = pymysql - except ImportError: - raise nose.SkipTest('pymysql not installed') - - @classmethod - def connect(cls): - return cls.driver.connect(host='127.0.0.1', user='root', passwd='', - db='pandas_nosetest') - - def _count_rows(self, table_name): - cur = self._get_exec() - cur.execute( - "SELECT count(*) AS count_1 FROM %s" % table_name) - rows = cur.fetchall() - return rows[0][0] - - def setUp(self): - try: - self.conn = self.connect() - except self.driver.err.OperationalError: - raise nose.SkipTest("Can't connect to MySQL server") - - self.pandasSQL = sql.SQLiteDatabase(self.conn, 'mysql') - - self._load_iris_data() - self._load_test1_data() - - def test_a_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - sql.to_sql(self.test_frame1, 'test_frame1', self.conn, - flavor='mysql') - self.assertTrue( - sql.has_table('test_frame1', self.conn, flavor='mysql'), - 'Table not written to DB') - - def _get_index_columns(self, tbl_name): - ixs = sql.read_sql_query( - "SHOW INDEX IN %s" % tbl_name, self.conn) - ix_cols = {} - for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): - if ix_name not in ix_cols: - ix_cols[ix_name] = [] - ix_cols[ix_name].append(ix_col) - return list(ix_cols.values()) - - # TODO: cruft? - # def test_to_sql_save_index(self): - # self._to_sql_save_index() - - # for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): - # if ix_name not in ix_cols: - # ix_cols[ix_name] = [] - # ix_cols[ix_name].append(ix_col) - # return ix_cols.values() - - def test_to_sql_save_index(self): - self._to_sql_save_index() - - def test_illegal_names(self): - df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) - - # These tables and columns should be ok - for ndx, ok_name in enumerate(['99beginswithnumber', '12345']): - df.to_sql(ok_name, self.conn, flavor=self.flavor, index=False, - if_exists='replace') - df2 = DataFrame([[1, 2], [3, 4]], columns=['a', ok_name]) - - df2.to_sql('test_ok_col_name', self.conn, - flavor=self.flavor, index=False, - if_exists='replace') - - # For MySQL, these should raise ValueError - for ndx, illegal_name in enumerate( - ['test_illegal_name]', 'test_illegal_name[', - 'test_illegal_name`', 'test_illegal_name"', - 'test_illegal_name\'', '']): - self.assertRaises(ValueError, df.to_sql, illegal_name, self.conn, - flavor=self.flavor, index=False) - - df2 = DataFrame([[1, 2], [3, 4]], columns=['a', illegal_name]) - self.assertRaises(ValueError, df2.to_sql, - 'test_illegal_col_name%d' % ndx, - self.conn, flavor=self.flavor, index=False) - - # ----------------------------------------------------------------------------- # -- Old tests from 0.13.1 (before refactor using sqlalchemy) @@ -2228,7 +2111,7 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.ix[0, 0] = np.nan - create_sql = sql.get_schema(frame, 'test', 'sqlite') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(create_sql) @@ -2247,7 +2130,7 @@ def test_write_row_by_row(self): def test_execute(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'sqlite') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(create_sql) ins = "INSERT INTO test VALUES (?, ?, ?, ?)" @@ -2262,7 +2145,7 @@ def test_execute(self): def test_schema(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'sqlite') + create_sql = sql.get_schema(frame, 'test') lines = create_sql.splitlines() for l in lines: tokens = l.split(' ') @@ -2270,7 +2153,7 @@ def test_schema(self): self.assertTrue(tokens[1] == 'DATETIME') frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'sqlite', keys=['A', 'B'],) + create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) lines = create_sql.splitlines() self.assertTrue('PRIMARY KEY ("A", "B")' in create_sql) cur = self.conn.cursor() @@ -2425,44 +2308,68 @@ def clean_up(test_table_to_drop): frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='notvalidvalue') clean_up(table_name) # test if_exists='fail' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='fail') + sql.to_sql(frame=df_if_exists_1, con=self.conn, + name=table_name, if_exists='fail') self.assertRaises(ValueError, sql.to_sql, frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='fail') # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='replace', index=False) + if_exists='replace', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='sqlite', if_exists='replace', index=False) + if_exists='replace', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) # test if_exists='append' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='fail', index=False) + if_exists='fail', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='sqlite', if_exists='append', index=False) + if_exists='append', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) +class TestSQLFlavorDeprecation(tm.TestCase): + """ + gh-13611: test that the 'flavor' parameter + is appropriately deprecated by checking the + functions that directly raise the warning + """ + + con = 1234 # don't need real connection for this + funcs = ['SQLiteDatabase', 'pandasSQL_builder'] + + def test_unsupported_flavor(self): + msg = 'is not supported' + + for func in self.funcs: + tm.assertRaisesRegexp(ValueError, msg, getattr(sql, func), + self.con, flavor='mysql') + + def test_deprecated_flavor(self): + for func in self.funcs: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + getattr(sql, func)(self.con, flavor='sqlite') + + +@unittest.skip("gh-13611: there is no support for MySQL " + "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn, tm.TestCase): @classmethod @@ -2531,7 +2438,7 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.ix[0, 0] = np.nan drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', 'mysql') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(drop_sql) cur.execute(create_sql) @@ -2553,7 +2460,7 @@ def test_chunksize_read_type(self): drop_sql = "DROP TABLE IF EXISTS test" cur = self.conn.cursor() cur.execute(drop_sql) - sql.to_sql(frame, name='test', con=self.conn, flavor='mysql') + sql.to_sql(frame, name='test', con=self.conn) query = "select * from test" chunksize = 5 chunk_gen = pd.read_sql_query(sql=query, con=self.conn, @@ -2565,7 +2472,7 @@ def test_execute(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', 'mysql') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") @@ -2584,7 +2491,7 @@ def test_execute(self): def test_schema(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'mysql') + create_sql = sql.get_schema(frame, 'test') lines = create_sql.splitlines() for l in lines: tokens = l.split(' ') @@ -2593,7 +2500,7 @@ def test_schema(self): frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', 'mysql', keys=['A', 'B'],) + create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) lines = create_sql.splitlines() self.assertTrue('PRIMARY KEY (`A`, `B`)' in create_sql) cur = self.conn.cursor() @@ -2666,8 +2573,7 @@ def _check_roundtrip(self, frame): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) - sql.to_sql(frame, name='test_table', - con=self.conn, flavor='mysql', index=False) + sql.to_sql(frame, name='test_table', con=self.conn, index=False) result = sql.read_sql("select * from test_table", self.conn) # HACK! Change this once indexes are handled properly. @@ -2687,7 +2593,7 @@ def _check_roundtrip(self, frame): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) sql.to_sql(frame2, name='test_table2', - con=self.conn, flavor='mysql', index=False) + con=self.conn, index=False) result = sql.read_sql("select * from test_table2", self.conn, index_col='Idx') expected = frame.copy() @@ -2707,7 +2613,7 @@ def test_tquery(self): cur = self.conn.cursor() cur.execute(drop_sql) sql.to_sql(frame, name='test_table', - con=self.conn, flavor='mysql', index=False) + con=self.conn, index=False) result = sql.tquery("select A from test_table", self.conn) expected = Series(frame.A.values, frame.index) # not to have name result = Series(result, frame.index) @@ -2733,7 +2639,7 @@ def test_uquery(self): cur = self.conn.cursor() cur.execute(drop_sql) sql.to_sql(frame, name='test_table', - con=self.conn, flavor='mysql', index=False) + con=self.conn, index=False) stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' self.assertEqual(sql.uquery(stmt, con=self.conn), 1) @@ -2753,7 +2659,7 @@ def test_keyword_as_column_names(self): _skip_if_no_pymysql() df = DataFrame({'From': np.ones(5)}) sql.to_sql(df, con=self.conn, name='testkeywords', - if_exists='replace', flavor='mysql', index=False) + if_exists='replace', index=False) def test_if_exists(self): _skip_if_no_pymysql() @@ -2776,39 +2682,37 @@ def clean_up(test_table_to_drop): frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='notvalidvalue') clean_up(table_name) # test if_exists='fail' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='fail', index=False) + if_exists='fail', index=False) self.assertRaises(ValueError, sql.to_sql, frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='fail') # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='replace', index=False) + if_exists='replace', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='mysql', if_exists='replace', index=False) + if_exists='replace', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) # test if_exists='append' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='fail', index=False) + if_exists='fail', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='mysql', if_exists='append', index=False) + if_exists='append', index=False) self.assertEqual(sql.tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) From 786edc7ed737d9f912613f29a62997017e729a37 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 20 Jul 2016 13:23:02 -0400 Subject: [PATCH 125/359] ENH: add time-window capability to .rolling xref #13327 closes #936 Author: Jeff Reback Closes #13513 from jreback/rolling and squashes the following commits: d8f3d73 [Jeff Reback] ENH: add time-window capability to .rolling --- ci/lint.sh | 14 +- doc/source/computation.rst | 85 ++ doc/source/timeseries.rst | 6 +- doc/source/whatsnew/v0.19.0.txt | 63 +- pandas/core/generic.py | 5 +- pandas/core/window.py | 363 +++++- pandas/tests/test_window.py | 573 ++++++++- pandas/window.pyx | 1921 +++++++++++++++++++++---------- setup.py | 6 +- 9 files changed, 2349 insertions(+), 687 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 9f582f72fcdd7..144febcfcece5 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -17,7 +17,19 @@ if [ "$LINT" ]; then fi done - echo "Linting DONE" + echo "Linting *.py DONE" + + echo "Linting *.pyx" + for path in 'window.pyx' + do + echo "linting -> pandas/$path" + flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126,E128 + if [ $? -ne "0" ]; then + RET=1 + fi + + done + echo "Linting *.pyx DONE" echo "Check for invalid testing" grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 59675e33e724b..12e0ecfba97da 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -391,6 +391,91 @@ For some windowing functions, additional parameters must be specified: such that the weights are normalized with respect to each other. Weights of ``[1, 1, 1]`` and ``[2, 2, 2]`` yield the same result. +.. _stats.moments.ts: + +Time-aware Rolling +~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.19.0 + +New in version 0.19.0 are the ability to pass an offset (or convertible) to a ``.rolling()`` method and have it produce +variable sized windows based on the passed time window. For each time point, this includes all preceding values occurring +within the indicated time delta. + +This can be particularly useful for a non-regular time frequency index. + +.. ipython:: python + + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + dft + +This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. + +.. ipython:: python + + dft.rolling(2).sum() + dft.rolling(2, min_periods=1).sum() + +Specifying an offset allows a more intuitive specification of the rolling frequency. + +.. ipython:: python + + dft.rolling('2s').sum() + +Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. + + +.. ipython:: python + + + dft = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index = pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) + + dft + dft.rolling(2).sum() + + +Using the time-specification generates variable windows for this sparse data. + +.. ipython:: python + + dft.rolling('2s').sum() + +Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the +default of the index) in a DataFrame. + +.. ipython:: python + + dft = dft.reset_index() + dft + dft.rolling('2s', on='foo').sum() + +.. _stats.moments.ts-versus-resampling: + +Time-aware Rolling vs. Resampling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They +both operate and perform reductive operations on time-indexed pandas objects. + +When using ``.rolling()`` with an offset. The offset is a time-delta. Take a backwards-in-time looking window, and +aggregate all of the values in that window (including the end-point, but not the start-point). This is the new value +at that point in the result. These are variable sized windows in time-space for each point of the input. You will get +a same sized result as the input. + +When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency +bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this +aggregation is the output for that frequency point. The windows are fixed size size in the frequency space. Your result +will have the shape of a regular frequency between the min and the max of the original input object. + +To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. + Centering Windows ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index f6a1e169afe9d..fd31eb1b584a8 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1284,7 +1284,11 @@ performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications. -``resample`` is a time-based groupby, followed by a reduction method on each of its groups. +``.resample()`` is a time-based groupby, followed by a reduction method on each of its groups. + +.. note:: + + ``.resample()`` is similar to using a ``.rolling()`` operation with a time-based offset, see a discussion `here ` See some :ref:`cookbook examples ` for some advanced strategies diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 57b0d8895f67b..cdae0d5c27c7d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -3,16 +3,17 @@ v0.19.0 (August ??, 2016) ------------------------- -This is a major release from 0.18.2 and includes a small number of API changes, several new features, +This is a major release from 0.18.1 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` +- ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` -.. contents:: What's new in v0.18.2 +.. contents:: What's new in v0.19.0 :local: :backlinks: none @@ -131,6 +132,64 @@ that forward filling happens automatically taking the most recent non-NaN value. This returns a merged DataFrame with the entries in the same order as the original left passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. +.. _whatsnew_0190.enhancements.rolling_ts: + +``.rolling()`` are now time-series aware +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``.rolling()`` objects are now time-series aware and can accept a time-series offset (or convertible) for the ``window`` argument (:issue:`13327`, :issue:`12995`) +See the full documentation :ref:`here `. + +.. ipython:: python + + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + dft + +This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. + +.. ipython:: python + + dft.rolling(2).sum() + dft.rolling(2, min_periods=1).sum() + +Specifying an offset allows a more intuitive specification of the rolling frequency. + +.. ipython:: python + + dft.rolling('2s').sum() + +Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. + +.. ipython:: python + + + dft = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index = pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) + + dft + dft.rolling(2).sum() + +Using the time-specification generates variable windows for this sparse data. + +.. ipython:: python + + dft.rolling('2s').sum() + +Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the +default of the index) in a DataFrame. + +.. ipython:: python + + dft = dft.reset_index() + dft + dft.rolling('2s', on='foo').sum() + .. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: :func:`read_csv` has improved support for duplicate column names diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e59bec2dbd7e0..dd4be571ef2b4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5342,11 +5342,12 @@ def _add_series_or_dataframe_operations(cls): @Appender(rwindow.rolling.__doc__) def rolling(self, window, min_periods=None, freq=None, center=False, - win_type=None, axis=0): + win_type=None, on=None, axis=0): axis = self._get_axis_number(axis) return rwindow.rolling(self, window=window, min_periods=min_periods, freq=freq, - center=center, win_type=win_type, axis=axis) + center=center, win_type=win_type, + on=on, axis=axis) cls.rolling = rolling diff --git a/pandas/core/window.py b/pandas/core/window.py index bc4d34529287b..9e2a27adc25a7 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -11,7 +11,11 @@ import numpy as np from collections import defaultdict -from pandas.types.generic import ABCSeries, ABCDataFrame +from pandas.types.generic import (ABCSeries, + ABCDataFrame, + ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex) from pandas.types.common import (is_integer, is_bool, is_float_dtype, @@ -26,11 +30,14 @@ GroupByMixin) import pandas.core.common as com import pandas._window as _window +from pandas.tseries.offsets import DateOffset from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import Substitution, Appender +from pandas.util.decorators import (Substitution, Appender, + cache_readonly) from textwrap import dedent + _shared_docs = dict() _doc_template = """ @@ -47,19 +54,21 @@ class _Window(PandasObject, SelectionMixin): _attributes = ['window', 'min_periods', 'freq', 'center', 'win_type', - 'axis'] + 'axis', 'on'] exclusions = set() def __init__(self, obj, window=None, min_periods=None, freq=None, - center=False, win_type=None, axis=0, **kwargs): + center=False, win_type=None, axis=0, on=None, **kwargs): if freq is not None: warnings.warn("The freq kw is deprecated and will be removed in a " "future version. You can resample prior to passing " "to a window function", FutureWarning, stacklevel=3) + self.__dict__.update(kwargs) self.blocks = [] self.obj = obj + self.on = on self.window = window self.min_periods = min_periods self.freq = freq @@ -72,6 +81,18 @@ def __init__(self, obj, window=None, min_periods=None, freq=None, def _constructor(self): return Window + @property + def is_datetimelike(self): + return None + + @property + def _on(self): + return None + + @property + def is_freq_type(self): + return self.win_type == 'freq' + def validate(self): if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") @@ -83,6 +104,7 @@ def _convert_freq(self, how=None): """ resample according to the how, return a new object """ obj = self._selected_obj + index = None if (self.freq is not None and isinstance(obj, (ABCSeries, ABCDataFrame))): if how is not None: @@ -92,13 +114,24 @@ def _convert_freq(self, how=None): stacklevel=6) obj = obj.resample(self.freq).aggregate(how or 'asfreq') - return obj + + return obj, index def _create_blocks(self, how): """ split data into blocks & return conformed data """ - obj = self._convert_freq(how) - return obj.as_blocks(copy=False).values(), obj + obj, index = self._convert_freq(how) + if index is not None: + index = self._on + + # filter out the on from the object + if self.on is not None: + if obj.ndim == 2: + obj = obj.reindex(columns=obj.columns.difference([self.on]), + copy=False) + blocks = obj.as_blocks(copy=False).values() + + return blocks, obj, index def _gotitem(self, key, ndim, subset=None): """ @@ -152,6 +185,21 @@ def __unicode__(self): return "{klass} [{attrs}]".format(klass=self._window_type, attrs=','.join(attrs)) + def _get_index(self, index=None): + """ + Return index as ndarrays + + Returns + ------- + tuple of (index, index_as_ndarray) + """ + + if self.is_freq_type: + if index is None: + index = self._on + return index, index.asi8 + return index, index + def _prep_values(self, values=None, kill_inf=True, how=None): if values is None: @@ -187,8 +235,8 @@ def _wrap_result(self, result, block=None, obj=None): if obj is None: obj = self._selected_obj - index = obj.index + if isinstance(result, np.ndarray): # coerce if necessary @@ -215,6 +263,9 @@ def _wrap_results(self, results, blocks, obj): obj : conformed data (may be resampled) """ + from pandas import Series + from pandas.core.index import _ensure_index + final = [] for result, block in zip(results, blocks): @@ -223,9 +274,31 @@ def _wrap_results(self, results, blocks, obj): return result final.append(result) + # if we have an 'on' column + # we want to put it back into the results + # in the same location + columns = self._selected_obj.columns + if self.on is not None \ + and not self._on.equals(obj.index): + + name = self._on.name + final.append(Series(self._on, index=obj.index, name=name)) + + if self._selection is not None: + + selection = _ensure_index(self._selection) + + # need to reorder to include original location of + # the on column (if its not already there) + if name not in selection: + columns = self.obj.columns + indexer = columns.get_indexer(selection.tolist() + [name]) + columns = columns.take(sorted(indexer)) + if not len(final): return obj.astype('float64') - return pd.concat(final, axis=1).reindex(columns=obj.columns) + return pd.concat(final, axis=1).reindex(columns=columns, + copy=False) def _center_window(self, result, window): """ center the result in the window """ @@ -271,18 +344,24 @@ def aggregate(self, arg, *args, **kwargs): class Window(_Window): """ - Provides rolling transformations. + Provides rolling window calculcations. .. versionadded:: 0.18.0 Parameters ---------- - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. + window : int, or offset + Size of the moving window. This is the number of observations used for + calculating the statistic. Each window will be a fixed size. + + If its an offset then this will be the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. This is + new in 0.19.0 min_periods : int, default None Minimum number of observations in window required to have a value - (otherwise result is NA). + (otherwise result is NA). For a window that is specified by an offset, + this will default to 1. freq : string or DateOffset object, optional (default None) (DEPRECATED) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. @@ -290,11 +369,91 @@ class Window(_Window): Set the labels at the center of the window. win_type : string, default None Provide a window type. See the notes below. - axis : int, default 0 + on : string, optional + For a DataFrame, column on which to calculate + the rolling window, rather than the index + + .. versionadded:: 0.19.0 + + axis : int or string, default 0 Returns ------- - a Window sub-classed for the particular operation + a Window or Rolling sub-classed for the particular operation + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + Rolling sum with a window length of 2, using the 'triang' + window type. + + >>> df.rolling(2, win_type='triang').sum() + B + 0 NaN + 1 1.0 + 2 2.5 + 3 NaN + 4 NaN + + Rolling sum with a window length of 2, min_periods defaults + to the window length. + + >>> df.rolling(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 NaN + 4 NaN + + Same as above, but explicity set the min_periods + + >>> df.rolling(2, min_periods=1).sum() + B + 0 0.0 + 1 1.0 + 2 3.0 + 3 2.0 + 4 4.0 + + A ragged (meaning not-a-regular frequency), time-indexed DataFrame + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + ....: index = [pd.Timestamp('20130101 09:00:00'), + ....: pd.Timestamp('20130101 09:00:02'), + ....: pd.Timestamp('20130101 09:00:03'), + ....: pd.Timestamp('20130101 09:00:05'), + ....: pd.Timestamp('20130101 09:00:06')]) + + >>> df + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 2.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + + + Contrasting to an integer rolling window, this will roll a variable + length window corresponding to the time period. + The default for min_periods is 1. + + >>> df.rolling('2s').sum() + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 3.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 Notes ----- @@ -305,7 +464,10 @@ class Window(_Window): frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - The recognized window types are: + To learn more about the offsets & frequency strings, please see `this link + `__. + + The recognized win_types are: * ``boxcar`` * ``triang`` @@ -321,7 +483,8 @@ class Window(_Window): * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). -""" + + """ def validate(self): super(Window, self).validate() @@ -329,7 +492,7 @@ def validate(self): window = self.window if isinstance(window, (list, tuple, np.ndarray)): pass - elif com.is_integer(window): + elif is_integer(window): if window < 0: raise ValueError("window must be non-negative") try: @@ -400,7 +563,7 @@ def _apply_window(self, mean=True, how=None, **kwargs): window = self._prep_window(**kwargs) center = self.center - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) results = [] for b in blocks: try: @@ -529,7 +692,8 @@ def _apply(self, func, name=None, window=None, center=None, if check_minp is None: check_minp = _use_window - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) + index, indexi = self._get_index(index=index) results = [] for b in blocks: try: @@ -551,9 +715,10 @@ def _apply(self, func, name=None, window=None, center=None, def func(arg, window, min_periods=None): minp = check_minp(min_periods, window) - # GH #12373: rolling functions error on float32 data - return cfunc(_ensure_float64(arg), - window, minp, **kwargs) + # ensure we are only rolling on floats + arg = _ensure_float64(arg) + return cfunc(arg, + window, minp, indexi, **kwargs) # calculation function if center: @@ -587,11 +752,13 @@ class _Rolling_and_Expanding(_Rolling): observations inside provided window.""" def count(self): - obj = self._convert_freq() + + blocks, obj, index = self._create_blocks(how=None) + index, indexi = self._get_index(index=index) + window = self._get_window() window = min(window, len(obj)) if not self.center else window - blocks, obj = self._create_blocks(how=None) results = [] for b in blocks: @@ -625,10 +792,12 @@ def apply(self, func, args=(), kwargs={}): _level = kwargs.pop('_level', None) # noqa window = self._get_window() offset = _offset(window, self.center) + index, indexi = self._get_index() def f(arg, window, min_periods): minp = _use_window(min_periods, window) - return _window.roll_generic(arg, window, minp, offset, func, args, + return _window.roll_generic(arg, window, minp, indexi, + offset, func, args, kwargs) return self._apply(f, func, args=args, kwargs=kwargs, @@ -695,10 +864,12 @@ def median(self, how=None, **kwargs): def std(self, ddof=1, *args, **kwargs): nv.validate_window_func('std', args, kwargs) window = self._get_window() + index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt(_window.roll_var(arg, window, minp, ddof)) + return _zsqrt(_window.roll_var(arg, window, minp, indexi, + ddof)) return self._apply(f, 'std', check_minp=_require_min_periods(1), ddof=ddof, **kwargs) @@ -740,10 +911,12 @@ def kurt(self, **kwargs): def quantile(self, quantile, **kwargs): window = self._get_window() + index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) - return _window.roll_quantile(arg, window, minp, quantile) + return _window.roll_quantile(arg, window, minp, indexi, + quantile) return self._apply(f, 'quantile', quantile=quantile, **kwargs) @@ -823,43 +996,63 @@ def _get_corr(a, b): class Rolling(_Rolling_and_Expanding): - """ - Provides rolling window calculcations. - - .. versionadded:: 0.18.0 - Parameters - ---------- - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - freq : string or DateOffset object, optional (default None) (DEPRECATED) - Frequency to conform the data to before computing the statistic. - Specified as a frequency string or DateOffset object. - center : boolean, default False - Set the labels at the center of the window. - axis : int, default 0 + @cache_readonly + def is_datetimelike(self): + return isinstance(self._on, + (ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex)) + + @cache_readonly + def _on(self): + + if self.on is None: + return self.obj.index + elif (isinstance(self.obj, ABCDataFrame) and + self.on in self.obj.columns): + return pd.Index(self.obj[self.on]) + else: + raise ValueError("invalid on specified as {0}, " + "must be a column (if DataFrame) " + "or None".format(self.on)) - Returns - ------- - a Window sub-classed for the particular operation + def validate(self): + super(Rolling, self).validate() - Notes - ----- - By default, the result is set to the right edge of the window. This can be - changed to the center of the window by setting ``center=True``. + # we allow rolling on a datetimelike index + if (self.is_datetimelike and + isinstance(self.window, (compat.string_types, DateOffset))): - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - """ + # must be monotonic for on + if not self._on.is_monotonic: + formatted = self.on or 'index' + raise ValueError("{0} must be " + "monotonic".format(formatted)) - def validate(self): - super(Rolling, self).validate() - if not is_integer(self.window): + from pandas.tseries.frequencies import to_offset + try: + freq = to_offset(self.window) + except (TypeError, ValueError): + raise ValueError("passed window {0} in not " + "compat with a datetimelike " + "index".format(self.window)) + + # we don't allow center + if self.center: + raise NotImplementedError("center is not implemented " + "for datetimelike and offset " + "based windows") + + # this will raise ValueError on non-fixed freqs + self.window = freq.nanos + self.win_type = 'freq' + + # min_periods must be an integer + if self.min_periods is None: + self.min_periods = 1 + + elif not is_integer(self.window): raise ValueError("window must be an integer") elif self.window < 0: raise ValueError("window must be non-negative") @@ -876,6 +1069,11 @@ def aggregate(self, arg, *args, **kwargs): @Appender(_doc_template) @Appender(_shared_docs['count']) def count(self): + + # different impl for freq counting + if self.is_freq_type: + return self._apply('roll_count', 'count') + return super(Rolling, self).count() @Substitution(name='rolling') @@ -993,12 +1191,31 @@ class Expanding(_Rolling_and_Expanding): Specified as a frequency string or DateOffset object. center : boolean, default False Set the labels at the center of the window. - axis : int, default 0 + axis : int or string, default 0 Returns ------- a Window sub-classed for the particular operation + Examples + -------- + + >>> df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.expanding(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 3.0 + 4 7.0 + Notes ----- By default, the result is set to the right edge of the window. This can be @@ -1205,6 +1422,25 @@ class EWM(_Rolling): ------- a Window sub-classed for the particular operation + Examples + -------- + + >>> df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.ewm(com=0.5).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + Notes ----- Exactly one of center of mass, span, half-life, and alpha must be provided. @@ -1248,6 +1484,7 @@ def __init__(self, obj, com=None, span=None, halflife=None, alpha=None, self.adjust = adjust self.ignore_na = ignore_na self.axis = axis + self.on = None @property def _constructor(self): @@ -1276,7 +1513,7 @@ def _apply(self, func, how=None, **kwargs): y : type of input argument """ - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) results = [] for b in blocks: try: diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 3693ebdb12e2f..7a35682eee3b0 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -11,7 +11,7 @@ import pandas as pd from pandas import (Series, DataFrame, Panel, bdate_range, isnull, - notnull, concat) + notnull, concat, Timestamp) import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.core.window as rwindow @@ -101,7 +101,7 @@ def tests_skip_nuisance(self): expected = pd.concat([r[['A', 'B']].sum(), df[['C']]], axis=1) result = r.sum() - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_like=True) def test_agg(self): df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) @@ -319,6 +319,13 @@ class TestRolling(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.rolling(2).sum() + df.rolling(2, min_periods=1).sum() + def test_constructor(self): # GH 12669 @@ -372,6 +379,12 @@ class TestExpanding(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.expanding(2).sum() + def test_constructor(self): # GH 12669 @@ -408,6 +421,12 @@ class TestEWM(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.ewm(com=0.5).mean() + def test_constructor(self): for o in [self.series, self.frame]: c = o.ewm @@ -565,6 +584,7 @@ def _create_data(self): def test_dtypes(self): self._create_data() for f_name, d_name in product(self.funcs.keys(), self.data.keys()): + f = self.funcs[f_name] d = self.data[d_name] exp = self.expects[d_name][f_name] @@ -958,6 +978,7 @@ def test_rolling_median(self): name='median') def test_rolling_min(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self._check_moment_func(mom.rolling_min, np.min, name='min') @@ -970,6 +991,7 @@ def test_rolling_min(self): window=3, min_periods=5) def test_rolling_max(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self._check_moment_func(mom.rolling_max, np.max, name='max') @@ -2890,6 +2912,7 @@ def test_rolling_median_memory_error(self): Series(np.random.randn(n)).rolling(window=2, center=False).median() def test_rolling_min_max_numeric_types(self): + # GH12373 types_test = [np.dtype("f{}".format(width)) for width in [4, 8]] types_test.extend([np.dtype("{}{}".format(sign, width)) @@ -2961,6 +2984,7 @@ def test_rolling(self): r = g.rolling(window=4) for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) tm.assert_frame_equal(result, expected) @@ -3007,6 +3031,7 @@ def test_expanding(self): r = g.expanding() for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.expanding(), f)()) tm.assert_frame_equal(result, expected) @@ -3047,3 +3072,547 @@ def test_expanding_apply(self): result = r.apply(lambda x: x.sum()) expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum())) tm.assert_frame_equal(result, expected) + + +class TestRollingTS(tm.TestCase): + + # rolling time-series friendly + # xref GH13327 + + def setUp(self): + + self.regular = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': range(5)}).set_index('A') + + self.ragged = DataFrame({'B': range(5)}) + self.ragged.index = [Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')] + + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=[Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')]) + df + df.rolling('2s').sum() + + def test_valid(self): + + df = self.regular + + # not a valid freq + with self.assertRaises(ValueError): + df.rolling(window='foobar') + + # not a datetimelike index + with self.assertRaises(ValueError): + df.reset_index().rolling(window='foobar') + + # non-fixed freqs + for freq in ['2MS', pd.offsets.MonthBegin(2)]: + with self.assertRaises(ValueError): + df.rolling(window=freq) + + for freq in ['1D', pd.offsets.Day(2), '2ms']: + df.rolling(window=freq) + + # non-integer min_periods + for minp in [1.0, 'foo', np.array([1, 2, 3])]: + with self.assertRaises(ValueError): + df.rolling(window='1D', min_periods=minp) + + # center is not implemented + with self.assertRaises(NotImplementedError): + df.rolling(window='1D', center=True) + + def test_on(self): + + df = self.regular + + # not a valid column + with self.assertRaises(ValueError): + df.rolling(window='2s', on='foobar') + + # column is valid + df = df.copy() + df['C'] = pd.date_range('20130101', periods=len(df)) + df.rolling(window='2d', on='C').sum() + + # invalid columns + with self.assertRaises(ValueError): + df.rolling(window='2d', on='B') + + # ok even though on non-selected + df.rolling(window='2d', on='C').B.sum() + + def test_monotonic_on(self): + + # on/index must be monotonic + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': range(5)}) + + self.assertTrue(df.A.is_monotonic) + df.rolling('2s', on='A').sum() + + df = df.set_index('A') + self.assertTrue(df.index.is_monotonic) + df.rolling('2s').sum() + + # non-monotonic + df.index = reversed(df.index.tolist()) + self.assertFalse(df.index.is_monotonic) + + with self.assertRaises(ValueError): + df.rolling('2s').sum() + + df = df.reset_index() + with self.assertRaises(ValueError): + df.rolling('2s', on='A').sum() + + def test_frame_on(self): + + df = DataFrame({'B': range(5), + 'C': pd.date_range('20130101 09:00:00', + periods=5, + freq='3s')}) + + df['A'] = [Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')] + + # we are doing simulating using 'on' + expected = (df.set_index('A') + .rolling('2s') + .B + .sum() + .reset_index(drop=True) + ) + + result = (df.rolling('2s', on='A') + .B + .sum() + ) + tm.assert_series_equal(result, expected) + + # test as a frame + # we should be ignoring the 'on' as an aggregation column + # note that the expected is setting, computing, and reseting + # so the columns need to be switched compared + # to the actual result where they are ordered as in the + # original + expected = (df.set_index('A') + .rolling('2s')[['B']] + .sum() + .reset_index()[['B', 'A']] + ) + + result = (df.rolling('2s', on='A')[['B']] + .sum() + ) + tm.assert_frame_equal(result, expected) + + def test_frame_on2(self): + + # using multiple aggregation columns + df = DataFrame({'A': [0, 1, 2, 3, 4], + 'B': [0, 1, 2, np.nan, 4], + 'C': pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')])}, + columns=['A', 'C', 'B']) + + expected1 = DataFrame({'A': [0., 1, 3, 3, 7], + 'B': [0, 1, 3, np.nan, 4], + 'C': df['C']}, + columns=['A', 'C', 'B']) + + result = df.rolling('2s', on='C').sum() + expected = expected1 + tm.assert_frame_equal(result, expected) + + expected = Series([0, 1, 3, np.nan, 4], name='B') + result = df.rolling('2s', on='C').B.sum() + tm.assert_series_equal(result, expected) + + expected = expected1[['A', 'B', 'C']] + result = df.rolling('2s', on='C')[['A', 'B', 'C']].sum() + tm.assert_frame_equal(result, expected) + + def test_basic_regular(self): + + df = self.regular.copy() + + df.index = pd.date_range('20130101', periods=5, freq='D') + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='1D').sum() + tm.assert_frame_equal(result, expected) + + df.index = pd.date_range('20130101', periods=5, freq='2D') + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='2D', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='2D', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1).sum() + result = df.rolling(window='2D').sum() + tm.assert_frame_equal(result, expected) + + def test_min_periods(self): + + # compare for min_periods + df = self.regular + + # these slightly different + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling('2s').sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling('2s', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + def test_ragged_sum(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 3, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=2).sum() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 3, np.nan, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s').sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='4s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='4s', min_periods=3).sum() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 6, 10] + tm.assert_frame_equal(result, expected) + + def test_ragged_mean(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).mean() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).mean() + expected = df.copy() + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_median(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).median() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).median() + expected = df.copy() + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_quantile(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).quantile(0.5) + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).quantile(0.5) + expected = df.copy() + expected['B'] = [0.0, 1, 1.0, 3.0, 3.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_std(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).std(ddof=0) + expected = df.copy() + expected['B'] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='1s', min_periods=1).std(ddof=1) + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).std(ddof=0) + expected = df.copy() + expected['B'] = [0.0] + [0.5] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).std(ddof=1) + expected = df.copy() + expected['B'] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] + tm.assert_frame_equal(result, expected) + + def test_ragged_var(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).var(ddof=0) + expected = df.copy() + expected['B'] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='1s', min_periods=1).var(ddof=1) + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).var(ddof=0) + expected = df.copy() + expected['B'] = [0.0] + [0.25] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).var(ddof=1) + expected = df.copy() + expected['B'] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.] + tm.assert_frame_equal(result, expected) + + def test_ragged_skew(self): + + df = self.ragged + result = df.rolling(window='3s', min_periods=1).skew() + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).skew() + expected = df.copy() + expected['B'] = [np.nan] * 2 + [0.0, 0.0, 0.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_kurt(self): + + df = self.ragged + result = df.rolling(window='3s', min_periods=1).kurt() + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).kurt() + expected = df.copy() + expected['B'] = [np.nan] * 4 + [-1.2] + tm.assert_frame_equal(result, expected) + + def test_ragged_count(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).count() + expected = df.copy() + expected['B'] = [1.0, 1, 1, 1, 1] + tm.assert_frame_equal(result, expected) + + df = self.ragged + result = df.rolling(window='1s').count() + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).count() + expected = df.copy() + expected['B'] = [1.0, 1, 2, 1, 2] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=2).count() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 2, np.nan, 2] + tm.assert_frame_equal(result, expected) + + def test_regular_min(self): + + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': [0.0, 1, 2, 3, 4]}).set_index('A') + result = df.rolling('1s').min() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': [5, 4, 3, 4, 5]}).set_index('A') + + tm.assert_frame_equal(result, expected) + result = df.rolling('2s').min() + expected = df.copy() + expected['B'] = [5.0, 4, 3, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling('5s').min() + expected = df.copy() + expected['B'] = [5.0, 4, 3, 3, 3] + tm.assert_frame_equal(result, expected) + + def test_ragged_min(self): + + df = self.ragged + + result = df.rolling(window='1s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_perf_min(self): + + N = 10000 + + dfp = DataFrame({'B': np.random.randn(N)}, + index=pd.date_range('20130101', + periods=N, + freq='s')) + expected = dfp.rolling(2, min_periods=1).min() + result = dfp.rolling('2s').min() + self.assertTrue(((result - expected) < 0.01).all().bool()) + + expected = dfp.rolling(200, min_periods=1).min() + result = dfp.rolling('200s').min() + self.assertTrue(((result - expected) < 0.01).all().bool()) + + def test_ragged_max(self): + + df = self.ragged + + result = df.rolling(window='1s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + def test_ragged_apply(self): + + df = self.ragged + + f = lambda x: 1 + result = df.rolling(window='1s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + def test_all(self): + + # simple comparision of integer vs time-based windowing + df = self.regular * 2 + er = df.rolling(window=1) + r = df.rolling(window='1s') + + for f in ['sum', 'mean', 'count', 'median', 'std', + 'var', 'kurt', 'skew', 'min', 'max']: + + result = getattr(r, f)() + expected = getattr(er, f)() + tm.assert_frame_equal(result, expected) + + result = r.quantile(0.5) + expected = er.quantile(0.5) + tm.assert_frame_equal(result, expected) + + result = r.apply(lambda x: 1) + expected = er.apply(lambda x: 1) + tm.assert_frame_equal(result, expected) + + def test_all2(self): + + # more sophisticated comparision of integer vs. + # time-based windowing + df = DataFrame({'B': np.arange(50)}, + index=pd.date_range('20130101', + periods=50, freq='H') + ) + # in-range data + dft = df.between_time("09:00", "16:00") + + r = dft.rolling(window='5H') + + for f in ['sum', 'mean', 'count', 'median', 'std', + 'var', 'kurt', 'skew', 'min', 'max']: + + result = getattr(r, f)() + + # we need to roll the days separately + # to compare with a time-based roll + # finally groupby-apply will return a multi-index + # so we need to drop the day + def agg_by_day(x): + x = x.between_time("09:00", "16:00") + return getattr(x.rolling(5, min_periods=1), f)() + expected = df.groupby(df.index.day).apply( + agg_by_day).reset_index(level=0, drop=True) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/window.pyx b/pandas/window.pyx index bfe9152477a40..8235d68e2a88b 100644 --- a/pandas/window.pyx +++ b/pandas/window.pyx @@ -1,3 +1,6 @@ +# cython: profile=False +# cython: boundscheck=False, wraparound=False, cdivision=True + from numpy cimport * cimport numpy as np import numpy as np @@ -51,9 +54,10 @@ cdef double nan = NaN cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -# this is our util.pxd from util cimport numeric +from skiplist cimport * + cdef extern from "src/headers/math.h": double sqrt(double x) nogil int signbit(double) nogil @@ -69,16 +73,37 @@ include "skiplist.pyx" # - In Cython x * x is faster than x ** 2 for C types, this should be # periodically revisited to see if it's still true. # -# - -def _check_minp(win, minp, N, floor=1): + +def _check_minp(win, minp, N, floor=None): + """ + Parameters + ---------- + win: int + minp: int or None + N: len of window + floor: int, optional + default 1 + + Returns + ------- + minimum period + """ + + if minp is None: + minp = 1 + if not util.is_integer_object(minp): + raise ValueError("min_periods must be an integer") if minp > win: - raise ValueError('min_periods (%d) must be <= window (%d)' - % (minp, win)) + raise ValueError("min_periods (%d) must be <= " + "window (%d)" % (minp, win)) elif minp > N: minp = N + 1 elif minp < 0: raise ValueError('min_periods must be >= 0') + if floor is None: + floor = 1 + return max(minp, floor) # original C implementation by N. Devillard. @@ -96,757 +121,1227 @@ def _check_minp(win, minp, N, floor=1): # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -#------------------------------------------------------------------------------- -# Rolling sum -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_sum(ndarray[double_t] input, int win, int minp): - cdef double val, prev, sum_x = 0 - cdef int nobs = 0, i - cdef int N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) +# ---------------------------------------------------------------------- +# The indexer objects for rolling +# These define start/end indexers to compute offsets - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - # Not NaN - if val == val: - nobs += 1 - sum_x += val +cdef class WindowIndexer: - output[i] = NaN + cdef: + ndarray start, end + int64_t N, minp, win + bint is_variable - for i from minp - 1 <= i < N: - val = input[i] + def get_data(self): + return (self.start, self.end, self.N, + self.win, self.minp, + self.is_variable) - if val == val: - nobs += 1 - sum_x += val - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 +cdef class MockFixedWindowIndexer(WindowIndexer): + """ - if nobs >= minp: - output[i] = sum_x - else: - output[i] = NaN + We are just checking parameters of the indexer, + and returning a consistent API with fixed/variable + indexers. - return output + Parameters + ---------- + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: object + index of the input + floor: optional + unit for flooring -#------------------------------------------------------------------------------- -# Rolling mean -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_mean(ndarray[double_t] input, - int win, int minp): - cdef: - double val, prev, result, sum_x = 0 - Py_ssize_t nobs = 0, i, neg_ct = 0 - Py_ssize_t N = len(input) + """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + object index=None, object floor=None): - cdef ndarray[double_t] output = np.empty(N, dtype=float) - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] + assert index is None + self.is_variable = 0 + self.N = len(input) + self.minp = _check_minp(win, minp, self.N, floor=floor) + self.start = np.empty(0, dtype='int64') + self.end = np.empty(0, dtype='int64') + self.win = win - # Not NaN - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 - output[i] = NaN +cdef class FixedWindowIndexer(WindowIndexer): + """ + create a fixed length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments - for i from minp - 1 <= i < N: - val = input[i] + Parameters + ---------- + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: object + index of the input + floor: optional + unit for flooring the unit - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 + """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + object index=None, object floor=None): + cdef ndarray start_s, start_e, end_s, end_e - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 - if signbit(prev): - neg_ct -= 1 + assert index is None + self.is_variable = 0 + self.N = len(input) + self.minp = _check_minp(win, minp, self.N, floor=floor) - if nobs >= minp: - result = sum_x / nobs - if neg_ct == 0 and result < 0: - # all positive - output[i] = 0 - elif neg_ct == nobs and result > 0: - # all negative - output[i] = 0 - else: - output[i] = result - else: - output[i] = NaN + start_s = np.zeros(win, dtype='int64') + start_e = np.arange(win, self.N, dtype='int64') - win + 1 + self.start = np.concatenate([start_s, start_e]) - return output + end_s = np.arange(win, dtype='int64') + 1 + end_e = start_e + win + self.end = np.concatenate([end_s, end_e]) + self.win = win -#------------------------------------------------------------------------------- -# Exponentially weighted moving average -def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, int minp): +cdef class VariableWindowIndexer(WindowIndexer): """ - Compute exponentially-weighted moving average using center-of-mass. + create a variable length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments Parameters ---------- - input : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: ndarray + index of the input - Returns - ------- - y : ndarray """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + ndarray index): - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output + self.is_variable = 1 + self.N = len(index) + self.minp = _check_minp(win, minp, self.N) - minp = max(minp, 1) + self.start = np.empty(self.N, dtype='int64') + self.start.fill(-1) - cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur - cdef Py_ssize_t i, nobs + self.end = np.empty(self.N, dtype='int64') + self.end.fill(-1) - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha + self.build(index, win) - weighted_avg = input[0] - is_observation = (weighted_avg == weighted_avg) - nobs = int(is_observation) - output[0] = weighted_avg if (nobs >= minp) else NaN - old_wt = 1. + # max window size + self.win = (self.end - self.start).max() - for i from 1 <= i < N: - cur = input[i] - is_observation = (cur == cur) - nobs += int(is_observation) - if weighted_avg == weighted_avg: - if is_observation or (not ignore_na): - old_wt *= old_wt_factor - if is_observation: - if weighted_avg != cur: # avoid numerical errors on constant series - weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1. - elif is_observation: - weighted_avg = cur + def build(self, ndarray[int64_t] index, int64_t win): - output[i] = weighted_avg if (nobs >= minp) else NaN + cdef: + ndarray[int64_t] start, end + int64_t start_bound, end_bound, N + Py_ssize_t i, j - return output + start = self.start + end = self.end + N = self.N -#------------------------------------------------------------------------------- -# Exponentially weighted moving covariance + start[0] = 0 + end[0] = 1 -def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, - double_t com, int adjust, int ignore_na, int minp, int bias): + with nogil: + + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, N): + end_bound = index[i] + start_bound = index[i] - win + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + if index[j] > start_bound: + start[i] = j + break + + # end bound is previous end + # or current index + if index[end[i - 1]] <= end_bound: + end[i] = i + 1 + else: + end[i] = end[i - 1] + + +def get_window_indexer(input, win, minp, index, floor=None, + use_mock=True): """ - Compute exponentially-weighted moving variance using center-of-mass. + return the correct window indexer for the computation Parameters ---------- - input_x : ndarray (float64 type) - input_y : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int - bias: int + input: 1d ndarray + win: integer, window size + minp: integer, minimum periods + index: 1d ndarray, optional + index to the input array + floor: optional + unit for flooring the unit + use_mock: boolean, default True + if we are a fixed indexer, return a mock indexer + instead of the FixedWindow Indexer. This is a type + compat Indexer that allows us to use a standard + code path with all of the indexers. Returns ------- - y : ndarray - """ + tuple of 1d int64 ndarrays of the offsets & data about the window - cdef Py_ssize_t N = len(input_x) - if len(input_y) != N: - raise ValueError('arrays are of different lengths (%d and %d)' % (N, len(input_y))) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output - - minp = max(minp, 1) + """ - cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov - cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y - cdef Py_ssize_t i, nobs + if index is not None: + indexer = VariableWindowIndexer(input, win, minp, index) + elif use_mock: + indexer = MockFixedWindowIndexer(input, win, minp, index, floor) + else: + indexer = FixedWindowIndexer(input, win, minp, index, floor) + return indexer.get_data() - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha +# ---------------------------------------------------------------------- +# Rolling count +# this is only an impl for index not None, IOW, freq aware - mean_x = input_x[0] - mean_y = input_y[0] - is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) - nobs = int(is_observation) - if not is_observation: - mean_x = NaN - mean_y = NaN - output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN - cov = 0. - sum_wt = 1. - sum_wt2 = 1. - old_wt = 1. - for i from 1 <= i < N: - cur_x = input_x[i] - cur_y = input_y[i] - is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) - nobs += int(is_observation) - if mean_x == mean_x: - if is_observation or (not ignore_na): - sum_wt *= old_wt_factor - sum_wt2 *= (old_wt_factor * old_wt_factor) - old_wt *= old_wt_factor - if is_observation: - old_mean_x = mean_x - old_mean_y = mean_y - if mean_x != cur_x: # avoid numerical errors on constant series - mean_x = ((old_wt * old_mean_x) + (new_wt * cur_x)) / (old_wt + new_wt) - if mean_y != cur_y: # avoid numerical errors on constant series - mean_y = ((old_wt * old_mean_y) + (new_wt * cur_y)) / (old_wt + new_wt) - cov = ((old_wt * (cov + ((old_mean_x - mean_x) * (old_mean_y - mean_y)))) + - (new_wt * ((cur_x - mean_x) * (cur_y - mean_y)))) / (old_wt + new_wt) - sum_wt += new_wt - sum_wt2 += (new_wt * new_wt) - old_wt += new_wt - if not adjust: - sum_wt /= old_wt - sum_wt2 /= (old_wt * old_wt) - old_wt = 1. - elif is_observation: - mean_x = cur_x - mean_y = cur_y +def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, count_x = 0.0 + int64_t s, e, nobs, N + Py_ssize_t i, j + ndarray[int64_t] start, end + ndarray[double_t] output - if nobs >= minp: - if not bias: - numerator = sum_wt * sum_wt - denominator = numerator - sum_wt2 - output[i] = ((numerator / denominator) * cov) if (denominator > 0.) else NaN - else: - output[i] = cov - else: - output[i] = NaN + start, end, N, win, minp, _ = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) - return output + with nogil: -#---------------------------------------------------------------------- -# Rolling variance + for i in range(0, N): + s = start[i] + e = end[i] -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): - """ - Numerically stable implementation using Welford's method. - """ - cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta - cdef Py_ssize_t i - cdef Py_ssize_t N = len(input) + if i == 0: - cdef ndarray[double_t] output = np.empty(N, dtype=float) + # setup + count_x = 0.0 + for j in range(s, e): + val = input[j] + if val == val: + count_x += 1.0 - minp = _check_minp(win, minp, N) + else: - # Check for windows larger than array, addresses #7297 - win = min(win, N) + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + count_x -= 1.0 - with nogil: - # Over the first window, observations can only be added, never removed - for i from 0 <= i < win: - val = input[i] + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + count_x += 1.0 - # Not NaN - if val == val: - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 + if count_x >= minp: + output[i] = count_x else: - val = NaN + output[i] = NaN - output[i] = val + return output - # After the first window, observations can both be added and removed - for i from win <= i < N: - val = input[i] - prev = input[i - win] +# ---------------------------------------------------------------------- +# Rolling sum - if val == val: - if prev == prev: - # Adding one observation and removing another one - delta = val - prev - prev -= mean_x - mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta - else: - # Adding one observation and not removing any - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - elif prev == prev: - # Adding no new observation, but removing one - nobs -= 1 - if nobs: - delta = (prev - mean_x) - mean_x -= delta / nobs - ssqdm_x -= delta * (prev - mean_x) - else: - mean_x = 0 - ssqdm_x = 0 - # Variance is unchanged if no observation is added or removed - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 - else: - val = NaN - output[i] = val +cdef inline double calc_sum(int64_t minp, int64_t nobs, double sum_x) nogil: + cdef double result - return output + if nobs >= minp: + result = sum_x + else: + result = NaN + return result -#------------------------------------------------------------------------------- -# Rolling skewness -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_skew(ndarray[double_t] input, int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) +cdef inline void add_sum(double val, int64_t *nobs, double *sum_x) nogil: + """ add a value from the sum calc """ - # 3 components of the skewness equation - cdef double A, B, C, R + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + sum_x[0] = sum_x[0] + val - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - # Not NaN - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val +cdef inline void remove_sum(double val, int64_t *nobs, double *sum_x) nogil: + """ remove a value from the sum calc """ - output[i] = NaN + if val == val: + nobs[0] = nobs[0] - 1 + sum_x[0] = sum_x[0] - val - for i from minp - 1 <= i < N: - val = input[i] - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val +def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev_x, sum_x = 0 + int64_t s, e + int64_t nobs = 0, i, j, N + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) - nobs -= 1 - if nobs >= minp: - A = x / nobs - B = xx / nobs - A * A - C = xxx / nobs - A * A * A - 3 * A * B - if B <= 0 or nobs < 3: - output[i] = NaN - else: - R = sqrt(B) - output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / - ((nobs-2) * R * R * R)) - else: - output[i] = NaN + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we have 2 paths + # but is faster - return output + if is_variable: -#------------------------------------------------------------------------------- -# Rolling kurtosis -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_kurt(ndarray[double_t] input, - int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) + # variable window + with nogil: - cdef ndarray[double_t] output = np.empty(N, dtype=float) + for i in range(0, N): + s = start[i] + e = end[i] - # 5 components of the kurtosis equation - cdef double A, B, C, D, R, K + if i == 0: - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + add_sum(input[j], &nobs, &sum_x) - # Not NaN - if val == val: - nobs += 1 + else: - # seriously don't ask me why this is faster - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val + # calculate deletes + for j in range(start[i - 1], s): + remove_sum(input[j], &nobs, &sum_x) - output[i] = NaN + # calculate adds + for j in range(end[i - 1], e): + add_sum(input[j], &nobs, &sum_x) - for i from minp - 1 <= i < N: - val = input[i] + output[i] = calc_sum(minp, nobs, sum_x) - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val + else: - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - xxxx -= prev * prev * prev * prev + # fixed window - nobs -= 1 + with nogil: - if nobs >= minp: - A = x / nobs - R = A * A - B = xx / nobs - R - R = R * A - C = xxx / nobs - R - 3 * A * B - R = R * A - D = xxxx / nobs - R - 6*B*A*A - 4*C*A - - if B == 0 or nobs < 4: - output[i] = NaN + for i in range(0, minp - 1): + add_sum(input[i], &nobs, &sum_x) + output[i] = NaN - else: - K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) - K = K / ((nobs - 2.)*(nobs-3.)) + for i in range(minp - 1, N): + val = input[i] + add_sum(val, &nobs, &sum_x) - output[i] = K + if i > win - 1: + prev_x = input[i - win] + remove_sum(prev_x, &nobs, &sum_x) - else: - output[i] = NaN + output[i] = calc_sum(minp, nobs, sum_x) return output -#------------------------------------------------------------------------------- -# Rolling median, min, max +# ---------------------------------------------------------------------- +# Rolling mean -from skiplist cimport * -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_median_c(ndarray[float64_t] arg, int win, int minp): - cdef: - double val, res, prev - bint err=0 - int ret=0 - skiplist_t *sl - Py_ssize_t midpoint, nobs = 0, i +cdef inline double calc_mean(int64_t minp, Py_ssize_t nobs, + Py_ssize_t neg_ct, double sum_x) nogil: + cdef double result + if nobs >= minp: + result = sum_x / nobs + if neg_ct == 0 and result < 0: + # all positive + result = 0 + elif neg_ct == nobs and result > 0: + # all negative + result = 0 + else: + pass + else: + result = NaN + return result - cdef Py_ssize_t N = len(arg) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - sl = skiplist_init(win) - if sl == NULL: - raise MemoryError("skiplist_init failed") +cdef inline void add_mean(double val, Py_ssize_t *nobs, double *sum_x, + Py_ssize_t *neg_ct) nogil: + """ add a value from the mean calc """ - minp = _check_minp(win, minp, N) + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + sum_x[0] = sum_x[0] + val + if signbit(val): + neg_ct[0] = neg_ct[0] + 1 - with nogil: - for i from 0 <= i < minp - 1: - val = arg[i] - # Not NaN - if val == val: - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - output[i] = NaN +cdef inline void remove_mean(double val, Py_ssize_t *nobs, double *sum_x, + Py_ssize_t *neg_ct) nogil: + """ remove a value from the mean calc """ + + if val == val: + nobs[0] = nobs[0] - 1 + sum_x[0] = sum_x[0] - val + if signbit(val): + neg_ct[0] = neg_ct[0] - 1 - with nogil: - if not err: - for i from minp - 1 <= i < N: - val = arg[i] +def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev_x, result, sum_x = 0 + int64_t s, e + bint is_variable + Py_ssize_t nobs = 0, i, j, neg_ct = 0, N + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we have 2 paths + # but is faster + + if is_variable: + + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + val = input[j] + add_mean(val, &nobs, &sum_x, &neg_ct) + + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + remove_mean(val, &nobs, &sum_x, &neg_ct) + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + add_mean(val, &nobs, &sum_x, &neg_ct) + + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + + else: + + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + add_mean(val, &nobs, &sum_x, &neg_ct) + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + add_mean(val, &nobs, &sum_x, &neg_ct) if i > win - 1: - prev = arg[i - win] + prev_x = input[i - win] + remove_mean(prev_x, &nobs, &sum_x, &neg_ct) + + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + + return output + +# ---------------------------------------------------------------------- +# Rolling variance + + +cdef inline double calc_var(int64_t minp, int ddof, double nobs, + double ssqdm_x) nogil: + cdef double result + + # Variance is unchanged if no observation is added or removed + if (nobs >= minp) and (nobs > ddof): + + # pathological case + if nobs == 1: + result = 0 + else: + result = ssqdm_x / (nobs - ddof) + if result < 0: + result = 0 + else: + result = NaN + + return result + + +cdef inline void add_var(double val, double *nobs, double *mean_x, + double *ssqdm_x) nogil: + """ add a value from the var calc """ + cdef double delta + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + delta = (val - mean_x[0]) + mean_x[0] = mean_x[0] + delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + +cdef inline void remove_var(double val, double *nobs, double *mean_x, + double *ssqdm_x) nogil: + """ remove a value from the var calc """ + cdef double delta + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + if nobs[0]: + delta = (val - mean_x[0]) + mean_x[0] = mean_x[0] - delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + else: + mean_x[0] = 0 + ssqdm_x[0] = 0 + + +def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, + object index, int ddof=1): + """ + Numerically stable implementation using Welford's method. + """ + cdef: + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + int64_t s, e + bint is_variable + Py_ssize_t i, j, N + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + # Check for windows larger than array, addresses #7297 + win = min(win, N) + + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we + # have 2 paths but is faster + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + add_var(input[j], &nobs, &mean_x, &ssqdm_x) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + add_var(input[j], &nobs, &mean_x, &ssqdm_x) + + # calculate deletes + for j in range(start[i - 1], s): + remove_var(input[j], &nobs, &mean_x, &ssqdm_x) + + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + else: + + with nogil: + + # Over the first window, observations can only be added, never + # removed + for i from 0 <= i < win: + add_var(input[i], &nobs, &mean_x, &ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + # After the first window, observations can both be added and + # removed + for i from win <= i < N: + val = input[i] + prev = input[i - win] + + if val == val: if prev == prev: - skiplist_remove(sl, prev) - nobs -= 1 + # Adding one observation and removing another one + delta = val - prev + prev -= mean_x + mean_x += delta / nobs + val -= mean_x + ssqdm_x += (val + prev) * delta + + else: + add_var(val, &nobs, &mean_x, &ssqdm_x) + elif prev == prev: + remove_var(prev, &nobs, &mean_x, &ssqdm_x) + + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + return output + + +# ---------------------------------------------------------------------- +# Rolling skewness + +cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, + double xxx) nogil: + cdef double result, dnobs + cdef double A, B, C, R + + if nobs >= minp: + dnobs = nobs + A = x / dnobs + B = xx / dnobs - A * A + C = xxx / dnobs - A * A * A - 3 * A * B + if B <= 0 or nobs < 3: + result = NaN + else: + R = sqrt(B) + result = ((sqrt(dnobs * (dnobs - 1.)) * C) / + ((dnobs - 2) * R * R * R)) + else: + result = NaN + + return result + +cdef inline void add_skew(double val, int64_t *nobs, double *x, double *xx, + double *xxx) nogil: + """ add a value from the skew calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + # seriously don't ask me why this is faster + x[0] = x[0] + val + xx[0] = xx[0] + val * val + xxx[0] = xxx[0] + val * val * val + +cdef inline void remove_skew(double val, int64_t *nobs, double *x, double *xx, + double *xxx) nogil: + """ remove a value from the skew calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + + # seriously don't ask me why this is faster + x[0] = x[0] - val + xx[0] = xx[0] - val * val + xxx[0] = xxx[0] - val * val * val + + +def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev + double x = 0, xx = 0, xxx = 0 + int64_t nobs = 0, i, j, N + int64_t s, e + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + val = input[j] + add_skew(val, &nobs, &x, &xx, &xxx) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + add_skew(val, &nobs, &x, &xx, &xxx) + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + remove_skew(val, &nobs, &x, &xx, &xxx) + + output[i] = calc_skew(minp, nobs, x, xx, xxx) + + else: + + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + add_skew(val, &nobs, &x, &xx, &xxx) + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + add_skew(val, &nobs, &x, &xx, &xxx) + + if i > win - 1: + prev = input[i - win] + remove_skew(prev, &nobs, &x, &xx, &xxx) + + output[i] = calc_skew(minp, nobs, x, xx, xxx) + + return output + +# ---------------------------------------------------------------------- +# Rolling kurtosis + + +cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, + double xxx, double xxxx) nogil: + cdef double result, dnobs + cdef double A, B, C, D, R, K + + if nobs >= minp: + dnobs = nobs + A = x / dnobs + R = A * A + B = xx / dnobs - R + R = R * A + C = xxx / dnobs - R - 3 * A * B + R = R * A + D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A + + if B == 0 or nobs < 4: + result = NaN + else: + K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) + result = K / ((dnobs - 2.) * (dnobs - 3.)) + else: + result = NaN + + return result + +cdef inline void add_kurt(double val, int64_t *nobs, double *x, double *xx, + double *xxx, double *xxxx) nogil: + """ add a value from the kurotic calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + # seriously don't ask me why this is faster + x[0] = x[0] + val + xx[0] = xx[0] + val * val + xxx[0] = xxx[0] + val * val * val + xxxx[0] = xxxx[0] + val * val * val * val + +cdef inline void remove_kurt(double val, int64_t *nobs, double *x, double *xx, + double *xxx, double *xxxx) nogil: + """ remove a value from the kurotic calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + + # seriously don't ask me why this is faster + x[0] = x[0] - val + xx[0] = xx[0] - val * val + xxx[0] = xxx[0] - val * val * val + xxxx[0] = xxxx[0] - val * val * val * val + + +def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev + double x = 0, xx = 0, xxx = 0, xxxx = 0 + int64_t nobs = 0, i, j, N + int64_t s, e + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + + else: + + with nogil: + + for i from 0 <= i < minp - 1: + add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + output[i] = NaN + + for i from minp - 1 <= i < N: + add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + + if i > win - 1: + prev = input[i - win] + remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + + return output + +# ---------------------------------------------------------------------- +# Rolling median, min, max + + +def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, res, prev + bint err=0, is_variable + int ret=0 + skiplist_t *sl + Py_ssize_t i, j + int64_t nobs = 0, N, s, e + int midpoint + ndarray[int64_t] start, end + ndarray[double_t] output + + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + start, end, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index, + use_mock=False) + output = np.empty(N, dtype=float) + + sl = skiplist_init(win) + if sl == NULL: + raise MemoryError("skiplist_init failed") + + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + val = input[i] if val == val: nobs += 1 err = skiplist_insert(sl, val) != 1 if err: break - if nobs >= minp: - midpoint = nobs / 2 - if nobs % 2: - res = skiplist_get(sl, midpoint, &ret) - else: - res = (skiplist_get(sl, midpoint, &ret) + - skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + skiplist_remove(sl, val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + + if nobs >= minp: + midpoint = (nobs / 2) + if nobs % 2: + res = skiplist_get(sl, midpoint, &ret) else: - res = NaN + res = (skiplist_get(sl, midpoint, &ret) + + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + res = NaN - output[i] = res + output[i] = res - skiplist_destroy(sl) + skiplist_destroy(sl) if err: raise MemoryError("skiplist_insert failed") return output -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Moving maximum / minimum code taken from Bottleneck under the terms # of its Simplified BSD license # https://github.com/kwgoodman/bottleneck -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_max(ndarray[numeric] a, int window, int minp): + +cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: + + if numeric in cython.floating: + if ai == ai: + nobs[0] = nobs[0] + 1 + elif is_max: + if numeric == cython.float: + ai = MINfloat32 + else: + ai = MINfloat64 + else: + if numeric == cython.float: + ai = MAXfloat32 + else: + ai = MAXfloat64 + + else: + nobs[0] = nobs[0] + 1 + + return ai + + +cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil: + """ remove a value from the mm calc """ + if numeric in cython.floating and aold == aold: + nobs[0] = nobs[0] - 1 + + +cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, + numeric value) nogil: + cdef numeric result + + if numeric in cython.floating: + if nobs >= minp: + result = value + else: + result = NaN + else: + result = value + + return result + + +def roll_max(ndarray[numeric] input, int64_t win, int64_t minp, + object index): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - a: numpy array + input: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN + index: ndarray, optional + index for window computation """ - return _roll_min_max(a, window, minp, 1) + return _roll_min_max(input, win, minp, index, is_max=1) + -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_min(ndarray[numeric] a, int window, int minp): +def roll_min(ndarray[numeric] input, int64_t win, int64_t minp, + object index): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - a: numpy array + input: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN + index: ndarray, optional + index for window computation """ - return _roll_min_max(a, window, minp, 0) - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef _roll_min_max(ndarray[numeric] a, int window, int minp, bint is_max): - "Moving min/max of 1d array of any numeric type along axis=0 ignoring NaNs." - cdef numeric ai, aold - cdef Py_ssize_t count - cdef Py_ssize_t* death - cdef numeric* ring - cdef numeric* minvalue - cdef numeric* end - cdef numeric* last - cdef Py_ssize_t i0 - cdef np.npy_intp *dim - dim = PyArray_DIMS(a) - cdef Py_ssize_t n0 = dim[0] - cdef np.npy_intp *dims = [n0] - cdef bint should_replace - cdef np.ndarray[numeric, ndim=1] y = PyArray_EMPTY(1, dims, PyArray_TYPE(a), 0) - - if window < 1: - raise ValueError('Invalid window size %d' - % (window)) - - if minp > window: - raise ValueError('Invalid min_periods size %d greater than window %d' - % (minp, window)) - - minp = _check_minp(window, minp, n0) - with nogil: - ring = malloc(window * sizeof(numeric)) - death = malloc(window * sizeof(Py_ssize_t)) - end = ring + window - last = ring + return _roll_min_max(input, win, minp, index, is_max=0) + +cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, + object index, bint is_max): + """ + Moving min/max of 1d array of any numeric type along axis=0 + ignoring NaNs. + """ + + cdef: + numeric ai + bint is_variable, should_replace + int64_t s, e, N, i, j, removed + Py_ssize_t nobs = 0 + ndarray[int64_t] starti, endi + ndarray[numeric, ndim=1] output + cdef: + int64_t* death + numeric* ring + numeric* minvalue + numeric* end + numeric* last + + cdef: + cdef numeric r + + starti, endi, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index) + + output = np.empty(N, dtype=input.dtype) + + if is_variable: + + with nogil: + + for i in range(N): + s = starti[i] + e = endi[i] + + r = input[s] + nobs = 0 + for j in range(s, e): + + # adds, death at the i offset + ai = init_mm(input[j], &nobs, is_max) + + if is_max: + if ai > r: + r = ai + else: + if ai < r: + r = ai + + output[i] = calc_mm(minp, nobs, r) + + else: + + # setup the rings of death! + ring = malloc(win * sizeof(numeric)) + death = malloc(win * sizeof(int64_t)) + + end = ring + win + last = ring minvalue = ring - ai = a[0] - if numeric in cython.floating: - if ai == ai: - minvalue[0] = ai - elif is_max: - minvalue[0] = MINfloat64 - else: - minvalue[0] = MAXfloat64 - else: - minvalue[0] = ai - death[0] = window - - count = 0 - for i0 in range(n0): - ai = a[i0] - if numeric in cython.floating: - if ai == ai: - count += 1 - elif is_max: - ai = MINfloat64 + ai = input[0] + minvalue[0] = init_mm(input[0], &nobs, is_max) + death[0] = win + nobs = 0 + + with nogil: + + for i in range(N): + ai = init_mm(input[i], &nobs, is_max) + + if i >= win: + remove_mm(input[i - win], &nobs) + + if death[minvalue - ring] == i: + minvalue = minvalue + 1 + if minvalue >= end: + minvalue = ring + + if is_max: + should_replace = ai >= minvalue[0] else: - ai = MAXfloat64 - else: - count += 1 - if i0 >= window: - aold = a[i0 - window] - if aold == aold: - count -= 1 - if death[minvalue-ring] == i0: - minvalue += 1 - if minvalue >= end: - minvalue = ring - should_replace = ai >= minvalue[0] if is_max else ai <= minvalue[0] - if should_replace: - minvalue[0] = ai - death[minvalue-ring] = i0 + window - last = minvalue - else: - should_replace = last[0] <= ai if is_max else last[0] >= ai - while should_replace: - if last == ring: - last = end - last -= 1 - should_replace = last[0] <= ai if is_max else last[0] >= ai - last += 1 - if last == end: - last = ring - last[0] = ai - death[last - ring] = i0 + window - if numeric in cython.floating: - if count >= minp: - y[i0] = minvalue[0] + should_replace = ai <= minvalue[0] + if should_replace: + + minvalue[0] = ai + death[minvalue - ring] = i + win + last = minvalue + else: - y[i0] = NaN - else: - y[i0] = minvalue[0] - for i0 in range(minp - 1): - if numeric in cython.floating: - y[i0] = NaN - else: - y[i0] = 0 + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + while should_replace: + if last == ring: + last = end + last -= 1 + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + + last += 1 + if last == end: + last = ring + last[0] = ai + death[last - ring] = i + win + + output[i] = calc_mm(minp, nobs, minvalue[0]) + + for i in range(minp - 1): + if numeric in cython.floating: + output[i] = NaN + else: + output[i] = 0 + + free(ring) + free(death) + + # print("output: {0}".format(output)) + return output - free(ring) - free(death) - return y -def roll_quantile(ndarray[float64_t, cast=True] input, int win, - int minp, double quantile): +def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, + int64_t minp, object index, double quantile): """ O(N log(window)) implementation using skip list """ - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - + cdef: + double val, prev, midpoint + IndexableSkiplist skiplist + int64_t nobs = 0, i, j, s, e, N + Py_ssize_t idx + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + start, end, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index, + use_mock=False) + output = np.empty(N, dtype=float) skiplist = IndexableSkiplist(win) - minp = _check_minp(win, minp, N) - - for i from 0 <= i < minp - 1: - val = input[i] + for i in range(0, N): + s = start[i] + e = end[i] - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) + if i == 0: - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] + # setup + val = input[i] + if val == val: + nobs += 1 + skiplist.insert(val) - if i > win - 1: - prev = input[i - win] + else: - if prev == prev: - skiplist.remove(prev) - nobs -= 1 + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + skiplist.remove(val) + nobs -= 1 - if val == val: - nobs += 1 - skiplist.insert(val) + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + nobs += 1 + skiplist.insert(val) if nobs >= minp: - idx = int((quantile / 1.) * (nobs - 1)) + idx = int(quantile * (nobs - 1)) output[i] = skiplist.get(idx) else: output[i] = NaN return output + def roll_generic(ndarray[float64_t, cast=True] input, - int win, int minp, int offset, - object func, object args, object kwargs): - cdef ndarray[double_t] output, counts, bufarr - cdef Py_ssize_t i, n - cdef float64_t *buf - cdef float64_t *oldbuf + int64_t win, int64_t minp, object index, + int offset, object func, + object args, object kwargs): + cdef: + ndarray[double_t] output, counts, bufarr + float64_t *buf + float64_t *oldbuf + int64_t nobs = 0, i, j, s, e, N + bint is_variable + ndarray[int64_t] start, end if not input.flags.c_contiguous: input = input.copy('C') @@ -855,36 +1350,60 @@ def roll_generic(ndarray[float64_t, cast=True] input, if n == 0: return input - minp = _check_minp(win, minp, n, floor=0) - output = np.empty(n, dtype=float) - counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:] + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index, + floor=0) + output = np.empty(N, dtype=float) - # truncated windows at the beginning, through first full-length window - for i from 0 <= i < (int_min(win, n) - offset): - if counts[i] >= minp: - output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs) - else: - output[i] = NaN + counts = roll_sum(np.concatenate([np.isfinite(input).astype(float), + np.array([0.] * offset)]), + win, minp, index)[offset:] - # remaining full-length windows - buf = input.data - bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data - for i from (win - offset) <= i < (n - offset): - buf = buf + 1 - bufarr.data = buf - if counts[i] >= minp: - output[i] = func(bufarr, *args, **kwargs) - else: - output[i] = NaN - bufarr.data = oldbuf + if is_variable: - # truncated windows at the end - for i from int_max(n - offset, 0) <= i < n: - if counts[i] >= minp: - output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs) - else: - output[i] = NaN + # variable window + if offset != 0: + raise ValueError("unable to roll_generic with a non-zero offset") + + for i in range(0, N): + s = start[i] + e = end[i] + + if counts[i] >= minp: + output[i] = func(input[s:e], *args, **kwargs) + else: + output[i] = NaN + + else: + + # truncated windows at the beginning, through first full-length window + for i from 0 <= i < (int_min(win, N) - offset): + if counts[i] >= minp: + output[i] = func(input[0: (i + offset + 1)], *args, **kwargs) + else: + output[i] = NaN + + # remaining full-length windows + buf = input.data + bufarr = np.empty(win, dtype=float) + oldbuf = bufarr.data + for i from (win - offset) <= i < (N - offset): + buf = buf + 1 + bufarr.data = buf + if counts[i] >= minp: + output[i] = func(bufarr, *args, **kwargs) + else: + output[i] = NaN + bufarr.data = oldbuf + + # truncated windows at the end + for i from int_max(N - offset, 0) <= i < N: + if counts[i] >= minp: + output[i] = func(input[int_max(i + offset - win + 1, 0): N], + *args, + **kwargs) + else: + output[i] = NaN return output @@ -952,3 +1471,179 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, output[in_i] = NaN return output + +# ---------------------------------------------------------------------- +# Exponentially weighted moving average + + +def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, + int minp): + """ + Compute exponentially-weighted moving average using center-of-mass. + + Parameters + ---------- + input : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + weighted_avg = input[0] + is_observation = (weighted_avg == weighted_avg) + nobs = int(is_observation) + output[0] = weighted_avg if (nobs >= minp) else NaN + old_wt = 1. + + for i from 1 <= i < N: + cur = input[i] + is_observation = (cur == cur) + nobs += int(is_observation) + if weighted_avg == weighted_avg: + + if is_observation or (not ignore_na): + + old_wt *= old_wt_factor + if is_observation: + + # avoid numerical errors on constant series + if weighted_avg != cur: + weighted_avg = ((old_wt * weighted_avg) + + (new_wt * cur)) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1. + elif is_observation: + weighted_avg = cur + + output[i] = weighted_avg if (nobs >= minp) else NaN + + return output + +# ---------------------------------------------------------------------- +# Exponentially weighted moving covariance + + +def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, + double_t com, int adjust, int ignore_na, int minp, int bias): + """ + Compute exponentially-weighted moving variance using center-of-mass. + + Parameters + ---------- + input_x : ndarray (float64 type) + input_y : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + bias: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input_x) + if len(input_y) != N: + raise ValueError("arrays are of different lengths " + "(%d and %d)" % (N, len(input_y))) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov + cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + mean_x = input_x[0] + mean_y = input_y[0] + is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) + nobs = int(is_observation) + if not is_observation: + mean_x = NaN + mean_y = NaN + output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN + cov = 0. + sum_wt = 1. + sum_wt2 = 1. + old_wt = 1. + + for i from 1 <= i < N: + cur_x = input_x[i] + cur_y = input_y[i] + is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) + nobs += int(is_observation) + if mean_x == mean_x: + if is_observation or (not ignore_na): + sum_wt *= old_wt_factor + sum_wt2 *= (old_wt_factor * old_wt_factor) + old_wt *= old_wt_factor + if is_observation: + old_mean_x = mean_x + old_mean_y = mean_y + + # avoid numerical errors on constant series + if mean_x != cur_x: + mean_x = ((old_wt * old_mean_x) + + (new_wt * cur_x)) / (old_wt + new_wt) + + # avoid numerical errors on constant series + if mean_y != cur_y: + mean_y = ((old_wt * old_mean_y) + + (new_wt * cur_y)) / (old_wt + new_wt) + cov = ((old_wt * (cov + ((old_mean_x - mean_x) * + (old_mean_y - mean_y)))) + + (new_wt * ((cur_x - mean_x) * + (cur_y - mean_y)))) / (old_wt + new_wt) + sum_wt += new_wt + sum_wt2 += (new_wt * new_wt) + old_wt += new_wt + if not adjust: + sum_wt /= old_wt + sum_wt2 /= (old_wt * old_wt) + old_wt = 1. + elif is_observation: + mean_x = cur_x + mean_y = cur_y + + if nobs >= minp: + if not bias: + numerator = sum_wt * sum_wt + denominator = numerator - sum_wt2 + if (denominator > 0.): + output[i] = ((numerator / denominator) * cov) + else: + output[i] = NaN + else: + output[i] = cov + else: + output[i] = NaN + + return output diff --git a/setup.py b/setup.py index 0bff49c4976b8..58965fe9ae6d6 100755 --- a/setup.py +++ b/setup.py @@ -430,9 +430,9 @@ def pxd(name): 'depends': [srcpath('generated', suffix='.pyx'), srcpath('join', suffix='.pyx')]}, _window={'pyxfile': 'window', - 'pxdfiles': ['src/skiplist','src/util'], - 'depends': ['pandas/src/skiplist.pyx', - 'pandas/src/skiplist.h']}, + 'pxdfiles': ['src/skiplist', 'src/util'], + 'depends': ['pandas/src/skiplist.pyx', + 'pandas/src/skiplist.h']}, parser={'pyxfile': 'parser', 'depends': ['pandas/src/parser/tokenizer.h', 'pandas/src/parser/io.h', From 57b373c97a8cd72f29a6206c5859661d8b926a97 Mon Sep 17 00:00:00 2001 From: Yuichiro Kaneko Date: Thu, 21 Jul 2016 02:25:33 +0900 Subject: [PATCH 126/359] CLN: Remove a test case about Timestamp to TestTimestamp (#13722) --- pandas/tseries/tests/test_timedeltas.py | 1 - pandas/tseries/tests/test_timeseries.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 36ae479c3dfcc..659101cb4cad2 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -188,7 +188,6 @@ def test_construction(self): self.assertEqual(Timedelta('').value, iNaT) self.assertEqual(Timedelta('nat').value, iNaT) self.assertEqual(Timedelta('NAT').value, iNaT) - self.assertTrue(isnull(Timestamp('nat'))) self.assertTrue(isnull(Timedelta('nat'))) # offset diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 59fc147ead4eb..9c97749c87103 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4389,6 +4389,8 @@ def check(val, unit=None, h=1, s=1, us=0): result = Timestamp('NaT') self.assertIs(result, NaT) + self.assertTrue(isnull(Timestamp('nat'))) + def test_roundtrip(self): # test value to string and back conversions From b25a2a1259f33ce8123b7f239f109ae42155d02c Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 20 Jul 2016 17:11:18 -0400 Subject: [PATCH 127/359] DOC/DEPR: pivot_annual closes #736 Author: sinhrks Closes #13706 from sinhrks/pivot_annual and squashes the following commits: d097bab [sinhrks] DOC/DEPR: pivot_annual --- doc/source/cookbook.rst | 13 +++++++++++++ doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tseries/tests/test_util.py | 9 ++++++--- pandas/tseries/util.py | 8 ++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 0dbc79415af0b..38a816060e1bc 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -679,6 +679,19 @@ The :ref:`Pivot ` docs. 'Employed' : lambda x : sum(x), 'Grade' : lambda x : sum(x) / len(x)}) +`Plot pandas DataFrame with year over year data +`__ + +To create year and month crosstabulation: + +.. ipython:: python + + df = pd.DataFrame({'value': np.random.randn(36)}, + index=pd.date_range('2011-01-01', freq='M', periods=36)) + + pd.pivot_table(df, index=df.index.month, columns=df.index.year, + values='value', aggfunc='sum') + Apply ***** diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index cdae0d5c27c7d..ee77660795852 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -589,7 +589,7 @@ Deprecations - ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) - top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) - ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) - +- ``pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) .. _whatsnew_0190.prior_deprecations: diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py index 9c5c9b7a03445..9d992995df3a7 100644 --- a/pandas/tseries/tests/test_util.py +++ b/pandas/tseries/tests/test_util.py @@ -21,7 +21,8 @@ def test_daily(self): rng = date_range('1/1/2000', '12/31/2004', freq='D') ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_annual(ts, 'D') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'D') doy = ts.index.dayofyear doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 @@ -53,7 +54,8 @@ def test_hourly(self): hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 hoy += 1 - annual = pivot_annual(ts_hourly) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts_hourly) ts_hourly = ts_hourly.astype(float) for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: @@ -78,7 +80,8 @@ def test_monthly(self): rng = date_range('1/1/2000', '12/31/2004', freq='M') ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_annual(ts, 'M') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'M') month = ts.index.month for i in range(1, 13): diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index 98a93d22b09a6..7bac0567ea5c6 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -1,3 +1,5 @@ +import warnings + from pandas.compat import lrange import numpy as np from pandas.types.common import _ensure_platform_int @@ -7,6 +9,8 @@ def pivot_annual(series, freq=None): """ + Deprecated. Use ``pivot_table`` instead. + Group a series by years, taking leap years into account. The output has as many rows as distinct years in the original series, @@ -35,6 +39,10 @@ def pivot_annual(series, freq=None): ------- annual : DataFrame """ + + msg = "pivot_annual is deprecated. Use pivot_table instead" + warnings.warn(msg, FutureWarning) + index = series.index year = index.year years = nanops.unique1d(year) From 016b35276eea344b861147dfff2d4ff8ae52aadc Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 20 Jul 2016 17:22:45 -0400 Subject: [PATCH 128/359] PERF: Improve Period hashing closes #12817 Author: sinhrks Closes #13705 from sinhrks/period_hash and squashes the following commits: e1fb7f4 [sinhrks] PERF: Improve Period hasing --- asv_bench/benchmarks/period.py | 26 +++++- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/src/period.pyx | 2 +- pandas/tseries/tests/test_base.py | 138 ++++++++++++++++++++++------ pandas/tseries/tests/test_period.py | 13 +++ 5 files changed, 152 insertions(+), 29 deletions(-) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 012030a71ac82..c1b89ae1db75b 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,4 +1,4 @@ -from pandas import PeriodIndex, date_range +from pandas import Series, Period, PeriodIndex, date_range class create_period_index_from_date_range(object): @@ -7,3 +7,27 @@ class create_period_index_from_date_range(object): def time_period_index(self): # Simulate irregular PeriodIndex PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D') + + +class period_algorithm(object): + goal_time = 0.2 + + def setup(self): + data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), + Period('2011-03', freq='M'), Period('2011-04', freq='M')] + self.s = Series(data * 1000) + self.i = PeriodIndex(data, freq='M') + + def time_period_series_drop_duplicates(self): + self.s.drop_duplicates() + + def time_period_index_drop_duplicates(self): + self.i.drop_duplicates() + + def time_period_series_value_counts(self): + self.s.value_counts() + + def time_period_index_value_counts(self): + self.i.value_counts() + + diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index ee77660795852..73ce39b66fc27 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -628,6 +628,8 @@ Performance Improvements - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - Improved performance of ``Index.difference`` (:issue:`12044`) - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) +- Improved performance of hashing ``Period`` (:issue:`12817`) + .. _whatsnew_0190.bug_fixes: diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 37f265ede07e7..45743d1cf70ff 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -727,7 +727,7 @@ cdef class _Period(object): (type(self).__name__, type(other).__name__)) def __hash__(self): - return hash((self.ordinal, self.freq)) + return hash((self.ordinal, self.freqstr)) def _add_delta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 4aa1e2f5d33dd..05f7d9d9ce7b8 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -491,13 +491,15 @@ def test_value_counts_unique(self): for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex( - np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + tz=tz) exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, tz=tz) expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, tz=tz) @@ -507,15 +509,20 @@ def test_value_counts_unique(self): '2013-01-01 09:00', '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], tz=tz) - exp_idx = DatetimeIndex( - ['2013-01-01 09:00', '2013-01-01 08:00'], tz=tz) + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + tz=tz) expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = DatetimeIndex( - ['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], tz=tz) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], tz=tz) expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), + expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -654,6 +661,27 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertIsNone(result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_take(self): # GH 10295 idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') @@ -1303,23 +1331,29 @@ def test_value_counts_unique(self): exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10) expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) expected = timedelta_range('1 days 09:00:00', freq='H', periods=10) tm.assert_index_equal(idx.unique(), expected) - idx = TimedeltaIndex( - ['1 days 09:00:00', '1 days 09:00:00', '1 days 09:00:00', - '1 days 08:00:00', '1 days 08:00:00', pd.NaT]) + idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00', + '1 days 09:00:00', '1 days 08:00:00', + '1 days 08:00:00', pd.NaT]) exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00']) expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', pd.NaT - ]) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', + pd.NaT]) expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -1454,6 +1488,27 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertIsNone(result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_take(self): # GH 10295 idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') @@ -2121,8 +2176,8 @@ def test_value_counts_unique(self): # GH 7735 idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = PeriodIndex( - np.repeat(idx.values, range(1, len(idx) + 1)), freq='H') + idx = PeriodIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + freq='H') exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00', '2011-01-01 16:00', '2011-01-01 15:00', @@ -2131,24 +2186,31 @@ def test_value_counts_unique(self): '2011-01-01 10:00', '2011-01-01 09:00'], freq='H') expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) - expected = pd.period_range('2011-01-01 09:00', freq='H', periods=10) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + expected = pd.period_range('2011-01-01 09:00', freq='H', + periods=10) tm.assert_index_equal(idx.unique(), expected) idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], freq='H') - exp_idx = PeriodIndex( - ['2013-01-01 09:00', '2013-01-01 08:00'], freq='H') + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + freq='H') expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = PeriodIndex( - ['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], freq='H') + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], freq='H') expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -2164,6 +2226,28 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertEqual(idx.freq, result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.period_range('2011-01-01', '2011-01-31', freq='D', + name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_order_compat(self): def _check_freq(index, expected_index): if isinstance(index, PeriodIndex): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index c90cbbf80086a..e3a67289a587b 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -462,6 +462,19 @@ def test_period_deprecated_freq(self): p = Period('2016-03-01 09:00', freq=exp) tm.assertIsInstance(p, Period) + def test_hash(self): + self.assertEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01-01', freq='D')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='3M')), + hash(Period('2011-01', freq='2M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-02', freq='M'))) + def test_repr(self): p = Period('Jan-2000') self.assertIn('2000-01', repr(p)) From 49621311e7812b4bacd487421057ef3f79434bdd Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 20 Jul 2016 17:25:12 -0400 Subject: [PATCH 129/359] MAINT: Removed some warnings in tests Per discussion with @jreback here. Author: gfyoung Closes #13702 from gfyoung/test-warnings-remove and squashes the following commits: e7292d3 [gfyoung] MAINT: Removed some warnings in tests --- pandas/core/internals.py | 2 +- pandas/tests/test_categorical.py | 81 ++++++++++++-------------------- 2 files changed, 32 insertions(+), 51 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ff12cfddbe9cd..8e77486457546 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1490,7 +1490,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] - mask = mask.reshape(new_values.shape) + mask = _safe_reshape(mask, new_values.shape) new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 35b1b8c1bf341..57b8bb1531551 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101,E1103,W0232 -import os import sys from datetime import datetime from distutils.version import LooseVersion @@ -2906,54 +2905,41 @@ def test_value_counts(self): tm.assert_series_equal(res, exp) def test_value_counts_with_nan(self): - # https://github.com/pydata/pandas/issues/9443 + # see gh-9443 + # sanity check s = pd.Series(["a", "b", "a"], dtype="category") - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) - s = pd.Series(["a", "b", None, "a", None, None], dtype="category") - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([3, 2, 1], index=pd.CategoricalIndex([np.nan, "a", "b"]))) - # When we aren't sorting by counts, and np.nan isn't a - # category, it should be last. - tm.assert_series_equal( - s.value_counts(dropna=False, sort=False), - pd.Series([2, 1, 3], - index=pd.CategoricalIndex(["a", "b", np.nan]))) + res = s.value_counts(dropna=True) + tm.assert_series_equal(res, exp) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical(["a", "b", "a"], - categories=["a", "b", np.nan])) + res = s.value_counts(dropna=True) + tm.assert_series_equal(res, exp) - # internal categories are different because of NaN - exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) - tm.assert_series_equal(s.value_counts(dropna=True), exp, - check_categorical=False) - exp = pd.Series([2, 1, 0], - index=pd.CategoricalIndex(["a", "b", np.nan])) - tm.assert_series_equal(s.value_counts(dropna=False), exp, - check_categorical=False) + # same Series via two different constructions --> same behaviour + series = [ + pd.Series(["a", "b", None, "a", None, None], dtype="category"), + pd.Series(pd.Categorical(["a", "b", None, "a", None, None], + categories=["a", "b"])) + ] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], - categories=["a", "b", np.nan])) + for s in series: + # None is a NaN value, so we exclude its count here + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) + res = s.value_counts(dropna=True) + tm.assert_series_equal(res, exp) - exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) - tm.assert_series_equal(s.value_counts(dropna=True), exp, - check_categorical=False) - exp = pd.Series([3, 2, 1], - index=pd.CategoricalIndex([np.nan, "a", "b"])) - tm.assert_series_equal(s.value_counts(dropna=False), exp, - check_categorical=False) + # we don't exclude the count of None and sort by counts + exp = pd.Series([3, 2, 1], index=pd.CategoricalIndex([np.nan, "a", "b"])) + res = s.value_counts(dropna=False) + tm.assert_series_equal(res, exp) + + # When we aren't sorting by counts, and np.nan isn't a + # category, it should be last. + exp = pd.Series([2, 1, 3], index=pd.CategoricalIndex(["a", "b", np.nan])) + res = s.value_counts(dropna=False, sort=False) + tm.assert_series_equal(res, exp) def test_groupby(self): @@ -4113,16 +4099,11 @@ def f(): res = df.dropna() tm.assert_frame_equal(res, df_exp_drop_all) - # make sure that fillna takes both missing values and NA categories - # into account - c = Categorical(["a", "b", np.nan]) - with tm.assert_produces_warning(FutureWarning): - c.set_categories(["a", "b", np.nan], rename=True, inplace=True) - - c[0] = np.nan + # make sure that fillna takes missing values into account + c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) - cat_exp = Categorical(["a", "b", "a"], categories=["a", "b", np.nan]) + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) df_exp = pd.DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) res = df.fillna("a") From 634e95d8d0f79bcaded9e92b4bbce46dd9805da4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 20 Jul 2016 17:27:51 -0400 Subject: [PATCH 130/359] CLN: removed the 'diff' method for Index Deprecated all the way back in `0.15.0` here. Author: gfyoung Closes #13669 from gfyoung/remove-index-diff and squashes the following commits: 7dca659 [gfyoung] CLN: removed the 'diff' method for Index --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/base.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 73ce39b66fc27..5727b917fd08c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -602,6 +602,7 @@ Removal of prior version deprecations/changes - ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) - ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) +- ``pd.Index`` has dropped the ``diff`` method in favour of ``difference`` (:issue:`13669`) - Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 567d2a458dafa..850d049ef9f45 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1965,8 +1965,6 @@ def difference(self, other): return this._shallow_copy(the_diff, name=result_name) - diff = deprecate('diff', difference) - def symmetric_difference(self, other, result_name=None): """ Compute the symmetric difference of two Index objects. From 210fea9d4dc4314f9bc4ddb5f7dab6fa87912ca9 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Wed, 20 Jul 2016 17:44:56 -0400 Subject: [PATCH 131/359] ERR: csv parser exceptions will now bubble up closes #13652 Author: Brett Rosen Closes #13693 from bdrosen96/brett/dont_swallow_exc and squashes the following commits: 0efe18b [Brett Rosen] Address review comments 6ed3a2e [Brett Rosen] Flake e966c26 [Brett Rosen] Test case for patch, plus fix to not swallow exceptions --- doc/source/whatsnew/v0.19.0.txt | 11 +++++----- pandas/io/tests/parser/common.py | 22 +++++++++++++++++++ .../io/tests/parser/data/sauron.SHIFT_JIS.csv | 14 ++++++++++++ pandas/io/tests/parser/test_parsers.py | 1 + pandas/parser.pyx | 18 ++++++++++++++- 5 files changed, 59 insertions(+), 7 deletions(-) create mode 100644 pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 5727b917fd08c..58a92cfa5a784 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -309,7 +309,7 @@ Other enhancements - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) -- ``.to_stata()`` and ```StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) +- ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) .. _whatsnew_0190.api: @@ -317,7 +317,7 @@ API changes ~~~~~~~~~~~ -- ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue: `12882`) +- ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`) - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) @@ -330,7 +330,7 @@ API changes - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) - +- More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) .. _whatsnew_0190.api.tolist: @@ -595,7 +595,6 @@ Deprecations Removal of prior version deprecations/changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) @@ -689,8 +688,8 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a ``tempfile.TemporaryFile`` on Windows with Python 3 (:issue:`13398`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) -- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) -- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) +- Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) +- Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 670f3df6f3984..11eed79e03267 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -3,6 +3,7 @@ import csv import os import platform +import codecs import re import sys @@ -45,6 +46,27 @@ def test_empty_decimal_marker(self): with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(data), decimal='') + def test_bad_stream_exception(self): + # Issue 13652: + # This test validates that both python engine + # and C engine will raise UnicodeDecodeError instead of + # c engine raising CParserError and swallowing exception + # that caused read to fail. + handle = open(self.csv_shiftjs, "rb") + codec = codecs.lookup("utf-8") + utf8 = codecs.lookup('utf-8') + # stream must be binary UTF8 + stream = codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, + codec.streamwriter) + if compat.PY3: + msg = "'utf-8' codec can't decode byte" + else: + msg = "'utf8' codec can't decode byte" + with tm.assertRaisesRegexp(UnicodeDecodeError, msg): + self.read_csv(stream) + stream.close() + def test_read_csv(self): if not compat.PY3: if compat.is_platform_windows(): diff --git a/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv b/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv new file mode 100644 index 0000000000000..218ddf333ef52 --- /dev/null +++ b/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv @@ -0,0 +1,14 @@ +num, text +1,�T�E�����iSauron�A�A�C�k�A�̑n���̎� - ��O�I3019�N3��25���j�́AJ�ER�ER�E�g�[���L���̒����𕑑�Ƃ��������w�z�r�b�g�̖`���x�w�w�֕���x�w�V���}�����̕���x�̓o��l���B +2,�w�z�r�b�g�̖`���x�Ɍ��y�̂���u���l����Ȃ��t�v�i�f��w�z�r�b�g�V���[�Y�x�̎����ł́u���l�����i�l�N���}���T�[�j�v�j�Ƃ͔ނ̂��Ƃł���B +3,���̑��҂ł���w�w�֕���x�ɂ����Ắu��‚̎w�ցithe One Ring�j�v�̍���A�u�����iDark Lord�j�v�A�u���̎ҁithe One�j[1]�v�Ƃ��ēo�ꂷ��B�O�j�ɂ�����w�V���}�����̕���x�ł́A����̖��������S�X�̍ł��͂��鑤�߂ł������B +4,�T�E�����͌����A�A���_�i�n���j�̑n����S�����V�g�I�푰�A�C�k�A�̈���ł��������A�僁���R�[���̔��t�ɉ��S���đ—����A�A���_�ɊQ���Ȃ����݂ƂȂ����B +5,�u�T�E�����v�Ƃ̓N�E�F�����Łu�g�̖т̂悾�‚��́v�Ƃ����Ӗ��ł���A�V���_�����œ��l�̈Ӗ��ł��閼�O�u�S���T�E�A�v�ƌĂ΂�邱�Ƃ�����B +6,�����́A�T�E����������A���݌������G���t�ɂ�閼�ł���A�w�w�֕���x�쒆�ɂ����ăA���S�����́u����i�T�E�����j�͎����̖{���̖��͎g��Ȃ����A��������ɏ���������ɏo�����肷�邱�Ƃ������Ȃ��v�Ɣ������Ă���B +7,���̂ق��A���I�ɃG���t�ɑ΂��Ď��̂����Ƃ���閼�ɁA�u�A���i�^�[���i������N�j�v�A�u�A���^�m�i���M�ȍ׍H�t�j�v�A�u�A�E�����f�B���i�A�E���̉��l�j�v������B +8,���I�̍��̃T�E�����́A���݂ɕϐg����\�͂������Ă����B +9,���̔\�͂��g���Ό��ڗ킵�����h�ȊO���𑕂����Ƃ�A�܂�����ȘT��z����������Ƃ����������ɕς��邱�Ƃ��ł��A�G���t���狰���ꂽ�B +10,���I�Ɉ�‚̎w�ւ����グ���T�E�����́A���̗͂̎w�ւŐ�����鎖���₻�̏��L�҂��x�z�ł���悤�ɂȂ����B +11,�܂��A���̂��łтĂ��w�ւ�������艽�x�ł��h�邱�Ƃ��ł����B +12,�������k�[���m�[���v���̍ۂɔ��������̂�j�󂳂ꂽ��́A��x�Ɣ������ϐg���邱�Ƃ͂ł��Ȃ��Ȃ�A���̈��ӂ̋�̂悤�Ȍ�������낵���p�����Ƃ�Ȃ��Ȃ����Ƃ����B +13,�܂����΂��΁u�܂Ԃ��̂Ȃ��΂ɉ����ꂽ�ځv�Ƃ������S�ە\���ő�����ꂽ�B diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 21f903342a611..6001c85ae76b1 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -44,6 +44,7 @@ def setUp(self): self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') + self.csv_shiftjs = os.path.join(self.dirpath, 'sauron.SHIFT_JIS.csv') class TestCParserHighMemory(BaseParser, CParserTests, tm.TestCase): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 3928bc8472113..b5d1c8b7acf2c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -10,7 +10,9 @@ import warnings from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE from cpython cimport (PyObject, PyBytes_FromString, PyBytes_AsString, PyBytes_Check, - PyUnicode_Check, PyUnicode_AsUTF8String) + PyUnicode_Check, PyUnicode_AsUTF8String, + PyErr_Occurred, PyErr_Fetch) +from cpython.ref cimport PyObject, Py_XDECREF from io.common import CParserError, DtypeWarning, EmptyDataError @@ -1878,6 +1880,20 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL: cdef raise_parser_error(object base, parser_t *parser): + cdef: + object old_exc + PyObject *type + PyObject *value + PyObject *traceback + + if PyErr_Occurred(): + PyErr_Fetch(&type, &value, &traceback); + Py_XDECREF(type) + Py_XDECREF(traceback) + if value != NULL: + old_exc = value + Py_XDECREF(value) + raise old_exc message = '%s. C error: ' % base if parser.error_msg != NULL: if PY3: From 63a1e5c58af8ddc8dec192f39a0999aad74acaf9 Mon Sep 17 00:00:00 2001 From: Stephen Kappel Date: Sun, 8 May 2016 19:19:30 -0400 Subject: [PATCH 132/359] ENH: astype() allows col label -> dtype mapping as arg closes #7271 closes #13375 --- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/core/generic.py | 41 ++++++++++++++++--- pandas/tests/frame/test_dtypes.py | 65 +++++++++++++++++++++++++++++- pandas/tests/series/test_dtypes.py | 16 ++++++++ pandas/tests/test_panel.py | 12 ++++++ 5 files changed, 130 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 58a92cfa5a784..febdd2c939164 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -331,6 +331,8 @@ API changes - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) - More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) +- ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) + .. _whatsnew_0190.api.tolist: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dd4be571ef2b4..0863c8f1af385 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,4 +1,5 @@ # pylint: disable=W0231,E1101 +import collections import warnings import operator import weakref @@ -161,7 +162,7 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): @property def _constructor(self): - """Used when a manipulation result has the same dimesions as the + """Used when a manipulation result has the same dimensions as the original. """ raise AbstractMethodError(self) @@ -3000,7 +3001,11 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): Parameters ---------- - dtype : numpy.dtype or Python type + dtype : data type, or dict of column name -> data type + Use a numpy.dtype or Python type to cast entire pandas object to + the same type. Alternatively, use {col: dtype, ...}, where col is a + column label and dtype is a numpy.dtype or Python type to cast one + or more of the DataFrame's columns to column-specific types. raise_on_error : raise on invalid input kwargs : keyword arguments to pass on to the constructor @@ -3008,10 +3013,36 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): ------- casted : type of caller """ + if isinstance(dtype, collections.Mapping): + if self.ndim == 1: # i.e. Series + if len(dtype) > 1 or list(dtype.keys())[0] != self.name: + raise KeyError('Only the Series name can be used for ' + 'the key in Series dtype mappings.') + new_type = list(dtype.values())[0] + return self.astype(new_type, copy, raise_on_error, **kwargs) + elif self.ndim > 2: + raise NotImplementedError( + 'astype() only accepts a dtype arg of type dict when ' + 'invoked on Series and DataFrames. A single dtype must be ' + 'specified when invoked on a Panel.' + ) + for col_name in dtype.keys(): + if col_name not in self: + raise KeyError('Only a column name can be used for the ' + 'key in a dtype mappings argument.') + from pandas import concat + results = [] + for col_name, col in self.iteritems(): + if col_name in dtype: + results.append(col.astype(dtype[col_name], copy=copy)) + else: + results.append(results.append(col.copy() if copy else col)) + return concat(results, axis=1, copy=False) - mgr = self._data.astype(dtype=dtype, copy=copy, - raise_on_error=raise_on_error, **kwargs) - return self._constructor(mgr).__finalize__(self) + # else, only a single dtype is given + new_data = self._data.astype(dtype=dtype, copy=copy, + raise_on_error=raise_on_error, **kwargs) + return self._constructor(new_data).__finalize__(self) def copy(self, deep=True): """ diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index c650436eefaf3..817770b9da610 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -5,7 +5,7 @@ import numpy as np from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, - compat, option_context) + compat, concat, option_context) from pandas.compat import u from pandas.types.dtypes import DatetimeTZDtype from pandas.tests.frame.common import TestData @@ -396,6 +396,69 @@ def test_astype_str(self): expected = DataFrame(['1.12345678901']) assert_frame_equal(result, expected) + def test_astype_dict(self): + # GH7271 + a = Series(date_range('2010-01-04', periods=5)) + b = Series(range(5)) + c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + d = Series(['1.0', '2', '3.14', '4', '5.4']) + df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) + original = df.copy(deep=True) + + # change type of a subset of columns + result = df.astype({'b': 'str', 'd': 'float32'}) + expected = DataFrame({ + 'a': a, + 'b': Series(['0', '1', '2', '3', '4']), + 'c': c, + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) + assert_frame_equal(result, expected) + assert_frame_equal(df, original) + + result = df.astype({'b': np.float32, 'c': 'float32', 'd': np.float64}) + expected = DataFrame({ + 'a': a, + 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), + 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) + assert_frame_equal(result, expected) + assert_frame_equal(df, original) + + # change all columns + assert_frame_equal(df.astype({'a': str, 'b': str, 'c': str, 'd': str}), + df.astype(str)) + assert_frame_equal(df, original) + + # error should be raised when using something other than column labels + # in the keys of the dtype dict + self.assertRaises(KeyError, df.astype, {'b': str, 2: str}) + self.assertRaises(KeyError, df.astype, {'e': str}) + assert_frame_equal(df, original) + + # if the dtypes provided are the same as the original dtypes, the + # resulting DataFrame should be the same as the original DataFrame + equiv = df.astype({col: df[col].dtype for col in df.columns}) + assert_frame_equal(df, equiv) + assert_frame_equal(df, original) + + def test_astype_duplicate_col(self): + a1 = Series([1, 2, 3, 4, 5], name='a') + b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b') + a2 = Series([0, 1, 2, 3, 4], name='a') + df = concat([a1, b, a2], axis=1) + + result = df.astype(str) + a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a') + b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str, + name='b') + a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a') + expected = concat([a1_str, b_str, a2_str], axis=1) + assert_frame_equal(result, expected) + + result = df.astype({'a': 'str'}) + expected = concat([a1_str, b, a2_str], axis=1) + assert_frame_equal(result, expected) + def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 6864eac603ded..9a406dfa10c35 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -133,6 +133,22 @@ def test_astype_unicode(self): reload(sys) # noqa sys.setdefaultencoding(former_encoding) + def test_astype_dict(self): + # GH7271 + s = Series(range(0, 10, 2), name='abc') + + result = s.astype({'abc': str}) + expected = Series(['0', '2', '4', '6', '8'], name='abc') + assert_series_equal(result, expected) + + result = s.astype({'abc': 'float64'}) + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', + name='abc') + assert_series_equal(result, expected) + + self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) + self.assertRaises(KeyError, s.astype, {0: str}) + def test_complexx(self): # GH4819 # complex access for ndarray compat diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index f2e13867d3bf0..d9c7c1dc0dc62 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1231,6 +1231,18 @@ def test_dtypes(self): expected = Series(np.dtype('float64'), index=self.panel.items) assert_series_equal(result, expected) + def test_astype(self): + # GH7271 + data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) + panel = Panel(data, ['a', 'b'], ['c', 'd'], ['e', 'f']) + + str_data = np.array([[['1', '2'], ['3', '4']], + [['5', '6'], ['7', '8']]]) + expected = Panel(str_data, ['a', 'b'], ['c', 'd'], ['e', 'f']) + assert_panel_equal(panel.astype(str), expected) + + self.assertRaises(NotImplementedError, panel.astype, {0: str}) + def test_apply(self): # GH1148 From aa3ece3e88fb0a6a7d4702a28eb800b791973bef Mon Sep 17 00:00:00 2001 From: Yuichiro Kaneko Date: Thu, 21 Jul 2016 07:16:28 +0900 Subject: [PATCH 133/359] CLN: Replace float64_t with int64_t in _ensure_components (#13673) --- pandas/tslib.pyx | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 016c49ea2b859..5f487eedd1683 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2494,20 +2494,19 @@ cdef class _Timedelta(timedelta): """ compute the components """ - cdef int64_t sfrac, ifrac, ivalue = self.value - cdef float64_t frac + cdef int64_t sfrac, ifrac, frac, ivalue = self.value if self.is_populated: return # put frac in seconds - frac = float(ivalue)/1e9 + frac = ivalue/(1000*1000*1000) if frac < 0: self._sign = -1 # even fraction - if int(-frac/86400) != -frac/86400.0: - self._d = int(-frac/86400.0+1) + if (-frac % 86400) != 0: + self._d = -frac/86400 + 1 frac += 86400*self._d else: frac = -frac @@ -2516,39 +2515,37 @@ cdef class _Timedelta(timedelta): self._d = 0 if frac >= 86400: - self._d += int(frac / 86400) + self._d += frac / 86400 frac -= self._d * 86400 if frac >= 3600: - self._h = int(frac / 3600) + self._h = frac / 3600 frac -= self._h * 3600 else: self._h = 0 if frac >= 60: - self._m = int(frac / 60) + self._m = frac / 60 frac -= self._m * 60 else: self._m = 0 if frac >= 0: - self._s = int(frac) + self._s = frac frac -= self._s else: self._s = 0 - if frac != 0: - - # reset so we don't lose precision - sfrac = int((self._h*3600 + self._m*60 + self._s)*1e9) - if self._sign < 0: - ifrac = ivalue + self._d*DAY_NS - sfrac - else: - ifrac = ivalue - (self._d*DAY_NS + sfrac) + sfrac = (self._h*3600 + self._m*60 + self._s)*(1000*1000*1000) + if self._sign < 0: + ifrac = ivalue + self._d*DAY_NS - sfrac + else: + ifrac = ivalue - (self._d*DAY_NS + sfrac) - self._ms = int(ifrac/1e6) + if ifrac != 0: + self._ms = ifrac/(1000*1000) ifrac -= self._ms*1000*1000 - self._us = int(ifrac/1e3) + self._us = ifrac/1000 ifrac -= self._us*1000 self._ns = ifrac else: From 1ce8f8e0b8540252dac25497f29d4de66a8bea3f Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Wed, 20 Jul 2016 18:18:07 -0400 Subject: [PATCH 134/359] ERR: Add check for input array lengths in from_arrays method (GH13599) closes #13599 Author: Sahil Dua Closes #13728 from sahildua2305/multi-from-arrays-bug and squashes the following commits: dbd3ab8 [Sahil Dua] BUG: Add check for input array lengths in from_arrays method (GH13599) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/multi.py | 6 ++++++ pandas/tests/indexes/test_multi.py | 17 +++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index febdd2c939164..b6168f7737654 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -681,6 +681,7 @@ Bug Fixes - Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) - Bug in ``.str.replace`` does not raise ``TypeError`` for invalid replacement (:issue:`13438`) +- Bug in ``MultiIndex.from_arrays`` which didn't check for input array lengths matching (:issue:`13599`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 365a971f82a3b..184744915bd8d 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -848,6 +848,12 @@ def from_arrays(cls, arrays, sortorder=None, names=None): name = None if names is None else names[0] return Index(arrays[0], name=name) + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i - 1]): + raise ValueError('all arrays must be same length') + cats = [Categorical.from_array(arr, ordered=True) for arr in arrays] levels = [c.categories for c in cats] labels = [c.codes for c in cats] diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 2734e90a1971b..0b65b6a9d09f5 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -632,6 +632,23 @@ def test_from_arrays_index_series_period(self): tm.assert_index_equal(result, result2) + def test_from_arrays_different_lengths(self): + # GH13599 + idx1 = [1, 2, 3] + idx2 = ['a', 'b'] + assertRaisesRegexp(ValueError, '^all arrays must be same length$', + MultiIndex.from_arrays, [idx1, idx2]) + + idx1 = [] + idx2 = ['a', 'b'] + assertRaisesRegexp(ValueError, '^all arrays must be same length$', + MultiIndex.from_arrays, [idx1, idx2]) + + idx1 = [1, 2, 3] + idx2 = [] + assertRaisesRegexp(ValueError, '^all arrays must be same length$', + MultiIndex.from_arrays, [idx1, idx2]) + def test_from_product(self): first = ['foo', 'bar', 'buz'] From 622297ccf12e9b013de6c7a1316dd9453176ecb1 Mon Sep 17 00:00:00 2001 From: Piotr Jucha Date: Wed, 20 Jul 2016 19:16:39 -0400 Subject: [PATCH 135/359] TST/BUG: test unsortable in safe_sort and Categorical (GH13714) - Change test for Categorical.from_array - Add test for safe_sort - Change 2.7 build to latest numpy --- ci/requirements-2.7.build | 2 +- ci/requirements-2.7.run | 2 +- pandas/tests/test_algos.py | 14 +++++++++++++- pandas/tests/test_categorical.py | 15 +++------------ 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build index eca8460468d34..b2e2038faf7c3 100644 --- a/ci/requirements-2.7.build +++ b/ci/requirements-2.7.build @@ -1,4 +1,4 @@ python-dateutil=2.4.1 pytz=2013b -numpy=1.9.3 +numpy cython=0.19.1 diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index 0c1132eaa62d3..560d6571b8771 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -1,6 +1,6 @@ python-dateutil=2.4.1 pytz=2013b -numpy=1.9.3 +numpy xlwt=0.7.5 numexpr pytables diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f18d869b3843d..cf23d096d99ba 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -5,7 +5,6 @@ from numpy.random import RandomState from numpy import nan import datetime - from pandas import Series, Categorical, CategoricalIndex, Index import pandas as pd @@ -116,6 +115,19 @@ def test_mixed_integer(self): labels = [0, 1, 2, 3, 0, -1, 1] result, result_labels = algos.safe_sort(values, labels) expected = np.array([0, 1, 'a', 'b'], dtype=object) + expected_labels = np.array([3, 1, 0, 2, 3, -1, 1]) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + def test_unsortable(self): + # GH 13714 + arr = np.array([1, 2, datetime.datetime.now(), 0, 3], dtype=object) + if compat.PY2 and not pd._np_version_under1p10: + # RuntimeWarning: tp_compare didn't return -1 or -2 for exception + with tm.assert_produces_warning(RuntimeWarning): + tm.assertRaises(TypeError, algos.safe_sort, arr) + else: + tm.assertRaises(TypeError, algos.safe_sort, arr) def test_exceptions(self): with tm.assertRaisesRegexp(TypeError, diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 57b8bb1531551..42636c6330fba 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -97,18 +97,9 @@ def test_constructor_unsortable(self): factor = Categorical.from_array(arr, ordered=False) self.assertFalse(factor.ordered) - if compat.PY3: - self.assertRaises( - TypeError, lambda: Categorical.from_array(arr, ordered=True)) - else: - # this however will raise as cannot be sorted (on PY3 or older - # numpies) - if LooseVersion(np.__version__) < "1.10": - self.assertRaises( - TypeError, - lambda: Categorical.from_array(arr, ordered=True)) - else: - Categorical.from_array(arr, ordered=True) + # this however will raise as cannot be sorted + self.assertRaises( + TypeError, lambda: Categorical.from_array(arr, ordered=True)) def test_is_equal_dtype(self): From ae144bb4c3587c6e2e0e0434ad64729456348857 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 21 Jul 2016 03:45:45 +0900 Subject: [PATCH 136/359] BUG: DatetimeIndex raises AttributeError on win closes #13736 --- pandas/tseries/index.py | 5 ++++- pandas/tseries/tests/test_timeseries.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index d448ca9878b99..64f156f4b044c 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -329,9 +329,12 @@ def __new__(cls, data=None, subarr = tslib.cast_to_nanoseconds(data) else: subarr = data - elif data.dtype == _INT64_DTYPE: + else: + # must be integer dtype otherwise if isinstance(data, Int64Index): raise TypeError('cannot convert Int64Index->DatetimeIndex') + if data.dtype != _INT64_DTYPE: + data = data.astype(np.int64) subarr = data.view(_NS_DTYPE) if isinstance(subarr, DatetimeIndex): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 9c97749c87103..2a9696503eaa5 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1520,6 +1520,16 @@ def test_dti_constructor_years_only(self): (rng3, expected3), (rng4, expected4)]: tm.assert_index_equal(rng, expected) + def test_dti_constructor_small_int(self): + # GH 13721 + exp = DatetimeIndex(['1970-01-01 00:00:00.00000000', + '1970-01-01 00:00:00.00000001', + '1970-01-01 00:00:00.00000002']) + + for dtype in [np.int64, np.int32, np.int16, np.int8]: + arr = np.array([0, 10, 20], dtype=dtype) + tm.assert_index_equal(DatetimeIndex(arr), exp) + def test_normalize(self): rng = date_range('1/1/2000 9:30', periods=10, freq='D') From b37ec145eedd933b84fb26f4b6d67f292b6419c5 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Thu, 21 Jul 2016 19:54:49 +0900 Subject: [PATCH 137/359] TST/BUG: Added mote tests for Period(NaT) (#13737) TST/BUG: Added more tests for Period(NaT) --- doc/source/whatsnew/v0.19.0.txt | 6 ++++++ pandas/tseries/tests/test_period.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index b6168f7737654..11b94e08798a4 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -527,9 +527,12 @@ Previous Behavior: New Behavior: +These result in ``pd.NaT`` without providing ``freq`` option. + .. ipython:: python pd.Period('NaT') + pd.Period(None) To be compat with ``Period`` addition and subtraction, ``pd.NaT`` now supports addition and subtraction with ``int``. Previously it raises ``ValueError``. @@ -549,6 +552,7 @@ New Behavior: pd.NaT + 1 pd.NaT - 1 + .. _whatsnew_0190.api.difference: ``Index.difference`` and ``.symmetric_difference`` changes @@ -701,6 +705,8 @@ Bug Fixes - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) +- Bug ``Series.isnull`` and ``Series.notnull`` ignore ``Period('NaT')`` (:issue:`13737`) +- Bug ``Series.fillna`` and ``Series.dropna`` don't affect to ``Period('NaT')`` (:issue:`13737`) - Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) - Bug in ``pd.to_datetime()`` which overflowed on ``int8``, and ``int16`` dtypes (:issue:`13451`) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index e3a67289a587b..0b0ee012a2f30 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -4294,6 +4294,36 @@ def test_constructor_cast_object(self): exp = Series(period_range('1/1/2000', periods=10)) tm.assert_series_equal(s, exp) + def test_isnull(self): + # GH 13737 + s = Series([pd.Period('2011-01', freq='M'), + pd.Period('NaT', freq='M')]) + tm.assert_series_equal(s.isnull(), Series([False, True])) + tm.assert_series_equal(s.notnull(), Series([True, False])) + + def test_fillna(self): + # GH 13737 + s = Series([pd.Period('2011-01', freq='M'), + pd.Period('NaT', freq='M')]) + + res = s.fillna(pd.Period('2012-01', freq='M')) + exp = Series([pd.Period('2011-01', freq='M'), + pd.Period('2012-01', freq='M')]) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'object') + + res = s.fillna('XXX') + exp = Series([pd.Period('2011-01', freq='M'), 'XXX']) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'object') + + def test_dropna(self): + # GH 13737 + s = Series([pd.Period('2011-01', freq='M'), + pd.Period('NaT', freq='M')]) + tm.assert_series_equal(s.dropna(), + Series([pd.Period('2011-01', freq='M')])) + def test_series_comparison_scalars(self): val = pd.Period('2000-01-04', freq='D') result = self.series > val From 0fe5a345f90e4d9029a0bf923245b51c6c6a8322 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 19 Jul 2016 18:06:55 +0100 Subject: [PATCH 138/359] ENH: Enable automatic writing of dates to Stata files Automatically select type %tc for datetime[ns] columns Change ValueErrors to NotImplementedError for unsupported types Add tests for select exceptions Improve to_stata and StataWriter docstrings closes #12259 closes #13710 --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/frame.py | 39 ++++++++++------ pandas/io/stata.py | 83 ++++++++++++++++++++------------- pandas/io/tests/test_stata.py | 50 +++++++++++++++++++- 4 files changed, 124 insertions(+), 49 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 11b94e08798a4..eb22cbd7d798f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -310,6 +310,7 @@ Other enhancements - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) +- ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`) .. _whatsnew_0190.api: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4fe7b318b3a18..a59668320de3d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1473,31 +1473,42 @@ def to_stata(self, fname, convert_dates=None, write_index=True, Parameters ---------- - fname : file path or buffer - Where to save the dta file. + fname : str or buffer + String path of file-like object convert_dates : dict - Dictionary mapping column of datetime types to the stata internal - format that you want to use for the dates. Options are - 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a - number or a name. + Dictionary mapping columns containing datetime types to stata internal + format to use when wirting the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information write_index : bool Write the index to Stata dataset. encoding : str - Default is latin-1. Note that Stata does not support unicode. + Default is latin-1. Unicode is not supported byteorder : str - Can be ">", "<", "little", or "big". The default is None which uses - `sys.byteorder` + Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime - A date time to use when writing the file. Can be None, in which - case the current time is used. + A datetime to use as file creation date. Default is the current time dataset_label : str - A label for the data set. Should be 80 characters or smaller. + A label for the data set. Must be 80 characters or smaller. .. versionadded:: 0.19.0 variable_labels : dict - Dictionary containing columns as keys and variable labels as - values. Each label must be 80 characters or smaller. + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + * Column dtype is not representable in Stata + ValueError + * Columns listed in convert_dates are noth either datetime64[ns] + or datetime.datetime + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters Examples -------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d35466e8896ba..68a723d399c27 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -432,7 +432,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): d = parse_dates_safe(dates, year=True) conv_dates = d.year else: - raise ValueError("fmt %s not understood" % fmt) + raise ValueError("Format %s is not a known Stata date format" % fmt) conv_dates = Series(conv_dates, dtype=np.float64) missing_value = struct.unpack('", "<", "little", or "big". The default is None which uses - `sys.byteorder` + Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime - A date time to use when writing the file. Can be None, in which - case the current time is used. + A datetime to use as file creation date. Default is the current time dataset_label : str - A label for the data set. Should be 80 characters or smaller. + A label for the data set. Must be 80 characters or smaller. .. versionadded:: 0.19.0 @@ -1843,6 +1842,17 @@ class StataWriter(StataParser): The StataWriter instance has a write_file method, which will write the file to the given `fname`. + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are noth either datetime64[ns] + or datetime.datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + Examples -------- >>> import pandas as pd @@ -1861,7 +1871,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None): super(StataWriter, self).__init__(encoding) - self._convert_dates = convert_dates + self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index self._time_stamp = time_stamp self._data_label = data_label @@ -2041,15 +2051,22 @@ def _prepare_pandas(self, data): self.varlist = data.columns.tolist() dtypes = data.dtypes - if self._convert_dates is not None: - self._convert_dates = _maybe_convert_to_int_keys( - self._convert_dates, self.varlist + + # Ensure all date columns are converted + for col in data: + if col in self._convert_dates: + continue + if is_datetime64_dtype(data[col]): + self._convert_dates[col] = 'tc' + + self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates, + self.varlist) + for key in self._convert_dates: + new_type = _convert_datetime_to_stata_type( + self._convert_dates[key] ) - for key in self._convert_dates: - new_type = _convert_datetime_to_stata_type( - self._convert_dates[key] - ) - dtypes[key] = np.dtype(new_type) + dtypes[key] = np.dtype(new_type) + self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.iteritems(): diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 91850e6ffe9b9..009e40c84f94b 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -11,17 +11,17 @@ import nose import numpy as np +from pandas.tslib import NaT import pandas as pd import pandas.util.testing as tm from pandas import compat from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series -from pandas.types.common import is_categorical_dtype -from pandas.tslib import NaT from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) +from pandas.types.common import is_categorical_dtype class TestStata(tm.TestCase): @@ -1165,6 +1165,52 @@ def test_write_variable_label_errors(self): with tm.ensure_clean() as path: original.to_stata(path, variable_labels=variable_labels_long) + def test_default_date_conversion(self): + # GH 12259 + dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000), + dt.datetime(2012, 12, 21, 12, 21, 12, 21000), + dt.datetime(1776, 7, 4, 7, 4, 7, 4000)] + original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], + 'strs': ['apple', 'banana', 'cherry'], + 'dates': dates}) + + with tm.ensure_clean() as path: + original.to_stata(path, write_index=False) + reread = read_stata(path, convert_dates=True) + tm.assert_frame_equal(original, reread) + + original.to_stata(path, + write_index=False, + convert_dates={'dates': 'tc'}) + direct = read_stata(path, convert_dates=True) + tm.assert_frame_equal(reread, direct) + + def test_unsupported_type(self): + original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]}) + + with tm.assertRaises(NotImplementedError): + with tm.ensure_clean() as path: + original.to_stata(path) + + def test_unsupported_datetype(self): + dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000), + dt.datetime(2012, 12, 21, 12, 21, 12, 21000), + dt.datetime(1776, 7, 4, 7, 4, 7, 4000)] + original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], + 'strs': ['apple', 'banana', 'cherry'], + 'dates': dates}) + + with tm.assertRaises(NotImplementedError): + with tm.ensure_clean() as path: + original.to_stata(path, convert_dates={'dates': 'tC'}) + + dates = pd.date_range('1-1-1990',periods=3,tz='Asia/Hong_Kong') + original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], + 'strs': ['apple', 'banana', 'cherry'], + 'dates': dates}) + with tm.assertRaises(NotImplementedError): + with tm.ensure_clean() as path: + original.to_stata(path) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From fc16f1fd21aee163e93e5713a0676f7a79838897 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Jul 2016 13:06:59 +0200 Subject: [PATCH 139/359] CLN: remove deprecated io.sql uquery and tquery functions (#13616) * CLN: remove deprecated io.sql uquery and tquery functions * linter -> remove ununsed imports --- doc/source/whatsnew/v0.19.0.txt | 3 + pandas/io/sql.py | 122 +---------------------------- pandas/io/tests/test_sql.py | 132 +++++--------------------------- 3 files changed, 25 insertions(+), 232 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index eb22cbd7d798f..0d8c658a05310 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -622,6 +622,9 @@ Removal of prior version deprecations/changes Now legacy time rules raises ``ValueError``. For the list of currently supported offsets, see :ref:`here ` +- The ``tquery`` and ``uquery`` functions in the ``pandas.io.sql`` module are removed (:issue:`5950`). + + .. _whatsnew_0190.performance: Performance Improvements diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b9eaa0e4d657b..dfc9e80aa27d1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -8,7 +8,6 @@ from datetime import datetime, date, time import warnings -import traceback import re import numpy as np @@ -18,7 +17,7 @@ from pandas.types.common import (is_list_like, is_datetime64tz_dtype) -from pandas.compat import (lzip, map, zip, raise_with_traceback, +from pandas.compat import (map, zip, raise_with_traceback, string_types, text_type) from pandas.core.api import DataFrame, Series from pandas.core.base import PandasObject @@ -192,125 +191,6 @@ def execute(sql, con, cur=None, params=None): return pandas_sql.execute(*args) -# ----------------------------------------------------------------------------- -# -- Deprecated tquery and uquery - -def _safe_fetch(cur): - try: - result = cur.fetchall() - if not isinstance(result, list): - result = list(result) - return result - except Exception as e: # pragma: no cover - excName = e.__class__.__name__ - if excName == 'OperationalError': - return [] - - -def tquery(sql, con=None, cur=None, retry=True): - """ - DEPRECATED. Returns list of tuples corresponding to each row in given sql - query. - - If only one column selected, then plain list is returned. - - To obtain the same result in the future, you can use the following: - - >>> execute(sql, con, params).fetchall() - - Parameters - ---------- - sql: string - SQL query to be executed - con: DBAPI2 connection, default: None - cur: deprecated, cursor is obtained from connection, default: None - retry: boolean value to specify whether to retry after failure - default: True - - Returns - ------- - Results Iterable - - """ - warnings.warn( - "tquery is deprecated, and will be removed in future versions. " - "You can use ``execute(...).fetchall()`` instead.", - FutureWarning, stacklevel=2) - - cur = execute(sql, con, cur=cur) - result = _safe_fetch(cur) - - if con is not None: - try: - cur.close() - con.commit() - except Exception as e: - excName = e.__class__.__name__ - if excName == 'OperationalError': # pragma: no cover - print('Failed to commit, may need to restart interpreter') - else: - raise - - traceback.print_exc() - if retry: - return tquery(sql, con=con, retry=False) - - if result and len(result[0]) == 1: - # python 3 compat - result = list(lzip(*result)[0]) - elif result is None: # pragma: no cover - result = [] - - return result - - -def uquery(sql, con=None, cur=None, retry=True, params=None): - """ - DEPRECATED. Does the same thing as tquery, but instead of returning - results, it returns the number of rows affected. Good for update queries. - - To obtain the same result in the future, you can use the following: - - >>> execute(sql, con).rowcount - - Parameters - ---------- - sql: string - SQL query to be executed - con: DBAPI2 connection, default: None - cur: deprecated, cursor is obtained from connection, default: None - retry: boolean value to specify whether to retry after failure - default: True - params: list or tuple, optional, default: None - List of parameters to pass to execute method. - - Returns - ------- - Number of affected rows - - """ - warnings.warn( - "uquery is deprecated, and will be removed in future versions. " - "You can use ``execute(...).rowcount`` instead.", - FutureWarning, stacklevel=2) - - cur = execute(sql, con, cur=cur, params=params) - - result = cur.rowcount - try: - con.commit() - except Exception as e: - excName = e.__class__.__name__ - if excName != 'OperationalError': - raise - - traceback.print_exc() - if retry: - print('Looks like your connection failed, reconnecting...') - return uquery(sql, con, retry=False) - return result - - # ----------------------------------------------------------------------------- # -- Read and write to DataFrames diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 41be39f9abaa6..f4001420a77b6 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -1070,17 +1070,6 @@ def test_get_schema2(self): create_sql = sql.get_schema(self.test_frame1, 'test') self.assertTrue('CREATE' in create_sql) - def test_tquery(self): - with tm.assert_produces_warning(FutureWarning): - iris_results = sql.tquery("SELECT * FROM iris", con=self.conn) - row = iris_results[0] - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) - - def test_uquery(self): - with tm.assert_produces_warning(FutureWarning): - rows = sql.uquery("SELECT * FROM iris LIMIT 1", con=self.conn) - self.assertEqual(rows, -1) - def _get_sqlite_column_type(self, schema, column): for col in schema.split('\n'): @@ -2091,6 +2080,15 @@ def format_query(sql, *args): return sql % tuple(processed_args) +def tquery(query, con=None, cur=None): + """Replace removed sql.tquery function""" + res = sql.execute(query, con=con, cur=cur).fetchall() + if res is None: + return None + else: + return list(res) + + def _skip_if_no_pymysql(): try: import pymysql # noqa @@ -2120,7 +2118,7 @@ def test_write_row_by_row(self): ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" for idx, row in frame.iterrows(): fmt_sql = format_query(ins, *row) - sql.tquery(fmt_sql, cur=cur) + tquery(fmt_sql, cur=cur) self.conn.commit() @@ -2200,7 +2198,7 @@ def test_execute_closed_connection(self): self.conn.close() try: sys.stdout = StringIO() - self.assertRaises(Exception, sql.tquery, "select * from test", + self.assertRaises(Exception, tquery, "select * from test", con=self.conn) finally: sys.stdout = sys.__stdout__ @@ -2232,42 +2230,6 @@ def _check_roundtrip(self, frame): expected.index.name = 'Idx' tm.assert_frame_equal(expected, result) - def test_tquery(self): - frame = tm.makeTimeDataFrame() - sql.to_sql(frame, name='test_table', con=self.conn, index=False) - result = sql.tquery("select A from test_table", self.conn) - expected = Series(frame.A.values, frame.index) # not to have name - result = Series(result, frame.index) - tm.assert_series_equal(result, expected) - - try: - sys.stdout = StringIO() - self.assertRaises(sql.DatabaseError, sql.tquery, - 'select * from blah', con=self.conn) - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'select * from blah', con=self.conn, retry=True) - finally: - sys.stdout = sys.__stdout__ - - def test_uquery(self): - frame = tm.makeTimeDataFrame() - sql.to_sql(frame, name='test_table', con=self.conn, index=False) - stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' - self.assertEqual(sql.uquery(stmt, con=self.conn), 1) - - try: - sys.stdout = StringIO() - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'insert into blah values (1)', con=self.conn) - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'insert into blah values (1)', con=self.conn, - retry=True) - finally: - sys.stdout = sys.__stdout__ - def test_keyword_as_column_names(self): df = DataFrame({'From': np.ones(5)}) sql.to_sql(df, con=self.conn, name='testkeywords', index=False) @@ -2324,22 +2286,22 @@ def clean_up(test_table_to_drop): # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, if_exists='replace', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, if_exists='replace', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + self.assertEqual(tquery(sql_select, con=self.conn), [(3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) # test if_exists='append' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, if_exists='fail', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, if_exists='append', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) @@ -2445,7 +2407,7 @@ def test_write_row_by_row(self): ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" for idx, row in frame.iterrows(): fmt_sql = format_query(ins, *row) - sql.tquery(fmt_sql, cur=cur) + tquery(fmt_sql, cur=cur) self.conn.commit() @@ -2554,7 +2516,7 @@ def test_execute_closed_connection(self): self.conn.close() try: sys.stdout = StringIO() - self.assertRaises(Exception, sql.tquery, "select * from test", + self.assertRaises(Exception, tquery, "select * from test", con=self.conn) finally: sys.stdout = sys.__stdout__ @@ -2603,58 +2565,6 @@ def _check_roundtrip(self, frame): expected.index.names = result.index.names tm.assert_frame_equal(expected, result) - def test_tquery(self): - try: - import pymysql # noqa - except ImportError: - raise nose.SkipTest("no pymysql") - frame = tm.makeTimeDataFrame() - drop_sql = "DROP TABLE IF EXISTS test_table" - cur = self.conn.cursor() - cur.execute(drop_sql) - sql.to_sql(frame, name='test_table', - con=self.conn, index=False) - result = sql.tquery("select A from test_table", self.conn) - expected = Series(frame.A.values, frame.index) # not to have name - result = Series(result, frame.index) - tm.assert_series_equal(result, expected) - - try: - sys.stdout = StringIO() - self.assertRaises(sql.DatabaseError, sql.tquery, - 'select * from blah', con=self.conn) - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'select * from blah', con=self.conn, retry=True) - finally: - sys.stdout = sys.__stdout__ - - def test_uquery(self): - try: - import pymysql # noqa - except ImportError: - raise nose.SkipTest("no pymysql") - frame = tm.makeTimeDataFrame() - drop_sql = "DROP TABLE IF EXISTS test_table" - cur = self.conn.cursor() - cur.execute(drop_sql) - sql.to_sql(frame, name='test_table', - con=self.conn, index=False) - stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' - self.assertEqual(sql.uquery(stmt, con=self.conn), 1) - - try: - sys.stdout = StringIO() - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'insert into blah values (1)', con=self.conn) - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'insert into blah values (1)', con=self.conn, - retry=True) - finally: - sys.stdout = sys.__stdout__ - def test_keyword_as_column_names(self): _skip_if_no_pymysql() df = DataFrame({'From': np.ones(5)}) @@ -2698,22 +2608,22 @@ def clean_up(test_table_to_drop): # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, if_exists='replace', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, if_exists='replace', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + self.assertEqual(tquery(sql_select, con=self.conn), [(3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) # test if_exists='append' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, if_exists='fail', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, if_exists='append', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) From e357ea1105723781032f0ec509236b69325633f4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Jul 2016 14:23:05 +0200 Subject: [PATCH 140/359] DOC: some general doc/sphinx fixes (#13740) * DOC: fix stata versionadded * DOC: fix sphinx warnings/errors * DOC: replace old whatsnew io.data example with plain code-block * fix lint issues --- doc/source/computation.rst | 16 +++++------- doc/source/ecosystem.rst | 6 ++--- doc/source/whatsnew/v0.15.1.txt | 46 ++++++++++++++++++++++++++++----- doc/source/whatsnew/v0.17.0.txt | 2 +- doc/source/whatsnew/v0.8.0.txt | 3 +-- pandas/core/frame.py | 26 ++++++++++--------- pandas/io/stata.py | 5 ++-- pandas/io/tests/test_stata.py | 2 +- 8 files changed, 68 insertions(+), 38 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 12e0ecfba97da..1414d2dd3c8dc 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -428,15 +428,13 @@ Using a non-regular, but still monotonic index, rolling with an integer window d .. ipython:: python - - dft = DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index = pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) - + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index = pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) dft dft.rolling(2).sum() diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 0d010b47f393a..17ebd1f163f4f 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -25,7 +25,7 @@ Statistics and Machine Learning ------------------------------- `Statsmodels `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Statsmodels is the prominent python "statistics and econometrics library" and it has a long-standing special relationship with pandas. Statsmodels provides powerful statistics, @@ -78,7 +78,7 @@ more advanced types of plots then those offered by pandas. The `Vincent `__ project leverages `Vega `__ (that in turn, leverages `d3 `__) to create -plots. Although functional, as of Summer 2016 the Vincent project has not been updated +plots. Although functional, as of Summer 2016 the Vincent project has not been updated in over two years and is `unlikely to receive further updates `__. `IPython Vega `__ @@ -130,7 +130,7 @@ qgrid is "an interactive grid for sorting and filtering DataFrames in IPython Notebook" built with SlickGrid. `Spyder `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Spyder is a cross-platform Qt-based open-source Python IDE with editing, testing, debugging, and introspection features. diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt index 2a4104c2d5dc4..a25e5a80b65fc 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.txt @@ -144,14 +144,46 @@ API changes Current behavior: - .. ipython:: python - :okwarning: + .. code-block:: ipython - from pandas.io.data import Options - aapl = Options('aapl','yahoo') - aapl.get_call_data().iloc[0:5,0:1] - aapl.expiry_dates - aapl.get_near_stock_price(expiry=aapl.expiry_dates[0:3]).iloc[0:5,0:1] + In [17]: from pandas.io.data import Options + + In [18]: aapl = Options('aapl','yahoo') + + In [19]: aapl.get_call_data().iloc[0:5,0:1] + Out[19]: + Last + Strike Expiry Type Symbol + 80 2014-11-14 call AAPL141114C00080000 29.05 + 84 2014-11-14 call AAPL141114C00084000 24.80 + 85 2014-11-14 call AAPL141114C00085000 24.05 + 86 2014-11-14 call AAPL141114C00086000 22.76 + 87 2014-11-14 call AAPL141114C00087000 21.74 + + In [20]: aapl.expiry_dates + Out[20]: + [datetime.date(2014, 11, 14), + datetime.date(2014, 11, 22), + datetime.date(2014, 11, 28), + datetime.date(2014, 12, 5), + datetime.date(2014, 12, 12), + datetime.date(2014, 12, 20), + datetime.date(2015, 1, 17), + datetime.date(2015, 2, 20), + datetime.date(2015, 4, 17), + datetime.date(2015, 7, 17), + datetime.date(2016, 1, 15), + datetime.date(2017, 1, 20)] + + In [21]: aapl.get_near_stock_price(expiry=aapl.expiry_dates[0:3]).iloc[0:5,0:1] + Out[21]: + Last + Strike Expiry Type Symbol + 109 2014-11-22 call AAPL141122C00109000 1.48 + 2014-11-28 call AAPL141128C00109000 1.79 + 110 2014-11-14 call AAPL141114C00110000 0.55 + 2014-11-22 call AAPL141122C00110000 1.02 + 2014-11-28 call AAPL141128C00110000 1.32 See the Options documentation in :ref:`Remote Data ` diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index ef9785d25f014..fc13224d3fe6e 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -965,7 +965,7 @@ Deprecations - ``TimeSeries`` deprecated in favor of ``Series`` (note that this has been an alias since 0.13.0), (:issue:`10890`) - ``SparsePanel`` deprecated and will be removed in a future version (:issue:`11157`). - ``Series.is_time_series`` deprecated in favor of ``Series.index.is_all_dates`` (:issue:`11135`) -- Legacy offsets (like ``'A@JAN'``) listed in :ref:`here ` are deprecated (note that this has been alias since 0.8.0), (:issue:`10878`) +- Legacy offsets (like ``'A@JAN'``) are deprecated (note that this has been alias since 0.8.0) (:issue:`10878`) - ``WidePanel`` deprecated in favor of ``Panel``, ``LongPanel`` in favor of ``DataFrame`` (note these have been aliases since < 0.11.0), (:issue:`10892`) - ``DataFrame.convert_objects`` has been deprecated in favor of type-specific functions ``pd.to_datetime``, ``pd.to_timestamp`` and ``pd.to_numeric`` (new in 0.17.0) (:issue:`11133`). diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/whatsnew/v0.8.0.txt index 0d2cfeb2d7cfc..cf6ac7c1e6ad2 100644 --- a/doc/source/whatsnew/v0.8.0.txt +++ b/doc/source/whatsnew/v0.8.0.txt @@ -134,7 +134,7 @@ Other new features - Move to klib-based hash tables for indexing; better performance and less memory usage than Python's dict - Add first, last, min, max, and prod optimized GroupBy functions -- New :ref:`ordered_merge ` function +- New :ref:`ordered_merge ` function - Add flexible :ref:`comparison ` instance methods eq, ne, lt, gt, etc. to DataFrame, Series - Improve :ref:`scatter_matrix ` plotting @@ -271,4 +271,3 @@ unique. In many cases it will no longer fail (some method like ``append`` still check for uniqueness unless disabled). However, all is not lost: you can inspect ``index.is_unique`` and raise an exception explicitly if it is ``False`` or go to a different code branch. - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a59668320de3d..558d3b1885bf5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1476,12 +1476,12 @@ def to_stata(self, fname, convert_dates=None, write_index=True, fname : str or buffer String path of file-like object convert_dates : dict - Dictionary mapping columns containing datetime types to stata internal - format to use when wirting the dates. Options are 'tc', 'td', 'tm', - 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. - Datetime columns that do not have a conversion type specified will be - converted to 'tc'. Raises NotImplementedError if a datetime column has - timezone information + Dictionary mapping columns containing datetime types to stata + internal format to use when wirting the dates. Options are 'tc', + 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer + or a name. Datetime columns that do not have a conversion type + specified will be converted to 'tc'. Raises NotImplementedError if + a datetime column has timezone information write_index : bool Write the index to Stata dataset. encoding : str @@ -1489,15 +1489,15 @@ def to_stata(self, fname, convert_dates=None, write_index=True, byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime - A datetime to use as file creation date. Default is the current time + A datetime to use as file creation date. Default is the current + time. dataset_label : str A label for the data set. Must be 80 characters or smaller. - - .. versionadded:: 0.19.0 - variable_labels : dict - Dictionary containing columns as keys and variable labels as values. - Each label must be 80 characters or smaller. + Dictionary containing columns as keys and variable labels as + values. Each label must be 80 characters or smaller. + + .. versionadded:: 0.19.0 Raises ------ @@ -1510,6 +1510,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True, * Column listed in convert_dates is not in DataFrame * Categorical label contains more than 32,000 characters + .. versionadded:: 0.19.0 + Examples -------- >>> writer = StataWriter('./data_file.dta', data) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 68a723d399c27..59bc24acac6f8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1829,13 +1829,12 @@ class StataWriter(StataParser): A datetime to use as file creation date. Default is the current time dataset_label : str A label for the data set. Must be 80 characters or smaller. - - .. versionadded:: 0.19.0 - variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. + .. versionadded:: 0.19.0 + Returns ------- writer : StataWriter instance diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 009e40c84f94b..2e3182b69acaf 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1204,7 +1204,7 @@ def test_unsupported_datetype(self): with tm.ensure_clean() as path: original.to_stata(path, convert_dates={'dates': 'tC'}) - dates = pd.date_range('1-1-1990',periods=3,tz='Asia/Hong_Kong') + dates = pd.date_range('1-1-1990', periods=3, tz='Asia/Hong_Kong') original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], 'strs': ['apple', 'banana', 'cherry'], 'dates': dates}) From 4d3b6c11150da0e052bf170b70e12a81f606c398 Mon Sep 17 00:00:00 2001 From: Bob Baxley Date: Thu, 21 Jul 2016 10:16:49 -0400 Subject: [PATCH 141/359] ENH: Adding additional keywords to read_html for #13461 (#13575) --- doc/source/io.rst | 29 +++++++++++++++ doc/source/whatsnew/v0.19.0.txt | 2 + pandas/io/html.py | 58 +++++++++++++++++------------ pandas/io/tests/test_html.py | 66 +++++++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 24 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index da0444a8b8df9..113afa32d182e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1959,6 +1959,35 @@ Specify an HTML attribute dfs2 = read_html(url, attrs={'class': 'sortable'}) print(np.array_equal(dfs1[0], dfs2[0])) # Should be True +Specify values that should be converted to NaN + +.. code-block:: python + + dfs = read_html(url, na_values=['No Acquirer']) + +.. versionadded:: 0.19 + +Specify whether to keep the default set of NaN values + +.. code-block:: python + + dfs = read_html(url, keep_default_na=False) + +.. versionadded:: 0.19 + +Specify converters for columns. This is useful for numerical text data that has +leading zeros. By default columns that are numerical are cast to numeric +types and the leading zeros are lost. To avoid this, we can convert these +columns to strings. + +.. code-block:: python + + url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' + dfs = read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC': + str}) + +.. versionadded:: 0.19 + Use some combination of the above .. code-block:: python diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0d8c658a05310..c264ce201d25f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -293,6 +293,8 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) +- The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`) + - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/html.py b/pandas/io/html.py index e0d84a9617ae4..3c38dae91eb89 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -611,10 +611,10 @@ def _expand_elements(body): body[ind] += empty * (lens_max - length) -def _data_to_frame(data, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, - decimal): - head, body, foot = data +def _data_to_frame(**kwargs): + head, body, foot = kwargs.pop('data') + header = kwargs.pop('header') + kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: body = [head] + body @@ -628,10 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows, # fill out elements of body that are "ragged" _expand_elements(body) - tp = TextParser(body, header=header, index_col=index_col, - skiprows=_get_skiprows(skiprows), - parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands, decimal=decimal) + tp = TextParser(body, header=header, **kwargs) df = tp.read() return df @@ -716,9 +713,7 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal): +def _parse(flavor, io, match, attrs, encoding, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -740,15 +735,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, ret = [] for table in tables: try: - ret.append(_data_to_frame(data=table, - header=header, - index_col=index_col, - skiprows=skiprows, - parse_dates=parse_dates, - tupleize_cols=tupleize_cols, - thousands=thousands, - decimal=decimal - )) + ret.append(_data_to_frame(data=table, **kwargs)) except EmptyDataError: # empty table continue return ret @@ -757,7 +744,8 @@ def _parse(flavor, io, match, header, index_col, skiprows, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',', encoding=None, - decimal='.'): + decimal='.', converters=None, na_values=None, + keep_default_na=True): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -839,6 +827,25 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + + .. versionadded:: 0.19.0 + + na_values : iterable, default None + Custom NA values + + .. versionadded:: 0.19.0 + + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to + + .. versionadded:: 0.19.0 + Returns ------- dfs : list of DataFrames @@ -881,6 +888,9 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, raise ValueError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') _validate_header_arg(header) - return _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal) + return _parse(flavor=flavor, io=io, match=match, header=header, + index_col=index_col, skiprows=skiprows, + parse_dates=parse_dates, tupleize_cols=tupleize_cols, + thousands=thousands, attrs=attrs, encoding=encoding, + decimal=decimal, converters=converters, na_values=na_values, + keep_default_na=keep_default_na) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 5a95fe7727df0..7b4e775db9476 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -694,6 +694,72 @@ def test_bool_header_arg(self): with tm.assertRaises(TypeError): read_html(self.spam_data, header=arg) + def test_converters(self): + # GH 13461 + html_data = """ + + + + + + + + + + + + +
a
0.763
0.244
""" + + expected_df = DataFrame({'a': ['0.763', '0.244']}) + html_df = read_html(html_data, converters={'a': str})[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_na_values(self): + # GH 13461 + html_data = """ + + + + + + + + + + + + +
a
0.763
0.244
""" + + expected_df = DataFrame({'a': [0.763, np.nan]}) + html_df = read_html(html_data, na_values=[0.244])[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_keep_default_na(self): + html_data = """ + + + + + + + + + + + + +
a
N/A
NA
""" + + expected_df = DataFrame({'a': ['N/A', 'NA']}) + html_df = read_html(html_data, keep_default_na=False)[0] + tm.assert_frame_equal(expected_df, html_df) + + expected_df = DataFrame({'a': [np.nan, np.nan]}) + html_df = read_html(html_data, keep_default_na=True)[0] + tm.assert_frame_equal(expected_df, html_df) + def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_') From 4caacdff9c7c6499651269e5829a5a3412a46033 Mon Sep 17 00:00:00 2001 From: Sahil Dua Date: Thu, 21 Jul 2016 16:30:56 +0200 Subject: [PATCH 142/359] DOC: Add reference to frequency strings - [x] closes #13160 Author: Sahil Dua This patch had conflicts when merged, resolved by Committer: Joris Van den Bossche Closes #13632 from sahildua2305/frequency-strings-fix and squashes the following commits: b45ae21 [Sahil Dua] DOC: Add learn more text in generic and removed from window fff5da7 [Sahil Dua] Fix links and text for reference 7b7045a [Sahil Dua] DOC: Add reference to frequency strings in docs 02c44bb [Sahil Dua] Add reference to frequency strings; fixes #13160 --- doc/source/timeseries.rst | 12 ++++++------ doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/generic.py | 6 ++++++ pandas/stats/moments.py | 21 +++++++++++++++++++++ pandas/tseries/index.py | 12 ++++++++++++ pandas/tseries/tdi.py | 6 ++++++ 6 files changed, 52 insertions(+), 7 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index fd31eb1b584a8..da19c6a7d2bec 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -567,11 +567,11 @@ DateOffset objects ------------------ In the preceding examples, we created DatetimeIndex objects at various -frequencies by passing in frequency strings like 'M', 'W', and 'BM to the -``freq`` keyword. Under the hood, these frequency strings are being translated -into an instance of pandas ``DateOffset``, which represents a regular -frequency increment. Specific offset logic like "month", "business day", or -"one hour" is represented in its various subclasses. +frequencies by passing in :ref:`frequency strings ` +like 'M', 'W', and 'BM to the ``freq`` keyword. Under the hood, these frequency +strings are being translated into an instance of pandas ``DateOffset``, +which represents a regular frequency increment. Specific offset logic like +"month", "business day", or "one hour" is represented in its various subclasses. .. csv-table:: :header: "Class name", "Description" @@ -953,7 +953,7 @@ You can use keyword arguments suported by either ``BusinessHour`` and ``CustomBu # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 -.. _timeseries.alias: +.. _timeseries.offset_aliases: Offset Aliases ~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index c264ce201d25f..85a1d8899249a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -622,7 +622,7 @@ Removal of prior version deprecations/changes pandas/tseries/frequencies.py:465: FutureWarning: Freq "W@MON" is deprecated, use "W-MON" as alternative. Out[2]: DatetimeIndex(['2016-07-04', '2016-07-11', '2016-07-18'], dtype='datetime64[ns]', freq='W-MON') - Now legacy time rules raises ``ValueError``. For the list of currently supported offsets, see :ref:`here ` + Now legacy time rules raises ``ValueError``. For the list of currently supported offsets, see :ref:`here ` - The ``tquery`` and ``uquery`` functions in the ``pandas.io.sql`` module are removed (:issue:`5950`). diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0863c8f1af385..ee3cb52bd6e42 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3982,6 +3982,9 @@ def asfreq(self, freq, method=None, how=None, normalize=False): Returns ------- converted : type of caller + + To learn more about the frequency strings, please see `this link + `__. """ from pandas.tseries.resample import asfreq return asfreq(self, freq, method=method, how=how, normalize=normalize) @@ -4053,6 +4056,9 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, range from 0 through 4. Defaults to 0 + To learn more about the offset strings, please see `this link + `__. + Examples -------- diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index bb475e47206c2..95b209aee0b0c 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -271,6 +271,9 @@ def rolling_count(arg, window, **kwargs): The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('rolling', 'count', arg, window=window, **kwargs) @@ -521,6 +524,9 @@ def rolling_quantile(arg, window, quantile, min_periods=None, freq=None, The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('rolling', 'quantile', @@ -570,6 +576,9 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None, The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('rolling', 'apply', @@ -642,6 +651,9 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ func = 'mean' if mean else 'sum' return ensure_compat('rolling', @@ -707,6 +719,9 @@ def expanding_count(arg, freq=None): The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('expanding', 'count', arg, freq=freq) @@ -735,6 +750,9 @@ def expanding_quantile(arg, quantile, min_periods=1, freq=None): The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('expanding', 'quantile', @@ -818,6 +836,9 @@ def expanding_apply(arg, func, min_periods=1, freq=None, The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('expanding', 'apply', diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 64f156f4b044c..b87e9738b02ee 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -189,6 +189,9 @@ class DatetimeIndex(DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, Attempt to infer fall dst-transition hours based on order name : object Name to be stored in the index + + To learn more about the frequency strings, please see `this link + `__. """ _typ = 'datetimeindex' @@ -2026,6 +2029,9 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, ----- 2 of start, end, or periods must be specified + To learn more about the frequency strings, please see `this link + `__. + Returns ------- rng : DatetimeIndex @@ -2066,6 +2072,9 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, ----- 2 of start, end, or periods must be specified + To learn more about the frequency strings, please see `this link + `__. + Returns ------- rng : DatetimeIndex @@ -2117,6 +2126,9 @@ def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, ----- 2 of start, end, or periods must be specified + To learn more about the frequency strings, please see `this link + `__. + Returns ------- rng : DatetimeIndex diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 78ab333be8ea5..8aad5bdd35f65 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -110,6 +110,9 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, TimelikeOps, Int64Index): the 'left', 'right', or both sides (None) name : object Name to be stored in the index + + To learn more about the frequency strings, please see `this link + `__. """ _typ = 'timedeltaindex' @@ -1014,6 +1017,9 @@ def timedelta_range(start=None, end=None, periods=None, freq='D', Returns ------- rng : TimedeltaIndex + + To learn more about the frequency strings, please see `this link + `__. """ return TimedeltaIndex(start=start, end=end, periods=periods, freq=freq, name=name, From bb6b5e54edaf046389e8cce28e7cd27ee87f5fcc Mon Sep 17 00:00:00 2001 From: Jeffrey Gerard Date: Thu, 21 Jul 2016 17:03:42 +0200 Subject: [PATCH 143/359] ENH: DataFrame sort columns by rows: sort_values(axis=1) closes #10806 Author: Jeffrey Gerard Closes #13622 from IamJeffG/GH10806 and squashes the following commits: ea2d89e [Jeffrey Gerard] More test cases. Clarify whatnew w/ example. f43ab2e [Jeffrey Gerard] Tweak whatsnew entry, once more 2773cdf [Jeffrey Gerard] Tweak whatsnew entry 0f23615 [Jeffrey Gerard] Whatsnew entry for DataFrame.sort_values by index (10806) 970e25b [Jeffrey Gerard] DataFrame sort columns by rows: sort_values(axis=1) Joris Van den Bossche: updated axis kwarg in docstring --- doc/source/whatsnew/v0.19.0.txt | 9 ++++++++ pandas/core/frame.py | 9 ++++---- pandas/core/generic.py | 3 ++- pandas/tests/frame/test_sorting.py | 34 ++++++++++++++++++++++++++---- 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 85a1d8899249a..8ace344974b27 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -313,6 +313,15 @@ Other enhancements - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) - ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`) +- ``DataFrame`` has gained support to re-order the columns based on the values + in a row using ``df.sort_values(by='...', axis=1)`` (:issue:`10806`) + + .. ipython:: python + + df = pd.DataFrame({'A': [2, 7], 'B': [3, 5], 'C': [4, 8]}, + index=['row1', 'row2']) + df.sort_values(by='row2', axis=1) + .. _whatsnew_0190.api: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 558d3b1885bf5..fe05b3715f45d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -101,7 +101,7 @@ _shared_doc_kwargs = dict( axes='index, columns', klass='DataFrame', - axes_single_arg="{0, 1, 'index', 'columns'}", + axes_single_arg="{0 or 'index', 1 or 'columns'}", optional_by=""" by : str or list of str Name or list of names which refer to the axis items.""") @@ -3184,9 +3184,8 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): axis = self._get_axis_number(axis) + other_axis = 0 if axis == 1 else 1 - if axis != 0: - raise ValueError('When sorting by column, axis must be 0 (rows)') if not isinstance(by, list): by = [by] if is_sequence(ascending) and len(by) != len(ascending): @@ -3202,7 +3201,7 @@ def trans(v): keys = [] for x in by: - k = self[x].values + k = self.xs(x, axis=other_axis).values if k.ndim == 2: raise ValueError('Cannot sort by duplicate column %s' % str(x)) @@ -3214,7 +3213,7 @@ def trans(v): from pandas.core.groupby import _nargsort by = by[0] - k = self[by].values + k = self.xs(by, axis=other_axis).values if k.ndim == 2: # try to be helpful diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ee3cb52bd6e42..e7a098351a0ab 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1979,7 +1979,8 @@ def add_suffix(self, suffix): Parameters ----------%(optional_by)s - axis : %(axes)s to direct sorting, default 0 + axis : %(axes_single_arg)s, default 0 + Axis to direct sorting ascending : bool or list of bool, default True Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 4d57216c8f870..b7a38e9e13ebd 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -84,7 +84,7 @@ def test_sort_values(self): frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list('ABC')) - # by column + # by column (axis=0) sorted_df = frame.sort_values(by='A') indexer = frame['A'].argsort().values expected = frame.ix[frame.index[indexer]] @@ -116,9 +116,26 @@ def test_sort_values(self): self.assertRaises(ValueError, lambda: frame.sort_values( by=['A', 'B'], axis=2, inplace=True)) - msg = 'When sorting by column, axis must be 0' - with assertRaisesRegexp(ValueError, msg): - frame.sort_values(by='A', axis=1) + # by row (axis=1): GH 10806 + sorted_df = frame.sort_values(by=3, axis=1) + expected = frame + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=3, axis=1, ascending=False) + expected = frame.reindex(columns=['C', 'B', 'A']) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 2], axis='columns') + expected = frame.reindex(columns=['B', 'A', 'C']) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, + ascending=[True, False]) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) + expected = frame.reindex(columns=['C', 'B', 'A']) + assert_frame_equal(sorted_df, expected) msg = r'Length of ascending \(5\) != length of by \(2\)' with assertRaisesRegexp(ValueError, msg): @@ -133,6 +150,11 @@ def test_sort_values_inplace(self): expected = frame.sort_values(by='A') assert_frame_equal(sorted_df, expected) + sorted_df = frame.copy() + sorted_df.sort_values(by=1, axis=1, inplace=True) + expected = frame.sort_values(by=1, axis=1) + assert_frame_equal(sorted_df, expected) + sorted_df = frame.copy() sorted_df.sort_values(by='A', ascending=False, inplace=True) expected = frame.sort_values(by='A', ascending=False) @@ -179,6 +201,10 @@ def test_sort_nan(self): sorted_df = df.sort_values(['A'], na_position='first', ascending=False) assert_frame_equal(sorted_df, expected) + expected = df.reindex(columns=['B', 'A']) + sorted_df = df.sort_values(by=1, axis=1, na_position='first') + assert_frame_equal(sorted_df, expected) + # na_position='last', order expected = DataFrame( {'A': [1, 1, 2, 4, 6, 8, nan], From 253ed5d512cd37ddeaaed949de582502ddb9100f Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 22 Jul 2016 04:50:40 -0400 Subject: [PATCH 144/359] CLN: Removed the return_type param in StringMethods.split (#13701) Deprecated in #10085 --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/strings.py | 4 +-- pandas/tests/test_strings.py | 45 ++------------------------------- 3 files changed, 4 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8ace344974b27..edca4289167e5 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -621,6 +621,7 @@ Removal of prior version deprecations/changes - ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) - ``pd.Index`` has dropped the ``diff`` method in favour of ``difference`` (:issue:`13669`) +- ``str.split`` has dropped the ``return_type`` parameter in favor of ``expand`` (:issue:`13701`) - Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) Previous Behavior: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3150fc5d0143a..b49761367b9b5 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -15,7 +15,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin -from pandas.util.decorators import Appender, deprecate_kwarg +from pandas.util.decorators import Appender import re import pandas.lib as lib import warnings @@ -1401,8 +1401,6 @@ def cat(self, others=None, sep=None, na_rep=None): result = str_cat(data, others=others, sep=sep, na_rep=na_rep) return self._wrap_result(result, use_codes=(not self._is_categorical)) - @deprecate_kwarg('return_type', 'expand', mapping={'series': False, - 'frame': True}) @copy(str_split) def split(self, pat=None, n=-1, expand=False): result = str_split(self._data, pat, n=n) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index fcdbec8fbc5c4..92fa7b976eb0e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1906,45 +1906,6 @@ def test_split_no_pat_with_nonzero_n(self): def test_split_to_dataframe(self): s = Series(['nosplit', 'alsonosplit']) - - with tm.assert_produces_warning(FutureWarning): - result = s.str.split('_', return_type='frame') - - exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) - tm.assert_frame_equal(result, exp) - - s = Series(['some_equal_splits', 'with_no_nans']) - with tm.assert_produces_warning(FutureWarning): - result = s.str.split('_', return_type='frame') - exp = DataFrame({0: ['some', 'with'], - 1: ['equal', 'no'], - 2: ['splits', 'nans']}) - tm.assert_frame_equal(result, exp) - - s = Series(['some_unequal_splits', 'one_of_these_things_is_not']) - with tm.assert_produces_warning(FutureWarning): - result = s.str.split('_', return_type='frame') - exp = DataFrame({0: ['some', 'one'], - 1: ['unequal', 'of'], - 2: ['splits', 'these'], - 3: [NA, 'things'], - 4: [NA, 'is'], - 5: [NA, 'not']}) - tm.assert_frame_equal(result, exp) - - s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) - with tm.assert_produces_warning(FutureWarning): - result = s.str.split('_', return_type='frame') - exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, - index=['preserve', 'me']) - tm.assert_frame_equal(result, exp) - - with tm.assertRaisesRegexp(ValueError, "expand must be"): - with tm.assert_produces_warning(FutureWarning): - s.str.split('_', return_type="some_invalid_type") - - def test_split_to_dataframe_expand(self): - s = Series(['nosplit', 'alsonosplit']) result = s.str.split('_', expand=True) exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) tm.assert_frame_equal(result, exp) @@ -1973,8 +1934,7 @@ def test_split_to_dataframe_expand(self): tm.assert_frame_equal(result, exp) with tm.assertRaisesRegexp(ValueError, "expand must be"): - with tm.assert_produces_warning(FutureWarning): - s.str.split('_', return_type="some_invalid_type") + s.str.split('_', expand="not_a_boolean") def test_split_to_multiindex_expand(self): idx = Index(['nosplit', 'alsonosplit']) @@ -1999,8 +1959,7 @@ def test_split_to_multiindex_expand(self): self.assertEqual(result.nlevels, 6) with tm.assertRaisesRegexp(ValueError, "expand must be"): - with tm.assert_produces_warning(FutureWarning): - idx.str.split('_', return_type="some_invalid_type") + idx.str.split('_', expand="not_a_boolean") def test_rsplit_to_dataframe_expand(self): s = Series(['nosplit', 'alsonosplit']) From 9f94e6a04564e8269b43c5b4e83854d277ea30bd Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Fri, 22 Jul 2016 21:09:11 +0900 Subject: [PATCH 145/359] TST: Move plotting related tests to tests/plotting (#13621) related to #13579 --- pandas/tests/plotting/__init__.py | 0 pandas/tests/plotting/common.py | 552 +++++++ pandas/tests/plotting/test_boxplot_method.py | 374 +++++ .../plotting/test_datetimelike.py} | 10 +- .../test_frame.py} | 1380 +---------------- pandas/tests/plotting/test_groupby.py | 82 + pandas/tests/plotting/test_hist_method.py | 426 +++++ pandas/tests/plotting/test_misc.py | 277 ++++ pandas/tests/plotting/test_series.py | 807 ++++++++++ pandas/tests/test_graphics_others.py | 1033 ------------ setup.py | 1 + 11 files changed, 2533 insertions(+), 2409 deletions(-) create mode 100644 pandas/tests/plotting/__init__.py create mode 100644 pandas/tests/plotting/common.py create mode 100644 pandas/tests/plotting/test_boxplot_method.py rename pandas/{tseries/tests/test_plotting.py => tests/plotting/test_datetimelike.py} (99%) rename pandas/tests/{test_graphics.py => plotting/test_frame.py} (68%) create mode 100644 pandas/tests/plotting/test_groupby.py create mode 100644 pandas/tests/plotting/test_hist_method.py create mode 100644 pandas/tests/plotting/test_misc.py create mode 100644 pandas/tests/plotting/test_series.py delete mode 100644 pandas/tests/test_graphics_others.py diff --git a/pandas/tests/plotting/__init__.py b/pandas/tests/plotting/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py new file mode 100644 index 0000000000000..d80eb891c5bd6 --- /dev/null +++ b/pandas/tests/plotting/common.py @@ -0,0 +1,552 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose +import os +import warnings + +from pandas import DataFrame +from pandas.compat import zip, iteritems, OrderedDict +from pandas.util.decorators import cache_readonly +import pandas.core.common as com +import pandas.util.testing as tm +from pandas.util.testing import (ensure_clean, + assert_is_valid_plot_return_object) + +import numpy as np +from numpy import random + +import pandas.tools.plotting as plotting + + +""" +This is a common base class used for various plotting tests +""" + + +def _skip_if_no_scipy_gaussian_kde(): + try: + from scipy.stats import gaussian_kde # noqa + except ImportError: + raise nose.SkipTest("scipy version doesn't support gaussian_kde") + + +def _ok_for_gaussian_kde(kind): + if kind in ['kde', 'density']: + try: + from scipy.stats import gaussian_kde # noqa + except ImportError: + return False + return True + + +@tm.mplskip +class TestPlotBase(tm.TestCase): + + def setUp(self): + + import matplotlib as mpl + mpl.rcdefaults() + + self.mpl_le_1_2_1 = plotting._mpl_le_1_2_1() + self.mpl_ge_1_3_1 = plotting._mpl_ge_1_3_1() + self.mpl_ge_1_4_0 = plotting._mpl_ge_1_4_0() + self.mpl_ge_1_5_0 = plotting._mpl_ge_1_5_0() + + if self.mpl_ge_1_4_0: + self.bp_n_objects = 7 + else: + self.bp_n_objects = 8 + if self.mpl_ge_1_5_0: + # 1.5 added PolyCollections to legend handler + # so we have twice as many items. + self.polycollection_factor = 2 + else: + self.polycollection_factor = 1 + + # common test data + from pandas import read_csv + path = os.path.join(os.path.dirname(curpath()), 'data', 'iris.csv') + self.iris = read_csv(path) + + n = 100 + with tm.RNGContext(42): + gender = np.random.choice(['Male', 'Female'], size=n) + classroom = np.random.choice(['A', 'B', 'C'], size=n) + + self.hist_df = DataFrame({'gender': gender, + 'classroom': classroom, + 'height': random.normal(66, 4, size=n), + 'weight': random.normal(161, 32, size=n), + 'category': random.randint(4, size=n)}) + + self.tdf = tm.makeTimeDataFrame() + self.hexbin_df = DataFrame({"A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform( + size=20)}) + + def tearDown(self): + tm.close() + + @cache_readonly + def plt(self): + import matplotlib.pyplot as plt + return plt + + @cache_readonly + def colorconverter(self): + import matplotlib.colors as colors + return colors.colorConverter + + def _check_legend_labels(self, axes, labels=None, visible=True): + """ + Check each axes has expected legend labels + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + labels : list-like + expected legend labels + visible : bool + expected legend visibility. labels are checked only when visible is + True + """ + + if visible and (labels is None): + raise ValueError('labels must be specified when visible is True') + axes = self._flatten_visible(axes) + for ax in axes: + if visible: + self.assertTrue(ax.get_legend() is not None) + self._check_text_labels(ax.get_legend().get_texts(), labels) + else: + self.assertTrue(ax.get_legend() is None) + + def _check_data(self, xp, rs): + """ + Check each axes has identical lines + + Parameters + ---------- + xp : matplotlib Axes object + rs : matplotlib Axes object + """ + xp_lines = xp.get_lines() + rs_lines = rs.get_lines() + + def check_line(xpl, rsl): + xpdata = xpl.get_xydata() + rsdata = rsl.get_xydata() + tm.assert_almost_equal(xpdata, rsdata) + + self.assertEqual(len(xp_lines), len(rs_lines)) + [check_line(xpl, rsl) for xpl, rsl in zip(xp_lines, rs_lines)] + tm.close() + + def _check_visible(self, collections, visible=True): + """ + Check each artist is visible or not + + Parameters + ---------- + collections : matplotlib Artist or its list-like + target Artist or its list or collection + visible : bool + expected visibility + """ + from matplotlib.collections import Collection + if not isinstance(collections, + Collection) and not com.is_list_like(collections): + collections = [collections] + + for patch in collections: + self.assertEqual(patch.get_visible(), visible) + + def _get_colors_mapped(self, series, colors): + unique = series.unique() + # unique and colors length can be differed + # depending on slice value + mapped = dict(zip(unique, colors)) + return [mapped[v] for v in series.values] + + def _check_colors(self, collections, linecolors=None, facecolors=None, + mapping=None): + """ + Check each artist has expected line colors and face colors + + Parameters + ---------- + collections : list-like + list or collection of target artist + linecolors : list-like which has the same length as collections + list of expected line colors + facecolors : list-like which has the same length as collections + list of expected face colors + mapping : Series + Series used for color grouping key + used for andrew_curves, parallel_coordinates, radviz test + """ + + from matplotlib.lines import Line2D + from matplotlib.collections import Collection, PolyCollection + conv = self.colorconverter + if linecolors is not None: + + if mapping is not None: + linecolors = self._get_colors_mapped(mapping, linecolors) + linecolors = linecolors[:len(collections)] + + self.assertEqual(len(collections), len(linecolors)) + for patch, color in zip(collections, linecolors): + if isinstance(patch, Line2D): + result = patch.get_color() + # Line2D may contains string color expression + result = conv.to_rgba(result) + elif isinstance(patch, PolyCollection): + result = tuple(patch.get_edgecolor()[0]) + else: + result = patch.get_edgecolor() + + expected = conv.to_rgba(color) + self.assertEqual(result, expected) + + if facecolors is not None: + + if mapping is not None: + facecolors = self._get_colors_mapped(mapping, facecolors) + facecolors = facecolors[:len(collections)] + + self.assertEqual(len(collections), len(facecolors)) + for patch, color in zip(collections, facecolors): + if isinstance(patch, Collection): + # returned as list of np.array + result = patch.get_facecolor()[0] + else: + result = patch.get_facecolor() + + if isinstance(result, np.ndarray): + result = tuple(result) + + expected = conv.to_rgba(color) + self.assertEqual(result, expected) + + def _check_text_labels(self, texts, expected): + """ + Check each text has expected labels + + Parameters + ---------- + texts : matplotlib Text object, or its list-like + target text, or its list + expected : str or list-like which has the same length as texts + expected text label, or its list + """ + if not com.is_list_like(texts): + self.assertEqual(texts.get_text(), expected) + else: + labels = [t.get_text() for t in texts] + self.assertEqual(len(labels), len(expected)) + for l, e in zip(labels, expected): + self.assertEqual(l, e) + + def _check_ticks_props(self, axes, xlabelsize=None, xrot=None, + ylabelsize=None, yrot=None): + """ + Check each axes has expected tick properties + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xlabelsize : number + expected xticks font size + xrot : number + expected xticks rotation + ylabelsize : number + expected yticks font size + yrot : number + expected yticks rotation + """ + from matplotlib.ticker import NullFormatter + axes = self._flatten_visible(axes) + for ax in axes: + if xlabelsize or xrot: + if isinstance(ax.xaxis.get_minor_formatter(), NullFormatter): + # If minor ticks has NullFormatter, rot / fontsize are not + # retained + labels = ax.get_xticklabels() + else: + labels = ax.get_xticklabels() + ax.get_xticklabels( + minor=True) + + for label in labels: + if xlabelsize is not None: + self.assertAlmostEqual(label.get_fontsize(), + xlabelsize) + if xrot is not None: + self.assertAlmostEqual(label.get_rotation(), xrot) + + if ylabelsize or yrot: + if isinstance(ax.yaxis.get_minor_formatter(), NullFormatter): + labels = ax.get_yticklabels() + else: + labels = ax.get_yticklabels() + ax.get_yticklabels( + minor=True) + + for label in labels: + if ylabelsize is not None: + self.assertAlmostEqual(label.get_fontsize(), + ylabelsize) + if yrot is not None: + self.assertAlmostEqual(label.get_rotation(), yrot) + + def _check_ax_scales(self, axes, xaxis='linear', yaxis='linear'): + """ + Check each axes has expected scales + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xaxis : {'linear', 'log'} + expected xaxis scale + yaxis : {'linear', 'log'} + expected yaxis scale + """ + axes = self._flatten_visible(axes) + for ax in axes: + self.assertEqual(ax.xaxis.get_scale(), xaxis) + self.assertEqual(ax.yaxis.get_scale(), yaxis) + + def _check_axes_shape(self, axes, axes_num=None, layout=None, + figsize=(8.0, 6.0)): + """ + Check expected number of axes is drawn in expected layout + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + axes_num : number + expected number of axes. Unnecessary axes should be set to + invisible. + layout : tuple + expected layout, (expected number of rows , columns) + figsize : tuple + expected figsize. default is matplotlib default + """ + visible_axes = self._flatten_visible(axes) + + if axes_num is not None: + self.assertEqual(len(visible_axes), axes_num) + for ax in visible_axes: + # check something drawn on visible axes + self.assertTrue(len(ax.get_children()) > 0) + + if layout is not None: + result = self._get_axes_layout(plotting._flatten(axes)) + self.assertEqual(result, layout) + + self.assert_numpy_array_equal( + np.round(visible_axes[0].figure.get_size_inches()), + np.array(figsize, dtype=np.float64)) + + def _get_axes_layout(self, axes): + x_set = set() + y_set = set() + for ax in axes: + # check axes coordinates to estimate layout + points = ax.get_position().get_points() + x_set.add(points[0][0]) + y_set.add(points[0][1]) + return (len(y_set), len(x_set)) + + def _flatten_visible(self, axes): + """ + Flatten axes, and filter only visible + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + + """ + axes = plotting._flatten(axes) + axes = [ax for ax in axes if ax.get_visible()] + return axes + + def _check_has_errorbars(self, axes, xerr=0, yerr=0): + """ + Check axes has expected number of errorbars + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xerr : number + expected number of x errorbar + yerr : number + expected number of y errorbar + """ + axes = self._flatten_visible(axes) + for ax in axes: + containers = ax.containers + xerr_count = 0 + yerr_count = 0 + for c in containers: + has_xerr = getattr(c, 'has_xerr', False) + has_yerr = getattr(c, 'has_yerr', False) + if has_xerr: + xerr_count += 1 + if has_yerr: + yerr_count += 1 + self.assertEqual(xerr, xerr_count) + self.assertEqual(yerr, yerr_count) + + def _check_box_return_type(self, returned, return_type, expected_keys=None, + check_ax_title=True): + """ + Check box returned type is correct + + Parameters + ---------- + returned : object to be tested, returned from boxplot + return_type : str + return_type passed to boxplot + expected_keys : list-like, optional + group labels in subplot case. If not passed, + the function checks assuming boxplot uses single ax + check_ax_title : bool + Whether to check the ax.title is the same as expected_key + Intended to be checked by calling from ``boxplot``. + Normal ``plot`` doesn't attach ``ax.title``, it must be disabled. + """ + from matplotlib.axes import Axes + types = {'dict': dict, 'axes': Axes, 'both': tuple} + if expected_keys is None: + # should be fixed when the returning default is changed + if return_type is None: + return_type = 'dict' + + self.assertTrue(isinstance(returned, types[return_type])) + if return_type == 'both': + self.assertIsInstance(returned.ax, Axes) + self.assertIsInstance(returned.lines, dict) + else: + # should be fixed when the returning default is changed + if return_type is None: + for r in self._flatten_visible(returned): + self.assertIsInstance(r, Axes) + return + + self.assertTrue(isinstance(returned, OrderedDict)) + self.assertEqual(sorted(returned.keys()), sorted(expected_keys)) + for key, value in iteritems(returned): + self.assertTrue(isinstance(value, types[return_type])) + # check returned dict has correct mapping + if return_type == 'axes': + if check_ax_title: + self.assertEqual(value.get_title(), key) + elif return_type == 'both': + if check_ax_title: + self.assertEqual(value.ax.get_title(), key) + self.assertIsInstance(value.ax, Axes) + self.assertIsInstance(value.lines, dict) + elif return_type == 'dict': + line = value['medians'][0] + axes = line.axes if self.mpl_ge_1_5_0 else line.get_axes() + if check_ax_title: + self.assertEqual(axes.get_title(), key) + else: + raise AssertionError + + def _check_grid_settings(self, obj, kinds, kws={}): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + + import matplotlib as mpl + + def is_grid_on(): + xoff = all(not g.gridOn + for g in self.plt.gca().xaxis.get_major_ticks()) + yoff = all(not g.gridOn + for g in self.plt.gca().yaxis.get_major_ticks()) + return not (xoff and yoff) + + spndx = 1 + for kind in kinds: + if not _ok_for_gaussian_kde(kind): + continue + + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc('axes', grid=False) + obj.plot(kind=kind, **kws) + self.assertFalse(is_grid_on()) + + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc('axes', grid=True) + obj.plot(kind=kind, grid=False, **kws) + self.assertFalse(is_grid_on()) + + if kind != 'pie': + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc('axes', grid=True) + obj.plot(kind=kind, **kws) + self.assertTrue(is_grid_on()) + + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc('axes', grid=False) + obj.plot(kind=kind, grid=True, **kws) + self.assertTrue(is_grid_on()) + + def _maybe_unpack_cycler(self, rcParams, field='color'): + """ + Compat layer for MPL 1.5 change to color cycle + + Before: plt.rcParams['axes.color_cycle'] -> ['b', 'g', 'r'...] + After : plt.rcParams['axes.prop_cycle'] -> cycler(...) + """ + if self.mpl_ge_1_5_0: + cyl = rcParams['axes.prop_cycle'] + colors = [v[field] for v in cyl] + else: + colors = rcParams['axes.color_cycle'] + return colors + + +def _check_plot_works(f, filterwarnings='always', **kwargs): + import matplotlib.pyplot as plt + ret = None + with warnings.catch_warnings(): + warnings.simplefilter(filterwarnings) + try: + try: + fig = kwargs['figure'] + except KeyError: + fig = plt.gcf() + + plt.clf() + + ax = kwargs.get('ax', fig.add_subplot(211)) # noqa + ret = f(**kwargs) + + assert_is_valid_plot_return_object(ret) + + try: + kwargs['ax'] = fig.add_subplot(212) + ret = f(**kwargs) + except Exception: + pass + else: + assert_is_valid_plot_return_object(ret) + + with ensure_clean(return_filelike=True) as path: + plt.savefig(path) + finally: + tm.close(fig) + + return ret + + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py new file mode 100644 index 0000000000000..d499540827ab0 --- /dev/null +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose +import itertools +import string +from distutils.version import LooseVersion + +from pandas import Series, DataFrame, MultiIndex +from pandas.compat import range, lzip +import pandas.util.testing as tm +from pandas.util.testing import slow + +import numpy as np +from numpy import random +from numpy.random import randn + +import pandas.tools.plotting as plotting + +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works) + + +""" Test cases for .boxplot method """ + + +def _skip_if_mpl_14_or_dev_boxplot(): + # GH 8382 + # Boxplot failures on 1.4 and 1.4.1 + # Don't need try / except since that's done at class level + import matplotlib + if str(matplotlib.__version__) >= LooseVersion('1.4'): + raise nose.SkipTest("Matplotlib Regression in 1.4 and current dev.") + + +@tm.mplskip +class TestDataFramePlots(TestPlotBase): + + @slow + def test_boxplot_legacy(self): + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + df['indic'] = ['foo', 'bar'] * 3 + df['indic2'] = ['foo', 'bar', 'foo'] * 2 + + _check_plot_works(df.boxplot, return_type='dict') + _check_plot_works(df.boxplot, column=[ + 'one', 'two'], return_type='dict') + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, column=['one', 'two'], + by='indic') + _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2']) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by='indic') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by=['indic', 'indic2']) + _check_plot_works(plotting.boxplot, data=df['one'], return_type='dict') + _check_plot_works(df.boxplot, notch=1, return_type='dict') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by='indic', notch=1) + + df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) + df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) + df['Y'] = Series(['A'] * 10) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by='X') + + # When ax is supplied and required number of axes is 1, + # passed ax should be used: + fig, ax = self.plt.subplots() + axes = df.boxplot('Col1', by='X', ax=ax) + ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes() + self.assertIs(ax_axes, axes) + + fig, ax = self.plt.subplots() + axes = df.groupby('Y').boxplot(ax=ax, return_type='axes') + ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes() + self.assertIs(ax_axes, axes['A']) + + # Multiple columns with an ax argument should use same figure + fig, ax = self.plt.subplots() + with tm.assert_produces_warning(UserWarning): + axes = df.boxplot(column=['Col1', 'Col2'], + by='X', ax=ax, return_type='axes') + self.assertIs(axes['Col1'].get_figure(), fig) + + # When by is None, check that all relevant lines are present in the + # dict + fig, ax = self.plt.subplots() + d = df.boxplot(ax=ax, return_type='dict') + lines = list(itertools.chain.from_iterable(d.values())) + self.assertEqual(len(ax.get_lines()), len(lines)) + + @slow + def test_boxplot_return_type_legacy(self): + # API change in https://github.com/pydata/pandas/pull/7096 + import matplotlib as mpl # noqa + + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + with tm.assertRaises(ValueError): + df.boxplot(return_type='NOTATYPE') + + with tm.assert_produces_warning(FutureWarning): + result = df.boxplot() + # change to Axes in future + self._check_box_return_type(result, 'dict') + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type='dict') + self._check_box_return_type(result, 'dict') + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type='axes') + self._check_box_return_type(result, 'axes') + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type='both') + self._check_box_return_type(result, 'both') + + @slow + def test_boxplot_axis_limits(self): + + def _check_ax_limits(col, ax): + y_min, y_max = ax.get_ylim() + self.assertTrue(y_min <= col.min()) + self.assertTrue(y_max >= col.max()) + + df = self.hist_df.copy() + df['age'] = np.random.randint(1, 20, df.shape[0]) + # One full row + height_ax, weight_ax = df.boxplot(['height', 'weight'], by='category') + _check_ax_limits(df['height'], height_ax) + _check_ax_limits(df['weight'], weight_ax) + self.assertEqual(weight_ax._sharey, height_ax) + + # Two rows, one partial + p = df.boxplot(['height', 'weight', 'age'], by='category') + height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0] + dummy_ax = p[1, 1] + _check_ax_limits(df['height'], height_ax) + _check_ax_limits(df['weight'], weight_ax) + _check_ax_limits(df['age'], age_ax) + self.assertEqual(weight_ax._sharey, height_ax) + self.assertEqual(age_ax._sharey, height_ax) + self.assertIsNone(dummy_ax._sharey) + + @slow + def test_boxplot_empty_column(self): + _skip_if_mpl_14_or_dev_boxplot() + df = DataFrame(np.random.randn(20, 4)) + df.loc[:, 0] = np.nan + _check_plot_works(df.boxplot, return_type='axes') + + +@tm.mplskip +class TestDataFrameGroupByPlots(TestPlotBase): + + @slow + def test_boxplot_legacy(self): + grouped = self.hist_df.groupby(by='gender') + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type='axes') + self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2)) + + axes = _check_plot_works(grouped.boxplot, subplots=False, + return_type='axes') + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + tuples = lzip(string.ascii_letters[:10], range(10)) + df = DataFrame(np.random.rand(10, 3), + index=MultiIndex.from_tuples(tuples)) + + grouped = df.groupby(level=1) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type='axes') + self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3)) + + axes = _check_plot_works(grouped.boxplot, subplots=False, + return_type='axes') + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + grouped = df.unstack(level=1).groupby(level=0, axis=1) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type='axes') + self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2)) + + axes = _check_plot_works(grouped.boxplot, subplots=False, + return_type='axes') + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + @slow + def test_grouped_plot_fignums(self): + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender = np.random.choice(['male', 'female'], size=n) + df = DataFrame({'height': height, 'weight': weight, 'gender': gender}) + gb = df.groupby('gender') + + res = gb.plot() + self.assertEqual(len(self.plt.get_fignums()), 2) + self.assertEqual(len(res), 2) + tm.close() + + res = gb.boxplot(return_type='axes') + self.assertEqual(len(self.plt.get_fignums()), 1) + self.assertEqual(len(res), 2) + tm.close() + + # now works with GH 5610 as gender is excluded + res = df.groupby('gender').hist() + tm.close() + + @slow + def test_grouped_box_return_type(self): + df = self.hist_df + + # old style: return_type=None + result = df.boxplot(by='gender') + self.assertIsInstance(result, np.ndarray) + self._check_box_return_type( + result, None, + expected_keys=['height', 'weight', 'category']) + + # now for groupby + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df.groupby('gender').boxplot() + self._check_box_return_type( + result, 'dict', expected_keys=['Male', 'Female']) + + columns2 = 'X B C D A G Y N Q O'.split() + df2 = DataFrame(random.randn(50, 10), columns=columns2) + categories2 = 'A B C D E F G H I J'.split() + df2['category'] = categories2 * 5 + + for t in ['dict', 'axes', 'both']: + returned = df.groupby('classroom').boxplot(return_type=t) + self._check_box_return_type( + returned, t, expected_keys=['A', 'B', 'C']) + + returned = df.boxplot(by='classroom', return_type=t) + self._check_box_return_type( + returned, t, + expected_keys=['height', 'weight', 'category']) + + returned = df2.groupby('category').boxplot(return_type=t) + self._check_box_return_type(returned, t, expected_keys=categories2) + + returned = df2.boxplot(by='category', return_type=t) + self._check_box_return_type(returned, t, expected_keys=columns2) + + @slow + def test_grouped_box_layout(self): + df = self.hist_df + + self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], + by=df.gender, layout=(1, 1)) + self.assertRaises(ValueError, df.boxplot, + column=['height', 'weight', 'category'], + layout=(2, 1), return_type='dict') + self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], + by=df.gender, layout=(-1, -1)) + + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('gender').boxplot, + column='height', return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2)) + + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('category').boxplot, + column='height', + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) + + # GH 6769 + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('classroom').boxplot, + column='height', return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + + # GH 5897 + axes = df.boxplot(column=['height', 'weight', 'category'], by='gender', + return_type='axes') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + for ax in [axes['height']]: + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible([ax.xaxis.get_label()], visible=False) + for ax in [axes['weight'], axes['category']]: + self._check_visible(ax.get_xticklabels()) + self._check_visible([ax.xaxis.get_label()]) + + box = df.groupby('classroom').boxplot( + column=['height', 'weight', 'category'], return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('category').boxplot, + column='height', + layout=(3, 2), return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('category').boxplot, + column='height', + layout=(3, -1), return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) + + box = df.boxplot(column=['height', 'weight', 'category'], by='gender', + layout=(4, 1)) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1)) + + box = df.boxplot(column=['height', 'weight', 'category'], by='gender', + layout=(-1, 1)) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(3, 1)) + + box = df.groupby('classroom').boxplot( + column=['height', 'weight', 'category'], layout=(1, 4), + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4)) + + box = df.groupby('classroom').boxplot( # noqa + column=['height', 'weight', 'category'], layout=(1, -1), + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) + + @slow + def test_grouped_box_multiple_axes(self): + # GH 6970, GH 7069 + df = self.hist_df + + # check warning to ignore sharex / sharey + # this check should be done in the first function which + # passes multiple axes to plot, hist or boxplot + # location should be changed if other test is added + # which has earlier alphabetical order + with tm.assert_produces_warning(UserWarning): + fig, axes = self.plt.subplots(2, 2) + df.groupby('category').boxplot( + column='height', return_type='axes', ax=axes) + self._check_axes_shape(self.plt.gcf().axes, + axes_num=4, layout=(2, 2)) + + fig, axes = self.plt.subplots(2, 3) + with tm.assert_produces_warning(UserWarning): + returned = df.boxplot(column=['height', 'weight', 'category'], + by='gender', return_type='axes', ax=axes[0]) + returned = np.array(list(returned.values())) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[0]) + self.assertIs(returned[0].figure, fig) + + # draw on second row + with tm.assert_produces_warning(UserWarning): + returned = df.groupby('classroom').boxplot( + column=['height', 'weight', 'category'], + return_type='axes', ax=axes[1]) + returned = np.array(list(returned.values())) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[1]) + self.assertIs(returned[0].figure, fig) + + with tm.assertRaises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + with tm.assert_produces_warning(UserWarning): + axes = df.groupby('classroom').boxplot(ax=axes) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tests/plotting/test_datetimelike.py similarity index 99% rename from pandas/tseries/tests/test_plotting.py rename to pandas/tests/plotting/test_datetimelike.py index 2255f9fae73de..3f09317915254 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -14,12 +14,18 @@ from pandas.util.testing import assert_series_equal, ensure_clean, slow import pandas.util.testing as tm -from pandas.tests.test_graphics import _skip_if_no_scipy_gaussian_kde +from pandas.tests.plotting.common import (TestPlotBase, + _skip_if_no_scipy_gaussian_kde) + + +""" Test cases for time series specific (freq conversion, etc) """ @tm.mplskip -class TestTSPlot(tm.TestCase): +class TestTSPlot(TestPlotBase): + def setUp(self): + TestPlotBase.setUp(self) freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q', 'A'] idx = [period_range('12/31/1999', freq=x, periods=100) for x in freq] self.period_ser = [Series(np.random.randn(len(x)), x) for x in idx] diff --git a/pandas/tests/test_graphics.py b/pandas/tests/plotting/test_frame.py similarity index 68% rename from pandas/tests/test_graphics.py rename to pandas/tests/plotting/test_frame.py index 5493eb37c358b..311da4a92e45a 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/plotting/test_frame.py @@ -2,1293 +2,31 @@ # coding: utf-8 import nose -import itertools -import os import string import warnings from datetime import datetime, date -from pandas.types.common import is_list_like import pandas as pd from pandas import (Series, DataFrame, MultiIndex, PeriodIndex, date_range, bdate_range) -from pandas.compat import (range, lrange, StringIO, lmap, lzip, u, zip, - iteritems, OrderedDict, PY3) -from pandas.util.decorators import cache_readonly +from pandas.compat import (range, lrange, StringIO, lmap, lzip, u, zip, PY3) from pandas.formats.printing import pprint_thing import pandas.util.testing as tm -from pandas.util.testing import (ensure_clean, - assert_is_valid_plot_return_object, slow) +from pandas.util.testing import slow from pandas.core.config import set_option import numpy as np -from numpy import random from numpy.random import rand, randn import pandas.tools.plotting as plotting -""" -These tests are for ``Dataframe.plot`` and ``Series.plot``. -Other plot methods such as ``.hist``, ``.boxplot`` and other miscellaneous -are tested in test_graphics_others.py -""" +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works, + _skip_if_no_scipy_gaussian_kde, + _ok_for_gaussian_kde) -def _skip_if_no_scipy_gaussian_kde(): - try: - from scipy.stats import gaussian_kde # noqa - except ImportError: - raise nose.SkipTest("scipy version doesn't support gaussian_kde") - - -def _ok_for_gaussian_kde(kind): - if kind in ['kde', 'density']: - try: - from scipy.stats import gaussian_kde # noqa - except ImportError: - return False - return True - - -@tm.mplskip -class TestPlotBase(tm.TestCase): - - def setUp(self): - - import matplotlib as mpl - mpl.rcdefaults() - - n = 100 - with tm.RNGContext(42): - gender = np.random.choice(['Male', 'Female'], size=n) - classroom = np.random.choice(['A', 'B', 'C'], size=n) - - self.hist_df = DataFrame({'gender': gender, - 'classroom': classroom, - 'height': random.normal(66, 4, size=n), - 'weight': random.normal(161, 32, size=n), - 'category': random.randint(4, size=n)}) - - self.mpl_le_1_2_1 = plotting._mpl_le_1_2_1() - self.mpl_ge_1_3_1 = plotting._mpl_ge_1_3_1() - self.mpl_ge_1_4_0 = plotting._mpl_ge_1_4_0() - self.mpl_ge_1_5_0 = plotting._mpl_ge_1_5_0() - - if self.mpl_ge_1_4_0: - self.bp_n_objects = 7 - else: - self.bp_n_objects = 8 - if self.mpl_ge_1_5_0: - # 1.5 added PolyCollections to legend handler - # so we have twice as many items. - self.polycollection_factor = 2 - else: - self.polycollection_factor = 1 - - def tearDown(self): - tm.close() - - @cache_readonly - def plt(self): - import matplotlib.pyplot as plt - return plt - - @cache_readonly - def colorconverter(self): - import matplotlib.colors as colors - return colors.colorConverter - - def _check_legend_labels(self, axes, labels=None, visible=True): - """ - Check each axes has expected legend labels - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - labels : list-like - expected legend labels - visible : bool - expected legend visibility. labels are checked only when visible is - True - """ - - if visible and (labels is None): - raise ValueError('labels must be specified when visible is True') - axes = self._flatten_visible(axes) - for ax in axes: - if visible: - self.assertTrue(ax.get_legend() is not None) - self._check_text_labels(ax.get_legend().get_texts(), labels) - else: - self.assertTrue(ax.get_legend() is None) - - def _check_data(self, xp, rs): - """ - Check each axes has identical lines - - Parameters - ---------- - xp : matplotlib Axes object - rs : matplotlib Axes object - """ - xp_lines = xp.get_lines() - rs_lines = rs.get_lines() - - def check_line(xpl, rsl): - xpdata = xpl.get_xydata() - rsdata = rsl.get_xydata() - tm.assert_almost_equal(xpdata, rsdata) - - self.assertEqual(len(xp_lines), len(rs_lines)) - [check_line(xpl, rsl) for xpl, rsl in zip(xp_lines, rs_lines)] - tm.close() - - def _check_visible(self, collections, visible=True): - """ - Check each artist is visible or not - - Parameters - ---------- - collections : matplotlib Artist or its list-like - target Artist or its list or collection - visible : bool - expected visibility - """ - from matplotlib.collections import Collection - if not isinstance(collections, - Collection) and not is_list_like(collections): - collections = [collections] - - for patch in collections: - self.assertEqual(patch.get_visible(), visible) - - def _get_colors_mapped(self, series, colors): - unique = series.unique() - # unique and colors length can be differed - # depending on slice value - mapped = dict(zip(unique, colors)) - return [mapped[v] for v in series.values] - - def _check_colors(self, collections, linecolors=None, facecolors=None, - mapping=None): - """ - Check each artist has expected line colors and face colors - - Parameters - ---------- - collections : list-like - list or collection of target artist - linecolors : list-like which has the same length as collections - list of expected line colors - facecolors : list-like which has the same length as collections - list of expected face colors - mapping : Series - Series used for color grouping key - used for andrew_curves, parallel_coordinates, radviz test - """ - - from matplotlib.lines import Line2D - from matplotlib.collections import Collection, PolyCollection - conv = self.colorconverter - if linecolors is not None: - - if mapping is not None: - linecolors = self._get_colors_mapped(mapping, linecolors) - linecolors = linecolors[:len(collections)] - - self.assertEqual(len(collections), len(linecolors)) - for patch, color in zip(collections, linecolors): - if isinstance(patch, Line2D): - result = patch.get_color() - # Line2D may contains string color expression - result = conv.to_rgba(result) - elif isinstance(patch, PolyCollection): - result = tuple(patch.get_edgecolor()[0]) - else: - result = patch.get_edgecolor() - - expected = conv.to_rgba(color) - self.assertEqual(result, expected) - - if facecolors is not None: - - if mapping is not None: - facecolors = self._get_colors_mapped(mapping, facecolors) - facecolors = facecolors[:len(collections)] - - self.assertEqual(len(collections), len(facecolors)) - for patch, color in zip(collections, facecolors): - if isinstance(patch, Collection): - # returned as list of np.array - result = patch.get_facecolor()[0] - else: - result = patch.get_facecolor() - - if isinstance(result, np.ndarray): - result = tuple(result) - - expected = conv.to_rgba(color) - self.assertEqual(result, expected) - - def _check_text_labels(self, texts, expected): - """ - Check each text has expected labels - - Parameters - ---------- - texts : matplotlib Text object, or its list-like - target text, or its list - expected : str or list-like which has the same length as texts - expected text label, or its list - """ - if not is_list_like(texts): - self.assertEqual(texts.get_text(), expected) - else: - labels = [t.get_text() for t in texts] - self.assertEqual(len(labels), len(expected)) - for l, e in zip(labels, expected): - self.assertEqual(l, e) - - def _check_ticks_props(self, axes, xlabelsize=None, xrot=None, - ylabelsize=None, yrot=None): - """ - Check each axes has expected tick properties - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - xlabelsize : number - expected xticks font size - xrot : number - expected xticks rotation - ylabelsize : number - expected yticks font size - yrot : number - expected yticks rotation - """ - from matplotlib.ticker import NullFormatter - axes = self._flatten_visible(axes) - for ax in axes: - if xlabelsize or xrot: - if isinstance(ax.xaxis.get_minor_formatter(), NullFormatter): - # If minor ticks has NullFormatter, rot / fontsize are not - # retained - labels = ax.get_xticklabels() - else: - labels = ax.get_xticklabels() + ax.get_xticklabels( - minor=True) - - for label in labels: - if xlabelsize is not None: - self.assertAlmostEqual(label.get_fontsize(), - xlabelsize) - if xrot is not None: - self.assertAlmostEqual(label.get_rotation(), xrot) - - if ylabelsize or yrot: - if isinstance(ax.yaxis.get_minor_formatter(), NullFormatter): - labels = ax.get_yticklabels() - else: - labels = ax.get_yticklabels() + ax.get_yticklabels( - minor=True) - - for label in labels: - if ylabelsize is not None: - self.assertAlmostEqual(label.get_fontsize(), - ylabelsize) - if yrot is not None: - self.assertAlmostEqual(label.get_rotation(), yrot) - - def _check_ax_scales(self, axes, xaxis='linear', yaxis='linear'): - """ - Check each axes has expected scales - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - xaxis : {'linear', 'log'} - expected xaxis scale - yaxis : {'linear', 'log'} - expected yaxis scale - """ - axes = self._flatten_visible(axes) - for ax in axes: - self.assertEqual(ax.xaxis.get_scale(), xaxis) - self.assertEqual(ax.yaxis.get_scale(), yaxis) - - def _check_axes_shape(self, axes, axes_num=None, layout=None, - figsize=(8.0, 6.0)): - """ - Check expected number of axes is drawn in expected layout - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - axes_num : number - expected number of axes. Unnecessary axes should be set to - invisible. - layout : tuple - expected layout, (expected number of rows , columns) - figsize : tuple - expected figsize. default is matplotlib default - """ - visible_axes = self._flatten_visible(axes) - - if axes_num is not None: - self.assertEqual(len(visible_axes), axes_num) - for ax in visible_axes: - # check something drawn on visible axes - self.assertTrue(len(ax.get_children()) > 0) - - if layout is not None: - result = self._get_axes_layout(plotting._flatten(axes)) - self.assertEqual(result, layout) - - self.assert_numpy_array_equal( - np.round(visible_axes[0].figure.get_size_inches()), - np.array(figsize, dtype=np.float64)) - - def _get_axes_layout(self, axes): - x_set = set() - y_set = set() - for ax in axes: - # check axes coordinates to estimate layout - points = ax.get_position().get_points() - x_set.add(points[0][0]) - y_set.add(points[0][1]) - return (len(y_set), len(x_set)) - - def _flatten_visible(self, axes): - """ - Flatten axes, and filter only visible - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - - """ - axes = plotting._flatten(axes) - axes = [ax for ax in axes if ax.get_visible()] - return axes - - def _check_has_errorbars(self, axes, xerr=0, yerr=0): - """ - Check axes has expected number of errorbars - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - xerr : number - expected number of x errorbar - yerr : number - expected number of y errorbar - """ - axes = self._flatten_visible(axes) - for ax in axes: - containers = ax.containers - xerr_count = 0 - yerr_count = 0 - for c in containers: - has_xerr = getattr(c, 'has_xerr', False) - has_yerr = getattr(c, 'has_yerr', False) - if has_xerr: - xerr_count += 1 - if has_yerr: - yerr_count += 1 - self.assertEqual(xerr, xerr_count) - self.assertEqual(yerr, yerr_count) - - def _check_box_return_type(self, returned, return_type, expected_keys=None, - check_ax_title=True): - """ - Check box returned type is correct - - Parameters - ---------- - returned : object to be tested, returned from boxplot - return_type : str - return_type passed to boxplot - expected_keys : list-like, optional - group labels in subplot case. If not passed, - the function checks assuming boxplot uses single ax - check_ax_title : bool - Whether to check the ax.title is the same as expected_key - Intended to be checked by calling from ``boxplot``. - Normal ``plot`` doesn't attach ``ax.title``, it must be disabled. - """ - from matplotlib.axes import Axes - types = {'dict': dict, 'axes': Axes, 'both': tuple} - if expected_keys is None: - # should be fixed when the returning default is changed - if return_type is None: - return_type = 'dict' - - self.assertTrue(isinstance(returned, types[return_type])) - if return_type == 'both': - self.assertIsInstance(returned.ax, Axes) - self.assertIsInstance(returned.lines, dict) - else: - # should be fixed when the returning default is changed - if return_type is None: - for r in self._flatten_visible(returned): - self.assertIsInstance(r, Axes) - return - - self.assertTrue(isinstance(returned, OrderedDict)) - self.assertEqual(sorted(returned.keys()), sorted(expected_keys)) - for key, value in iteritems(returned): - self.assertTrue(isinstance(value, types[return_type])) - # check returned dict has correct mapping - if return_type == 'axes': - if check_ax_title: - self.assertEqual(value.get_title(), key) - elif return_type == 'both': - if check_ax_title: - self.assertEqual(value.ax.get_title(), key) - self.assertIsInstance(value.ax, Axes) - self.assertIsInstance(value.lines, dict) - elif return_type == 'dict': - line = value['medians'][0] - axes = line.axes if self.mpl_ge_1_5_0 else line.get_axes() - if check_ax_title: - self.assertEqual(axes.get_title(), key) - else: - raise AssertionError - - def _check_grid_settings(self, obj, kinds, kws={}): - # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 - - import matplotlib as mpl - - def is_grid_on(): - xoff = all(not g.gridOn - for g in self.plt.gca().xaxis.get_major_ticks()) - yoff = all(not g.gridOn - for g in self.plt.gca().yaxis.get_major_ticks()) - return not (xoff and yoff) - - spndx = 1 - for kind in kinds: - if not _ok_for_gaussian_kde(kind): - continue - - self.plt.subplot(1, 4 * len(kinds), spndx) - spndx += 1 - mpl.rc('axes', grid=False) - obj.plot(kind=kind, **kws) - self.assertFalse(is_grid_on()) - - self.plt.subplot(1, 4 * len(kinds), spndx) - spndx += 1 - mpl.rc('axes', grid=True) - obj.plot(kind=kind, grid=False, **kws) - self.assertFalse(is_grid_on()) - - if kind != 'pie': - self.plt.subplot(1, 4 * len(kinds), spndx) - spndx += 1 - mpl.rc('axes', grid=True) - obj.plot(kind=kind, **kws) - self.assertTrue(is_grid_on()) - - self.plt.subplot(1, 4 * len(kinds), spndx) - spndx += 1 - mpl.rc('axes', grid=False) - obj.plot(kind=kind, grid=True, **kws) - self.assertTrue(is_grid_on()) - - def _maybe_unpack_cycler(self, rcParams, field='color'): - """ - Compat layer for MPL 1.5 change to color cycle - - Before: plt.rcParams['axes.color_cycle'] -> ['b', 'g', 'r'...] - After : plt.rcParams['axes.prop_cycle'] -> cycler(...) - """ - if self.mpl_ge_1_5_0: - cyl = rcParams['axes.prop_cycle'] - colors = [v[field] for v in cyl] - else: - colors = rcParams['axes.color_cycle'] - return colors - - -@tm.mplskip -class TestSeriesPlots(TestPlotBase): - - def setUp(self): - TestPlotBase.setUp(self) - import matplotlib as mpl - mpl.rcdefaults() - - self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' - - self.series = tm.makeStringSeries() - self.series.name = 'series' - - self.iseries = tm.makePeriodSeries() - self.iseries.name = 'iseries' - - @slow - def test_plot(self): - _check_plot_works(self.ts.plot, label='foo') - _check_plot_works(self.ts.plot, use_index=False) - axes = _check_plot_works(self.ts.plot, rot=0) - self._check_ticks_props(axes, xrot=0) - - ax = _check_plot_works(self.ts.plot, style='.', logy=True) - self._check_ax_scales(ax, yaxis='log') - - ax = _check_plot_works(self.ts.plot, style='.', logx=True) - self._check_ax_scales(ax, xaxis='log') - - ax = _check_plot_works(self.ts.plot, style='.', loglog=True) - self._check_ax_scales(ax, xaxis='log', yaxis='log') - - _check_plot_works(self.ts[:10].plot.bar) - _check_plot_works(self.ts.plot.area, stacked=False) - _check_plot_works(self.iseries.plot) - - for kind in ['line', 'bar', 'barh', 'kde', 'hist', 'box']: - if not _ok_for_gaussian_kde(kind): - continue - _check_plot_works(self.series[:5].plot, kind=kind) - - _check_plot_works(self.series[:10].plot.barh) - ax = _check_plot_works(Series(randn(10)).plot.bar, color='black') - self._check_colors([ax.patches[0]], facecolors=['black']) - - # GH 6951 - ax = _check_plot_works(self.ts.plot, subplots=True) - self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) - - ax = _check_plot_works(self.ts.plot, subplots=True, layout=(-1, 1)) - self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) - ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1)) - self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) - - @slow - def test_plot_figsize_and_title(self): - # figsize and title - ax = self.series.plot(title='Test', figsize=(16, 8)) - self._check_text_labels(ax.title, 'Test') - self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) - - def test_dont_modify_rcParams(self): - # GH 8242 - if self.mpl_ge_1_5_0: - key = 'axes.prop_cycle' - else: - key = 'axes.color_cycle' - colors = self.plt.rcParams[key] - Series([1, 2, 3]).plot() - self.assertEqual(colors, self.plt.rcParams[key]) - - def test_ts_line_lim(self): - ax = self.ts.plot() - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) - self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) - tm.close() - - ax = self.ts.plot(secondary_y=True) - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) - self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) - - def test_ts_area_lim(self): - ax = self.ts.plot.area(stacked=False) - xmin, xmax = ax.get_xlim() - line = ax.get_lines()[0].get_data(orig=False)[0] - self.assertEqual(xmin, line[0]) - self.assertEqual(xmax, line[-1]) - tm.close() - - # GH 7471 - ax = self.ts.plot.area(stacked=False, x_compat=True) - xmin, xmax = ax.get_xlim() - line = ax.get_lines()[0].get_data(orig=False)[0] - self.assertEqual(xmin, line[0]) - self.assertEqual(xmax, line[-1]) - tm.close() - - tz_ts = self.ts.copy() - tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET') - ax = tz_ts.plot.area(stacked=False, x_compat=True) - xmin, xmax = ax.get_xlim() - line = ax.get_lines()[0].get_data(orig=False)[0] - self.assertEqual(xmin, line[0]) - self.assertEqual(xmax, line[-1]) - tm.close() - - ax = tz_ts.plot.area(stacked=False, secondary_y=True) - xmin, xmax = ax.get_xlim() - line = ax.get_lines()[0].get_data(orig=False)[0] - self.assertEqual(xmin, line[0]) - self.assertEqual(xmax, line[-1]) - - def test_label(self): - s = Series([1, 2]) - ax = s.plot(label='LABEL', legend=True) - self._check_legend_labels(ax, labels=['LABEL']) - self.plt.close() - ax = s.plot(legend=True) - self._check_legend_labels(ax, labels=['None']) - self.plt.close() - # get name from index - s.name = 'NAME' - ax = s.plot(legend=True) - self._check_legend_labels(ax, labels=['NAME']) - self.plt.close() - # override the default - ax = s.plot(legend=True, label='LABEL') - self._check_legend_labels(ax, labels=['LABEL']) - self.plt.close() - # Add lebel info, but don't draw - ax = s.plot(legend=False, label='LABEL') - self.assertEqual(ax.get_legend(), None) # Hasn't been drawn - ax.legend() # draw it - self._check_legend_labels(ax, labels=['LABEL']) - - def test_line_area_nan_series(self): - values = [1, 2, np.nan, 3] - s = Series(values) - ts = Series(values, index=tm.makeDateIndex(k=4)) - - for d in [s, ts]: - ax = _check_plot_works(d.plot) - masked = ax.lines[0].get_ydata() - # remove nan for comparison purpose - exp = np.array([1, 2, 3], dtype=np.float64) - self.assert_numpy_array_equal(np.delete(masked.data, 2), exp) - self.assert_numpy_array_equal( - masked.mask, np.array([False, False, True, False])) - - expected = np.array([1, 2, 0, 3], dtype=np.float64) - ax = _check_plot_works(d.plot, stacked=True) - self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) - ax = _check_plot_works(d.plot.area) - self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) - ax = _check_plot_works(d.plot.area, stacked=False) - self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) - - def test_line_use_index_false(self): - s = Series([1, 2, 3], index=['a', 'b', 'c']) - s.index.name = 'The Index' - ax = s.plot(use_index=False) - label = ax.get_xlabel() - self.assertEqual(label, '') - ax2 = s.plot.bar(use_index=False) - label2 = ax2.get_xlabel() - self.assertEqual(label2, '') - - @slow - def test_bar_log(self): - expected = np.array([1., 10., 100., 1000.]) - - if not self.mpl_le_1_2_1: - expected = np.hstack((.1, expected, 1e4)) - - ax = Series([200, 500]).plot.bar(log=True) - tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) - tm.close() - - ax = Series([200, 500]).plot.barh(log=True) - tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) - tm.close() - - # GH 9905 - expected = np.array([1.0e-03, 1.0e-02, 1.0e-01, 1.0e+00]) - - if not self.mpl_le_1_2_1: - expected = np.hstack((1.0e-04, expected, 1.0e+01)) - - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') - self.assertEqual(ax.get_ylim(), (0.001, 0.10000000000000001)) - tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) - tm.close() - - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') - self.assertEqual(ax.get_xlim(), (0.001, 0.10000000000000001)) - tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) - - @slow - def test_bar_ignore_index(self): - df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) - ax = df.plot.bar(use_index=False) - self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3']) - - def test_rotation(self): - df = DataFrame(randn(5, 5)) - # Default rot 0 - axes = df.plot() - self._check_ticks_props(axes, xrot=0) - - axes = df.plot(rot=30) - self._check_ticks_props(axes, xrot=30) - - def test_irregular_datetime(self): - rng = date_range('1/1/2000', '3/1/2000') - rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] - ser = Series(randn(len(rng)), rng) - ax = ser.plot() - xp = datetime(1999, 1, 1).toordinal() - ax.set_xlim('1/1/1999', '1/1/2001') - self.assertEqual(xp, ax.get_xlim()[0]) - - @slow - def test_pie_series(self): - # if sum of values is less than 1.0, pie handle them as rate and draw - # semicircle. - series = Series(np.random.randint(1, 5), - index=['a', 'b', 'c', 'd', 'e'], name='YLABEL') - ax = _check_plot_works(series.plot.pie) - self._check_text_labels(ax.texts, series.index) - self.assertEqual(ax.get_ylabel(), 'YLABEL') - - # without wedge labels - ax = _check_plot_works(series.plot.pie, labels=None) - self._check_text_labels(ax.texts, [''] * 5) - - # with less colors than elements - color_args = ['r', 'g', 'b'] - ax = _check_plot_works(series.plot.pie, colors=color_args) - - color_expected = ['r', 'g', 'b', 'r', 'g'] - self._check_colors(ax.patches, facecolors=color_expected) - - # with labels and colors - labels = ['A', 'B', 'C', 'D', 'E'] - color_args = ['r', 'g', 'b', 'c', 'm'] - ax = _check_plot_works(series.plot.pie, labels=labels, - colors=color_args) - self._check_text_labels(ax.texts, labels) - self._check_colors(ax.patches, facecolors=color_args) - - # with autopct and fontsize - ax = _check_plot_works(series.plot.pie, colors=color_args, - autopct='%.2f', fontsize=7) - pcts = ['{0:.2f}'.format(s * 100) - for s in series.values / float(series.sum())] - iters = [iter(series.index), iter(pcts)] - expected_texts = list(next(it) for it in itertools.cycle(iters)) - self._check_text_labels(ax.texts, expected_texts) - for t in ax.texts: - self.assertEqual(t.get_fontsize(), 7) - - # includes negative value - with tm.assertRaises(ValueError): - series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e']) - series.plot.pie() - - # includes nan - series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'], - name='YLABEL') - ax = _check_plot_works(series.plot.pie) - self._check_text_labels(ax.texts, ['a', 'b', '', 'd']) - - def test_pie_nan(self): - s = Series([1, np.nan, 1, 1]) - ax = s.plot.pie(legend=True) - expected = ['0', '', '2', '3'] - result = [x.get_text() for x in ax.texts] - self.assertEqual(result, expected) - - @slow - def test_hist_df_kwargs(self): - df = DataFrame(np.random.randn(10, 2)) - ax = df.plot.hist(bins=5) - self.assertEqual(len(ax.patches), 10) - - @slow - def test_hist_df_with_nonnumerics(self): - # GH 9853 - with tm.RNGContext(1): - df = DataFrame( - np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) - df['E'] = ['x', 'y'] * 5 - ax = df.plot.hist(bins=5) - self.assertEqual(len(ax.patches), 20) - - ax = df.plot.hist() # bins=10 - self.assertEqual(len(ax.patches), 40) - - @slow - def test_hist_legacy(self): - _check_plot_works(self.ts.hist) - _check_plot_works(self.ts.hist, grid=False) - _check_plot_works(self.ts.hist, figsize=(8, 10)) - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - _check_plot_works(self.ts.hist, - by=self.ts.index.month) - with tm.assert_produces_warning(UserWarning): - _check_plot_works(self.ts.hist, - by=self.ts.index.month, bins=5) - - fig, ax = self.plt.subplots(1, 1) - _check_plot_works(self.ts.hist, ax=ax) - _check_plot_works(self.ts.hist, ax=ax, figure=fig) - _check_plot_works(self.ts.hist, figure=fig) - tm.close() - - fig, (ax1, ax2) = self.plt.subplots(1, 2) - _check_plot_works(self.ts.hist, figure=fig, ax=ax1) - _check_plot_works(self.ts.hist, figure=fig, ax=ax2) - - with tm.assertRaises(ValueError): - self.ts.hist(by=self.ts.index, figure=fig) - - @slow - def test_hist_bins_legacy(self): - df = DataFrame(np.random.randn(10, 2)) - ax = df.hist(bins=2)[0][0] - self.assertEqual(len(ax.patches), 2) - - @slow - def test_hist_layout(self): - df = self.hist_df - with tm.assertRaises(ValueError): - df.height.hist(layout=(1, 1)) - - with tm.assertRaises(ValueError): - df.height.hist(layout=[1, 1]) - - @slow - def test_hist_layout_with_by(self): - df = self.hist_df - - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.gender, layout=(2, 1)) - self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.gender, layout=(3, -1)) - self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(4, 1)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(2, -1)) - self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(3, -1)) - self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(-1, 4)) - self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.classroom, layout=(2, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 2), - figsize=(12, 7)) - - @slow - def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf - x = Series(randn(2)) - y = Series(randn(2)) - subplot(121) - x.hist() - subplot(122) - y.hist() - fig = gcf() - axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() - self.assertEqual(len(axes), 2) - - @slow - def test_hist_secondary_legend(self): - # GH 9610 - df = DataFrame(np.random.randn(30, 4), columns=list('abcd')) - - # primary -> secondary - ax = df['a'].plot.hist(legend=True) - df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) - # both legends are dran on left ax - # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b (right)']) - self.assertTrue(ax.get_yaxis().get_visible()) - self.assertTrue(ax.right_ax.get_yaxis().get_visible()) - tm.close() - - # secondary -> secondary - ax = df['a'].plot.hist(legend=True, secondary_y=True) - df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) - # both legends are draw on left ax - # left axis must be invisible, right axis must be visible - self._check_legend_labels(ax.left_ax, - labels=['a (right)', 'b (right)']) - self.assertFalse(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - # secondary -> primary - ax = df['a'].plot.hist(legend=True, secondary_y=True) - # right axes is returned - df['b'].plot.hist(ax=ax, legend=True) - # both legends are draw on left ax - # left and right axis must be visible - self._check_legend_labels(ax.left_ax, labels=['a (right)', 'b']) - self.assertTrue(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - @slow - def test_df_series_secondary_legend(self): - # GH 9779 - df = DataFrame(np.random.randn(30, 3), columns=list('abc')) - s = Series(np.random.randn(30), name='x') - - # primary -> secondary (without passing ax) - ax = df.plot() - s.plot(legend=True, secondary_y=True) - # both legends are dran on left ax - # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) - self.assertTrue(ax.get_yaxis().get_visible()) - self.assertTrue(ax.right_ax.get_yaxis().get_visible()) - tm.close() - - # primary -> secondary (with passing ax) - ax = df.plot() - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are dran on left ax - # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) - self.assertTrue(ax.get_yaxis().get_visible()) - self.assertTrue(ax.right_ax.get_yaxis().get_visible()) - tm.close() - - # seconcary -> secondary (without passing ax) - ax = df.plot(secondary_y=True) - s.plot(legend=True, secondary_y=True) - # both legends are dran on left ax - # left axis must be invisible and right axis must be visible - expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] - self._check_legend_labels(ax.left_ax, labels=expected) - self.assertFalse(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - # secondary -> secondary (with passing ax) - ax = df.plot(secondary_y=True) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are dran on left ax - # left axis must be invisible and right axis must be visible - expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] - self._check_legend_labels(ax.left_ax, expected) - self.assertFalse(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - # secondary -> secondary (with passing ax) - ax = df.plot(secondary_y=True, mark_right=False) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are dran on left ax - # left axis must be invisible and right axis must be visible - expected = ['a', 'b', 'c', 'x (right)'] - self._check_legend_labels(ax.left_ax, expected) - self.assertFalse(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - @slow - def test_plot_fails_with_dupe_color_and_style(self): - x = Series(randn(2)) - with tm.assertRaises(ValueError): - x.plot(style='k--', color='k') - - @slow - def test_hist_kde(self): - ax = self.ts.plot.hist(logy=True) - self._check_ax_scales(ax, yaxis='log') - xlabels = ax.get_xticklabels() - # ticks are values, thus ticklabels are blank - self._check_text_labels(xlabels, [''] * len(xlabels)) - ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) - - tm._skip_if_no_scipy() - _skip_if_no_scipy_gaussian_kde() - _check_plot_works(self.ts.plot.kde) - _check_plot_works(self.ts.plot.density) - ax = self.ts.plot.kde(logy=True) - self._check_ax_scales(ax, yaxis='log') - xlabels = ax.get_xticklabels() - self._check_text_labels(xlabels, [''] * len(xlabels)) - ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) - - @slow - def test_kde_kwargs(self): - tm._skip_if_no_scipy() - _skip_if_no_scipy_gaussian_kde() - from numpy import linspace - _check_plot_works(self.ts.plot.kde, bw_method=.5, - ind=linspace(-100, 100, 20)) - _check_plot_works(self.ts.plot.density, bw_method=.5, - ind=linspace(-100, 100, 20)) - ax = self.ts.plot.kde(logy=True, bw_method=.5, - ind=linspace(-100, 100, 20)) - self._check_ax_scales(ax, yaxis='log') - self._check_text_labels(ax.yaxis.get_label(), 'Density') - - @slow - def test_kde_missing_vals(self): - tm._skip_if_no_scipy() - _skip_if_no_scipy_gaussian_kde() - s = Series(np.random.uniform(size=50)) - s[0] = np.nan - _check_plot_works(s.plot.kde) - - @slow - def test_hist_kwargs(self): - ax = self.ts.plot.hist(bins=5) - self.assertEqual(len(ax.patches), 5) - self._check_text_labels(ax.yaxis.get_label(), 'Frequency') - tm.close() - - if self.mpl_ge_1_3_1: - ax = self.ts.plot.hist(orientation='horizontal') - self._check_text_labels(ax.xaxis.get_label(), 'Frequency') - tm.close() - - ax = self.ts.plot.hist(align='left', stacked=True) - tm.close() - - @slow - def test_hist_kde_color(self): - ax = self.ts.plot.hist(logy=True, bins=10, color='b') - self._check_ax_scales(ax, yaxis='log') - self.assertEqual(len(ax.patches), 10) - self._check_colors(ax.patches, facecolors=['b'] * 10) - - tm._skip_if_no_scipy() - _skip_if_no_scipy_gaussian_kde() - ax = self.ts.plot.kde(logy=True, color='r') - self._check_ax_scales(ax, yaxis='log') - lines = ax.get_lines() - self.assertEqual(len(lines), 1) - self._check_colors(lines, ['r']) - - @slow - def test_boxplot_series(self): - ax = self.ts.plot.box(logy=True) - self._check_ax_scales(ax, yaxis='log') - xlabels = ax.get_xticklabels() - self._check_text_labels(xlabels, [self.ts.name]) - ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) - - @slow - def test_kind_both_ways(self): - s = Series(range(3)) - for kind in plotting._common_kinds + plotting._series_kinds: - if not _ok_for_gaussian_kde(kind): - continue - s.plot(kind=kind) - getattr(s.plot, kind)() - - @slow - def test_invalid_plot_data(self): - s = Series(list('abcd')) - for kind in plotting._common_kinds: - if not _ok_for_gaussian_kde(kind): - continue - with tm.assertRaises(TypeError): - s.plot(kind=kind) - - @slow - def test_valid_object_plot(self): - s = Series(lrange(10), dtype=object) - for kind in plotting._common_kinds: - if not _ok_for_gaussian_kde(kind): - continue - _check_plot_works(s.plot, kind=kind) - - def test_partially_invalid_plot_data(self): - s = Series(['a', 'b', 1.0, 2]) - for kind in plotting._common_kinds: - if not _ok_for_gaussian_kde(kind): - continue - with tm.assertRaises(TypeError): - s.plot(kind=kind) - - def test_invalid_kind(self): - s = Series([1, 2]) - with tm.assertRaises(ValueError): - s.plot(kind='aasdf') - - @slow - def test_dup_datetime_index_plot(self): - dr1 = date_range('1/1/2009', periods=4) - dr2 = date_range('1/2/2009', periods=4) - index = dr1.append(dr2) - values = randn(index.size) - s = Series(values, index=index) - _check_plot_works(s.plot) - - @slow - def test_errorbar_plot(self): - - s = Series(np.arange(10), name='x') - s_err = np.random.randn(10) - d_err = DataFrame(randn(10, 2), index=s.index, columns=['x', 'y']) - # test line and bar plots - kinds = ['line', 'bar'] - for kind in kinds: - ax = _check_plot_works(s.plot, yerr=Series(s_err), kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(s.plot, yerr=s_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(s.plot, yerr=s_err.tolist(), kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(s.plot, yerr=d_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(s.plot, xerr=0.2, yerr=0.2, kind=kind) - self._check_has_errorbars(ax, xerr=1, yerr=1) - - ax = _check_plot_works(s.plot, xerr=s_err) - self._check_has_errorbars(ax, xerr=1, yerr=0) - - # test time series plotting - ix = date_range('1/1/2000', '1/1/2001', freq='M') - ts = Series(np.arange(12), index=ix, name='x') - ts_err = Series(np.random.randn(12), index=ix) - td_err = DataFrame(randn(12, 2), index=ix, columns=['x', 'y']) - - ax = _check_plot_works(ts.plot, yerr=ts_err) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(ts.plot, yerr=td_err) - self._check_has_errorbars(ax, xerr=0, yerr=1) - - # check incorrect lengths and types - with tm.assertRaises(ValueError): - s.plot(yerr=np.arange(11)) - - s_err = ['zzz'] * 10 - # in mpl 1.5+ this is a TypeError - with tm.assertRaises((ValueError, TypeError)): - s.plot(yerr=s_err) - - def test_table(self): - _check_plot_works(self.series.plot, table=True) - _check_plot_works(self.series.plot, table=self.series) - - @slow - def test_series_grid_settings(self): - # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 - self._check_grid_settings(Series([1, 2, 3]), - plotting._series_kinds + - plotting._common_kinds) - - @slow - def test_standard_colors(self): - for c in ['r', 'red', 'green', '#FF0000']: - result = plotting._get_standard_colors(1, color=c) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(1, color=[c]) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(3, color=c) - self.assertEqual(result, [c] * 3) - - result = plotting._get_standard_colors(3, color=[c]) - self.assertEqual(result, [c] * 3) - - @slow - def test_standard_colors_all(self): - import matplotlib.colors as colors - - # multiple colors like mediumaquamarine - for c in colors.cnames: - result = plotting._get_standard_colors(num_colors=1, color=c) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(num_colors=1, color=[c]) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(num_colors=3, color=c) - self.assertEqual(result, [c] * 3) - - result = plotting._get_standard_colors(num_colors=3, color=[c]) - self.assertEqual(result, [c] * 3) - - # single letter colors like k - for c in colors.ColorConverter.colors: - result = plotting._get_standard_colors(num_colors=1, color=c) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(num_colors=1, color=[c]) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(num_colors=3, color=c) - self.assertEqual(result, [c] * 3) - - result = plotting._get_standard_colors(num_colors=3, color=[c]) - self.assertEqual(result, [c] * 3) - - def test_series_plot_color_kwargs(self): - # GH1890 - ax = Series(np.arange(12) + 1).plot(color='green') - self._check_colors(ax.get_lines(), linecolors=['green']) - - def test_time_series_plot_color_kwargs(self): - # #1890 - ax = Series(np.arange(12) + 1, index=date_range( - '1/1/2000', periods=12)).plot(color='green') - self._check_colors(ax.get_lines(), linecolors=['green']) - - def test_time_series_plot_color_with_empty_kwargs(self): - import matplotlib as mpl - - if self.mpl_ge_1_5_0: - def_colors = self._maybe_unpack_cycler(mpl.rcParams) - else: - def_colors = mpl.rcParams['axes.color_cycle'] - index = date_range('1/1/2000', periods=12) - s = Series(np.arange(1, 13), index=index) - - ncolors = 3 - - for i in range(ncolors): - ax = s.plot() - self._check_colors(ax.get_lines(), linecolors=def_colors[:ncolors]) - - def test_xticklabels(self): - # GH11529 - s = Series(np.arange(10), index=['P%02d' % i for i in range(10)]) - ax = s.plot(xticks=[0, 3, 5, 9]) - exp = ['P%02d' % i for i in [0, 3, 5, 9]] - self._check_text_labels(ax.get_xticklabels(), exp) - - def test_custom_business_day_freq(self): - # GH7222 - from pandas.tseries.offsets import CustomBusinessDay - s = Series(range(100, 121), index=pd.bdate_range( - start='2014-05-01', end='2014-06-01', - freq=CustomBusinessDay(holidays=['2014-05-26']))) - - _check_plot_works(s.plot) +""" Test cases for DataFrame.plot """ @tm.mplskip @@ -1305,10 +43,6 @@ def setUp(self): "C": np.arange(20) + np.random.uniform( size=20)}) - from pandas import read_csv - path = os.path.join(curpath(), 'data', 'iris.csv') - self.iris = read_csv(path) - @slow def test_plot(self): df = self.tdf @@ -3939,103 +2673,6 @@ def test_rcParams_bar_colors(self): for c in barplot.patches]) -@tm.mplskip -class TestDataFrameGroupByPlots(TestPlotBase): - - def test_series_groupby_plotting_nominally_works(self): - n = 10 - weight = Series(np.random.normal(166, 20, size=n)) - height = Series(np.random.normal(60, 10, size=n)) - with tm.RNGContext(42): - gender = np.random.choice(['male', 'female'], size=n) - - weight.groupby(gender).plot() - tm.close() - height.groupby(gender).hist() - tm.close() - # Regression test for GH8733 - height.groupby(gender).plot(alpha=0.5) - tm.close() - - def test_plotting_with_float_index_works(self): - # GH 7025 - df = DataFrame({'def': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'val': np.random.randn(9)}, - index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0]) - - df.groupby('def')['val'].plot() - tm.close() - df.groupby('def')['val'].apply(lambda x: x.plot()) - tm.close() - - def test_hist_single_row(self): - # GH10214 - bins = np.arange(80, 100 + 2, 1) - df = DataFrame({"Name": ["AAA", "BBB"], - "ByCol": [1, 2], - "Mark": [85, 89]}) - df["Mark"].hist(by=df["ByCol"], bins=bins) - df = DataFrame({"Name": ["AAA"], "ByCol": [1], "Mark": [85]}) - df["Mark"].hist(by=df["ByCol"], bins=bins) - - def test_plot_submethod_works(self): - df = DataFrame({'x': [1, 2, 3, 4, 5], - 'y': [1, 2, 3, 2, 1], - 'z': list('ababa')}) - df.groupby('z').plot.scatter('x', 'y') - tm.close() - df.groupby('z')['x'].plot.line() - tm.close() - - def test_plot_kwargs(self): - - df = DataFrame({'x': [1, 2, 3, 4, 5], - 'y': [1, 2, 3, 2, 1], - 'z': list('ababa')}) - - res = df.groupby('z').plot(kind='scatter', x='x', y='y') - # check that a scatter plot is effectively plotted: the axes should - # contain a PathCollection from the scatter plot (GH11805) - self.assertEqual(len(res['a'].collections), 1) - - res = df.groupby('z').plot.scatter(x='x', y='y') - self.assertEqual(len(res['a'].collections), 1) - - -def _check_plot_works(f, filterwarnings='always', **kwargs): - import matplotlib.pyplot as plt - ret = None - with warnings.catch_warnings(): - warnings.simplefilter(filterwarnings) - try: - try: - fig = kwargs['figure'] - except KeyError: - fig = plt.gcf() - - plt.clf() - - ax = kwargs.get('ax', fig.add_subplot(211)) # noqa - ret = f(**kwargs) - - assert_is_valid_plot_return_object(ret) - - try: - kwargs['ax'] = fig.add_subplot(212) - ret = f(**kwargs) - except Exception: - pass - else: - assert_is_valid_plot_return_object(ret) - - with ensure_clean(return_filelike=True) as path: - plt.savefig(path) - finally: - tm.close(fig) - - return ret - - def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt import matplotlib as mpl @@ -4050,11 +2687,6 @@ def _generate_4_axes_via_gridspec(): return gs, [ax_tl, ax_ll, ax_tr, ax_lr] -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py new file mode 100644 index 0000000000000..101a6556c61bf --- /dev/null +++ b/pandas/tests/plotting/test_groupby.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose + +from pandas import Series, DataFrame +import pandas.util.testing as tm + +import numpy as np + +from pandas.tests.plotting.common import TestPlotBase + + +""" Test cases for GroupBy.plot """ + + +@tm.mplskip +class TestDataFrameGroupByPlots(TestPlotBase): + + def test_series_groupby_plotting_nominally_works(self): + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender = np.random.choice(['male', 'female'], size=n) + + weight.groupby(gender).plot() + tm.close() + height.groupby(gender).hist() + tm.close() + # Regression test for GH8733 + height.groupby(gender).plot(alpha=0.5) + tm.close() + + def test_plotting_with_float_index_works(self): + # GH 7025 + df = DataFrame({'def': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'val': np.random.randn(9)}, + index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0]) + + df.groupby('def')['val'].plot() + tm.close() + df.groupby('def')['val'].apply(lambda x: x.plot()) + tm.close() + + def test_hist_single_row(self): + # GH10214 + bins = np.arange(80, 100 + 2, 1) + df = DataFrame({"Name": ["AAA", "BBB"], + "ByCol": [1, 2], + "Mark": [85, 89]}) + df["Mark"].hist(by=df["ByCol"], bins=bins) + df = DataFrame({"Name": ["AAA"], "ByCol": [1], "Mark": [85]}) + df["Mark"].hist(by=df["ByCol"], bins=bins) + + def test_plot_submethod_works(self): + df = DataFrame({'x': [1, 2, 3, 4, 5], + 'y': [1, 2, 3, 2, 1], + 'z': list('ababa')}) + df.groupby('z').plot.scatter('x', 'y') + tm.close() + df.groupby('z')['x'].plot.line() + tm.close() + + def test_plot_kwargs(self): + + df = DataFrame({'x': [1, 2, 3, 4, 5], + 'y': [1, 2, 3, 2, 1], + 'z': list('ababa')}) + + res = df.groupby('z').plot(kind='scatter', x='x', y='y') + # check that a scatter plot is effectively plotted: the axes should + # contain a PathCollection from the scatter plot (GH11805) + self.assertEqual(len(res['a'].collections), 1) + + res = df.groupby('z').plot.scatter(x='x', y='y') + self.assertEqual(len(res['a'].collections), 1) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py new file mode 100644 index 0000000000000..c7bff5a31fc02 --- /dev/null +++ b/pandas/tests/plotting/test_hist_method.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose + +from pandas import Series, DataFrame +import pandas.util.testing as tm +from pandas.util.testing import slow + +import numpy as np +from numpy.random import randn + +import pandas.tools.plotting as plotting +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works) + + +""" Test cases for .hist method """ + + +@tm.mplskip +class TestSeriesPlots(TestPlotBase): + + def setUp(self): + TestPlotBase.setUp(self) + import matplotlib as mpl + mpl.rcdefaults() + + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + @slow + def test_hist_legacy(self): + _check_plot_works(self.ts.hist) + _check_plot_works(self.ts.hist, grid=False) + _check_plot_works(self.ts.hist, figsize=(8, 10)) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) + + fig, ax = self.plt.subplots(1, 1) + _check_plot_works(self.ts.hist, ax=ax) + _check_plot_works(self.ts.hist, ax=ax, figure=fig) + _check_plot_works(self.ts.hist, figure=fig) + tm.close() + + fig, (ax1, ax2) = self.plt.subplots(1, 2) + _check_plot_works(self.ts.hist, figure=fig, ax=ax1) + _check_plot_works(self.ts.hist, figure=fig, ax=ax2) + + with tm.assertRaises(ValueError): + self.ts.hist(by=self.ts.index, figure=fig) + + @slow + def test_hist_bins_legacy(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.hist(bins=2)[0][0] + self.assertEqual(len(ax.patches), 2) + + @slow + def test_hist_layout(self): + df = self.hist_df + with tm.assertRaises(ValueError): + df.height.hist(layout=(1, 1)) + + with tm.assertRaises(ValueError): + df.height.hist(layout=[1, 1]) + + @slow + def test_hist_layout_with_by(self): + df = self.hist_df + + # _check_plot_works adds an `ax` kwarg to the method call + # so we get a warning about an axis being cleared, even + # though we don't explicing pass one, see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, + layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, + layout=(3, -1)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, + layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.category, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.category, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.category, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.classroom, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) + self._check_axes_shape( + axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) + + @slow + def test_hist_no_overlap(self): + from matplotlib.pyplot import subplot, gcf + x = Series(randn(2)) + y = Series(randn(2)) + subplot(121) + x.hist() + subplot(122) + y.hist() + fig = gcf() + axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() + self.assertEqual(len(axes), 2) + + @slow + def test_hist_by_no_extra_plots(self): + df = self.hist_df + axes = df.height.hist(by=df.gender) # noqa + self.assertEqual(len(self.plt.get_fignums()), 1) + + @slow + def test_plot_fails_when_ax_differs_from_figure(self): + from pylab import figure + fig1 = figure() + fig2 = figure() + ax1 = fig1.add_subplot(111) + with tm.assertRaises(AssertionError): + self.ts.hist(ax=ax1, figure=fig2) + + +@tm.mplskip +class TestDataFramePlots(TestPlotBase): + + @slow + def test_hist_df_legacy(self): + from matplotlib.patches import Rectangle + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.hist_df.hist) + + # make sure layout is handled + df = DataFrame(randn(100, 3)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, grid=False) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + self.assertFalse(axes[1, 1].get_visible()) + + df = DataFrame(randn(100, 1)) + _check_plot_works(df.hist) + + # make sure layout is handled + df = DataFrame(randn(100, 6)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, layout=(4, 2)) + self._check_axes_shape(axes, axes_num=6, layout=(4, 2)) + + # make sure sharex, sharey is handled + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, sharex=True, sharey=True) + + # handle figsize arg + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, figsize=(8, 10)) + + # check bins argument + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, bins=5) + + # make sure xlabelsize and xrot are handled + ser = df[0] + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) + self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) + self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + + tm.close() + # make sure kwargs to hist are handled + ax = ser.hist(normed=True, cumulative=True, bins=4) + # height of last bin (index 5) must be 1.0 + rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + self.assertAlmostEqual(rects[-1].get_height(), 1.0) + + tm.close() + ax = ser.hist(log=True) + # scale of y must be 'log' + self._check_ax_scales(ax, yaxis='log') + + tm.close() + + # propagate attr exception from matplotlib.Axes.hist + with tm.assertRaises(AttributeError): + ser.hist(foo='bar') + + @slow + def test_hist_layout(self): + df = DataFrame(randn(100, 3)) + + layout_to_expected_size = ( + {'layout': None, 'expected_size': (2, 2)}, # default is 2x2 + {'layout': (2, 2), 'expected_size': (2, 2)}, + {'layout': (4, 1), 'expected_size': (4, 1)}, + {'layout': (1, 4), 'expected_size': (1, 4)}, + {'layout': (3, 3), 'expected_size': (3, 3)}, + {'layout': (-1, 4), 'expected_size': (1, 4)}, + {'layout': (4, -1), 'expected_size': (4, 1)}, + {'layout': (-1, 2), 'expected_size': (2, 2)}, + {'layout': (2, -1), 'expected_size': (2, 2)} + ) + + for layout_test in layout_to_expected_size: + axes = df.hist(layout=layout_test['layout']) + expected = layout_test['expected_size'] + self._check_axes_shape(axes, axes_num=3, layout=expected) + + # layout too small for all 4 plots + with tm.assertRaises(ValueError): + df.hist(layout=(1, 1)) + + # invalid format for layout + with tm.assertRaises(ValueError): + df.hist(layout=(1,)) + with tm.assertRaises(ValueError): + df.hist(layout=(-1, -1)) + + +@tm.mplskip +class TestDataFrameGroupByPlots(TestPlotBase): + + @slow + def test_grouped_hist_legacy(self): + from matplotlib.patches import Rectangle + + df = DataFrame(randn(500, 2), columns=['A', 'B']) + df['C'] = np.random.randint(0, 4, 500) + df['D'] = ['X'] * 500 + + axes = plotting.grouped_hist(df.A, by=df.C) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + tm.close() + axes = df.hist(by=df.C) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + tm.close() + # group by a key with single value + axes = df.hist(by='D', rot=30) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + self._check_ticks_props(axes, xrot=30) + + tm.close() + # make sure kwargs to hist are handled + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = plotting.grouped_hist(df.A, by=df.C, normed=True, + cumulative=True, bins=4, + xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + # height of last bin (index 5) must be 1.0 + for ax in axes.ravel(): + rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + height = rects[-1].get_height() + self.assertAlmostEqual(height, 1.0) + self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + + tm.close() + axes = plotting.grouped_hist(df.A, by=df.C, log=True) + # scale of y must be 'log' + self._check_ax_scales(axes, yaxis='log') + + tm.close() + # propagate attr exception from matplotlib.Axes.hist + with tm.assertRaises(AttributeError): + plotting.grouped_hist(df.A, by=df.C, foo='bar') + + with tm.assert_produces_warning(FutureWarning): + df.hist(by='C', figsize='default') + + @slow + def test_grouped_hist_legacy2(self): + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender_int = np.random.choice([0, 1], size=n) + df_int = DataFrame({'height': height, 'weight': weight, + 'gender': gender_int}) + gb = df_int.groupby('gender') + axes = gb.hist() + self.assertEqual(len(axes), 2) + self.assertEqual(len(self.plt.get_fignums()), 2) + tm.close() + + @slow + def test_grouped_hist_layout(self): + df = self.hist_df + self.assertRaises(ValueError, df.hist, column='weight', by=df.gender, + layout=(1, 1)) + self.assertRaises(ValueError, df.hist, column='height', by=df.category, + layout=(1, 3)) + self.assertRaises(ValueError, df.hist, column='height', by=df.category, + layout=(-1, -1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, column='height', by=df.gender, + layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, column='height', by=df.gender, + layout=(2, -1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + axes = df.hist(column='height', by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = df.hist(column='height', by=df.category, layout=(-1, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = df.hist(column='height', by=df.category, + layout=(4, 2), figsize=(12, 8)) + self._check_axes_shape( + axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) + tm.close() + + # GH 6769 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.hist, column='height', by='classroom', layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + # without column + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, by='classroom') + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = df.hist(by='gender', layout=(3, 5)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 5)) + + axes = df.hist(column=['height', 'weight', 'category']) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + @slow + def test_grouped_hist_multiple_axes(self): + # GH 6970, GH 7069 + df = self.hist_df + + fig, axes = self.plt.subplots(2, 3) + returned = df.hist(column=['height', 'weight', 'category'], ax=axes[0]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[0]) + self.assertIs(returned[0].figure, fig) + returned = df.hist(by='classroom', ax=axes[1]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[1]) + self.assertIs(returned[0].figure, fig) + + with tm.assertRaises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + axes = df.hist(column='height', ax=axes) + + @slow + def test_axis_share_x(self): + df = self.hist_df + # GH4089 + ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True) + + # share x + self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) + + # don't share y + self.assertFalse(ax1._shared_y_axes.joined(ax1, ax2)) + self.assertFalse(ax2._shared_y_axes.joined(ax1, ax2)) + + @slow + def test_axis_share_y(self): + df = self.hist_df + ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True) + + # share y + self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) + + # don't share x + self.assertFalse(ax1._shared_x_axes.joined(ax1, ax2)) + self.assertFalse(ax2._shared_x_axes.joined(ax1, ax2)) + + @slow + def test_axis_share_xy(self): + df = self.hist_df + ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, + sharey=True) + + # share both x and y + self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) + + self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py new file mode 100644 index 0000000000000..8b9a4fe05bb2e --- /dev/null +++ b/pandas/tests/plotting/test_misc.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose + +from pandas import Series, DataFrame +from pandas.compat import lmap +import pandas.util.testing as tm +from pandas.util.testing import slow + +import numpy as np +from numpy import random +from numpy.random import randn + +import pandas.tools.plotting as plotting +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works, + _ok_for_gaussian_kde) + + +""" Test cases for misc plot functions """ + + +@tm.mplskip +class TestSeriesPlots(TestPlotBase): + + def setUp(self): + TestPlotBase.setUp(self) + import matplotlib as mpl + mpl.rcdefaults() + + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + @slow + def test_autocorrelation_plot(self): + from pandas.tools.plotting import autocorrelation_plot + _check_plot_works(autocorrelation_plot, series=self.ts) + _check_plot_works(autocorrelation_plot, series=self.ts.values) + + ax = autocorrelation_plot(self.ts, label='Test') + self._check_legend_labels(ax, labels=['Test']) + + @slow + def test_lag_plot(self): + from pandas.tools.plotting import lag_plot + _check_plot_works(lag_plot, series=self.ts) + _check_plot_works(lag_plot, series=self.ts, lag=5) + + @slow + def test_bootstrap_plot(self): + from pandas.tools.plotting import bootstrap_plot + _check_plot_works(bootstrap_plot, series=self.ts, size=10) + + +@tm.mplskip +class TestDataFramePlots(TestPlotBase): + + @slow + def test_scatter_plot_legacy(self): + tm._skip_if_no_scipy() + + df = DataFrame(randn(100, 2)) + + def scat(**kwds): + return plotting.scatter_matrix(df, **kwds) + + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, marker='+') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, vmin=0) + if _ok_for_gaussian_kde('kde'): + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, diagonal='kde') + if _ok_for_gaussian_kde('density'): + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, diagonal='density') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, diagonal='hist') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, range_padding=.1) + + def scat2(x, y, by=None, ax=None, figsize=None): + return plotting.scatter_plot(df, x, y, by, ax, figsize=None) + + _check_plot_works(scat2, x=0, y=1) + grouper = Series(np.repeat([1, 2, 3, 4, 5], 20), df.index) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat2, x=0, y=1, by=grouper) + + def test_scatter_matrix_axis(self): + tm._skip_if_no_scipy() + scatter_matrix = plotting.scatter_matrix + + with tm.RNGContext(42): + df = DataFrame(randn(100, 3)) + + # we are plotting multiples on a sub-plot + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(scatter_matrix, filterwarnings='always', + frame=df, range_padding=.1) + axes0_labels = axes[0][0].yaxis.get_majorticklabels() + + # GH 5662 + expected = ['-2', '-1', '0', '1', '2'] + self._check_text_labels(axes0_labels, expected) + self._check_ticks_props( + axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + + df[0] = ((df[0] - 2) / 3) + + # we are plotting multiples on a sub-plot + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(scatter_matrix, filterwarnings='always', + frame=df, range_padding=.1) + axes0_labels = axes[0][0].yaxis.get_majorticklabels() + expected = ['-1.2', '-1.0', '-0.8', '-0.6', '-0.4', '-0.2', '0.0'] + self._check_text_labels(axes0_labels, expected) + self._check_ticks_props( + axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + + @slow + def test_andrews_curves(self): + from pandas.tools.plotting import andrews_curves + from matplotlib import cm + + df = self.iris + + _check_plot_works(andrews_curves, frame=df, class_column='Name') + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', color=rgba) + self._check_colors( + ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', color=cnames) + self._check_colors( + ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + self._check_colors( + ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + + length = 10 + df = DataFrame({"A": random.rand(length), + "B": random.rand(length), + "C": random.rand(length), + "Name": ["A"] * length}) + + _check_plot_works(andrews_curves, frame=df, class_column='Name') + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', color=rgba) + self._check_colors( + ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', color=cnames) + self._check_colors( + ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + self._check_colors( + ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + + colors = ['b', 'g', 'r'] + df = DataFrame({"A": [1, 2, 3], + "B": [1, 2, 3], + "C": [1, 2, 3], + "Name": colors}) + ax = andrews_curves(df, 'Name', color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, linecolors=colors) + + with tm.assert_produces_warning(FutureWarning): + andrews_curves(data=df, class_column='Name') + + @slow + def test_parallel_coordinates(self): + from pandas.tools.plotting import parallel_coordinates + from matplotlib import cm + + df = self.iris + + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name') + nlines = len(ax.get_lines()) + nxticks = len(ax.xaxis.get_ticklabels()) + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name', color=rgba) + self._check_colors( + ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name', color=cnames) + self._check_colors( + ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + self._check_colors( + ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name', axvlines=False) + assert len(ax.get_lines()) == (nlines - nxticks) + + colors = ['b', 'g', 'r'] + df = DataFrame({"A": [1, 2, 3], + "B": [1, 2, 3], + "C": [1, 2, 3], + "Name": colors}) + ax = parallel_coordinates(df, 'Name', color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, linecolors=colors) + + with tm.assert_produces_warning(FutureWarning): + parallel_coordinates(data=df, class_column='Name') + with tm.assert_produces_warning(FutureWarning): + parallel_coordinates(df, 'Name', colors=colors) + + @slow + def test_radviz(self): + from pandas.tools.plotting import radviz + from matplotlib import cm + + df = self.iris + _check_plot_works(radviz, frame=df, class_column='Name') + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works( + radviz, frame=df, class_column='Name', color=rgba) + # skip Circle drawn as ticks + patches = [p for p in ax.patches[:20] if p.get_label() != ''] + self._check_colors( + patches[:10], facecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + _check_plot_works(radviz, frame=df, class_column='Name', color=cnames) + patches = [p for p in ax.patches[:20] if p.get_label() != ''] + self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) + + _check_plot_works(radviz, frame=df, + class_column='Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + patches = [p for p in ax.patches[:20] if p.get_label() != ''] + self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) + + colors = [[0., 0., 1., 1.], + [0., 0.5, 1., 1.], + [1., 0., 0., 1.]] + df = DataFrame({"A": [1, 2, 3], + "B": [2, 1, 3], + "C": [3, 2, 1], + "Name": ['b', 'g', 'r']}) + ax = radviz(df, 'Name', color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, facecolors=colors) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py new file mode 100644 index 0000000000000..2bd2f8255569d --- /dev/null +++ b/pandas/tests/plotting/test_series.py @@ -0,0 +1,807 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose +import itertools + +from datetime import datetime + +import pandas as pd +from pandas import Series, DataFrame, date_range +from pandas.compat import range, lrange +import pandas.util.testing as tm +from pandas.util.testing import slow + +import numpy as np +from numpy.random import randn + +import pandas.tools.plotting as plotting +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works, + _skip_if_no_scipy_gaussian_kde, + _ok_for_gaussian_kde) + + +""" Test cases for Series.plot """ + + +@tm.mplskip +class TestSeriesPlots(TestPlotBase): + + def setUp(self): + TestPlotBase.setUp(self) + import matplotlib as mpl + mpl.rcdefaults() + + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.iseries = tm.makePeriodSeries() + self.iseries.name = 'iseries' + + @slow + def test_plot(self): + _check_plot_works(self.ts.plot, label='foo') + _check_plot_works(self.ts.plot, use_index=False) + axes = _check_plot_works(self.ts.plot, rot=0) + self._check_ticks_props(axes, xrot=0) + + ax = _check_plot_works(self.ts.plot, style='.', logy=True) + self._check_ax_scales(ax, yaxis='log') + + ax = _check_plot_works(self.ts.plot, style='.', logx=True) + self._check_ax_scales(ax, xaxis='log') + + ax = _check_plot_works(self.ts.plot, style='.', loglog=True) + self._check_ax_scales(ax, xaxis='log', yaxis='log') + + _check_plot_works(self.ts[:10].plot.bar) + _check_plot_works(self.ts.plot.area, stacked=False) + _check_plot_works(self.iseries.plot) + + for kind in ['line', 'bar', 'barh', 'kde', 'hist', 'box']: + if not _ok_for_gaussian_kde(kind): + continue + _check_plot_works(self.series[:5].plot, kind=kind) + + _check_plot_works(self.series[:10].plot.barh) + ax = _check_plot_works(Series(randn(10)).plot.bar, color='black') + self._check_colors([ax.patches[0]], facecolors=['black']) + + # GH 6951 + ax = _check_plot_works(self.ts.plot, subplots=True) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + + ax = _check_plot_works(self.ts.plot, subplots=True, layout=(-1, 1)) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1)) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + + @slow + def test_plot_figsize_and_title(self): + # figsize and title + ax = self.series.plot(title='Test', figsize=(16, 8)) + self._check_text_labels(ax.title, 'Test') + self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) + + def test_dont_modify_rcParams(self): + # GH 8242 + if self.mpl_ge_1_5_0: + key = 'axes.prop_cycle' + else: + key = 'axes.color_cycle' + colors = self.plt.rcParams[key] + Series([1, 2, 3]).plot() + self.assertEqual(colors, self.plt.rcParams[key]) + + def test_ts_line_lim(self): + ax = self.ts.plot() + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) + self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) + tm.close() + + ax = self.ts.plot(secondary_y=True) + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) + self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) + + def test_ts_area_lim(self): + ax = self.ts.plot.area(stacked=False) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + tm.close() + + # GH 7471 + ax = self.ts.plot.area(stacked=False, x_compat=True) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + tm.close() + + tz_ts = self.ts.copy() + tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET') + ax = tz_ts.plot.area(stacked=False, x_compat=True) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + tm.close() + + ax = tz_ts.plot.area(stacked=False, secondary_y=True) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + + def test_label(self): + s = Series([1, 2]) + ax = s.plot(label='LABEL', legend=True) + self._check_legend_labels(ax, labels=['LABEL']) + self.plt.close() + ax = s.plot(legend=True) + self._check_legend_labels(ax, labels=['None']) + self.plt.close() + # get name from index + s.name = 'NAME' + ax = s.plot(legend=True) + self._check_legend_labels(ax, labels=['NAME']) + self.plt.close() + # override the default + ax = s.plot(legend=True, label='LABEL') + self._check_legend_labels(ax, labels=['LABEL']) + self.plt.close() + # Add lebel info, but don't draw + ax = s.plot(legend=False, label='LABEL') + self.assertEqual(ax.get_legend(), None) # Hasn't been drawn + ax.legend() # draw it + self._check_legend_labels(ax, labels=['LABEL']) + + def test_line_area_nan_series(self): + values = [1, 2, np.nan, 3] + s = Series(values) + ts = Series(values, index=tm.makeDateIndex(k=4)) + + for d in [s, ts]: + ax = _check_plot_works(d.plot) + masked = ax.lines[0].get_ydata() + # remove nan for comparison purpose + exp = np.array([1, 2, 3], dtype=np.float64) + self.assert_numpy_array_equal(np.delete(masked.data, 2), exp) + self.assert_numpy_array_equal( + masked.mask, np.array([False, False, True, False])) + + expected = np.array([1, 2, 0, 3], dtype=np.float64) + ax = _check_plot_works(d.plot, stacked=True) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + ax = _check_plot_works(d.plot.area) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + ax = _check_plot_works(d.plot.area, stacked=False) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + + def test_line_use_index_false(self): + s = Series([1, 2, 3], index=['a', 'b', 'c']) + s.index.name = 'The Index' + ax = s.plot(use_index=False) + label = ax.get_xlabel() + self.assertEqual(label, '') + ax2 = s.plot.bar(use_index=False) + label2 = ax2.get_xlabel() + self.assertEqual(label2, '') + + @slow + def test_bar_log(self): + expected = np.array([1., 10., 100., 1000.]) + + if not self.mpl_le_1_2_1: + expected = np.hstack((.1, expected, 1e4)) + + ax = Series([200, 500]).plot.bar(log=True) + tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) + tm.close() + + ax = Series([200, 500]).plot.barh(log=True) + tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) + tm.close() + + # GH 9905 + expected = np.array([1.0e-03, 1.0e-02, 1.0e-01, 1.0e+00]) + + if not self.mpl_le_1_2_1: + expected = np.hstack((1.0e-04, expected, 1.0e+01)) + + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') + self.assertEqual(ax.get_ylim(), (0.001, 0.10000000000000001)) + tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) + tm.close() + + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') + self.assertEqual(ax.get_xlim(), (0.001, 0.10000000000000001)) + tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) + + @slow + def test_bar_ignore_index(self): + df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + ax = df.plot.bar(use_index=False) + self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3']) + + def test_rotation(self): + df = DataFrame(randn(5, 5)) + # Default rot 0 + axes = df.plot() + self._check_ticks_props(axes, xrot=0) + + axes = df.plot(rot=30) + self._check_ticks_props(axes, xrot=30) + + def test_irregular_datetime(self): + rng = date_range('1/1/2000', '3/1/2000') + rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] + ser = Series(randn(len(rng)), rng) + ax = ser.plot() + xp = datetime(1999, 1, 1).toordinal() + ax.set_xlim('1/1/1999', '1/1/2001') + self.assertEqual(xp, ax.get_xlim()[0]) + + @slow + def test_pie_series(self): + # if sum of values is less than 1.0, pie handle them as rate and draw + # semicircle. + series = Series(np.random.randint(1, 5), + index=['a', 'b', 'c', 'd', 'e'], name='YLABEL') + ax = _check_plot_works(series.plot.pie) + self._check_text_labels(ax.texts, series.index) + self.assertEqual(ax.get_ylabel(), 'YLABEL') + + # without wedge labels + ax = _check_plot_works(series.plot.pie, labels=None) + self._check_text_labels(ax.texts, [''] * 5) + + # with less colors than elements + color_args = ['r', 'g', 'b'] + ax = _check_plot_works(series.plot.pie, colors=color_args) + + color_expected = ['r', 'g', 'b', 'r', 'g'] + self._check_colors(ax.patches, facecolors=color_expected) + + # with labels and colors + labels = ['A', 'B', 'C', 'D', 'E'] + color_args = ['r', 'g', 'b', 'c', 'm'] + ax = _check_plot_works(series.plot.pie, labels=labels, + colors=color_args) + self._check_text_labels(ax.texts, labels) + self._check_colors(ax.patches, facecolors=color_args) + + # with autopct and fontsize + ax = _check_plot_works(series.plot.pie, colors=color_args, + autopct='%.2f', fontsize=7) + pcts = ['{0:.2f}'.format(s * 100) + for s in series.values / float(series.sum())] + iters = [iter(series.index), iter(pcts)] + expected_texts = list(next(it) for it in itertools.cycle(iters)) + self._check_text_labels(ax.texts, expected_texts) + for t in ax.texts: + self.assertEqual(t.get_fontsize(), 7) + + # includes negative value + with tm.assertRaises(ValueError): + series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e']) + series.plot.pie() + + # includes nan + series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'], + name='YLABEL') + ax = _check_plot_works(series.plot.pie) + self._check_text_labels(ax.texts, ['a', 'b', '', 'd']) + + def test_pie_nan(self): + s = Series([1, np.nan, 1, 1]) + ax = s.plot.pie(legend=True) + expected = ['0', '', '2', '3'] + result = [x.get_text() for x in ax.texts] + self.assertEqual(result, expected) + + @slow + def test_hist_df_kwargs(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.plot.hist(bins=5) + self.assertEqual(len(ax.patches), 10) + + @slow + def test_hist_df_with_nonnumerics(self): + # GH 9853 + with tm.RNGContext(1): + df = DataFrame( + np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) + df['E'] = ['x', 'y'] * 5 + ax = df.plot.hist(bins=5) + self.assertEqual(len(ax.patches), 20) + + ax = df.plot.hist() # bins=10 + self.assertEqual(len(ax.patches), 40) + + @slow + def test_hist_legacy(self): + _check_plot_works(self.ts.hist) + _check_plot_works(self.ts.hist, grid=False) + _check_plot_works(self.ts.hist, figsize=(8, 10)) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, + by=self.ts.index.month) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, + by=self.ts.index.month, bins=5) + + fig, ax = self.plt.subplots(1, 1) + _check_plot_works(self.ts.hist, ax=ax) + _check_plot_works(self.ts.hist, ax=ax, figure=fig) + _check_plot_works(self.ts.hist, figure=fig) + tm.close() + + fig, (ax1, ax2) = self.plt.subplots(1, 2) + _check_plot_works(self.ts.hist, figure=fig, ax=ax1) + _check_plot_works(self.ts.hist, figure=fig, ax=ax2) + + with tm.assertRaises(ValueError): + self.ts.hist(by=self.ts.index, figure=fig) + + @slow + def test_hist_bins_legacy(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.hist(bins=2)[0][0] + self.assertEqual(len(ax.patches), 2) + + @slow + def test_hist_layout(self): + df = self.hist_df + with tm.assertRaises(ValueError): + df.height.hist(layout=(1, 1)) + + with tm.assertRaises(ValueError): + df.height.hist(layout=[1, 1]) + + @slow + def test_hist_layout_with_by(self): + df = self.hist_df + + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.gender, layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.gender, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.classroom, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), + figsize=(12, 7)) + + @slow + def test_hist_no_overlap(self): + from matplotlib.pyplot import subplot, gcf + x = Series(randn(2)) + y = Series(randn(2)) + subplot(121) + x.hist() + subplot(122) + y.hist() + fig = gcf() + axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() + self.assertEqual(len(axes), 2) + + @slow + def test_hist_secondary_legend(self): + # GH 9610 + df = DataFrame(np.random.randn(30, 4), columns=list('abcd')) + + # primary -> secondary + ax = df['a'].plot.hist(legend=True) + df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left and right axis must be visible + self._check_legend_labels(ax, labels=['a', 'b (right)']) + self.assertTrue(ax.get_yaxis().get_visible()) + self.assertTrue(ax.right_ax.get_yaxis().get_visible()) + tm.close() + + # secondary -> secondary + ax = df['a'].plot.hist(legend=True, secondary_y=True) + df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) + # both legends are draw on left ax + # left axis must be invisible, right axis must be visible + self._check_legend_labels(ax.left_ax, + labels=['a (right)', 'b (right)']) + self.assertFalse(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + # secondary -> primary + ax = df['a'].plot.hist(legend=True, secondary_y=True) + # right axes is returned + df['b'].plot.hist(ax=ax, legend=True) + # both legends are draw on left ax + # left and right axis must be visible + self._check_legend_labels(ax.left_ax, labels=['a (right)', 'b']) + self.assertTrue(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + @slow + def test_df_series_secondary_legend(self): + # GH 9779 + df = DataFrame(np.random.randn(30, 3), columns=list('abc')) + s = Series(np.random.randn(30), name='x') + + # primary -> secondary (without passing ax) + ax = df.plot() + s.plot(legend=True, secondary_y=True) + # both legends are dran on left ax + # left and right axis must be visible + self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) + self.assertTrue(ax.get_yaxis().get_visible()) + self.assertTrue(ax.right_ax.get_yaxis().get_visible()) + tm.close() + + # primary -> secondary (with passing ax) + ax = df.plot() + s.plot(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left and right axis must be visible + self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) + self.assertTrue(ax.get_yaxis().get_visible()) + self.assertTrue(ax.right_ax.get_yaxis().get_visible()) + tm.close() + + # seconcary -> secondary (without passing ax) + ax = df.plot(secondary_y=True) + s.plot(legend=True, secondary_y=True) + # both legends are dran on left ax + # left axis must be invisible and right axis must be visible + expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] + self._check_legend_labels(ax.left_ax, labels=expected) + self.assertFalse(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + # secondary -> secondary (with passing ax) + ax = df.plot(secondary_y=True) + s.plot(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left axis must be invisible and right axis must be visible + expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] + self._check_legend_labels(ax.left_ax, expected) + self.assertFalse(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + # secondary -> secondary (with passing ax) + ax = df.plot(secondary_y=True, mark_right=False) + s.plot(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left axis must be invisible and right axis must be visible + expected = ['a', 'b', 'c', 'x (right)'] + self._check_legend_labels(ax.left_ax, expected) + self.assertFalse(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + @slow + def test_plot_fails_with_dupe_color_and_style(self): + x = Series(randn(2)) + with tm.assertRaises(ValueError): + x.plot(style='k--', color='k') + + @slow + def test_hist_kde(self): + ax = self.ts.plot.hist(logy=True) + self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + # ticks are values, thus ticklabels are blank + self._check_text_labels(xlabels, [''] * len(xlabels)) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) + + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + _check_plot_works(self.ts.plot.kde) + _check_plot_works(self.ts.plot.density) + ax = self.ts.plot.kde(logy=True) + self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + self._check_text_labels(xlabels, [''] * len(xlabels)) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) + + @slow + def test_kde_kwargs(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + from numpy import linspace + _check_plot_works(self.ts.plot.kde, bw_method=.5, + ind=linspace(-100, 100, 20)) + _check_plot_works(self.ts.plot.density, bw_method=.5, + ind=linspace(-100, 100, 20)) + ax = self.ts.plot.kde(logy=True, bw_method=.5, + ind=linspace(-100, 100, 20)) + self._check_ax_scales(ax, yaxis='log') + self._check_text_labels(ax.yaxis.get_label(), 'Density') + + @slow + def test_kde_missing_vals(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + s = Series(np.random.uniform(size=50)) + s[0] = np.nan + _check_plot_works(s.plot.kde) + + @slow + def test_hist_kwargs(self): + ax = self.ts.plot.hist(bins=5) + self.assertEqual(len(ax.patches), 5) + self._check_text_labels(ax.yaxis.get_label(), 'Frequency') + tm.close() + + if self.mpl_ge_1_3_1: + ax = self.ts.plot.hist(orientation='horizontal') + self._check_text_labels(ax.xaxis.get_label(), 'Frequency') + tm.close() + + ax = self.ts.plot.hist(align='left', stacked=True) + tm.close() + + @slow + def test_hist_kde_color(self): + ax = self.ts.plot.hist(logy=True, bins=10, color='b') + self._check_ax_scales(ax, yaxis='log') + self.assertEqual(len(ax.patches), 10) + self._check_colors(ax.patches, facecolors=['b'] * 10) + + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + ax = self.ts.plot.kde(logy=True, color='r') + self._check_ax_scales(ax, yaxis='log') + lines = ax.get_lines() + self.assertEqual(len(lines), 1) + self._check_colors(lines, ['r']) + + @slow + def test_boxplot_series(self): + ax = self.ts.plot.box(logy=True) + self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + self._check_text_labels(xlabels, [self.ts.name]) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) + + @slow + def test_kind_both_ways(self): + s = Series(range(3)) + for kind in plotting._common_kinds + plotting._series_kinds: + if not _ok_for_gaussian_kde(kind): + continue + s.plot(kind=kind) + getattr(s.plot, kind)() + + @slow + def test_invalid_plot_data(self): + s = Series(list('abcd')) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + with tm.assertRaises(TypeError): + s.plot(kind=kind) + + @slow + def test_valid_object_plot(self): + s = Series(lrange(10), dtype=object) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + _check_plot_works(s.plot, kind=kind) + + def test_partially_invalid_plot_data(self): + s = Series(['a', 'b', 1.0, 2]) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + with tm.assertRaises(TypeError): + s.plot(kind=kind) + + def test_invalid_kind(self): + s = Series([1, 2]) + with tm.assertRaises(ValueError): + s.plot(kind='aasdf') + + @slow + def test_dup_datetime_index_plot(self): + dr1 = date_range('1/1/2009', periods=4) + dr2 = date_range('1/2/2009', periods=4) + index = dr1.append(dr2) + values = randn(index.size) + s = Series(values, index=index) + _check_plot_works(s.plot) + + @slow + def test_errorbar_plot(self): + + s = Series(np.arange(10), name='x') + s_err = np.random.randn(10) + d_err = DataFrame(randn(10, 2), index=s.index, columns=['x', 'y']) + # test line and bar plots + kinds = ['line', 'bar'] + for kind in kinds: + ax = _check_plot_works(s.plot, yerr=Series(s_err), kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=s_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=s_err.tolist(), kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, xerr=0.2, yerr=0.2, kind=kind) + self._check_has_errorbars(ax, xerr=1, yerr=1) + + ax = _check_plot_works(s.plot, xerr=s_err) + self._check_has_errorbars(ax, xerr=1, yerr=0) + + # test time series plotting + ix = date_range('1/1/2000', '1/1/2001', freq='M') + ts = Series(np.arange(12), index=ix, name='x') + ts_err = Series(np.random.randn(12), index=ix) + td_err = DataFrame(randn(12, 2), index=ix, columns=['x', 'y']) + + ax = _check_plot_works(ts.plot, yerr=ts_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(ts.plot, yerr=td_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + # check incorrect lengths and types + with tm.assertRaises(ValueError): + s.plot(yerr=np.arange(11)) + + s_err = ['zzz'] * 10 + # in mpl 1.5+ this is a TypeError + with tm.assertRaises((ValueError, TypeError)): + s.plot(yerr=s_err) + + def test_table(self): + _check_plot_works(self.series.plot, table=True) + _check_plot_works(self.series.plot, table=self.series) + + @slow + def test_series_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings(Series([1, 2, 3]), + plotting._series_kinds + + plotting._common_kinds) + + @slow + def test_standard_colors(self): + for c in ['r', 'red', 'green', '#FF0000']: + result = plotting._get_standard_colors(1, color=c) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(1, color=[c]) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(3, color=c) + self.assertEqual(result, [c] * 3) + + result = plotting._get_standard_colors(3, color=[c]) + self.assertEqual(result, [c] * 3) + + @slow + def test_standard_colors_all(self): + import matplotlib.colors as colors + + # multiple colors like mediumaquamarine + for c in colors.cnames: + result = plotting._get_standard_colors(num_colors=1, color=c) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(num_colors=1, color=[c]) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(num_colors=3, color=c) + self.assertEqual(result, [c] * 3) + + result = plotting._get_standard_colors(num_colors=3, color=[c]) + self.assertEqual(result, [c] * 3) + + # single letter colors like k + for c in colors.ColorConverter.colors: + result = plotting._get_standard_colors(num_colors=1, color=c) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(num_colors=1, color=[c]) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(num_colors=3, color=c) + self.assertEqual(result, [c] * 3) + + result = plotting._get_standard_colors(num_colors=3, color=[c]) + self.assertEqual(result, [c] * 3) + + def test_series_plot_color_kwargs(self): + # GH1890 + ax = Series(np.arange(12) + 1).plot(color='green') + self._check_colors(ax.get_lines(), linecolors=['green']) + + def test_time_series_plot_color_kwargs(self): + # #1890 + ax = Series(np.arange(12) + 1, index=date_range( + '1/1/2000', periods=12)).plot(color='green') + self._check_colors(ax.get_lines(), linecolors=['green']) + + def test_time_series_plot_color_with_empty_kwargs(self): + import matplotlib as mpl + + if self.mpl_ge_1_5_0: + def_colors = self._maybe_unpack_cycler(mpl.rcParams) + else: + def_colors = mpl.rcParams['axes.color_cycle'] + index = date_range('1/1/2000', periods=12) + s = Series(np.arange(1, 13), index=index) + + ncolors = 3 + + for i in range(ncolors): + ax = s.plot() + self._check_colors(ax.get_lines(), linecolors=def_colors[:ncolors]) + + def test_xticklabels(self): + # GH11529 + s = Series(np.arange(10), index=['P%02d' % i for i in range(10)]) + ax = s.plot(xticks=[0, 3, 5, 9]) + exp = ['P%02d' % i for i in [0, 3, 5, 9]] + self._check_text_labels(ax.get_xticklabels(), exp) + + def test_custom_business_day_freq(self): + # GH7222 + from pandas.tseries.offsets import CustomBusinessDay + s = Series(range(100, 121), index=pd.bdate_range( + start='2014-05-01', end='2014-06-01', + freq=CustomBusinessDay(holidays=['2014-05-26']))) + + _check_plot_works(s.plot) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_graphics_others.py b/pandas/tests/test_graphics_others.py deleted file mode 100644 index f9a210a492594..0000000000000 --- a/pandas/tests/test_graphics_others.py +++ /dev/null @@ -1,1033 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import nose -import itertools -import os -import string -from distutils.version import LooseVersion - -from pandas import Series, DataFrame, MultiIndex -from pandas.compat import range, lmap, lzip -import pandas.util.testing as tm -from pandas.util.testing import slow - -import numpy as np -from numpy import random -from numpy.random import randn - -import pandas.tools.plotting as plotting - -from pandas.tests.test_graphics import (TestPlotBase, _check_plot_works, - curpath, _ok_for_gaussian_kde) - - -""" -These tests are for ``DataFrame.hist``, ``DataFrame.boxplot`` and -other miscellaneous plots. -`Dataframe.plot`` and ``Series.plot`` are tested in test_graphics.py -""" - - -def _skip_if_mpl_14_or_dev_boxplot(): - # GH 8382 - # Boxplot failures on 1.4 and 1.4.1 - # Don't need try / except since that's done at class level - import matplotlib - if str(matplotlib.__version__) >= LooseVersion('1.4'): - raise nose.SkipTest("Matplotlib Regression in 1.4 and current dev.") - - -@tm.mplskip -class TestSeriesPlots(TestPlotBase): - - def setUp(self): - TestPlotBase.setUp(self) - import matplotlib as mpl - mpl.rcdefaults() - - self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' - - self.series = tm.makeStringSeries() - self.series.name = 'series' - - self.iseries = tm.makePeriodSeries() - self.iseries.name = 'iseries' - - @slow - def test_hist_legacy(self): - _check_plot_works(self.ts.hist) - _check_plot_works(self.ts.hist, grid=False) - _check_plot_works(self.ts.hist, figsize=(8, 10)) - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - _check_plot_works(self.ts.hist, by=self.ts.index.month) - with tm.assert_produces_warning(UserWarning): - _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) - - fig, ax = self.plt.subplots(1, 1) - _check_plot_works(self.ts.hist, ax=ax) - _check_plot_works(self.ts.hist, ax=ax, figure=fig) - _check_plot_works(self.ts.hist, figure=fig) - tm.close() - - fig, (ax1, ax2) = self.plt.subplots(1, 2) - _check_plot_works(self.ts.hist, figure=fig, ax=ax1) - _check_plot_works(self.ts.hist, figure=fig, ax=ax2) - - with tm.assertRaises(ValueError): - self.ts.hist(by=self.ts.index, figure=fig) - - @slow - def test_hist_bins_legacy(self): - df = DataFrame(np.random.randn(10, 2)) - ax = df.hist(bins=2)[0][0] - self.assertEqual(len(ax.patches), 2) - - @slow - def test_hist_layout(self): - df = self.hist_df - with tm.assertRaises(ValueError): - df.height.hist(layout=(1, 1)) - - with tm.assertRaises(ValueError): - df.height.hist(layout=[1, 1]) - - @slow - def test_hist_layout_with_by(self): - df = self.hist_df - - # _check_plot_works adds an `ax` kwarg to the method call - # so we get a warning about an axis being cleared, even - # though we don't explicing pass one, see GH #13188 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, by=df.gender, - layout=(2, 1)) - self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, by=df.gender, - layout=(3, -1)) - self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, by=df.category, - layout=(4, 1)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(2, -1)) - self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(3, -1)) - self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(-1, 4)) - self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.classroom, layout=(2, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) - self._check_axes_shape( - axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) - - @slow - def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf - x = Series(randn(2)) - y = Series(randn(2)) - subplot(121) - x.hist() - subplot(122) - y.hist() - fig = gcf() - axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() - self.assertEqual(len(axes), 2) - - @slow - def test_hist_by_no_extra_plots(self): - df = self.hist_df - axes = df.height.hist(by=df.gender) # noqa - self.assertEqual(len(self.plt.get_fignums()), 1) - - @slow - def test_plot_fails_when_ax_differs_from_figure(self): - from pylab import figure - fig1 = figure() - fig2 = figure() - ax1 = fig1.add_subplot(111) - with tm.assertRaises(AssertionError): - self.ts.hist(ax=ax1, figure=fig2) - - @slow - def test_autocorrelation_plot(self): - from pandas.tools.plotting import autocorrelation_plot - _check_plot_works(autocorrelation_plot, series=self.ts) - _check_plot_works(autocorrelation_plot, series=self.ts.values) - - ax = autocorrelation_plot(self.ts, label='Test') - self._check_legend_labels(ax, labels=['Test']) - - @slow - def test_lag_plot(self): - from pandas.tools.plotting import lag_plot - _check_plot_works(lag_plot, series=self.ts) - _check_plot_works(lag_plot, series=self.ts, lag=5) - - @slow - def test_bootstrap_plot(self): - from pandas.tools.plotting import bootstrap_plot - _check_plot_works(bootstrap_plot, series=self.ts, size=10) - - -@tm.mplskip -class TestDataFramePlots(TestPlotBase): - - def setUp(self): - TestPlotBase.setUp(self) - import matplotlib as mpl - mpl.rcdefaults() - - self.tdf = tm.makeTimeDataFrame() - self.hexbin_df = DataFrame({ - "A": np.random.uniform(size=20), - "B": np.random.uniform(size=20), - "C": np.arange(20) + np.random.uniform(size=20)}) - - from pandas import read_csv - path = os.path.join(curpath(), 'data', 'iris.csv') - self.iris = read_csv(path) - - @slow - def test_boxplot_legacy(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) - df['indic'] = ['foo', 'bar'] * 3 - df['indic2'] = ['foo', 'bar', 'foo'] * 2 - - _check_plot_works(df.boxplot, return_type='dict') - _check_plot_works(df.boxplot, column=[ - 'one', 'two'], return_type='dict') - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, column=['one', 'two'], - by='indic') - _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2']) - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by='indic') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by=['indic', 'indic2']) - _check_plot_works(plotting.boxplot, data=df['one'], return_type='dict') - _check_plot_works(df.boxplot, notch=1, return_type='dict') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by='indic', notch=1) - - df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) - df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) - df['Y'] = Series(['A'] * 10) - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by='X') - - # When ax is supplied and required number of axes is 1, - # passed ax should be used: - fig, ax = self.plt.subplots() - axes = df.boxplot('Col1', by='X', ax=ax) - ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes() - self.assertIs(ax_axes, axes) - - fig, ax = self.plt.subplots() - axes = df.groupby('Y').boxplot(ax=ax, return_type='axes') - ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes() - self.assertIs(ax_axes, axes['A']) - - # Multiple columns with an ax argument should use same figure - fig, ax = self.plt.subplots() - with tm.assert_produces_warning(UserWarning): - axes = df.boxplot(column=['Col1', 'Col2'], - by='X', ax=ax, return_type='axes') - self.assertIs(axes['Col1'].get_figure(), fig) - - # When by is None, check that all relevant lines are present in the - # dict - fig, ax = self.plt.subplots() - d = df.boxplot(ax=ax, return_type='dict') - lines = list(itertools.chain.from_iterable(d.values())) - self.assertEqual(len(ax.get_lines()), len(lines)) - - @slow - def test_boxplot_return_type_legacy(self): - # API change in https://github.com/pydata/pandas/pull/7096 - import matplotlib as mpl # noqa - - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) - with tm.assertRaises(ValueError): - df.boxplot(return_type='NOTATYPE') - - with tm.assert_produces_warning(FutureWarning): - result = df.boxplot() - # change to Axes in future - self._check_box_return_type(result, 'dict') - - with tm.assert_produces_warning(False): - result = df.boxplot(return_type='dict') - self._check_box_return_type(result, 'dict') - - with tm.assert_produces_warning(False): - result = df.boxplot(return_type='axes') - self._check_box_return_type(result, 'axes') - - with tm.assert_produces_warning(False): - result = df.boxplot(return_type='both') - self._check_box_return_type(result, 'both') - - @slow - def test_boxplot_axis_limits(self): - - def _check_ax_limits(col, ax): - y_min, y_max = ax.get_ylim() - self.assertTrue(y_min <= col.min()) - self.assertTrue(y_max >= col.max()) - - df = self.hist_df.copy() - df['age'] = np.random.randint(1, 20, df.shape[0]) - # One full row - height_ax, weight_ax = df.boxplot(['height', 'weight'], by='category') - _check_ax_limits(df['height'], height_ax) - _check_ax_limits(df['weight'], weight_ax) - self.assertEqual(weight_ax._sharey, height_ax) - - # Two rows, one partial - p = df.boxplot(['height', 'weight', 'age'], by='category') - height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0] - dummy_ax = p[1, 1] - _check_ax_limits(df['height'], height_ax) - _check_ax_limits(df['weight'], weight_ax) - _check_ax_limits(df['age'], age_ax) - self.assertEqual(weight_ax._sharey, height_ax) - self.assertEqual(age_ax._sharey, height_ax) - self.assertIsNone(dummy_ax._sharey) - - @slow - def test_boxplot_empty_column(self): - _skip_if_mpl_14_or_dev_boxplot() - df = DataFrame(np.random.randn(20, 4)) - df.loc[:, 0] = np.nan - _check_plot_works(df.boxplot, return_type='axes') - - @slow - def test_hist_df_legacy(self): - from matplotlib.patches import Rectangle - with tm.assert_produces_warning(UserWarning): - _check_plot_works(self.hist_df.hist) - - # make sure layout is handled - df = DataFrame(randn(100, 3)) - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, grid=False) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - self.assertFalse(axes[1, 1].get_visible()) - - df = DataFrame(randn(100, 1)) - _check_plot_works(df.hist) - - # make sure layout is handled - df = DataFrame(randn(100, 6)) - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, layout=(4, 2)) - self._check_axes_shape(axes, axes_num=6, layout=(4, 2)) - - # make sure sharex, sharey is handled - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.hist, sharex=True, sharey=True) - - # handle figsize arg - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.hist, figsize=(8, 10)) - - # check bins argument - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.hist, bins=5) - - # make sure xlabelsize and xrot are handled - ser = df[0] - xf, yf = 20, 18 - xrot, yrot = 30, 40 - axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) - - xf, yf = 20, 18 - xrot, yrot = 30, 40 - axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) - - tm.close() - # make sure kwargs to hist are handled - ax = ser.hist(normed=True, cumulative=True, bins=4) - # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] - self.assertAlmostEqual(rects[-1].get_height(), 1.0) - - tm.close() - ax = ser.hist(log=True) - # scale of y must be 'log' - self._check_ax_scales(ax, yaxis='log') - - tm.close() - - # propagate attr exception from matplotlib.Axes.hist - with tm.assertRaises(AttributeError): - ser.hist(foo='bar') - - @slow - def test_hist_layout(self): - df = DataFrame(randn(100, 3)) - - layout_to_expected_size = ( - {'layout': None, 'expected_size': (2, 2)}, # default is 2x2 - {'layout': (2, 2), 'expected_size': (2, 2)}, - {'layout': (4, 1), 'expected_size': (4, 1)}, - {'layout': (1, 4), 'expected_size': (1, 4)}, - {'layout': (3, 3), 'expected_size': (3, 3)}, - {'layout': (-1, 4), 'expected_size': (1, 4)}, - {'layout': (4, -1), 'expected_size': (4, 1)}, - {'layout': (-1, 2), 'expected_size': (2, 2)}, - {'layout': (2, -1), 'expected_size': (2, 2)} - ) - - for layout_test in layout_to_expected_size: - axes = df.hist(layout=layout_test['layout']) - expected = layout_test['expected_size'] - self._check_axes_shape(axes, axes_num=3, layout=expected) - - # layout too small for all 4 plots - with tm.assertRaises(ValueError): - df.hist(layout=(1, 1)) - - # invalid format for layout - with tm.assertRaises(ValueError): - df.hist(layout=(1,)) - with tm.assertRaises(ValueError): - df.hist(layout=(-1, -1)) - - @slow - def test_scatter_plot_legacy(self): - tm._skip_if_no_scipy() - - df = DataFrame(randn(100, 2)) - - def scat(**kwds): - return plotting.scatter_matrix(df, **kwds) - - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat) - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, marker='+') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, vmin=0) - if _ok_for_gaussian_kde('kde'): - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, diagonal='kde') - if _ok_for_gaussian_kde('density'): - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, diagonal='density') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, diagonal='hist') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, range_padding=.1) - - def scat2(x, y, by=None, ax=None, figsize=None): - return plotting.scatter_plot(df, x, y, by, ax, figsize=None) - - _check_plot_works(scat2, x=0, y=1) - grouper = Series(np.repeat([1, 2, 3, 4, 5], 20), df.index) - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat2, x=0, y=1, by=grouper) - - def test_scatter_matrix_axis(self): - tm._skip_if_no_scipy() - scatter_matrix = plotting.scatter_matrix - - with tm.RNGContext(42): - df = DataFrame(randn(100, 3)) - - # we are plotting multiples on a sub-plot - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(scatter_matrix, filterwarnings='always', - frame=df, range_padding=.1) - axes0_labels = axes[0][0].yaxis.get_majorticklabels() - - # GH 5662 - expected = ['-2', '-1', '0', '1', '2'] - self._check_text_labels(axes0_labels, expected) - self._check_ticks_props( - axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) - - df[0] = ((df[0] - 2) / 3) - - # we are plotting multiples on a sub-plot - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(scatter_matrix, filterwarnings='always', - frame=df, range_padding=.1) - axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ['-1.2', '-1.0', '-0.8', '-0.6', '-0.4', '-0.2', '0.0'] - self._check_text_labels(axes0_labels, expected) - self._check_ticks_props( - axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) - - @slow - def test_andrews_curves(self): - from pandas.tools.plotting import andrews_curves - from matplotlib import cm - - df = self.iris - - _check_plot_works(andrews_curves, frame=df, class_column='Name') - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=rgba) - self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=cnames) - self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) - - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) - self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) - - length = 10 - df = DataFrame({"A": random.rand(length), - "B": random.rand(length), - "C": random.rand(length), - "Name": ["A"] * length}) - - _check_plot_works(andrews_curves, frame=df, class_column='Name') - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=rgba) - self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=cnames) - self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) - - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) - self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) - - colors = ['b', 'g', 'r'] - df = DataFrame({"A": [1, 2, 3], - "B": [1, 2, 3], - "C": [1, 2, 3], - "Name": colors}) - ax = andrews_curves(df, 'Name', color=colors) - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, linecolors=colors) - - with tm.assert_produces_warning(FutureWarning): - andrews_curves(data=df, class_column='Name') - - @slow - def test_parallel_coordinates(self): - from pandas.tools.plotting import parallel_coordinates - from matplotlib import cm - - df = self.iris - - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name') - nlines = len(ax.get_lines()) - nxticks = len(ax.xaxis.get_ticklabels()) - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', color=rgba) - self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', color=cnames) - self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) - - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', colormap=cm.jet) - cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) - self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) - - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', axvlines=False) - assert len(ax.get_lines()) == (nlines - nxticks) - - colors = ['b', 'g', 'r'] - df = DataFrame({"A": [1, 2, 3], - "B": [1, 2, 3], - "C": [1, 2, 3], - "Name": colors}) - ax = parallel_coordinates(df, 'Name', color=colors) - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, linecolors=colors) - - with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(data=df, class_column='Name') - with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(df, 'Name', colors=colors) - - @slow - def test_radviz(self): - from pandas.tools.plotting import radviz - from matplotlib import cm - - df = self.iris - _check_plot_works(radviz, frame=df, class_column='Name') - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works( - radviz, frame=df, class_column='Name', color=rgba) - # skip Circle drawn as ticks - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors( - patches[:10], facecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - _check_plot_works(radviz, frame=df, class_column='Name', color=cnames) - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) - - _check_plot_works(radviz, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) - - colors = [[0., 0., 1., 1.], - [0., 0.5, 1., 1.], - [1., 0., 0., 1.]] - df = DataFrame({"A": [1, 2, 3], - "B": [2, 1, 3], - "C": [3, 2, 1], - "Name": ['b', 'g', 'r']}) - ax = radviz(df, 'Name', color=colors) - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, facecolors=colors) - - -@tm.mplskip -class TestDataFrameGroupByPlots(TestPlotBase): - - @slow - def test_boxplot_legacy(self): - grouped = self.hist_df.groupby(by='gender') - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2)) - - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - tuples = lzip(string.ascii_letters[:10], range(10)) - df = DataFrame(np.random.rand(10, 3), - index=MultiIndex.from_tuples(tuples)) - - grouped = df.groupby(level=1) - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3)) - - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - - grouped = df.unstack(level=1).groupby(level=0, axis=1) - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2)) - - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - - @slow - def test_grouped_plot_fignums(self): - n = 10 - weight = Series(np.random.normal(166, 20, size=n)) - height = Series(np.random.normal(60, 10, size=n)) - with tm.RNGContext(42): - gender = np.random.choice(['male', 'female'], size=n) - df = DataFrame({'height': height, 'weight': weight, 'gender': gender}) - gb = df.groupby('gender') - - res = gb.plot() - self.assertEqual(len(self.plt.get_fignums()), 2) - self.assertEqual(len(res), 2) - tm.close() - - res = gb.boxplot(return_type='axes') - self.assertEqual(len(self.plt.get_fignums()), 1) - self.assertEqual(len(res), 2) - tm.close() - - # now works with GH 5610 as gender is excluded - res = df.groupby('gender').hist() - tm.close() - - @slow - def test_grouped_hist_legacy(self): - from matplotlib.patches import Rectangle - - df = DataFrame(randn(500, 2), columns=['A', 'B']) - df['C'] = np.random.randint(0, 4, 500) - df['D'] = ['X'] * 500 - - axes = plotting.grouped_hist(df.A, by=df.C) - self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - - tm.close() - axes = df.hist(by=df.C) - self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - - tm.close() - # group by a key with single value - axes = df.hist(by='D', rot=30) - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - self._check_ticks_props(axes, xrot=30) - - tm.close() - # make sure kwargs to hist are handled - xf, yf = 20, 18 - xrot, yrot = 30, 40 - axes = plotting.grouped_hist(df.A, by=df.C, normed=True, - cumulative=True, bins=4, - xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) - # height of last bin (index 5) must be 1.0 - for ax in axes.ravel(): - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] - height = rects[-1].get_height() - self.assertAlmostEqual(height, 1.0) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) - - tm.close() - axes = plotting.grouped_hist(df.A, by=df.C, log=True) - # scale of y must be 'log' - self._check_ax_scales(axes, yaxis='log') - - tm.close() - # propagate attr exception from matplotlib.Axes.hist - with tm.assertRaises(AttributeError): - plotting.grouped_hist(df.A, by=df.C, foo='bar') - - with tm.assert_produces_warning(FutureWarning): - df.hist(by='C', figsize='default') - - @slow - def test_grouped_hist_legacy2(self): - n = 10 - weight = Series(np.random.normal(166, 20, size=n)) - height = Series(np.random.normal(60, 10, size=n)) - with tm.RNGContext(42): - gender_int = np.random.choice([0, 1], size=n) - df_int = DataFrame({'height': height, 'weight': weight, - 'gender': gender_int}) - gb = df_int.groupby('gender') - axes = gb.hist() - self.assertEqual(len(axes), 2) - self.assertEqual(len(self.plt.get_fignums()), 2) - tm.close() - - @slow - def test_grouped_box_return_type(self): - df = self.hist_df - - # old style: return_type=None - result = df.boxplot(by='gender') - self.assertIsInstance(result, np.ndarray) - self._check_box_return_type( - result, None, - expected_keys=['height', 'weight', 'category']) - - # now for groupby - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.groupby('gender').boxplot() - self._check_box_return_type( - result, 'dict', expected_keys=['Male', 'Female']) - - columns2 = 'X B C D A G Y N Q O'.split() - df2 = DataFrame(random.randn(50, 10), columns=columns2) - categories2 = 'A B C D E F G H I J'.split() - df2['category'] = categories2 * 5 - - for t in ['dict', 'axes', 'both']: - returned = df.groupby('classroom').boxplot(return_type=t) - self._check_box_return_type( - returned, t, expected_keys=['A', 'B', 'C']) - - returned = df.boxplot(by='classroom', return_type=t) - self._check_box_return_type( - returned, t, - expected_keys=['height', 'weight', 'category']) - - returned = df2.groupby('category').boxplot(return_type=t) - self._check_box_return_type(returned, t, expected_keys=categories2) - - returned = df2.boxplot(by='category', return_type=t) - self._check_box_return_type(returned, t, expected_keys=columns2) - - @slow - def test_grouped_box_layout(self): - df = self.hist_df - - self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], - by=df.gender, layout=(1, 1)) - self.assertRaises(ValueError, df.boxplot, - column=['height', 'weight', 'category'], - layout=(2, 1), return_type='dict') - self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], - by=df.gender, layout=(-1, -1)) - - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('gender').boxplot, - column='height', return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2)) - - with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) - - # GH 6769 - with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('classroom').boxplot, - column='height', return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) - - # GH 5897 - axes = df.boxplot(column=['height', 'weight', 'category'], by='gender', - return_type='axes') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) - for ax in [axes['height']]: - self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible([ax.xaxis.get_label()], visible=False) - for ax in [axes['weight'], axes['category']]: - self._check_visible(ax.get_xticklabels()) - self._check_visible([ax.xaxis.get_label()]) - - box = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) - - with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, 2), return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) - with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, -1), return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) - - box = df.boxplot(column=['height', 'weight', 'category'], by='gender', - layout=(4, 1)) - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1)) - - box = df.boxplot(column=['height', 'weight', 'category'], by='gender', - layout=(-1, 1)) - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(3, 1)) - - box = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], layout=(1, 4), - return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4)) - - box = df.groupby('classroom').boxplot( # noqa - column=['height', 'weight', 'category'], layout=(1, -1), - return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) - - @slow - def test_grouped_box_multiple_axes(self): - # GH 6970, GH 7069 - df = self.hist_df - - # check warning to ignore sharex / sharey - # this check should be done in the first function which - # passes multiple axes to plot, hist or boxplot - # location should be changed if other test is added - # which has earlier alphabetical order - with tm.assert_produces_warning(UserWarning): - fig, axes = self.plt.subplots(2, 2) - df.groupby('category').boxplot( - column='height', return_type='axes', ax=axes) - self._check_axes_shape(self.plt.gcf().axes, - axes_num=4, layout=(2, 2)) - - fig, axes = self.plt.subplots(2, 3) - with tm.assert_produces_warning(UserWarning): - returned = df.boxplot(column=['height', 'weight', 'category'], - by='gender', return_type='axes', ax=axes[0]) - returned = np.array(list(returned.values())) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - self.assert_numpy_array_equal(returned, axes[0]) - self.assertIs(returned[0].figure, fig) - - # draw on second row - with tm.assert_produces_warning(UserWarning): - returned = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], - return_type='axes', ax=axes[1]) - returned = np.array(list(returned.values())) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - self.assert_numpy_array_equal(returned, axes[1]) - self.assertIs(returned[0].figure, fig) - - with tm.assertRaises(ValueError): - fig, axes = self.plt.subplots(2, 3) - # pass different number of axes from required - with tm.assert_produces_warning(UserWarning): - axes = df.groupby('classroom').boxplot(ax=axes) - - @slow - def test_grouped_hist_layout(self): - df = self.hist_df - self.assertRaises(ValueError, df.hist, column='weight', by=df.gender, - layout=(1, 1)) - self.assertRaises(ValueError, df.hist, column='height', by=df.category, - layout=(1, 3)) - self.assertRaises(ValueError, df.hist, column='height', by=df.category, - layout=(-1, -1)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, 1)) - self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, -1)) - self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - - axes = df.hist(column='height', by=df.category, layout=(4, 1)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - axes = df.hist(column='height', by=df.category, layout=(-1, 1)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - axes = df.hist(column='height', by=df.category, - layout=(4, 2), figsize=(12, 8)) - self._check_axes_shape( - axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) - tm.close() - - # GH 6769 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.hist, column='height', by='classroom', layout=(2, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - # without column - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, by='classroom') - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - axes = df.hist(by='gender', layout=(3, 5)) - self._check_axes_shape(axes, axes_num=2, layout=(3, 5)) - - axes = df.hist(column=['height', 'weight', 'category']) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - @slow - def test_grouped_hist_multiple_axes(self): - # GH 6970, GH 7069 - df = self.hist_df - - fig, axes = self.plt.subplots(2, 3) - returned = df.hist(column=['height', 'weight', 'category'], ax=axes[0]) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - self.assert_numpy_array_equal(returned, axes[0]) - self.assertIs(returned[0].figure, fig) - returned = df.hist(by='classroom', ax=axes[1]) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - self.assert_numpy_array_equal(returned, axes[1]) - self.assertIs(returned[0].figure, fig) - - with tm.assertRaises(ValueError): - fig, axes = self.plt.subplots(2, 3) - # pass different number of axes from required - axes = df.hist(column='height', ax=axes) - - @slow - def test_axis_share_x(self): - df = self.hist_df - # GH4089 - ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True) - - # share x - self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) - self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) - - # don't share y - self.assertFalse(ax1._shared_y_axes.joined(ax1, ax2)) - self.assertFalse(ax2._shared_y_axes.joined(ax1, ax2)) - - @slow - def test_axis_share_y(self): - df = self.hist_df - ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True) - - # share y - self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) - self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) - - # don't share x - self.assertFalse(ax1._shared_x_axes.joined(ax1, ax2)) - self.assertFalse(ax2._shared_x_axes.joined(ax1, ax2)) - - @slow - def test_axis_share_xy(self): - df = self.hist_df - ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, - sharey=True) - - # share both x and y - self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) - self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) - - self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) - self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/setup.py b/setup.py index 58965fe9ae6d6..937b3509cf493 100755 --- a/setup.py +++ b/setup.py @@ -571,6 +571,7 @@ def pxd(name): 'pandas.tests.formats', 'pandas.tests.types', 'pandas.tests.test_msgpack', + 'pandas.tests.plotting', 'pandas.tools', 'pandas.tools.tests', 'pandas.tseries', From e533947cf828fbc75d6b754dbe0e3fa862b1647a Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 23 Jul 2016 22:08:41 +0900 Subject: [PATCH 146/359] BUG: DatetimeIndex with nanosecond frequency does not include end - [x] closes #13672 - [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` - [x] whatsnew entry Author: sinhrks Closes #13762 from sinhrks/date_range_nano and squashes the following commits: 0e63d70 [sinhrks] BUG: DatetimeIndex with nanosecond frequency does not include end --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tseries/index.py | 3 +- pandas/tseries/tests/test_period.py | 9 ++++ pandas/tseries/tests/test_timeseries.py | 65 +++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index edca4289167e5..721da38baf67d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -746,6 +746,7 @@ Bug Fixes - Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`) - Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`) - Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`) +- Bug in ``DatetimeIndex`` with nanosecond frequency does not include timestamp specified with ``end`` (:issue:`13672`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index b87e9738b02ee..a1775c11d2226 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1964,7 +1964,8 @@ def _generate_regular_range(start, end, periods, offset): b = Timestamp(start).value # cannot just use e = Timestamp(end) + 1 because arange breaks when # stride is too large, see GH10887 - e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + e = (b + (Timestamp(end).value - b) // stride * stride + + stride // 2 + 1) # end.tz == start.tz by this point due to _generate implementation tz = start.tz elif start is not None: diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 0b0ee012a2f30..7077a61092b9e 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1573,6 +1573,15 @@ def test_constructor_U(self): self.assertRaises(ValueError, period_range, '2007-1-1', periods=500, freq='X') + def test_constructor_nano(self): + idx = period_range(start=Period(ordinal=1, freq='N'), + end=Period(ordinal=4, freq='N'), freq='N') + exp = PeriodIndex([Period(ordinal=1, freq='N'), + Period(ordinal=2, freq='N'), + Period(ordinal=3, freq='N'), + Period(ordinal=4, freq='N')], freq='N') + tm.assert_index_equal(idx, exp) + def test_constructor_arrays_negative_year(self): years = np.arange(1960, 2000, dtype=np.int64).repeat(4) quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 2a9696503eaa5..511987e4db886 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1075,6 +1075,71 @@ def test_to_datetime_freq(self): self.assertEqual(xp.freq, rs.freq) self.assertEqual(xp.tzinfo, rs.tzinfo) + def test_range_edges(self): + # GH 13672 + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000001'), + end=Timestamp('1970-01-01 00:00:00.000000004'), + freq='N') + exp = DatetimeIndex(['1970-01-01 00:00:00.000000001', + '1970-01-01 00:00:00.000000002', + '1970-01-01 00:00:00.000000003', + '1970-01-01 00:00:00.000000004']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000004'), + end=Timestamp('1970-01-01 00:00:00.000000001'), + freq='N') + exp = DatetimeIndex([]) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000001'), + end=Timestamp('1970-01-01 00:00:00.000000001'), + freq='N') + exp = DatetimeIndex(['1970-01-01 00:00:00.000000001']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000001'), + end=Timestamp('1970-01-01 00:00:00.000004'), + freq='U') + exp = DatetimeIndex(['1970-01-01 00:00:00.000001', + '1970-01-01 00:00:00.000002', + '1970-01-01 00:00:00.000003', + '1970-01-01 00:00:00.000004']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.001'), + end=Timestamp('1970-01-01 00:00:00.004'), + freq='L') + exp = DatetimeIndex(['1970-01-01 00:00:00.001', + '1970-01-01 00:00:00.002', + '1970-01-01 00:00:00.003', + '1970-01-01 00:00:00.004']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:01'), + end=Timestamp('1970-01-01 00:00:04'), freq='S') + exp = DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02', + '1970-01-01 00:00:03', '1970-01-01 00:00:04']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:01'), + end=Timestamp('1970-01-01 00:04'), freq='T') + exp = DatetimeIndex(['1970-01-01 00:01', '1970-01-01 00:02', + '1970-01-01 00:03', '1970-01-01 00:04']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 01:00'), + end=Timestamp('1970-01-01 04:00'), freq='H') + exp = DatetimeIndex(['1970-01-01 01:00', '1970-01-01 02:00', + '1970-01-01 03:00', '1970-01-01 04:00']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01'), + end=Timestamp('1970-01-04'), freq='D') + exp = DatetimeIndex(['1970-01-01', '1970-01-02', + '1970-01-03', '1970-01-04']) + tm.assert_index_equal(idx, exp) + def test_range_misspecified(self): # GH #1095 From 5a3b0710fb8ea94f1b61543afc3280dc8bc9ae14 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 23 Jul 2016 22:12:30 +0900 Subject: [PATCH 147/359] BUG: union_categoricals can't handle NaN - [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` - [x] whatsnew not needed ``union_categoricals`` doesn't handle ``NaN`` properly. **on current master:** ``` from pandas.types.concat import union_categoricals union_categoricals([pd.Categorical([np.nan, 1]), pd.Categorical([2, np.nan])]) # [1, 1, 2, 2] # Categories (2, int64): [1, 2] union_categoricals([pd.Categorical([np.nan]), pd.Categorical([np.nan])]) # IndexError: cannot do a non-empty take from an empty axes. ``` Author: sinhrks Closes #13759 from sinhrks/union_categoricals_nan and squashes the following commits: 4312a32 [sinhrks] BUG: union_categoricals can't handle NaN --- pandas/tools/tests/test_concat.py | 53 +++++++++++++++++++++++++++++++ pandas/types/concat.py | 14 +++++--- 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 568cf63c02e30..13c6b72ade27b 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -889,6 +889,59 @@ def test_union_categorical(self): with tm.assertRaises(ValueError): union_categoricals([]) + def test_union_categoricals_nan(self): + # GH 13759 + res = union_categoricals([pd.Categorical([1, 2, np.nan]), + pd.Categorical([3, 2, np.nan])]) + exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([pd.Categorical(['A', 'B']), + pd.Categorical(['B', 'B', np.nan])]) + exp = Categorical(['A', 'B', 'B', 'B', np.nan]) + tm.assert_categorical_equal(res, exp) + + val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), + pd.NaT] + val2 = [pd.NaT, pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-02-01')] + + res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) + exp = Categorical(val1 + val2, + categories=[pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-03-01'), + pd.Timestamp('2011-02-01')]) + tm.assert_categorical_equal(res, exp) + + # all NaN + res = union_categoricals([pd.Categorical([np.nan, np.nan]), + pd.Categorical(['X'])]) + exp = Categorical([np.nan, np.nan, 'X']) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([pd.Categorical([np.nan, np.nan]), + pd.Categorical([np.nan, np.nan])]) + exp = Categorical([np.nan, np.nan, np.nan, np.nan]) + tm.assert_categorical_equal(res, exp) + + def test_union_categoricals_empty(self): + # GH 13759 + res = union_categoricals([pd.Categorical([]), + pd.Categorical([])]) + exp = Categorical([]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([pd.Categorical([]), + pd.Categorical([1.0])]) + exp = Categorical([1.0]) + tm.assert_categorical_equal(res, exp) + + # to make dtype equal + nanc = pd.Categorical(np.array([np.nan], dtype=np.float64)) + res = union_categoricals([nanc, + pd.Categorical([])]) + tm.assert_categorical_equal(res, nanc) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 3b30531fb30ac..c8af0ec62db86 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -6,6 +6,7 @@ import pandas.tslib as tslib from pandas import compat from pandas.compat import map +from pandas.core.algorithms import take_1d from .common import (is_categorical_dtype, is_sparse, is_datetimetz, @@ -254,10 +255,15 @@ def union_categoricals(to_union): new_codes = [] for c in to_union: - indexer = categories.get_indexer(c.categories) - new_codes.append(indexer.take(c.codes)) - codes = np.concatenate(new_codes) - return Categorical(codes, categories=categories, ordered=False, + if len(c.categories) > 0: + indexer = categories.get_indexer(c.categories) + new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) + else: + # must be all NaN + new_codes.append(c.codes) + + new_codes = np.concatenate(new_codes) + return Categorical(new_codes, categories=categories, ordered=False, fastpath=True) From 1cd1026ed7061882a05521a7c16200eadae591e4 Mon Sep 17 00:00:00 2001 From: Jim Crist Date: Sat, 23 Jul 2016 10:37:53 -0500 Subject: [PATCH 148/359] Groupby getitem works with all index types (#13731) Previously `df.groupby(0)[df.columns]` would fail if all column names were integers (meaning `df.columns` was an `Int64Index`). This was because the implementation of `__getitem__` in `SelectionMixin` was checking for `ABCIndex` when it probably should have checked for `ABCIndexClass`. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/base.py | 6 +++--- pandas/tests/test_groupby.py | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 721da38baf67d..bafb351b2f678 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -750,6 +750,7 @@ Bug Fixes - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) +- Bug in ``df.groupby(...)[...]`` where getitem with ``Int64Index`` raised an error (:issue:`13731`) - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) diff --git a/pandas/core/base.py b/pandas/core/base.py index a0dfebdfde356..8c150d9fbb07e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -6,7 +6,7 @@ import numpy as np from pandas.types.missing import isnull -from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndex +from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass from pandas.types.common import (_ensure_object, is_object_dtype, is_list_like, is_scalar) @@ -299,7 +299,7 @@ def name(self): @property def _selection_list(self): if not isinstance(self._selection, (list, tuple, ABCSeries, - ABCIndex, np.ndarray)): + ABCIndexClass, np.ndarray)): return [self._selection] return self._selection @@ -330,7 +330,7 @@ def __getitem__(self, key): if self._selection is not None: raise Exception('Column(s) %s already selected' % self._selection) - if isinstance(key, (list, tuple, ABCSeries, ABCIndex, + if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 258f36cb1b68f..3f5b4152afe31 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3769,6 +3769,22 @@ def test_getitem_list_of_columns(self): assert_frame_equal(result2, expected) assert_frame_equal(result3, expected) + def test_getitem_numeric_column_names(self): + # GH #13731 + df = DataFrame({0: list('abcd') * 2, + 2: np.random.randn(8), + 4: np.random.randn(8), + 6: np.random.randn(8)}) + result = df.groupby(0)[df.columns[1:3]].mean() + result2 = df.groupby(0)[2, 4].mean() + result3 = df.groupby(0)[[2, 4]].mean() + + expected = df.ix[:, [0, 2, 4]].groupby(0).mean() + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + def test_agg_multiple_functions_maintain_order(self): # GH #610 funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] From b23ec91fece6d7c3c8ffb74bdfaf8aef8b959175 Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Sat, 23 Jul 2016 09:25:20 -0700 Subject: [PATCH 149/359] Closes #13688: added scroll to top button to footer block in theme (#13689) --- .../themes/nature_with_gtoc/layout.html | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/doc/source/themes/nature_with_gtoc/layout.html b/doc/source/themes/nature_with_gtoc/layout.html index fd0755e096023..ddf1e861f5f81 100644 --- a/doc/source/themes/nature_with_gtoc/layout.html +++ b/doc/source/themes/nature_with_gtoc/layout.html @@ -61,3 +61,37 @@

{{ _('Search') }}

{%- endblock %} + +{%- block footer %} + +Scroll To Top + +{% endblock %} \ No newline at end of file From b60e42b02040624f319cadd60a99fe50eb62d62e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 Jul 2016 21:28:48 +0200 Subject: [PATCH 150/359] ENH: Allow to_sql to recognize single sql type (GH11886) (#13614) --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/io/sql.py | 21 +++++++++++++++------ pandas/io/tests/test_sql.py | 16 ++++++++++++++++ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index bafb351b2f678..646e8822ed46f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -310,6 +310,7 @@ Other enhancements - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) +- ``DataFrame.to_sql `` now allows a single value as the SQL type for all columns (:issue:`11886`). - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) - ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`) @@ -322,7 +323,6 @@ Other enhancements index=['row1', 'row2']) df.sort_values(by='row2', axis=1) - .. _whatsnew_0190.api: API changes diff --git a/pandas/io/sql.py b/pandas/io/sql.py index dfc9e80aa27d1..49f277f6ba7bc 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -14,7 +14,7 @@ import pandas.lib as lib from pandas.types.missing import isnull from pandas.types.dtypes import DatetimeTZDtype -from pandas.types.common import (is_list_like, +from pandas.types.common import (is_list_like, is_dict_like, is_datetime64tz_dtype) from pandas.compat import (map, zip, raise_with_traceback, @@ -448,9 +448,10 @@ def to_sql(frame, name, con, flavor=None, schema=None, if_exists='fail', chunksize : int, default None If not None, then rows will be written in batches of this size at a time. If None, all rows will be written at once. - dtype : dict of column name to SQL type, default None + dtype : single SQLtype or dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should be a SQLAlchemy type, or a string for sqlite3 fallback connection. + If all columns are of the same type, one single value can be used. """ if if_exists not in ('fail', 'replace', 'append'): @@ -1121,11 +1122,15 @@ def to_sql(self, frame, name, if_exists='fail', index=True, chunksize : int, default None If not None, then rows will be written in batches of this size at a time. If None, all rows will be written at once. - dtype : dict of column name to SQL type, default None + dtype : single type or dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type. + be a SQLAlchemy type. If all columns are of the same type, one + single value can be used. """ + if dtype and not is_dict_like(dtype): + dtype = {col_name: dtype for col_name in frame} + if dtype is not None: from sqlalchemy.types import to_instance, TypeEngine for col, my_type in dtype.items(): @@ -1473,11 +1478,15 @@ def to_sql(self, frame, name, if_exists='fail', index=True, chunksize : int, default None If not None, then rows will be written in batches of this size at a time. If None, all rows will be written at once. - dtype : dict of column name to SQL type, default None + dtype : single type or dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should - be a string. + be a string. If all columns are of the same type, one single value + can be used. """ + if dtype and not is_dict_like(dtype): + dtype = {col_name: dtype for col_name in frame} + if dtype is not None: for col, my_type in dtype.items(): if not isinstance(my_type, str): diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index f4001420a77b6..21c3ea416e091 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -1537,6 +1537,15 @@ def test_dtype(self): self.assertTrue(isinstance(sqltype, sqlalchemy.String)) self.assertEqual(sqltype.length, 10) + # single dtype + df.to_sql('single_dtype_test', self.conn, dtype=sqlalchemy.TEXT) + meta = sqlalchemy.schema.MetaData(bind=self.conn) + meta.reflect() + sqltypea = meta.tables['single_dtype_test'].columns['A'].type + sqltypeb = meta.tables['single_dtype_test'].columns['B'].type + self.assertTrue(isinstance(sqltypea, sqlalchemy.TEXT)) + self.assertTrue(isinstance(sqltypeb, sqlalchemy.TEXT)) + def test_notnull_dtype(self): cols = {'Bool': Series([True, None]), 'Date': Series([datetime(2012, 5, 1), None]), @@ -2006,6 +2015,13 @@ def test_dtype(self): self.assertRaises(ValueError, df.to_sql, 'error', self.conn, dtype={'B': bool}) + # single dtype + df.to_sql('single_dtype_test', self.conn, dtype='STRING') + self.assertEqual( + self._get_sqlite_column_type('single_dtype_test', 'A'), 'STRING') + self.assertEqual( + self._get_sqlite_column_type('single_dtype_test', 'B'), 'STRING') + def test_notnull_dtype(self): if self.flavor == 'mysql': raise nose.SkipTest('Not applicable to MySQL legacy') From f919b9e72ec488f1aee08831d4464f86db398571 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Sun, 24 Jul 2016 22:50:32 +0900 Subject: [PATCH 151/359] PEP8: test/indexes/base (#13771) --- pandas/tests/indexes/test_base.py | 131 +++++++++++++++++------------- 1 file changed, 75 insertions(+), 56 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index cc5dd24292bb8..0ddc71b01c22a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2,9 +2,6 @@ from datetime import datetime, timedelta -# TODO(wesm): fix long line flake8 issues -# flake8: noqa - import pandas.util.testing as tm from pandas.indexes.api import Index, MultiIndex from .common import Base @@ -286,7 +283,8 @@ def test_constructor_dtypes(self): for idx in [Index(np.array([True, False, True], dtype=bool)), Index([True, False, True]), - Index(np.array([True, False, True], dtype=bool), dtype=bool), + Index(np.array([True, False, True], dtype=bool), + dtype=bool), Index([True, False, True], dtype=bool)]: self.assertIsInstance(idx, Index) self.assertEqual(idx.dtype, object) @@ -294,8 +292,10 @@ def test_constructor_dtypes(self): for idx in [Index(np.array([1, 2, 3], dtype=int), dtype='category'), Index([1, 2, 3], dtype='category'), Index(np.array([np_datetime64_compat('2011-01-01'), - np_datetime64_compat('2011-01-02')]), dtype='category'), - Index([datetime(2011, 1, 1), datetime(2011, 1, 2)], dtype='category')]: + np_datetime64_compat('2011-01-02')]), + dtype='category'), + Index([datetime(2011, 1, 1), datetime(2011, 1, 2)], + dtype='category')]: self.assertIsInstance(idx, CategoricalIndex) for idx in [Index(np.array([np_datetime64_compat('2011-01-01'), @@ -304,7 +304,8 @@ def test_constructor_dtypes(self): self.assertIsInstance(idx, DatetimeIndex) for idx in [Index(np.array([np_datetime64_compat('2011-01-01'), - np_datetime64_compat('2011-01-02')]), dtype=object), + np_datetime64_compat('2011-01-02')]), + dtype=object), Index([datetime(2011, 1, 1), datetime(2011, 1, 2)], dtype=object)]: self.assertNotIsInstance(idx, DatetimeIndex) @@ -483,10 +484,9 @@ def test_nanosecond_index_access(self): # self.assertEqual(first_value, # x['2013-01-01 00:00:00.000000050+0000']) - self.assertEqual( - first_value, - x[Timestamp(np_datetime64_compat('2013-01-01 00:00:00.000000050+0000', - 'ns'))]) + exp_ts = np_datetime64_compat('2013-01-01 00:00:00.000000050+0000', + 'ns') + self.assertEqual(first_value, x[Timestamp(exp_ts)]) def test_comparators(self): index = self.dateIndex @@ -1585,41 +1585,47 @@ def test_string_index_repr(self): expected = u"""Index(['あ', 'いい', 'ううう'], dtype='object')""" self.assertEqual(repr(idx), expected) else: - expected = u"""\ -Index([u'あ', u'いい', u'ううう'], dtype='object')""" + expected = u"""Index([u'あ', u'いい', u'ううう'], dtype='object')""" self.assertEqual(coerce(idx), expected) # multiple lines idx = pd.Index([u'あ', u'いい', u'ううう'] * 10) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - dtype='object')""" - + expected = (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + u"'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + u"'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう'],\n" + u" dtype='object')") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', - u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], - dtype='object')""" - + expected = (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい', u'ううう', u'あ',\n" + u" u'いい', u'ううう', u'あ', u'いい', u'ううう', " + u"u'あ', u'いい', u'ううう', u'あ', u'いい',\n" + u" u'ううう', u'あ', u'いい', u'ううう', u'あ', " + u"u'いい', u'ううう', u'あ', u'いい', u'ううう'],\n" + u" dtype='object')") self.assertEqual(coerce(idx), expected) # truncated idx = pd.Index([u'あ', u'いい', u'ううう'] * 100) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', - ... - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - dtype='object', length=300)""" - + expected = (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + u"'あ', 'いい', 'ううう', 'あ',\n" + u" ...\n" + u" 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう'],\n" + u" dtype='object', length=300)") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', - ... - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], - dtype='object', length=300)""" + expected = (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい', u'ううう', u'あ',\n" + u" ...\n" + u" u'ううう', u'あ', u'いい', u'ううう', u'あ', " + u"u'いい', u'ううう', u'あ', u'いい', u'ううう'],\n" + u" dtype='object', length=300)") self.assertEqual(coerce(idx), expected) @@ -1629,49 +1635,62 @@ def test_string_index_repr(self): # short idx = pd.Index([u'あ', u'いい', u'ううう']) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう'], dtype='object')""" + expected = (u"Index(['あ', 'いい', 'ううう'], " + u"dtype='object')") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう'], dtype='object')""" + expected = (u"Index([u'あ', u'いい', u'ううう'], " + u"dtype='object')") self.assertEqual(coerce(idx), expected) # multiple lines idx = pd.Index([u'あ', u'いい', u'ううう'] * 10) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう'], - dtype='object')""" + expected = (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう'],\n" + u" dtype='object')""") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', - u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], - dtype='object')""" + expected = (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい',\n" + u" u'ううう', u'あ', u'いい', u'ううう', " + u"u'あ', u'いい', u'ううう', u'あ',\n" + u" u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい',\n" + u" u'ううう', u'あ', u'いい', u'ううう', " + u"u'あ', u'いい', u'ううう'],\n" + u" dtype='object')") self.assertEqual(coerce(idx), expected) # truncated idx = pd.Index([u'あ', u'いい', u'ううう'] * 100) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', - ... - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう'], - dtype='object', length=300)""" + expected = (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ',\n" + u" ...\n" + u" 'ううう', 'あ', 'いい', 'ううう', 'あ', " + u"'いい', 'ううう', 'あ', 'いい',\n" + u" 'ううう'],\n" + u" dtype='object', length=300)") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', - u'ううう', u'あ', - ... - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', - u'いい', u'ううう'], - dtype='object', length=300)""" + expected = (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい',\n" + u" u'ううう', u'あ',\n" + u" ...\n" + u" u'ううう', u'あ', u'いい', u'ううう', " + u"u'あ', u'いい', u'ううう', u'あ',\n" + u" u'いい', u'ううう'],\n" + u" dtype='object', length=300)") self.assertEqual(coerce(idx), expected) From 0067b90e593b4ef87c4ef4bc0a65bf4c4fd3d6b3 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Sun, 24 Jul 2016 22:51:12 +0900 Subject: [PATCH 152/359] BUG: value_counts may raise OutOfBoundsDatetime (#13772) --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/core/series.py | 12 +++++++----- pandas/indexes/base.py | 8 +++++--- pandas/tests/indexes/test_datetimelike.py | 16 ++++++++++++++-- pandas/tests/test_algos.py | 18 +++++++++++++++--- 5 files changed, 43 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 646e8822ed46f..5f9ac3cc600fc 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -747,6 +747,8 @@ Bug Fixes - Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`) - Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`) - Bug in ``DatetimeIndex`` with nanosecond frequency does not include timestamp specified with ``end`` (:issue:`13672`) +- Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) +- Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/core/series.py b/pandas/core/series.py index c3f5b1b8e641c..e1cff96b9741e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -291,11 +291,13 @@ def _set_axis(self, axis, labels, fastpath=False): if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): - labels = DatetimeIndex(labels) - - # need to set here becuase we changed the index - if fastpath: - self._data.set_axis(axis, labels) + try: + labels = DatetimeIndex(labels) + # need to set here becuase we changed the index + if fastpath: + self._data.set_axis(axis, labels) + except tslib.OutOfBoundsDatetime: + pass self._set_subtyp(is_all_dates) object.__setattr__(self, '_index', labels) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 850d049ef9f45..b5ce456bda254 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -258,13 +258,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, pass elif inferred != 'string': if inferred.startswith('datetime'): - if (lib.is_datetime_with_singletz_array(subarr) or 'tz' in kwargs): # only when subarr has the same tz from pandas.tseries.index import DatetimeIndex - return DatetimeIndex(subarr, copy=copy, name=name, - **kwargs) + try: + return DatetimeIndex(subarr, copy=copy, + name=name, **kwargs) + except tslib.OutOfBoundsDatetime: + pass elif inferred.startswith('timedelta'): from pandas.tseries.tdi import TimedeltaIndex diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 378e8c545ec83..9371bef8b8f2e 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from datetime import timedelta, time +from datetime import datetime, timedelta, time import numpy as np @@ -12,7 +12,7 @@ import pandas.util.testing as tm import pandas as pd -from pandas.lib import Timestamp +from pandas.tslib import Timestamp, OutOfBoundsDatetime from .common import Base @@ -336,6 +336,18 @@ def test_construction_base_constructor(self): tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) + def test_construction_outofbounds(self): + # GH 13663 + dates = [datetime(3000, 1, 1), datetime(4000, 1, 1), + datetime(5000, 1, 1), datetime(6000, 1, 1)] + exp = Index(dates, dtype=object) + # coerces to object + tm.assert_index_equal(Index(dates), exp) + + with tm.assertRaises(OutOfBoundsDatetime): + # can't create DatetimeIndex + DatetimeIndex(dates) + def test_astype(self): # GH 13149, GH 13209 idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cf23d096d99ba..3c77d19aa7f3c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,7 +4,7 @@ import numpy as np from numpy.random import RandomState from numpy import nan -import datetime +from datetime import datetime from pandas import Series, Categorical, CategoricalIndex, Index import pandas as pd @@ -121,7 +121,7 @@ def test_mixed_integer(self): def test_unsortable(self): # GH 13714 - arr = np.array([1, 2, datetime.datetime.now(), 0, 3], dtype=object) + arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) if compat.PY2 and not pd._np_version_under1p10: # RuntimeWarning: tp_compare didn't return -1 or -2 for exception with tm.assert_produces_warning(RuntimeWarning): @@ -556,6 +556,18 @@ def test_value_counts_nat(self): tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) + def test_value_counts_datetime_outofbounds(self): + # GH 13663 + s = pd.Series([datetime(3000, 1, 1), datetime(5000, 1, 1), + datetime(5000, 1, 1), datetime(6000, 1, 1), + datetime(3000, 1, 1), datetime(3000, 1, 1)]) + res = s.value_counts() + + exp_index = pd.Index([datetime(3000, 1, 1), datetime(5000, 1, 1), + datetime(6000, 1, 1)], dtype=object) + exp = pd.Series([3, 2, 1], index=exp_index) + tm.assert_series_equal(res, exp) + def test_categorical(self): s = Series(pd.Categorical(list('aaabbc'))) result = s.value_counts() @@ -818,7 +830,7 @@ def _check(arr): def test_pad_backfill_object_segfault(): old = np.array([], dtype='O') - new = np.array([datetime.datetime(2010, 12, 31)], dtype='O') + new = np.array([datetime(2010, 12, 31)], dtype='O') result = _algos.pad_object(old, new) expected = np.array([-1], dtype=np.int64) From 7f02a46b973c1cfd2f6042e2d5a89922e38af5ef Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Sun, 24 Jul 2016 22:52:46 +0900 Subject: [PATCH 153/359] TST: add timedelta describe (#13769) --- pandas/tests/frame/test_analytics.py | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b71235a8f6576..370f3b5ee5b8b 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -295,6 +295,43 @@ def test_describe_datetime_columns(self): self.assertEqual(result.columns.freq, 'MS') self.assertEqual(result.columns.tz, expected.columns.tz) + def test_describe_timedelta_values(self): + # GH 6145 + t1 = pd.timedelta_range('1 days', freq='D', periods=5) + t2 = pd.timedelta_range('1 hours', freq='H', periods=5) + df = pd.DataFrame({'t1': t1, 't2': t2}) + + expected = DataFrame({'t1': [5, pd.Timedelta('3 days'), + df.iloc[:, 0].std(), + pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Timedelta('3 days'), + pd.Timedelta('4 days'), + pd.Timedelta('5 days')], + 't2': [5, pd.Timedelta('3 hours'), + df.iloc[:, 1].std(), + pd.Timedelta('1 hours'), + pd.Timedelta('2 hours'), + pd.Timedelta('3 hours'), + pd.Timedelta('4 hours'), + pd.Timedelta('5 hours')]}, + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max']) + + res = df.describe() + tm.assert_frame_equal(res, expected) + + exp_repr = (" t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00") + self.assertEqual(repr(res), exp_repr) + def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame({ From 6cae23dae668e1e5ac465f5868557f00dfcb5e77 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 24 Jul 2016 14:58:09 +0900 Subject: [PATCH 154/359] BUG: datetime64[us] arrays with NaT cannot be cast to DatetimeIndex closes #13770 closes #9114 xref #13692 --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/tseries/tests/test_timeseries.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 5f9ac3cc600fc..6fc87144a5da1 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -747,8 +747,10 @@ Bug Fixes - Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`) - Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`) - Bug in ``DatetimeIndex`` with nanosecond frequency does not include timestamp specified with ``end`` (:issue:`13672`) + - Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) - Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) +- Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 511987e4db886..7e17513a36394 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1595,6 +1595,17 @@ def test_dti_constructor_small_int(self): arr = np.array([0, 10, 20], dtype=dtype) tm.assert_index_equal(DatetimeIndex(arr), exp) + def test_dti_constructor_numpy_timeunits(self): + # GH 9114 + base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT']) + + for dtype in ['datetime64[h]', 'datetime64[m]', 'datetime64[s]', + 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: + values = base.values.astype(dtype) + + tm.assert_index_equal(DatetimeIndex(values), base) + tm.assert_index_equal(to_datetime(values), base) + def test_normalize(self): rng = date_range('1/1/2000 9:30', periods=10, freq='D') From ee6c0cdbca17fb4a852fc8099e79c49faa662687 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 24 Jul 2016 09:55:42 -0400 Subject: [PATCH 155/359] BUG: Fix segfault in lib.isnullobj Weird segfault arises when you call `lib.isnullobj` with an array that uses 0-field values to mean `None`. Changed input to be a `Python` object (i.e. no typing), and the segfault went away. Discovered when there were segfaults in printing a `DataFrame` containing such an array. Closes #13717. Author: gfyoung Closes #13764 from gfyoung/isnullobj-segfault and squashes the following commits: 0338b5d [gfyoung] BUG: Fix segfault in lib.isnullobj --- asv_bench/benchmarks/frame_methods.py | 16 +++++++++ doc/source/whatsnew/v0.19.0.txt | 1 + pandas/lib.pyx | 16 ++++++--- pandas/tests/test_lib.py | 50 +++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 5c5a1df4ea1f8..a21dee2e612d2 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,4 +1,5 @@ from .pandas_vb_common import * +import string class frame_apply_axis_1(object): @@ -606,6 +607,21 @@ def time_frame_isnull(self): isnull(self.df) +class frame_isnull_strings(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.sample = np.array(list(string.ascii_lowercase) + + list(string.ascii_uppercase) + + list(string.whitespace)) + self.data = np.random.choice(self.sample, (1000, 1000)) + self.df = DataFrame(self.data) + + def time_frame_isnull(self): + isnull(self.df) + + class frame_isnull_obj(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 6fc87144a5da1..fdce84817e436 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -694,6 +694,7 @@ Bug Fixes - Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) - Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) +- Bug in printing ``pd.DataFrame`` where unusual elements with the ``object`` dtype were causing segfaults (:issue:`13717`) - Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) - Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) - Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 7cbb502315b64..bf1dd1246120b 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -342,11 +342,13 @@ def item_from_zerodim(object val): @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj(ndarray[object] arr): +def isnullobj(ndarray arr): cdef Py_ssize_t i, n cdef object val cdef ndarray[uint8_t] result + assert arr.ndim == 1, "'arr' must be 1-D." + n = len(arr) result = np.empty(n, dtype=np.uint8) for i from 0 <= i < n: @@ -356,11 +358,13 @@ def isnullobj(ndarray[object] arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj_old(ndarray[object] arr): +def isnullobj_old(ndarray arr): cdef Py_ssize_t i, n cdef object val cdef ndarray[uint8_t] result + assert arr.ndim == 1, "'arr' must be 1-D." + n = len(arr) result = np.zeros(n, dtype=np.uint8) for i from 0 <= i < n: @@ -370,11 +374,13 @@ def isnullobj_old(ndarray[object] arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj2d(ndarray[object, ndim=2] arr): +def isnullobj2d(ndarray arr): cdef Py_ssize_t i, j, n, m cdef object val cdef ndarray[uint8_t, ndim=2] result + assert arr.ndim == 2, "'arr' must be 2-D." + n, m = ( arr).shape result = np.zeros((n, m), dtype=np.uint8) for i from 0 <= i < n: @@ -386,11 +392,13 @@ def isnullobj2d(ndarray[object, ndim=2] arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj2d_old(ndarray[object, ndim=2] arr): +def isnullobj2d_old(ndarray arr): cdef Py_ssize_t i, j, n, m cdef object val cdef ndarray[uint8_t, ndim=2] result + assert arr.ndim == 2, "'arr' must be 2-D." + n, m = ( arr).shape result = np.zeros((n, m), dtype=np.uint8) for i from 0 <= i < n: diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 84d7226f1b2f5..80b5e41e881cd 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import numpy as np +import pandas as pd import pandas.lib as lib import pandas.util.testing as tm @@ -184,6 +185,55 @@ def test_get_reverse_indexer(self): self.assertTrue(np.array_equal(result, expected)) +class TestNullObj(tm.TestCase): + + _1d_methods = ['isnullobj', 'isnullobj_old'] + _2d_methods = ['isnullobj2d', 'isnullobj2d_old'] + + def _check_behavior(self, arr, expected): + for method in TestNullObj._1d_methods: + result = getattr(lib, method)(arr) + tm.assert_numpy_array_equal(result, expected) + + arr = np.atleast_2d(arr) + expected = np.atleast_2d(expected) + + for method in TestNullObj._2d_methods: + result = getattr(lib, method)(arr) + tm.assert_numpy_array_equal(result, expected) + + def test_basic(self): + arr = np.array([1, None, 'foo', -5.1, pd.NaT, np.nan]) + expected = np.array([False, True, False, False, True, True]) + + self._check_behavior(arr, expected) + + def test_non_obj_dtype(self): + arr = np.array([1, 3, np.nan, 5], dtype=float) + expected = np.array([False, False, True, False]) + + self._check_behavior(arr, expected) + + def test_empty_arr(self): + arr = np.array([]) + expected = np.array([], dtype=bool) + + self._check_behavior(arr, expected) + + def test_empty_str_inp(self): + arr = np.array([""]) # empty but not null + expected = np.array([False]) + + self._check_behavior(arr, expected) + + def test_empty_like(self): + # see gh-13717: no segfaults! + arr = np.empty_like([None]) + expected = np.array([True]) + + self._check_behavior(arr, expected) + + def test_duplicated_with_nas(): keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) From 6efd743ccbf2ef6c13ea0c71b7b2e2a022a99455 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Thu, 2 Jun 2016 16:51:05 -0700 Subject: [PATCH 156/359] ENH: Adding lines to read_json closes #9180 closes #13356 closes #13351 Author: Andy R. Terrel --- doc/source/io.rst | 23 +++++++++++ doc/source/whatsnew/v0.19.0.txt | 3 +- pandas/core/generic.py | 12 +++++- pandas/io/json.py | 63 +++++++++++++++++++++++++---- pandas/io/tests/json/test_pandas.py | 52 ++++++++++++++++++++++++ 5 files changed, 142 insertions(+), 11 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 113afa32d182e..86da2561a36be 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1466,6 +1466,7 @@ with optional parameters: - ``force_ascii`` : force encoded string to be ASCII, default True. - ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. - ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object. +- ``lines`` : If ``records`` orient, then will write each record per line as json. Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters. @@ -1656,6 +1657,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` None. By default the timestamp precision will be detected, if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to seconds, milliseconds, microseconds or nanoseconds respectively. +- ``lines`` : reads file as one json object per line. +- ``encoding`` : The encoding to use to decode py3 bytes. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. @@ -1845,6 +1848,26 @@ into a flat table. json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) +.. _io.jsonl: + +Line delimited json +''''''''''''''''''' + +.. versionadded:: 0.19.0 + +pandas is able to read and write line-delimited json files that are common in data processing pipelines +using Hadoop or Spark. + +.. ipython:: python + + jsonl = ''' + {"a":1,"b":2} + {"a":3,"b":4} + ''' + df = pd.read_json(jsonl, lines=True) + df + df.to_json(orient='records', lines=True) + HTML ---- diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index fdce84817e436..8f369d17e9230 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -325,6 +325,7 @@ Other enhancements .. _whatsnew_0190.api: + API changes ~~~~~~~~~~~ @@ -344,7 +345,7 @@ API changes - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) - More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) - ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) - +- The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) .. _whatsnew_0190.api.tolist: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e7a098351a0ab..005d5467c14cd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1017,7 +1017,7 @@ def __setstate__(self, state): def to_json(self, path_or_buf=None, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None): + default_handler=None, lines=False): """ Convert the object to a JSON string. @@ -1065,6 +1065,13 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', Handler to call if object cannot otherwise be converted to a suitable format for JSON. Should receive a single argument which is the object to convert and return a serialisable object. + lines : boolean, defalut False + If 'orient' is 'records' write out line delimited json format. Will + throw ValueError if incorrect 'orient' since others are not list + like. + + .. versionadded:: 0.19.0 + Returns ------- @@ -1077,7 +1084,8 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', date_format=date_format, double_precision=double_precision, force_ascii=force_ascii, date_unit=date_unit, - default_handler=default_handler) + default_handler=default_handler, + lines=lines) def to_hdf(self, path_or_buf, key, **kwargs): """Activate the HDFStore. diff --git a/pandas/io/json.py b/pandas/io/json.py index fd97e51208f7e..e697351484f68 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -7,22 +7,25 @@ import pandas.json as _json from pandas.tslib import iNaT -from pandas.compat import long, u +from pandas.compat import StringIO, long, u from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, _get_handle from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing loads = _json.loads dumps = _json.dumps -# interface to/from - +# interface to/from def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None): + default_handler=None, lines=False): + + if lines and orient != 'records': + raise ValueError( + "'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( @@ -37,6 +40,9 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', else: raise NotImplementedError("'obj' should be a Series or a DataFrame") + if lines: + s = _convert_to_line_delimits(s) + if isinstance(path_or_buf, compat.string_types): with open(path_or_buf, 'w') as fh: fh.write(s) @@ -105,7 +111,8 @@ def _format_axes(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, - numpy=False, precise_float=False, date_unit=None): + numpy=False, precise_float=False, date_unit=None, encoding=None, + lines=False): """ Convert a JSON string to pandas object @@ -178,13 +185,23 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. + lines : boolean, default False + Read the file as a json object per line. + + .. versionadded:: 0.19.0 + + encoding : str, default is 'utf-8' + The encoding to use to decode py3 bytes. + + .. versionadded:: 0.19.0 Returns ------- result : Series or DataFrame """ - filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf) + filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, + encoding=encoding) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) @@ -195,7 +212,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, exists = False if exists: - with open(filepath_or_buffer, 'r') as fh: + with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh: json = fh.read() else: json = filepath_or_buffer @@ -204,6 +221,12 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, else: json = filepath_or_buffer + if lines: + # If given a json lines file, we break the string into lines, add + # commas and put it in a json list to make a valid json object. + lines = list(StringIO(json.strip())) + json = u'[' + u','.join(lines) + u']' + obj = None if typ == 'frame': obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, @@ -574,6 +597,30 @@ def is_ok(col): # JSON normalization routines +def _convert_to_line_delimits(s): + """Helper function that converts json lists to line delimited json.""" + + # Determine we have a JSON list to turn to lines otherwise just return the + # json object, only lists can + if not s[0] == '[' and s[-1] == ']': + return s + s = s[1:-1] + num_open_brackets_seen = 0 + commas_to_replace = [] + for idx, char in enumerate(s): # iter through to find all + if char == ',': # commas that should be \n + if num_open_brackets_seen == 0: + commas_to_replace.append(idx) + elif char == '{': + num_open_brackets_seen += 1 + elif char == '}': + num_open_brackets_seen -= 1 + s_arr = np.array(list(s)) # Turn to an array to set + s_arr[commas_to_replace] = '\n' # all commas at once. + s = ''.join(s_arr) + return s + + def nested_to_record(ds, prefix="", level=0): """a simplified json_normalize diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 9f8aedc2e399e..6516ced7b5fb7 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -948,6 +948,58 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + def test_read_jsonl(self): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_to_jsonl(self): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + self.assertEqual(result, expected) + + def test_latin_encoding(self): + if compat.PY2: + self.assertRaisesRegexp( + TypeError, '\[unicode\] is not implemented as a table column') + return + + values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'a', b'b', b'c'], + [b'EE, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], + [b'', b'a', b'b', b'c'], + [b'\xf8\xfc', b'a', b'b', b'c'], + [b'A\xf8\xfc', b'', b'a', b'b', b'c'], + [np.nan, b'', b'b', b'c'], + [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] + + def _try_decode(x, encoding='latin-1'): + try: + return x.decode(encoding) + except AttributeError: + return x + + # not sure how to remove latin-1 from code in python 2 and 3 + values = [[_try_decode(x) for x in y] for y in values] + + examples = [] + for dtype in ['category', object]: + for val in values: + examples.append(Series(val, dtype=dtype)) + + def roundtrip(s, encoding='latin-1'): + with ensure_clean('test.json') as path: + s.to_json(path, encoding=encoding) + retr = read_json(path, encoding=encoding) + assert_series_equal(s, retr, check_categorical=False) + + for s in examples: + roundtrip(s) + if __name__ == '__main__': import nose From 964b7bba7f5878c79130479f75461c58dd0c4b3e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 24 Jul 2016 11:25:33 -0400 Subject: [PATCH 157/359] TST: skip .to_json with encoding test as not implemented yet xref #13351 xref #13774 --- pandas/io/tests/json/test_pandas.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 6516ced7b5fb7..d5eca946a3e7e 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -1,4 +1,5 @@ # pylint: disable-msg=W0612,E1101 +import nose from pandas.compat import range, lrange, StringIO, OrderedDict import os @@ -967,6 +968,9 @@ def test_latin_encoding(self): TypeError, '\[unicode\] is not implemented as a table column') return + # GH 13774 + raise nose.SkipTest("encoding not implemented in .to_json(), xref #13774") + values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], [b'E\xc9, 17', b'a', b'b', b'c'], [b'EE, 17', b'', b'a', b'b', b'c'], From 42cc66d651d8070abc53a0c7d1dc30a777095923 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Mon, 25 Jul 2016 00:34:24 +0900 Subject: [PATCH 158/359] PERF/BUG: improve factorize for datetimetz (#13750) --- asv_bench/benchmarks/algorithms.py | 17 ++++++++++++++ asv_bench/benchmarks/period.py | 16 +++++++++++++ asv_bench/benchmarks/timeseries.py | 20 ++++++++-------- doc/source/whatsnew/v0.19.0.txt | 3 +++ pandas/core/algorithms.py | 5 ++-- pandas/tseries/tests/test_timeseries.py | 31 +++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 13 deletions(-) create mode 100644 asv_bench/benchmarks/algorithms.py diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py new file mode 100644 index 0000000000000..310a4c5549e4f --- /dev/null +++ b/asv_bench/benchmarks/algorithms.py @@ -0,0 +1,17 @@ +import numpy as np +import pandas as pd + + +class algorithm(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + self.int = pd.Int64Index(np.arange(N).repeat(5)) + self.float = pd.Float64Index(np.random.randn(N).repeat(5)) + + def time_int_factorize(self): + self.int.factorize() + + def time_float_factorize(self): + self.int.factorize() diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index c1b89ae1db75b..75b2c2dcacfed 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -9,6 +9,22 @@ def time_period_index(self): PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D') +class period_setitem(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = period_range(start='1/1/1990', freq='S', periods=20000) + self.df = DataFrame(index=range(len(self.rng))) + + def time_period_setitem(self): + self.df['col'] = self.rng + + class period_algorithm(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 2b0d098670858..fda6ebb4b437e 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -218,20 +218,20 @@ def time_dti_reset_index_tz(self): self.df.reset_index() -class period_setitem(object): +class datetime_algorithm(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = period_range(start='1/1/1990', freq='S', periods=20000) - self.df = DataFrame(index=range(len(self.rng))) + N = 100000 + self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5) + self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N, + tz='Asia/Tokyo').repeat(5) + + def time_dti_factorize(self): + self.dti.factorize() - def time_period_setitem(self): - self.df['col'] = self.rng + def time_dti_tz_factorize(self): + self.dti_tz.factorize() class timeseries_1min_5min_mean(object): diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8f369d17e9230..d7966e3382869 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -652,6 +652,8 @@ Performance Improvements - Improved performance of ``Index.difference`` (:issue:`12044`) - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) - Improved performance of hashing ``Period`` (:issue:`12817`) +- Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`) + .. _whatsnew_0190.bug_fixes: @@ -738,6 +740,7 @@ Bug Fixes - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - Clean some compile time warnings in datetime parsing (:issue:`13607`) +- Bug in ``factorize`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13750`) - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5cc54e61f6b2a..96a8582102cc9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -293,7 +293,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): is_datetimetz_type = is_datetimetz(values) if is_datetimetz_type: values = DatetimeIndex(values) - vals = values.tz_localize(None) + vals = values.asi8 is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) @@ -313,8 +313,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if is_datetimetz_type: # reset tz - uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( - values.tz) + uniques = values._shallow_copy(uniques) elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 7e17513a36394..7b9999bd05c83 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3826,6 +3826,37 @@ def test_factorize(self): self.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + def test_factorize_tz(self): + # GH 13750 + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + base = pd.date_range('2016-11-05', freq='H', periods=100, tz=tz) + idx = base.repeat(5) + + exp_arr = np.arange(100).repeat(5) + + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(res, base) + + def test_factorize_dst(self): + # GH 13750 + idx = pd.date_range('2016-11-06', freq='H', periods=12, + tz='US/Eastern') + + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + self.assert_numpy_array_equal(arr, np.arange(12)) + tm.assert_index_equal(res, idx) + + idx = pd.date_range('2016-06-13', freq='H', periods=12, + tz='US/Eastern') + + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + self.assert_numpy_array_equal(arr, np.arange(12)) + tm.assert_index_equal(res, idx) + def test_slice_with_negative_step(self): ts = Series(np.arange(20), date_range('2014-01-01', periods=20, freq='MS')) From 4556957c1ba748650e0fa2a1757965db317f56ac Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 24 Jul 2016 11:44:52 -0400 Subject: [PATCH 159/359] TST, COMPAT: Make MMapWrapper tests Windows compatible (#13732) --- pandas/io/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 5740944558a5d..0acf3244fe8fa 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -138,6 +138,6 @@ def test_next(self): for line in lines: next_line = next(wrapper) - self.assertEqual(next_line, line) + self.assertEqual(next_line.strip(), line.strip()) self.assertRaises(StopIteration, next, wrapper) From be7485fec09473b78dbd9c80aa4c525de9a296ac Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 24 Jul 2016 11:54:11 -0400 Subject: [PATCH 160/359] PEP: for for json skiptest --- pandas/io/tests/json/test_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index d5eca946a3e7e..96756a0b2d74b 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -969,7 +969,8 @@ def test_latin_encoding(self): return # GH 13774 - raise nose.SkipTest("encoding not implemented in .to_json(), xref #13774") + raise nose.SkipTest("encoding not implemented in .to_json(), " + "xref #13774") values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], [b'E\xc9, 17', b'a', b'b', b'c'], @@ -1006,6 +1007,5 @@ def roundtrip(s, encoding='latin-1'): if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], exit=False) From c1d8c09cca935d4e286ccdc7ca960a55dee2f476 Mon Sep 17 00:00:00 2001 From: yui-knk Date: Sun, 24 Jul 2016 11:57:19 -0400 Subject: [PATCH 161/359] BUG: Fix pd.Timedelta(None) to return NaT. Author: yui-knk Closes #13723 from yui-knk/timedelta_none and squashes the following commits: 46a0f38 [yui-knk] BUG: Fix pd.Timedelta(None) to return NaT. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tseries/tests/test_timedeltas.py | 2 ++ pandas/tslib.pyx | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d7966e3382869..81906ba044088 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -764,3 +764,4 @@ Bug Fixes - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) +- Bug in ``pd.Timedelta(None)`` raises ``ValueError``. This is different from ``pd.Timestamp(None)`` (:issue:`13687`) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 659101cb4cad2..0bdf8590ec487 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -188,6 +188,8 @@ def test_construction(self): self.assertEqual(Timedelta('').value, iNaT) self.assertEqual(Timedelta('nat').value, iNaT) self.assertEqual(Timedelta('NAT').value, iNaT) + self.assertEqual(Timedelta(None).value, iNaT) + self.assertEqual(Timedelta(np.nan).value, iNaT) self.assertTrue(isnull(Timedelta('nat'))) # offset diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 5f487eedd1683..bc42adbab62b1 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2592,10 +2592,10 @@ class Timedelta(_Timedelta): """ - def __new__(cls, object value=None, unit=None, **kwargs): + def __new__(cls, object value=_no_input, unit=None, **kwargs): cdef _Timedelta td_base - if value is None: + if value is _no_input: if not len(kwargs): raise ValueError("cannot construct a Timedelta without a value/unit or descriptive keywords (days,seconds....)") From 2c047d4fbe88e7462ec82314f58d014e7b0ad54e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 24 Jul 2016 12:01:26 -0400 Subject: [PATCH 162/359] DOC: whatsnew for #13723 --- doc/source/whatsnew/v0.19.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 81906ba044088..f3acf403a1d65 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -346,6 +346,7 @@ API changes - More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) - ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) +- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) .. _whatsnew_0190.api.tolist: @@ -764,4 +765,3 @@ Bug Fixes - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) -- Bug in ``pd.Timedelta(None)`` raises ``ValueError``. This is different from ``pd.Timestamp(None)`` (:issue:`13687`) From 81a5c987ae77a99e809a8ef446533321647c7bc9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 24 Jul 2016 20:03:22 +0200 Subject: [PATCH 163/359] CNL: remove the io.data and io.wb modules in favor of pandas-datareader (GH13724) (#13735) --- doc/source/remote_data.rst | 333 +- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/io/data.py | 1253 +---- pandas/io/tests/data/yahoo_options1.html | 6065 ---------------------- pandas/io/tests/data/yahoo_options2.html | 5853 --------------------- pandas/io/tests/data/yahoo_options3.html | 2807 ---------- pandas/io/tests/test_data.py | 586 --- pandas/io/tests/test_wb.py | 114 - pandas/io/wb.py | 320 +- 9 files changed, 22 insertions(+), 17311 deletions(-) delete mode 100644 pandas/io/tests/data/yahoo_options1.html delete mode 100644 pandas/io/tests/data/yahoo_options2.html delete mode 100644 pandas/io/tests/data/yahoo_options3.html delete mode 100644 pandas/io/tests/test_data.py delete mode 100644 pandas/io/tests/test_wb.py diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst index 842fcb6896680..019aa82fed1aa 100644 --- a/doc/source/remote_data.rst +++ b/doc/source/remote_data.rst @@ -2,34 +2,21 @@ .. currentmodule:: pandas -.. ipython:: python - :suppress: - - import os - import csv - import pandas as pd - - import numpy as np - np.random.seed(123456) - randn = np.random.randn - np.set_printoptions(precision=4, suppress=True) - - import matplotlib.pyplot as plt - plt.close('all') - - from pandas import * - options.display.max_rows=15 - import pandas.util.testing as tm - ****************** Remote Data Access ****************** .. _remote_data.pandas_datareader: -.. warning:: +DataReader +---------- - In pandas 0.17.0, the sub-package ``pandas.io.data`` will be removed in favor of a separately installable `pandas-datareader package `_. This will allow the data modules to be independently updated to your pandas installation. The API for ``pandas-datareader v0.1.1`` is the same as in ``pandas v0.16.1``. (:issue:`8961`) +The sub-package ``pandas.io.data`` is removed in favor of a separately +installable `pandas-datareader package +`_. This will allow the data +modules to be independently updated to your pandas installation. The API for +``pandas-datareader v0.1.1`` is the same as in ``pandas v0.16.1``. +(:issue:`8961`) You should replace the imports of the following: @@ -43,310 +30,6 @@ Remote Data Access from pandas_datareader import data, wb -.. _remote_data.data_reader: - -Functions from :mod:`pandas.io.data` and :mod:`pandas.io.ga` extract data from various Internet sources into a DataFrame. Currently the following sources are supported: - - - :ref:`Yahoo! Finance` - - :ref:`Google Finance` - - :ref:`St.Louis FED (FRED)` - - :ref:`Kenneth French's data library` - - :ref:`World Bank` - - :ref:`Google Analytics` - -It should be noted, that various sources support different kinds of data, so not all sources implement the same methods and the data elements returned might also differ. - -.. _remote_data.yahoo: - -Yahoo! Finance --------------- - -.. ipython:: python - :okwarning: - - import pandas.io.data as web - import datetime - start = datetime.datetime(2010, 1, 1) - end = datetime.datetime(2013, 1, 27) - f = web.DataReader("F", 'yahoo', start, end) - f.ix['2010-01-04'] - -.. _remote_data.yahoo_options: - -Yahoo! Finance Options ----------------------- -***Experimental*** - -The ``Options`` class allows the download of options data from Yahoo! Finance. - -The ``get_all_data`` method downloads and caches option data for all expiry months -and provides a formatted ``DataFrame`` with a hierarchical index, so it is easy to get -to the specific option you want. - -.. ipython:: python - - from pandas.io.data import Options - aapl = Options('aapl', 'yahoo') - data = aapl.get_all_data() - data.iloc[0:5, 0:5] - - # Show the $100 strike puts at all expiry dates: - data.loc[(100, slice(None), 'put'),:].iloc[0:5, 0:5] - - # Show the volume traded of $100 strike puts at all expiry dates: - data.loc[(100, slice(None), 'put'),'Vol'].head() - -If you don't want to download all the data, more specific requests can be made. - -.. ipython:: python - - import datetime - expiry = datetime.date(2016, 1, 1) - data = aapl.get_call_data(expiry=expiry) - data.iloc[0:5:, 0:5] - -Note that if you call ``get_all_data`` first, this second call will happen much faster, -as the data is cached. - -If a given expiry date is not available, data for the next available expiry will be -returned (January 15, 2015 in the above example). - -Available expiry dates can be accessed from the ``expiry_dates`` property. - -.. ipython:: python - - aapl.expiry_dates - data = aapl.get_call_data(expiry=aapl.expiry_dates[0]) - data.iloc[0:5:, 0:5] - -A list-like object containing dates can also be passed to the expiry parameter, -returning options data for all expiry dates in the list. - -.. ipython:: python - - data = aapl.get_near_stock_price(expiry=aapl.expiry_dates[0:3]) - data.iloc[0:5:, 0:5] - -The ``month`` and ``year`` parameters can be used to get all options data for a given month. - -.. _remote_data.google: - -Google Finance --------------- - -.. ipython:: python - - import pandas.io.data as web - import datetime - start = datetime.datetime(2010, 1, 1) - end = datetime.datetime(2013, 1, 27) - f = web.DataReader("F", 'google', start, end) - f.ix['2010-01-04'] - -.. _remote_data.fred: - -FRED ----- - -.. ipython:: python - - import pandas.io.data as web - import datetime - start = datetime.datetime(2010, 1, 1) - end = datetime.datetime(2013, 1, 27) - gdp=web.DataReader("GDP", "fred", start, end) - gdp.ix['2013-01-01'] - - # Multiple series: - inflation = web.DataReader(["CPIAUCSL", "CPILFESL"], "fred", start, end) - inflation.head() -.. _remote_data.ff: - -Fama/French ------------ - -Dataset names are listed at `Fama/French Data Library -`__. - -.. ipython:: python - - import pandas.io.data as web - ip = web.DataReader("5_Industry_Portfolios", "famafrench") - ip[4].ix[192607] - -.. _remote_data.wb: - -World Bank ----------- - -``pandas`` users can easily access thousands of panel data series from the -`World Bank's World Development Indicators `__ -by using the ``wb`` I/O functions. - -Indicators -~~~~~~~~~~ - -Either from exploring the World Bank site, or using the search function included, -every world bank indicator is accessible. - -For example, if you wanted to compare the Gross Domestic Products per capita in -constant dollars in North America, you would use the ``search`` function: - -.. code-block:: ipython - - In [1]: from pandas.io import wb - - In [2]: wb.search('gdp.*capita.*const').iloc[:,:2] - Out[2]: - id name - 3242 GDPPCKD GDP per Capita, constant US$, millions - 5143 NY.GDP.PCAP.KD GDP per capita (constant 2005 US$) - 5145 NY.GDP.PCAP.KN GDP per capita (constant LCU) - 5147 NY.GDP.PCAP.PP.KD GDP per capita, PPP (constant 2005 internation... - -Then you would use the ``download`` function to acquire the data from the World -Bank's servers: - -.. code-block:: ipython - - In [3]: dat = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008) - - In [4]: print(dat) - NY.GDP.PCAP.KD - country year - Canada 2008 36005.5004978584 - 2007 36182.9138439757 - 2006 35785.9698172849 - 2005 35087.8925933298 - Mexico 2008 8113.10219480083 - 2007 8119.21298908649 - 2006 7961.96818458178 - 2005 7666.69796097264 - United States 2008 43069.5819857208 - 2007 43635.5852068142 - 2006 43228.111147107 - 2005 42516.3934699993 - -The resulting dataset is a properly formatted ``DataFrame`` with a hierarchical -index, so it is easy to apply ``.groupby`` transformations to it: - -.. code-block:: ipython - - In [6]: dat['NY.GDP.PCAP.KD'].groupby(level=0).mean() - Out[6]: - country - Canada 35765.569188 - Mexico 7965.245332 - United States 43112.417952 - dtype: float64 - -Now imagine you want to compare GDP to the share of people with cellphone -contracts around the world. - -.. code-block:: ipython - - In [7]: wb.search('cell.*%').iloc[:,:2] - Out[7]: - id name - 3990 IT.CEL.SETS.FE.ZS Mobile cellular telephone users, female (% of ... - 3991 IT.CEL.SETS.MA.ZS Mobile cellular telephone users, male (% of po... - 4027 IT.MOB.COV.ZS Population coverage of mobile cellular telepho... - -Notice that this second search was much faster than the first one because -``pandas`` now has a cached list of available data series. - -.. code-block:: ipython - - In [13]: ind = ['NY.GDP.PCAP.KD', 'IT.MOB.COV.ZS'] - In [14]: dat = wb.download(indicator=ind, country='all', start=2011, end=2011).dropna() - In [15]: dat.columns = ['gdp', 'cellphone'] - In [16]: print(dat.tail()) - gdp cellphone - country year - Swaziland 2011 2413.952853 94.9 - Tunisia 2011 3687.340170 100.0 - Uganda 2011 405.332501 100.0 - Zambia 2011 767.911290 62.0 - Zimbabwe 2011 419.236086 72.4 - -Finally, we use the ``statsmodels`` package to assess the relationship between -our two variables using ordinary least squares regression. Unsurprisingly, -populations in rich countries tend to use cellphones at a higher rate: - -.. code-block:: ipython - - In [17]: import numpy as np - In [18]: import statsmodels.formula.api as smf - In [19]: mod = smf.ols("cellphone ~ np.log(gdp)", dat).fit() - In [20]: print(mod.summary()) - OLS Regression Results - ============================================================================== - Dep. Variable: cellphone R-squared: 0.297 - Model: OLS Adj. R-squared: 0.274 - Method: Least Squares F-statistic: 13.08 - Date: Thu, 25 Jul 2013 Prob (F-statistic): 0.00105 - Time: 15:24:42 Log-Likelihood: -139.16 - No. Observations: 33 AIC: 282.3 - Df Residuals: 31 BIC: 285.3 - Df Model: 1 - =============================================================================== - coef std err t P>|t| [95.0% Conf. Int.] - ------------------------------------------------------------------------------- - Intercept 16.5110 19.071 0.866 0.393 -22.384 55.406 - np.log(gdp) 9.9333 2.747 3.616 0.001 4.331 15.535 - ============================================================================== - Omnibus: 36.054 Durbin-Watson: 2.071 - Prob(Omnibus): 0.000 Jarque-Bera (JB): 119.133 - Skew: -2.314 Prob(JB): 1.35e-26 - Kurtosis: 11.077 Cond. No. 45.8 - ============================================================================== - -Country Codes -~~~~~~~~~~~~~ - -.. versionadded:: 0.15.1 - -The ``country`` argument accepts a string or list of mixed -`two `__ or `three `__ character -ISO country codes, as well as dynamic `World Bank exceptions `__ to the ISO standards. - -For a list of the the hard-coded country codes (used solely for error handling logic) see ``pandas.io.wb.country_codes``. - -Problematic Country Codes & Indicators -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. note:: - - The World Bank's country list and indicators are dynamic. As of 0.15.1, - :func:`wb.download()` is more flexible. To achieve this, the warning - and exception logic changed. - -The world bank converts some country codes in their response, which makes error -checking by pandas difficult. Retired indicators still persist in the search. - -Given the new flexibility of 0.15.1, improved error handling by the user -may be necessary for fringe cases. - -To help identify issues: - -There are at least 4 kinds of country codes: - -1. Standard (2/3 digit ISO) - returns data, will warn and error properly. -2. Non-standard (WB Exceptions) - returns data, but will falsely warn. -3. Blank - silently missing from the response. -4. Bad - causes the entire response from WB to fail, always exception inducing. - -There are at least 3 kinds of indicators: - -1. Current - Returns data. -2. Retired - Appears in search results, yet won't return data. -3. Bad - Will not return data. - -Use the ``errors`` argument to control warnings and exceptions. Setting -errors to ignore or warn, won't stop failed responses. (ie, 100% bad -indicators, or a single "bad" (#4 above) country code). - -See docstrings for more info. .. _remote_data.ga: diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f3acf403a1d65..317383e866464 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -616,6 +616,8 @@ Deprecations Removal of prior version deprecations/changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) +- The ``pandas.io.data`` and ``pandas.io.wb`` modules are removed in favor of + the `pandas-datareader package `__ (:issue:`13724`). - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) diff --git a/pandas/io/data.py b/pandas/io/data.py index 68151fbb091fa..e76790a6ab98b 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -1,1247 +1,6 @@ -""" -Module contains tools for collecting data from various remote sources - - -""" -# flake8: noqa - -import warnings -import tempfile -import datetime as dt -import time - -from collections import defaultdict - -import numpy as np - -from pandas.compat import( - StringIO, bytes_to_str, range, lmap, zip -) -import pandas.compat as compat -from pandas import Panel, DataFrame, Series, read_csv, concat, to_datetime, DatetimeIndex, DateOffset - -from pandas.types.common import is_list_like -from pandas.core.common import PandasError -from pandas.io.common import urlopen, ZipFile, urlencode -from pandas.tseries.offsets import MonthEnd -from pandas.util.testing import _network_error_classes -from pandas.io.html import read_html - -warnings.warn("\n" - "The pandas.io.data module is moved to a separate package " - "(pandas-datareader) and will be removed from pandas in a " - "future version.\nAfter installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.", - FutureWarning) - -class SymbolWarning(UserWarning): - pass - - -class RemoteDataError(PandasError, IOError): - pass - - -def DataReader(name, data_source=None, start=None, end=None, - retry_count=3, pause=0.001): - """ - Imports data from a number of online sources. - - Currently supports Yahoo! Finance, Google Finance, St. Louis FED (FRED) - and Kenneth French's data library. - - Parameters - ---------- - name : str or list of strs - the name of the dataset. Some data sources (yahoo, google, fred) will - accept a list of names. - data_source: str, default: None - the data source ("yahoo", "google", "fred", or "ff") - start : datetime, default: None - left boundary for range (defaults to 1/1/2010) - end : datetime, default: None - right boundary for range (defaults to today) - retry_count : int, default 3 - Number of times to retry query request. - pause : numeric, default 0.001 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - - Examples - ---------- - - # Data from Yahoo! Finance - gs = DataReader("GS", "yahoo") - - # Data from Google Finance - aapl = DataReader("AAPL", "google") - - # Data from FRED - vix = DataReader("VIXCLS", "fred") - - # Data from Fama/French - ff = DataReader("F-F_Research_Data_Factors", "famafrench") - ff = DataReader("F-F_Research_Data_Factors_weekly", "famafrench") - ff = DataReader("6_Portfolios_2x3", "famafrench") - ff = DataReader("F-F_ST_Reversal_Factor", "famafrench") - """ - start, end = _sanitize_dates(start, end) - - if data_source == "yahoo": - return get_data_yahoo(symbols=name, start=start, end=end, - adjust_price=False, chunksize=25, - retry_count=retry_count, pause=pause) - elif data_source == "google": - return get_data_google(symbols=name, start=start, end=end, - adjust_price=False, chunksize=25, - retry_count=retry_count, pause=pause) - elif data_source == "fred": - return get_data_fred(name, start, end) - elif data_source == "famafrench": - return get_data_famafrench(name) - - -def _sanitize_dates(start, end): - from pandas.core.datetools import to_datetime - start = to_datetime(start) - end = to_datetime(end) - if start is None: - start = dt.datetime(2010, 1, 1) - if end is None: - end = dt.datetime.today() - return start, end - - -def _in_chunks(seq, size): - """ - Return sequence in 'chunks' of size defined by size - """ - return (seq[pos:pos + size] for pos in range(0, len(seq), size)) - - -_yahoo_codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', - 'time': 't1', 'short_ratio': 's7'} - - -_YAHOO_QUOTE_URL = 'http://finance.yahoo.com/d/quotes.csv?' - - -def get_quote_yahoo(symbols): - """ - Get current yahoo quote - - Returns a DataFrame - """ - if isinstance(symbols, compat.string_types): - sym_list = symbols - else: - sym_list = '+'.join(symbols) - - # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm - request = ''.join(compat.itervalues(_yahoo_codes)) # code request string - header = list(_yahoo_codes.keys()) - - data = defaultdict(list) - - url_str = _YAHOO_QUOTE_URL + 's=%s&f=%s' % (sym_list, request) - - with urlopen(url_str) as url: - lines = url.readlines() - - for line in lines: - fields = line.decode('utf-8').strip().split(',') - for i, field in enumerate(fields): - if field[-2:] == '%"': - v = float(field.strip('"%')) - elif field[0] == '"': - v = field.strip('"') - else: - try: - v = float(field) - except ValueError: - v = field - data[header[i]].append(v) - - idx = data.pop('symbol') - return DataFrame(data, index=idx) - - -def get_quote_google(symbols): - raise NotImplementedError("Google Finance doesn't have this functionality") - - -def _retry_read_url(url, retry_count, pause, name): - for _ in range(retry_count): - time.sleep(pause) - - # kludge to close the socket ASAP - try: - with urlopen(url) as resp: - lines = resp.read() - except _network_error_classes: - pass - else: - rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, - parse_dates=True, na_values='-')[::-1] - # Yahoo! Finance sometimes does this awesome thing where they - # return 2 rows for the most recent business day - if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover - rs = rs[:-1] - - #Get rid of unicode characters in index name. - try: - rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore') - except AttributeError: - #Python 3 string has no decode method. - rs.index.name = rs.index.name.encode('ascii', 'ignore').decode() - - return rs - - raise IOError("after %d tries, %s did not " - "return a 200 for url %r" % (retry_count, name, url)) - - -_HISTORICAL_YAHOO_URL = 'http://ichart.finance.yahoo.com/table.csv?' - - -def _get_hist_yahoo(sym, start, end, interval, retry_count, pause): - """ - Get historical data for the given name from yahoo. - Date format is datetime - - Returns a DataFrame. - """ - start, end = _sanitize_dates(start, end) - url = (_HISTORICAL_YAHOO_URL + 's=%s' % sym + - '&a=%s' % (start.month - 1) + - '&b=%s' % start.day + - '&c=%s' % start.year + - '&d=%s' % (end.month - 1) + - '&e=%s' % end.day + - '&f=%s' % end.year + - '&g=%s' % interval + - '&ignore=.csv') - return _retry_read_url(url, retry_count, pause, 'Yahoo!') - - -_HISTORICAL_GOOGLE_URL = 'http://www.google.com/finance/historical?' - - -def _get_hist_google(sym, start, end, interval, retry_count, pause): - """ - Get historical data for the given name from google. - Date format is datetime - - Returns a DataFrame. - """ - start, end = _sanitize_dates(start, end) - - # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv - url = "%s%s" % (_HISTORICAL_GOOGLE_URL, - urlencode({"q": sym, - "startdate": start.strftime('%b %d, ' '%Y'), - "enddate": end.strftime('%b %d, %Y'), - "output": "csv"})) - return _retry_read_url(url, retry_count, pause, 'Google') - - -def _adjust_prices(hist_data, price_list=None): - """ - Return modifed DataFrame or Panel with adjusted prices based on - 'Adj Close' price. Adds 'Adj_Ratio' column. - """ - if price_list is None: - price_list = 'Open', 'High', 'Low', 'Close' - adj_ratio = hist_data['Adj Close'] / hist_data['Close'] - - data = hist_data.copy() - for item in price_list: - data[item] = hist_data[item] * adj_ratio - data['Adj_Ratio'] = adj_ratio - del data['Adj Close'] - return data - - -def _calc_return_index(price_df): - """ - Return a returns index from a input price df or series. Initial value - (typically NaN) is set to 1. - """ - df = price_df.pct_change().add(1).cumprod() - mask = df.ix[1].notnull() & df.ix[0].isnull() - df.ix[0][mask] = 1 - - # Check for first stock listings after starting date of index in ret_index - # If True, find first_valid_index and set previous entry to 1. - if (~mask).any(): - for sym in mask.index[~mask]: - tstamp = df[sym].first_valid_index() - t_idx = df.index.get_loc(tstamp) - 1 - df[sym].ix[t_idx] = 1 - - return df - - -_YAHOO_COMPONENTS_URL = 'http://download.finance.yahoo.com/d/quotes.csv?' - - -def get_components_yahoo(idx_sym): - """ - Returns DataFrame containing list of component information for - index represented in idx_sym from yahoo. Includes component symbol - (ticker), exchange, and name. - - Parameters - ---------- - idx_sym : str - Stock index symbol - Examples: - '^DJI' (Dow Jones Industrial Average) - '^NYA' (NYSE Composite) - '^IXIC' (NASDAQ Composite) - - See: http://finance.yahoo.com/indices for other index symbols - - Returns - ------- - idx_df : DataFrame - """ - stats = 'snx' - # URL of form: - # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv - url = _YAHOO_COMPONENTS_URL + 's={0}&f={1}&e=.csv&h={2}' - - idx_mod = idx_sym.replace('^', '@%5E') - url_str = url.format(idx_mod, stats, 1) - - idx_df = DataFrame() - mask = [True] - comp_idx = 1 - - # LOOP across component index structure, - # break when no new components are found - while True in mask: - url_str = url.format(idx_mod, stats, comp_idx) - with urlopen(url_str) as resp: - raw = resp.read() - lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"') - lines = [line.strip().split('","') for line in lines] - - temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) - temp_df = temp_df.drop_duplicates() - temp_df = temp_df.set_index('ticker') - mask = ~temp_df.index.isin(idx_df.index) - - comp_idx = comp_idx + 50 - idx_df = idx_df.append(temp_df[mask]) - - return idx_df - - -def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause, - method): - stocks = {} - failed = [] - passed = [] - for sym_group in _in_chunks(symbols, chunksize): - for sym in sym_group: - try: - stocks[sym] = method(sym, start, end, interval, retry_count, pause) - passed.append(sym) - except IOError: - warnings.warn('Failed to read symbol: {0!r}, replacing with ' - 'NaN.'.format(sym), SymbolWarning) - failed.append(sym) - - if len(passed) == 0: - raise RemoteDataError("No data fetched using " - "{0!r}".format(method.__name__)) - try: - if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0: - df_na = stocks[passed[0]].copy() - df_na[:] = np.nan - for sym in failed: - stocks[sym] = df_na - return Panel(stocks).swapaxes('items', 'minor') - except AttributeError: - # cannot construct a panel with just 1D nans indicating no data - raise RemoteDataError("No data fetched using " - "{0!r}".format(method.__name__)) - -_source_functions = {'google': _get_hist_google, 'yahoo': _get_hist_yahoo} - - -def _get_data_from(symbols, start, end, interval, retry_count, pause, adjust_price, - ret_index, chunksize, source): - - src_fn = _source_functions[source] - - # If a single symbol, (e.g., 'GOOG') - if isinstance(symbols, (compat.string_types, int)): - hist_data = src_fn(symbols, start, end, interval, retry_count, pause) - # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) - elif isinstance(symbols, DataFrame): - hist_data = _dl_mult_symbols(symbols.index, start, end, interval, chunksize, - retry_count, pause, src_fn) - else: - hist_data = _dl_mult_symbols(symbols, start, end, interval, chunksize, - retry_count, pause, src_fn) - if source.lower() == 'yahoo': - if ret_index: - hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) - if adjust_price: - hist_data = _adjust_prices(hist_data) - - return hist_data - - -def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, - pause=0.001, adjust_price=False, ret_index=False, - chunksize=25, interval='d'): - """ - Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Yahoo! Finance servers, - pauses between downloading 'chunks' of symbols can be specified. - - Parameters - ---------- - symbols : string, array-like object (list, tuple, Series), or DataFrame, default: None - Single stock symbol (ticker), array-like object of symbols or - DataFrame with index containing stock symbols - start : string, (defaults to '1/1/2010') - Starting date, timestamp. Parses many different kind of date - representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) - Ending date, timestamp. Same format as starting date. - retry_count : int, default: 3 - Number of times to retry query request. - pause : numeric, default: 0.001 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - adjust_price : bool, default: False - If True, adjusts all prices in hist_data ('Open', 'High', 'Low', - 'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops - 'Adj Close'. - ret_index : bool, default: False - If True, includes a simple return index 'Ret_Index' in hist_data. - chunksize : int, default: 25 - Number of symbols to download consecutively before intiating pause. - interval : string, default: 'd' - Time interval code, valid values are 'd' for daily, 'w' for weekly, - 'm' for monthly and 'v' for dividend. - - Returns - ------- - hist_data : DataFrame (str) or Panel (array-like object, DataFrame) - """ - if interval not in ['d', 'w', 'm', 'v']: - raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'") - return _get_data_from(symbols, start, end, interval, retry_count, pause, - adjust_price, ret_index, chunksize, 'yahoo') - - -def get_data_google(symbols=None, start=None, end=None, retry_count=3, - pause=0.001, adjust_price=False, ret_index=False, - chunksize=25): - """ - Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Google Finance servers, - pauses between downloading 'chunks' of symbols can be specified. - - Parameters - ---------- - symbols : string, array-like object (list, tuple, Series), or DataFrame - Single stock symbol (ticker), array-like object of symbols or - DataFrame with index containing stock symbols. - start : string, (defaults to '1/1/2010') - Starting date, timestamp. Parses many different kind of date - representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) - Ending date, timestamp. Same format as starting date. - retry_count : int, default: 3 - Number of times to retry query request. - pause : numeric, default: 0.001 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - chunksize : int, default: 25 - Number of symbols to download consecutively before intiating pause. - ret_index : bool, default: False - If True, includes a simple return index 'Ret_Index' in hist_data. - - Returns - ------- - hist_data : DataFrame (str) or Panel (array-like object, DataFrame) - """ - return _get_data_from(symbols, start, end, None, retry_count, pause, - adjust_price, ret_index, chunksize, 'google') - - -_FRED_URL = "http://research.stlouisfed.org/fred2/series/" - - -def get_data_fred(name, start=dt.datetime(2010, 1, 1), - end=dt.datetime.today()): - """ - Get data for the given name from the St. Louis FED (FRED). - Date format is datetime - - Returns a DataFrame. - - If multiple names are passed for "series" then the index of the - DataFrame is the outer join of the indicies of each series. - """ - start, end = _sanitize_dates(start, end) - - if not is_list_like(name): - names = [name] - else: - names = name - - urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for - n in names] - - def fetch_data(url, name): - with urlopen(url) as resp: - data = read_csv(resp, index_col=0, parse_dates=True, - header=None, skiprows=1, names=["DATE", name], - na_values='.') - try: - return data.truncate(start, end) - except KeyError: - if data.ix[3].name[7:12] == 'Error': - raise IOError("Failed to get the data. Check that {0!r} is " - "a valid FRED series.".format(name)) - raise - df = concat([fetch_data(url, n) for url, n in zip(urls, names)], - axis=1, join='outer') - return df - - -_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp' - - -def get_data_famafrench(name): - # path of zip files - zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name) - - with urlopen(zip_file_path) as url: - raw = url.read() - - with tempfile.TemporaryFile() as tmpf: - tmpf.write(raw) - - with ZipFile(tmpf, 'r') as zf: - data = zf.open(zf.namelist()[0]).readlines() - - line_lengths = np.array(lmap(len, data)) - file_edges = np.where(line_lengths == 2)[0] - - datasets = {} - edges = zip(file_edges + 1, file_edges[1:]) - for i, (left_edge, right_edge) in enumerate(edges): - dataset = [d.split() for d in data[left_edge:right_edge]] - if len(dataset) > 10: - ncol_raw = np.array(lmap(len, dataset)) - ncol = np.median(ncol_raw) - header_index = np.where(ncol_raw == ncol - 1)[0][-1] - header = dataset[header_index] - ds_header = dataset[header_index + 1:] - # to ensure the header is unique - header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, - start=1)] - index = np.array([d[0] for d in ds_header], dtype=int) - dataset = np.array([d[1:] for d in ds_header], dtype=float) - datasets[i] = DataFrame(dataset, index, columns=header) - - return datasets - - -# Items needed for options class -CUR_MONTH = dt.datetime.now().month -CUR_YEAR = dt.datetime.now().year -CUR_DAY = dt.datetime.now().day - - -def _two_char(s): - return '{0:0>2}'.format(s) - - -class Options(object): - """ - ***Experimental*** - This class fetches call/put data for a given stock/expiry month. - - It is instantiated with a string representing the ticker symbol. - - The class has the following methods: - get_options_data:(month, year, expiry) - get_call_data:(month, year, expiry) - get_put_data: (month, year, expiry) - get_near_stock_price(opt_frame, above_below) - get_all_data(call, put) - get_forward_data(months, call, put) (deprecated) - - Examples - -------- - # Instantiate object with ticker - >>> aapl = Options('aapl', 'yahoo') - - # Fetch next expiry call data - >>> calls = aapl.get_call_data() - - # Can now access aapl.calls instance variable - >>> aapl.calls - - # Fetch next expiry put data - >>> puts = aapl.get_put_data() - - # Can now access aapl.puts instance variable - >>> aapl.puts - - # cut down the call data to be 3 below and 3 above the stock price. - >>> cut_calls = aapl.get_near_stock_price(call=True, above_below=3) - - # Fetch call and put data with expiry from now to 8 months out - >>> forward_data = aapl.get_forward_data(8, call=True, put=True) - - # Fetch all call and put data - >>> all_data = aapl.get_all_data() - """ - - _TABLE_LOC = {'calls': 1, 'puts': 2} - _OPTIONS_BASE_URL = 'http://finance.yahoo.com/q/op?s={sym}' - _FINANCE_BASE_URL = 'http://finance.yahoo.com' - - def __init__(self, symbol, data_source=None): - """ Instantiates options_data with a ticker saved as symbol """ - self.symbol = symbol.upper() - if data_source is None: - warnings.warn("Options(symbol) is deprecated, use Options(symbol," - " data_source) instead", FutureWarning, stacklevel=2) - data_source = "yahoo" - if data_source != "yahoo": - raise NotImplementedError("currently only yahoo supported") - - def get_options_data(self, month=None, year=None, expiry=None): - """ - ***Experimental*** - Gets call/put data for the stock with the expiration data in the - given month and year - - Parameters - ---------- - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Notes - ----- - Note: Format of returned data frame is dependent on Yahoo and may change. - - When called, this function will add instance variables named - calls and puts. See the following example: - - >>> aapl = Options('aapl', 'yahoo') # Create object - >>> aapl.calls # will give an AttributeError - >>> aapl.get_options() # Get data and set ivars - >>> aapl.calls # Doesn't throw AttributeError - - Also note that aapl.calls and appl.puts will always be the calls - and puts for the next expiry. If the user calls this method with - a different expiry, the ivar will be named callsYYMMDD or putsYYMMDD, - where YY, MM and DD are, respectively, two digit representations of - the year, month and day for the expiry of the options. - - """ - return concat([f(month, year, expiry) - for f in (self.get_put_data, - self.get_call_data)]).sortlevel() - - def _get_option_frames_from_yahoo(self, expiry): - url = self._yahoo_url_from_expiry(expiry) - option_frames = self._option_frames_from_url(url) - frame_name = '_frames' + self._expiry_to_string(expiry) - setattr(self, frame_name, option_frames) - return option_frames - - @staticmethod - def _expiry_to_string(expiry): - m1 = _two_char(expiry.month) - d1 = _two_char(expiry.day) - return str(expiry.year)[-2:] + m1 + d1 - - def _yahoo_url_from_expiry(self, expiry): - try: - expiry_links = self._expiry_links - - except AttributeError: - _, expiry_links = self._get_expiry_dates_and_links() - - return self._FINANCE_BASE_URL + expiry_links[expiry] - - def _option_frames_from_url(self, url): - frames = read_html(url) - nframes = len(frames) - frames_req = max(self._TABLE_LOC.values()) - if nframes < frames_req: - raise RemoteDataError("%s options tables found (%s expected)" % (nframes, frames_req)) - - if not hasattr(self, 'underlying_price'): - try: - self.underlying_price, self.quote_time = self._underlying_price_and_time_from_url(url) - except IndexError: - self.underlying_price, self.quote_time = np.nan, np.nan - - calls = frames[self._TABLE_LOC['calls']] - puts = frames[self._TABLE_LOC['puts']] - - calls = self._process_data(calls, 'call') - puts = self._process_data(puts, 'put') - - return {'calls': calls, 'puts': puts} - - def _underlying_price_and_time_from_url(self, url): - root = self._parse_url(url) - underlying_price = self._underlying_price_from_root(root) - quote_time = self._quote_time_from_root(root) - return underlying_price, quote_time - - @staticmethod - def _underlying_price_from_root(root): - underlying_price = root.xpath('.//*[@class="time_rtq_ticker Fz-30 Fw-b"]')[0]\ - .getchildren()[0].text - underlying_price = underlying_price.replace(',', '') #GH11 - - try: - underlying_price = float(underlying_price) - except ValueError: - underlying_price = np.nan - - return underlying_price - - @staticmethod - def _quote_time_from_root(root): - #Gets the time of the quote, note this is actually the time of the underlying price. - try: - quote_time_text = root.xpath('.//*[@class="time_rtq Fz-m"]')[0].getchildren()[1].getchildren()[0].text - ##TODO: Enable timezone matching when strptime can match EST with %Z - quote_time_text = quote_time_text.split(' ')[0] - quote_time = dt.datetime.strptime(quote_time_text, "%I:%M%p") - quote_time = quote_time.replace(year=CUR_YEAR, month=CUR_MONTH, day=CUR_DAY) - except ValueError: - quote_time = np.nan - - return quote_time - - def _get_option_data(self, expiry, name): - frame_name = '_frames' + self._expiry_to_string(expiry) - - try: - frames = getattr(self, frame_name) - except AttributeError: - frames = self._get_option_frames_from_yahoo(expiry) - - option_data = frames[name] - if expiry != self.expiry_dates[0]: - name += self._expiry_to_string(expiry) - - setattr(self, name, option_data) - return option_data - - def get_call_data(self, month=None, year=None, expiry=None): - """ - ***Experimental*** - Gets call/put data for the stock with the expiration data in the - given month and year - - Parameters - ---------- - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - call_data: pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Notes - ----- - Note: Format of returned data frame is dependent on Yahoo and may change. - - When called, this function will add instance variables named - calls and puts. See the following example: - - >>> aapl = Options('aapl', 'yahoo') # Create object - >>> aapl.calls # will give an AttributeError - >>> aapl.get_call_data() # Get data and set ivars - >>> aapl.calls # Doesn't throw AttributeError - - Also note that aapl.calls will always be the calls for the next - expiry. If the user calls this method with a different month - or year, the ivar will be named callsYYMMDD where YY, MM and DD are, - respectively, two digit representations of the year, month and day - for the expiry of the options. - """ - expiry = self._try_parse_dates(year, month, expiry) - return self._get_data_in_date_range(expiry, call=True, put=False) - - def get_put_data(self, month=None, year=None, expiry=None): - """ - ***Experimental*** - Gets put data for the stock with the expiration data in the - given month and year - - Parameters - ---------- - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - put_data: pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Notes - ----- - Note: Format of returned data frame is dependent on Yahoo and may change. - - When called, this function will add instance variables named - puts. See the following example: - - >>> aapl = Options('aapl') # Create object - >>> aapl.puts # will give an AttributeError - >>> aapl.get_put_data() # Get data and set ivars - >>> aapl.puts # Doesn't throw AttributeError - - return self.__setattr__(self, str(str(x) + str(y))) - - Also note that aapl.puts will always be the puts for the next - expiry. If the user calls this method with a different month - or year, the ivar will be named putsYYMMDD where YY, MM and DD are, - respectively, two digit representations of the year, month and day - for the expiry of the options. - """ - expiry = self._try_parse_dates(year, month, expiry) - return self._get_data_in_date_range(expiry, put=True, call=False) - - def get_near_stock_price(self, above_below=2, call=True, put=False, - month=None, year=None, expiry=None): - """ - ***Experimental*** - Returns a data frame of options that are near the current stock price. - - Parameters - ---------- - above_below : number, int, optional (default=2) - The number of strike prices above and below the stock price that - should be taken - - call : bool, default: True - Tells the function whether or not it should be using calls - - put : bool, default: False - Tells the function weather or not it should be using puts - - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - chopped: DataFrame - The resultant DataFrame chopped down to be 2 * above_below + 1 rows - desired. If there isn't data as far out as the user has asked for - then - - Note: Format of returned data frame is dependent on Yahoo and may change. - - """ - expiry = self._try_parse_dates(year, month, expiry) - data = self._get_data_in_date_range(expiry, call=call, put=put) - return self.chop_data(data, above_below, self.underlying_price) - - def chop_data(self, df, above_below=2, underlying_price=None): - """Returns a data frame only options that are near the current stock price.""" - - if not underlying_price: - try: - underlying_price = self.underlying_price - except AttributeError: - underlying_price = np.nan - - max_strike = max(df.index.get_level_values('Strike')) - min_strike = min(df.index.get_level_values('Strike')) - - if not np.isnan(underlying_price) and min_strike < underlying_price < max_strike: - start_index = np.where(df.index.get_level_values('Strike') - > underlying_price)[0][0] - - get_range = slice(start_index - above_below, - start_index + above_below + 1) - df = df[get_range].dropna(how='all') - - return df - - def _try_parse_dates(self, year, month, expiry): - """ - Validates dates provided by user. Ensures the user either provided both a month and a year or an expiry. - - Parameters - ---------- - year : int - Calendar year - - month : int - Calendar month - - expiry : date-like or convertible, (preferred) - Expiry date - - Returns - ------- - list of expiry dates (datetime.date) - """ - - #Checks if the user gave one of the month or the year but not both and did not provide an expiry: - if (month is not None and year is None) or (month is None and year is not None) and expiry is None: - msg = "You must specify either (`year` and `month`) or `expiry` " \ - "or none of these options for the next expiry." - raise ValueError(msg) - - if expiry is not None: - if hasattr(expiry, '__iter__'): - expiry = [self._validate_expiry(exp) for exp in expiry] - else: - expiry = [self._validate_expiry(expiry)] - - if len(expiry) == 0: - raise ValueError('No expiries available for given input.') - - elif year is None and month is None: - #No arguments passed, provide next expiry - year = CUR_YEAR - month = CUR_MONTH - expiry = dt.date(year, month, 1) - expiry = [self._validate_expiry(expiry)] - - else: - #Year and month passed, provide all expiries in that month - expiry = [expiry for expiry in self.expiry_dates if expiry.year == year and expiry.month == month] - if len(expiry) == 0: - raise ValueError('No expiries available in %s-%s' % (year, month)) - - return expiry - - def _validate_expiry(self, expiry): - """Ensures that an expiry date has data available on Yahoo - If the expiry date does not have options that expire on that day, return next expiry""" - - expiry_dates = self.expiry_dates - expiry = to_datetime(expiry) - if hasattr(expiry, 'date'): - expiry = expiry.date() - - if expiry in expiry_dates: - return expiry - else: - index = DatetimeIndex(expiry_dates).sort_values() - return index[index.date >= expiry][0].date() - - def get_forward_data(self, months, call=True, put=False, near=False, - above_below=2): - """ - ***Experimental*** - Gets either call, put, or both data for months starting in the current - month and going out in the future a specified amount of time. - - Parameters - ---------- - months : number, int - How many months to go out in the collection of the data. This is - inclusive. - - call : bool, optional (default=True) - Whether or not to collect data for call options - - put : bool, optional (default=False) - Whether or not to collect data for put options. - - near : bool, optional (default=False) - Whether this function should get only the data near the - current stock price. Uses Options.get_near_stock_price - - above_below : number, int, optional (default=2) - The number of strike prices above and below the stock price that - should be taken if the near option is set to True - - Returns - ------- - pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Note: Format of returned data frame is dependent on Yahoo and may change. - - """ - warnings.warn("get_forward_data() is deprecated", FutureWarning, - stacklevel=2) - end_date = dt.date.today() + MonthEnd(months) - dates = (date for date in self.expiry_dates if date <= end_date.date()) - data = self._get_data_in_date_range(dates, call=call, put=put) - if near: - data = self.chop_data(data, above_below=above_below) - return data - - def get_all_data(self, call=True, put=True): - """ - ***Experimental*** - Gets either call, put, or both data for all available months starting - in the current month. - - Parameters - ---------- - call : bool, optional (default=True) - Whether or not to collect data for call options - - put : bool, optional (default=True) - Whether or not to collect data for put options. - - Returns - ------- - pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Note: Format of returned data frame is dependent on Yahoo and may change. - - """ - - try: - expiry_dates = self.expiry_dates - except AttributeError: - expiry_dates, _ = self._get_expiry_dates_and_links() - - return self._get_data_in_date_range(dates=expiry_dates, call=call, put=put) - - def _get_data_in_date_range(self, dates, call=True, put=True): - - to_ret = Series({'calls': call, 'puts': put}) - to_ret = to_ret[to_ret].index - data = [] - - for name in to_ret: - for expiry_date in dates: - nam = name + self._expiry_to_string(expiry_date) - try: # Try to access on the instance - frame = getattr(self, nam) - except AttributeError: - frame = self._get_option_data(expiry=expiry_date, name=name) - data.append(frame) - - return concat(data).sortlevel() - - @property - def expiry_dates(self): - """ - Returns a list of available expiry dates - """ - try: - expiry_dates = self._expiry_dates - except AttributeError: - expiry_dates, _ = self._get_expiry_dates_and_links() - return expiry_dates - - def _get_expiry_dates_and_links(self): - """ - Gets available expiry dates. - - Returns - ------- - Tuple of: - List of datetime.date objects - Dict of datetime.date objects as keys and corresponding links - """ - - url = self._OPTIONS_BASE_URL.format(sym=self.symbol) - root = self._parse_url(url) - - try: - links = root.xpath('//*[@id="options_menu"]/form/select/option') - except IndexError: - raise RemoteDataError('Expiry dates not available') - - expiry_dates = [dt.datetime.strptime(element.text, "%B %d, %Y").date() for element in links] - links = [element.attrib['data-selectbox-link'] for element in links] - - if len(expiry_dates) == 0: - raise RemoteDataError('Data not available') - - expiry_links = dict(zip(expiry_dates, links)) - self._expiry_links = expiry_links - self._expiry_dates = expiry_dates - return expiry_dates, expiry_links - - def _parse_url(self, url): - """ - Downloads and parses a URL, returns xml root. - - """ - try: - from lxml.html import parse - except ImportError: - raise ImportError("Please install lxml if you want to use the " - "{0!r} class".format(self.__class__.__name__)) - try: - doc = parse(url) - except _network_error_classes: - raise RemoteDataError("Unable to parse URL " - "{0!r}".format(url)) - else: - root = doc.getroot() - if root is None: - raise RemoteDataError("Parsed URL {0!r} has no root" - "element".format(url)) - return root - - def _process_data(self, frame, type): - """ - Adds columns for Expiry, IsNonstandard (ie: deliverable is not 100 shares) - and Tag (the tag indicating what is actually deliverable, None if standard). - - """ - frame.columns = ['Strike', 'Symbol', 'Last', 'Bid', 'Ask', 'Chg', 'PctChg', 'Vol', 'Open_Int', 'IV'] - frame["Rootexp"] = frame.Symbol.str[0:-9] - frame["Root"] = frame.Rootexp.str[0:-6] - frame["Expiry"] = to_datetime(frame.Rootexp.str[-6:]) - #Removes dashes in equity ticker to map to option ticker. - #Ex: BRK-B to BRKB140517C00100000 - frame["IsNonstandard"] = frame['Root'] != self.symbol.replace('-', '') - del frame["Rootexp"] - frame["Underlying"] = self.symbol - try: - frame['Underlying_Price'] = self.underlying_price - frame["Quote_Time"] = self.quote_time - except AttributeError: - frame['Underlying_Price'] = np.nan - frame["Quote_Time"] = np.nan - frame.rename(columns={'Open Int': 'Open_Int'}, inplace=True) - frame['Type'] = type - frame.set_index(['Strike', 'Expiry', 'Type', 'Symbol'], inplace=True) - - return frame +raise ImportError( + "The pandas.io.data module is moved to a separate package " + "(pandas-datareader). After installing the pandas-datareader package " + "(https://github.com/pydata/pandas-datareader), you can change " + "the import ``from pandas.io import data, wb`` to " + "``from pandas_datareader import data, wb``.") diff --git a/pandas/io/tests/data/yahoo_options1.html b/pandas/io/tests/data/yahoo_options1.html deleted file mode 100644 index 2846a2bd12732..0000000000000 --- a/pandas/io/tests/data/yahoo_options1.html +++ /dev/null @@ -1,6065 +0,0 @@ - - - - - AAPL Options | Yahoo! Inc. Stock - Yahoo! Finance - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
-
- - -
- -
- -
- - - - - - - -
- - -
- - -
-
- - - - - -
-
-
- - - - - -
-
-
- - - - - -
- -
- - - -
-
-
-
- - - - Dow - - - - Up - - - 1.32% - - - - - - - Nasdaq - - - - Up - - - 1.60% - - - - - - -
- -
-
-
- - -
- -
-

More on AAPL

-
-
- - -

Quotes

- - -

Charts

- - -

News & Info

- - -

Company

- - -

Analyst Coverage

- - -

Ownership

- - -

Financials

- - - -
-
- -
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
- - -
-
- - - - - -
-
-
- - - - -
-
-
-
-
-
-

Apple Inc. (AAPL)

- -
-
-
-
- - 104.83 - - - - - Up +1.84(1.79%) - - - NasdaqGS - As of 4:00PM EDT - -
-
| - - -
-
- -
- - -
-
-
-
-
- - - - -
-
-
- - - - - -
- -
- - - -
- - -
-
- - - - - -
-
-
- -
-
- -
-
October 24, 2014
- -
- - -
- -
- - -
-
-
- - -
-
-
-
-
- - -
-
- In The Money -
-
- - - -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- - -
- - -
-
-
- - - - - -
- -
- - - -
- - -
- -
-
- - - -
-
- - - - - - - - - - - - - - - -
- - - - - - - - - - - - \ No newline at end of file diff --git a/pandas/io/tests/data/yahoo_options2.html b/pandas/io/tests/data/yahoo_options2.html deleted file mode 100644 index bae9c193e03e1..0000000000000 --- a/pandas/io/tests/data/yahoo_options2.html +++ /dev/null @@ -1,5853 +0,0 @@ - - - - - AAPL Option Chain | Yahoo! Inc. Stock - Yahoo! Finance - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
-
- - -
- -
- -
- - - - - - - -
- - -
- - -
-
- - - - - -
-
-
- - - - - -
-
-
- - - - - -
- -
- - - -
-
-
-
- - - - Dow - - - - Up - - - 0.58% - - - - - - - Nasdaq - - - - Down - - - 0.06% - - - - - - -
- -
-
-
- - -
- -
-

More on AAPL

-
-
- - -

Quotes

- - -

Charts

- - -

News & Info

- - -

Company

- - -

Analyst Coverage

- - -

Ownership

- - -

Financials

- - - -
-
- -
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
- - -
-
- - - - - -
-
-
- - - - -
-
-
-
-
-
-

Apple Inc. (AAPL)

- -
-
-
-
- - 108.86 - - - - - Up +0.26(0.24%) - - - NasdaqGS - As of 4:00PM EST - -
-
| - - After Hours: - 108.86 0.00 (0.00%) 7:59PM EST - - -
-
- -
- - -
-
-
-
-
- - - - -
-
-
- - - - - -
- -
- - - -
- - -
-
- - - - - -
-
-
- -
-
- -
-
November 7, 2014
- -
- - - -
-
-
- - -
-
-
-
-
- - -
-
- In The Money -
-
- - - -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- - -
- - -
-
-
- - - - - -
- -
- - - -
- - -
- -
-
- - - -
-
- - - - - - - - - - - - - - - -
- - - - - - - - - - - - \ No newline at end of file diff --git a/pandas/io/tests/data/yahoo_options3.html b/pandas/io/tests/data/yahoo_options3.html deleted file mode 100644 index 6e79bb9bf9f36..0000000000000 --- a/pandas/io/tests/data/yahoo_options3.html +++ /dev/null @@ -1,2807 +0,0 @@ - - - - - SPWR Option Chain | Yahoo! Inc. Stock - Yahoo! Finance - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
  • FirefoxInstall the new Firefox »
  • -
    - - -
    - -
    - -
    - - - - - - - -
    - - -
    - - -
    - -
    - - - -
    -
    -
    -
    - - - - Dow - - - - Down - - - 0.58% - - - - - - - Nasdaq - - - - Down - - - 0.32% - - - - - - -
    - -
    -
    -
    - - -
    - -
    -

    More on SPWR

    -
    -
    - - -

    Quotes

    - - -

    Charts

    - - -

    News & Info

    - - -

    Company

    - - -

    Analyst Coverage

    - - -

    Ownership

    - - -

    Financials

    - - -
    -
    - -
    -
    - -
    -
    -
    -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    - - -
    - - -
    -
    -
    -
    -
    -
    -

    SunPower Corporation (SPWR)

    - -
    -
    -
    -
    - - 33.05 - - - - - Up +0.07(0.21%) - - - NASDAQ - As of 4:00PM EDT - -
    -
    | - - After Hours: - 33.10 Up +0.05 (0.15%) 7:47PM EDT - - -
    -
    - -
    - - -
    -
    -
    -
    -
    - - -
    - - - -
    - - -
    -
    - -
    -
    May 1, 2015
    - -
    - - - -
    -
    -
    - - -
    -
    -
    -
    -
    - - -
    -
    - In The Money -
    -
    - - - -
    -
    -

    Show Me Strikes From

    -
    - $ - to $ -
    - Apply Filter - Clear Filter -
    - - - - - -
    - -
    -
    -

    Show Me Strikes From

    -
    - $ - to $ -
    - Apply Filter - Clear Filter -
    - - - - - -
    - - -
    -
    - - - -
    - - -
    - -
    -
    - - - -
    -
    - - - - - - - - - - - - - - - -
    - - - - - - - - - - - - \ No newline at end of file diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py deleted file mode 100644 index 1efa8b13598a7..0000000000000 --- a/pandas/io/tests/test_data.py +++ /dev/null @@ -1,586 +0,0 @@ -# flake8: noqa - -from __future__ import print_function -from pandas import compat -import warnings -import nose -from nose.tools import assert_equal -from datetime import datetime -import os - -import numpy as np -import pandas as pd -from pandas import DataFrame, Timestamp -from pandas.util.testing import (assert_series_equal, assert_produces_warning, - network, assert_frame_equal) -import pandas.util.testing as tm - -with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - from pandas.io import data as web - -from pandas.io.data import DataReader, SymbolWarning, RemoteDataError, _yahoo_codes - -if compat.PY3: - from urllib.error import HTTPError -else: - from urllib2 import HTTPError - - -def _skip_if_no_lxml(): - try: - import lxml - except ImportError: - raise nose.SkipTest("no lxml") - -def _skip_if_no_bs(): - try: - import bs4 - import html5lib - except ImportError: - raise nose.SkipTest("no html5lib/bs4") - - -def assert_n_failed_equals_n_null_columns(wngs, obj, cls=SymbolWarning): - all_nan_cols = pd.Series(dict((k, pd.isnull(v).all()) for k, v in - compat.iteritems(obj))) - n_all_nan_cols = all_nan_cols.sum() - valid_warnings = pd.Series([wng for wng in wngs if wng.category == cls]) - assert_equal(len(valid_warnings), n_all_nan_cols) - failed_symbols = all_nan_cols[all_nan_cols].index - msgs = valid_warnings.map(lambda x: x.message) - assert msgs.str.contains('|'.join(failed_symbols)).all() - - -class TestGoogle(tm.TestCase): - @classmethod - def setUpClass(cls): - super(TestGoogle, cls).setUpClass() - cls.locales = tm.get_locales(prefix='en_US') - if not cls.locales: - raise nose.SkipTest("US English locale not available for testing") - - @classmethod - def tearDownClass(cls): - super(TestGoogle, cls).tearDownClass() - del cls.locales - - @network - def test_google(self): - # asserts that google is minimally working and that it throws - # an exception when DataReader can't get a 200 response from - # google - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - for locale in self.locales: - with tm.set_locale(locale): - panel = web.DataReader("F", 'google', start, end) - self.assertEqual(panel.Close[-1], 13.68) - - self.assertRaises(Exception, web.DataReader, "NON EXISTENT TICKER", - 'google', start, end) - - @network - def test_get_quote_fails(self): - self.assertRaises(NotImplementedError, web.get_quote_google, - pd.Series(['GOOG', 'AAPL', 'GOOG'])) - - @network - def test_get_goog_volume(self): - for locale in self.locales: - with tm.set_locale(locale): - df = web.get_data_google('GOOG').sort_index() - self.assertEqual(df.Volume.ix['JAN-02-2015'], 1446662) - - @network - def test_get_multi1(self): - for locale in self.locales: - sl = ['AAPL', 'AMZN', 'GOOG'] - with tm.set_locale(locale): - pan = web.get_data_google(sl, '2012', '2013') - ts = pan.Close.GOOG.index[pan.Close.AAPL < pan.Close.GOOG] - if (hasattr(pan, 'Close') and hasattr(pan.Close, 'GOOG') and - hasattr(pan.Close, 'AAPL')): - self.assertEqual(ts[0].dayofyear, 3) - else: - self.assertRaises(AttributeError, lambda: pan.Close) - - @network - def test_get_multi_invalid(self): - sl = ['AAPL', 'AMZN', 'INVALID'] - with tm.assert_produces_warning(SymbolWarning): - pan = web.get_data_google(sl, '2012') - self.assertIn('INVALID', pan.minor_axis) - - @network - def test_get_multi_all_invalid(self): - sl = ['INVALID', 'INVALID2', 'INVALID3'] - with tm.assert_produces_warning(SymbolWarning): - self.assertRaises(RemoteDataError, web.get_data_google, sl, '2012') - - @network - def test_get_multi2(self): - with warnings.catch_warnings(record=True) as w: - for locale in self.locales: - with tm.set_locale(locale): - pan = web.get_data_google(['GE', 'MSFT', 'INTC'], - 'JAN-01-12', 'JAN-31-12') - result = pan.Close.ix['01-18-12'] - assert_n_failed_equals_n_null_columns(w, result) - - # sanity checking - - self.assertTrue(np.issubdtype(result.dtype, np.floating)) - result = pan.Open.ix['Jan-15-12':'Jan-20-12'] - self.assertEqual((4, 3), result.shape) - assert_n_failed_equals_n_null_columns(w, result) - - @network - def test_dtypes(self): - #GH3995, #GH8980 - data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13') - self.assertTrue(np.issubdtype(data.Open.dtype, np.number)) - self.assertTrue(np.issubdtype(data.Close.dtype, np.number)) - self.assertTrue(np.issubdtype(data.Low.dtype, np.number)) - self.assertTrue(np.issubdtype(data.High.dtype, np.number)) - self.assertTrue(np.issubdtype(data.Volume.dtype, np.number)) - - @network - def test_unicode_date(self): - #GH8967 - data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13') - self.assertEqual(data.index.name, 'Date') - - -class TestYahoo(tm.TestCase): - @classmethod - def setUpClass(cls): - super(TestYahoo, cls).setUpClass() - _skip_if_no_lxml() - - @network - def test_yahoo(self): - # asserts that yahoo is minimally working and that it throws - # an exception when DataReader can't get a 200 response from - # yahoo - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - self.assertEqual(web.DataReader("F", 'yahoo', start, end)['Close'][-1], - 13.68) - - @network - def test_yahoo_fails(self): - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - self.assertRaises(Exception, web.DataReader, "NON EXISTENT TICKER", - 'yahoo', start, end) - - @network - def test_get_quote_series(self): - df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) - assert_series_equal(df.ix[0], df.ix[2]) - - @network - def test_get_quote_string(self): - df = web.get_quote_yahoo('GOOG') - - @network - def test_get_quote_string(self): - _yahoo_codes.update({'MarketCap': 'j1'}) - df = web.get_quote_yahoo('GOOG') - self.assertFalse(pd.isnull(df['MarketCap'][0])) - - @network - def test_get_quote_stringlist(self): - df = web.get_quote_yahoo(['GOOG', 'AAPL', 'GOOG']) - assert_series_equal(df.ix[0], df.ix[2]) - - @network - def test_get_components_dow_jones(self): - raise nose.SkipTest('unreliable test, receive partial components back for dow_jones') - - df = web.get_components_yahoo('^DJI') #Dow Jones - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(len(df), 30) - - @network - def test_get_components_dax(self): - raise nose.SkipTest('unreliable test, receive partial components back for dax') - - df = web.get_components_yahoo('^GDAXI') #DAX - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(len(df), 30) - self.assertEqual(df[df.name.str.contains('adidas', case=False)].index, - 'ADS.DE') - - @network - def test_get_components_nasdaq_100(self): - # as of 7/12/13 the conditional will test false because the link is invalid - raise nose.SkipTest('unreliable test, receive partial components back for nasdaq_100') - - df = web.get_components_yahoo('^NDX') #NASDAQ-100 - self.assertIsInstance(df, pd.DataFrame) - - if len(df) > 1: - # Usual culprits, should be around for a while - self.assertTrue('AAPL' in df.index) - self.assertTrue('GOOG' in df.index) - self.assertTrue('AMZN' in df.index) - else: - expected = DataFrame({'exchange': 'N/A', 'name': '@^NDX'}, - index=['@^NDX']) - assert_frame_equal(df, expected) - - @network - def test_get_data_single_symbol(self): - #single symbol - #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d - # just test that we succeed - web.get_data_yahoo('GOOG') - - @network - def test_get_data_interval(self): - # daily interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='d') - self.assertEqual(len(pan), 252) - - # weekly interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='w') - self.assertEqual(len(pan), 53) - - # montly interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='m') - self.assertEqual(len(pan), 12) - - # dividend data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='v') - self.assertEqual(len(pan), 4) - - # test fail on invalid interval - self.assertRaises(ValueError, web.get_data_yahoo, 'XOM', interval='NOT VALID') - - @network - def test_get_data_multiple_symbols(self): - # just test that we succeed - sl = ['AAPL', 'AMZN', 'GOOG'] - web.get_data_yahoo(sl, '2012') - - @network - def test_get_data_multiple_symbols_two_dates(self): - pan = web.get_data_yahoo(['GE', 'MSFT', 'INTC'], 'JAN-01-12', - 'JAN-31-12') - result = pan.Close.ix['01-18-12'] - self.assertEqual(len(result), 3) - - # sanity checking - self.assertTrue(np.issubdtype(result.dtype, np.floating)) - - expected = np.array([[18.99, 28.4, 25.18], - [18.58, 28.31, 25.13], - [19.03, 28.16, 25.52], - [18.81, 28.82, 25.87]]) - result = pan.Open.ix['Jan-15-12':'Jan-20-12'] - self.assertEqual(expected.shape, result.shape) - - @network - def test_get_date_ret_index(self): - pan = web.get_data_yahoo(['GE', 'INTC', 'IBM'], '1977', '1987', - ret_index=True) - self.assertTrue(hasattr(pan, 'Ret_Index')) - if hasattr(pan, 'Ret_Index') and hasattr(pan.Ret_Index, 'INTC'): - tstamp = pan.Ret_Index.INTC.first_valid_index() - result = pan.Ret_Index.ix[tstamp]['INTC'] - self.assertEqual(result, 1.0) - - # sanity checking - self.assertTrue(np.issubdtype(pan.values.dtype, np.floating)) - - -class TestYahooOptions(tm.TestCase): - - @classmethod - def setUpClass(cls): - super(TestYahooOptions, cls).setUpClass() - raise nose.SkipTest('disable Yahoo Options tests') - - _skip_if_no_lxml() - _skip_if_no_bs() - raise nose.SkipTest('unreliable test') - - # aapl has monthlies - cls.aapl = web.Options('aapl', 'yahoo') - d = (Timestamp.today() + pd.offsets.MonthBegin(1)).normalize() - cls.year = d.year - cls.month = d.month - cls.expiry = d - cls.expiry2 = d + pd.offsets.MonthBegin(1) - cls.dirpath = tm.get_data_path() - cls.html1 = os.path.join(cls.dirpath, 'yahoo_options1.html') - cls.html2 = os.path.join(cls.dirpath, 'yahoo_options2.html') - cls.html3 = os.path.join(cls.dirpath, 'yahoo_options3.html') #Empty table GH#22 - cls.data1 = cls.aapl._option_frames_from_url(cls.html1)['puts'] - - @classmethod - def tearDownClass(cls): - super(TestYahooOptions, cls).tearDownClass() - del cls.aapl, cls.expiry - - @network - def test_get_options_data(self): - # regression test GH6105 - self.assertRaises(ValueError, self.aapl.get_options_data, month=3) - self.assertRaises(ValueError, self.aapl.get_options_data, year=1992) - - try: - options = self.aapl.get_options_data(expiry=self.expiry) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(options) > 1) - - @network - def test_get_near_stock_price(self): - try: - options = self.aapl.get_near_stock_price(call=True, put=True, - expiry=[self.expiry,self.expiry2]) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(options) > 1) - - @network - def test_get_call_data(self): - try: - calls = self.aapl.get_call_data(expiry=self.expiry) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(calls) > 1) - - @network - def test_get_put_data(self): - try: - puts = self.aapl.get_put_data(expiry=self.expiry) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(puts) > 1) - - @network - def test_get_expiry_dates(self): - try: - dates, _ = self.aapl._get_expiry_dates_and_links() - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(dates) > 1) - - @network - def test_get_all_data(self): - - try: - data = self.aapl.get_all_data(put=True) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(data) > 1) - - @network - def test_get_data_with_list(self): - try: - data = self.aapl.get_call_data(expiry=self.aapl.expiry_dates) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(data) > 1) - - @network - def test_get_all_data_calls_only(self): - try: - data = self.aapl.get_all_data(call=True, put=False) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(data) > 1) - - @network - def test_get_underlying_price(self): - #GH7 - try: - options_object = web.Options('^spxpm', 'yahoo') - url = options_object._yahoo_url_from_expiry(options_object.expiry_dates[0]) - root = options_object._parse_url(url) - quote_price = options_object._underlying_price_from_root(root) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertIsInstance(quote_price, float) - - def test_sample_page_price_quote_time1(self): - #Tests the weekend quote time format - price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html1) - self.assertIsInstance(price, (int, float, complex)) - self.assertIsInstance(quote_time, (datetime, Timestamp)) - - def test_chop(self): - #regression test for #7625 - self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan) - chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100) - self.assertIsInstance(chopped, DataFrame) - self.assertTrue(len(chopped) > 1) - - def test_chop_out_of_strike_range(self): - #regression test for #7625 - self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan) - chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100000) - self.assertIsInstance(chopped, DataFrame) - self.assertTrue(len(chopped) > 1) - - - @network - def test_sample_page_price_quote_time2(self): - #Tests the EDT page format - #regression test for #8741 - price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html2) - self.assertIsInstance(price, (int, float, complex)) - self.assertIsInstance(quote_time, (datetime, Timestamp)) - - @network - def test_sample_page_chg_float(self): - #Tests that numeric columns with comma's are appropriately dealt with - self.assertEqual(self.data1['Chg'].dtype, 'float64') - - @network - def test_month_year(self): - try: - data = self.aapl.get_call_data(month=self.month, year=self.year) - except RemoteDataError as e: - raise nose.SkipTest(e) - - self.assertTrue(len(data) > 1) - - @network - def test_empty_table(self): - #GH22 - empty = self.aapl._option_frames_from_url(self.html3)['puts'] - self.assertTrue(len(empty) == 0) - - -class TestOptionsWarnings(tm.TestCase): - @classmethod - def setUpClass(cls): - super(TestOptionsWarnings, cls).setUpClass() - - @classmethod - def tearDownClass(cls): - super(TestOptionsWarnings, cls).tearDownClass() - - @network - def test_options_source_warning(self): - with assert_produces_warning(): - aapl = web.Options('aapl') - - -class TestDataReader(tm.TestCase): - - @network - def test_read_yahoo(self): - gs = DataReader("GS", "yahoo") - self.assertIsInstance(gs, DataFrame) - - @network - def test_read_google(self): - gs = DataReader("GS", "google") - self.assertIsInstance(gs, DataFrame) - - @network - def test_read_fred(self): - vix = DataReader("VIXCLS", "fred") - self.assertIsInstance(vix, DataFrame) - - @network - def test_read_famafrench(self): - raise nose.SkipTest('buggy as of 2/14/16; maybe a data revision?') - for name in ("F-F_Research_Data_Factors", - "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3", - "F-F_ST_Reversal_Factor", "F-F_Momentum_Factor"): - ff = DataReader(name, "famafrench") - self.assertTrue(ff is not None) - self.assertIsInstance(ff, dict) - - -class TestFred(tm.TestCase): - - @classmethod - def setUpClass(cls): - super(TestFred, cls).setUpClass() - raise nose.SkipTest('disable Fred tests') - - @network - def test_fred(self): - raise nose.SkipTest('buggy as of 2/14/16; maybe a data revision?') - - # Throws an exception when DataReader can't get a 200 response from - # FRED. - - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - received = web.DataReader("GDP", "fred", start, end)['GDP'].tail(1)[0] - self.assertTrue(int(received) > 10000) - - self.assertRaises(Exception, web.DataReader, "NON EXISTENT SERIES", - 'fred', start, end) - - @network - def test_fred_nan(self): - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - df = web.DataReader("DFII5", "fred", start, end) - self.assertTrue(pd.isnull(df.ix['2010-01-01'][0])) - - @network - def test_fred_parts(self): - raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') - - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - df = web.get_data_fred("CPIAUCSL", start, end) - self.assertEqual(df.ix['2010-05-01'][0], 217.23) - - t = df.CPIAUCSL.values - self.assertTrue(np.issubdtype(t.dtype, np.floating)) - self.assertEqual(t.shape, (37,)) - - @network - def test_fred_part2(self): - expected = [[576.7], - [962.9], - [684.7], - [848.3], - [933.3]] - result = web.get_data_fred("A09024USA144NNBR", start="1915").ix[:5] - tm.assert_numpy_array_equal(result.values, np.array(expected)) - - @network - def test_invalid_series(self): - name = "NOT A REAL SERIES" - self.assertRaises(Exception, web.get_data_fred, name) - - @network - def test_fred_multi(self): - raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') - - names = ['CPIAUCSL', 'CPALTT01USQ661S', 'CPILFESL'] - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - received = web.DataReader(names, "fred", start, end).head(1) - expected = DataFrame([[217.478, 0.99701529, 220.544]], columns=names, - index=[pd.tslib.Timestamp('2010-01-01 00:00:00')]) - expected.index.rename('DATE', inplace=True) - assert_frame_equal(received, expected, check_less_precise=True) - - @network - def test_fred_multi_bad_series(self): - - names = ['NOTAREALSERIES', 'CPIAUCSL', "ALSO FAKE"] - with tm.assertRaises(HTTPError): - DataReader(names, data_source="fred") - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_wb.py b/pandas/io/tests/test_wb.py deleted file mode 100644 index 42884b19de03a..0000000000000 --- a/pandas/io/tests/test_wb.py +++ /dev/null @@ -1,114 +0,0 @@ -# flake8: noqa - -import nose - -import pandas -from pandas.compat import u -from pandas.util.testing import network -from pandas.util.testing import assert_frame_equal -import pandas.util.testing as tm - -# deprecated -with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - from pandas.io.wb import search, download, get_countries - -class TestWB(tm.TestCase): - - @tm.slow - @network - def test_wdi_search(self): - - # Test that a name column exists, and that some results were returned - # ...without being too strict about what the actual contents of the - # results actually are. The fact that there are some, is good enough. - - result = search('gdp.*capita.*constant') - self.assertTrue(result.name.str.contains('GDP').any()) - - @tm.slow - @network - def test_wdi_download(self): - - # Test a bad indicator with double (US), triple (USA), - # standard (CA, MX), non standard (KSV), - # duplicated (US, US, USA), and unknown (BLA) country codes - - # ...but NOT a crash inducing country code (World bank strips pandas - # users of the luxury of laziness, because they create their - # own exceptions, and don't clean up legacy country codes. - # ...but NOT a retired indicator (User should want it to error.) - - cntry_codes = ['CA', 'MX', 'USA', 'US', 'US', 'KSV', 'BLA'] - inds = ['NY.GDP.PCAP.CD','BAD.INDICATOR'] - - expected = {'NY.GDP.PCAP.CD': {('Canada', '2003'): 28026.006013044702, ('Mexico', '2003'): 6601.0420648056606, ('Canada', '2004'): 31829.522562759001, ('Kosovo', '2003'): 1969.56271307405, ('Mexico', '2004'): 7042.0247834044303, ('United States', '2004'): 41928.886136479705, ('United States', '2003'): 39682.472247320402, ('Kosovo', '2004'): 2135.3328465238301}} - expected = pandas.DataFrame(expected) - #Round, to ignore revisions to data. - expected = pandas.np.round(expected,decimals=-3) - expected.sort(inplace=True) - result = download(country=cntry_codes, indicator=inds, - start=2003, end=2004, errors='ignore') - result.sort(inplace=True) - #Round, to ignore revisions to data. - result = pandas.np.round(result,decimals=-3) - expected.index = result.index - assert_frame_equal(result, pandas.DataFrame(expected)) - - @tm.slow - @network - def test_wdi_download_w_retired_indicator(self): - - cntry_codes = ['CA', 'MX', 'US'] - # Despite showing up in the search feature, and being listed online, - # the api calls to GDPPCKD don't work in their own query builder, nor - # pandas module. GDPPCKD used to be a common symbol. - # This test is written to ensure that error messages to pandas users - # continue to make sense, rather than a user getting some missing - # key error, cause their JSON message format changed. If - # World bank ever finishes the deprecation of this symbol, - # this nose test should still pass. - - inds = ['GDPPCKD'] - - try: - result = download(country=cntry_codes, indicator=inds, - start=2003, end=2004, errors='ignore') - # If for some reason result actually ever has data, it's cause WB - # fixed the issue with this ticker. Find another bad one. - except ValueError as e: - raise nose.SkipTest("No indicators returned data: {0}".format(e)) - - # if it ever gets here, it means WB unretired the indicator. - # even if they dropped it completely, it would still get caught above - # or the WB API changed somehow in a really unexpected way. - if len(result) > 0: - raise nose.SkipTest("Invalid results") - - @tm.slow - @network - def test_wdi_download_w_crash_inducing_countrycode(self): - - cntry_codes = ['CA', 'MX', 'US', 'XXX'] - inds = ['NY.GDP.PCAP.CD'] - - try: - result = download(country=cntry_codes, indicator=inds, - start=2003, end=2004, errors='ignore') - except ValueError as e: - raise nose.SkipTest("No indicators returned data: {0}".format(e)) - - # if it ever gets here, it means the country code XXX got used by WB - # or the WB API changed somehow in a really unexpected way. - if len(result) > 0: - raise nose.SkipTest("Invalid results") - - @tm.slow - @network - def test_wdi_get_countries(self): - result = get_countries() - self.assertTrue('Zimbabwe' in list(result['name'])) - self.assertTrue(len(result) > 100) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/wb.py b/pandas/io/wb.py index 81b4947f06b16..5dc4d9ce1adc4 100644 --- a/pandas/io/wb.py +++ b/pandas/io/wb.py @@ -1,314 +1,6 @@ -# -*- coding: utf-8 -*- - -# flake8: noqa - -from __future__ import print_function - -from pandas.compat import map, reduce, range, lrange -from pandas.io.common import urlopen -from pandas.io import json -import pandas -import numpy as np -import warnings - -warnings.warn("\n" - "The pandas.io.wb module is moved to a separate package " - "(pandas-datareader) and will be removed from pandas in a " - "future version.\nAfter installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.", - FutureWarning) - - -# This list of country codes was pulled from wikipedia during October 2014. -# While some exceptions do exist, it is the best proxy for countries supported -# by World Bank. It is an aggregation of the 2-digit ISO 3166-1 alpha-2, and -# 3-digit ISO 3166-1 alpha-3, codes, with 'all', 'ALL', and 'All' appended ot -# the end. - -country_codes = ['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AO', 'AQ', 'AR', \ - 'AS', 'AT', 'AU', 'AW', 'AX', 'AZ', 'BA', 'BB', 'BD', 'BE', \ - 'BF', 'BG', 'BH', 'BI', 'BJ', 'BL', 'BM', 'BN', 'BO', 'BQ', \ - 'BR', 'BS', 'BT', 'BV', 'BW', 'BY', 'BZ', 'CA', 'CC', 'CD', \ - 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CR', \ - 'CU', 'CV', 'CW', 'CX', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', \ - 'DO', 'DZ', 'EC', 'EE', 'EG', 'EH', 'ER', 'ES', 'ET', 'FI', \ - 'FJ', 'FK', 'FM', 'FO', 'FR', 'GA', 'GB', 'GD', 'GE', 'GF', \ - 'GG', 'GH', 'GI', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', \ - 'GT', 'GU', 'GW', 'GY', 'HK', 'HM', 'HN', 'HR', 'HT', 'HU', \ - 'ID', 'IE', 'IL', 'IM', 'IN', 'IO', 'IQ', 'IR', 'IS', 'IT', \ - 'JE', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KN', \ - 'KP', 'KR', 'KW', 'KY', 'KZ', 'LA', 'LB', 'LC', 'LI', 'LK', \ - 'LR', 'LS', 'LT', 'LU', 'LV', 'LY', 'MA', 'MC', 'MD', 'ME', \ - 'MF', 'MG', 'MH', 'MK', 'ML', 'MM', 'MN', 'MO', 'MP', 'MQ', \ - 'MR', 'MS', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NA', \ - 'NC', 'NE', 'NF', 'NG', 'NI', 'NL', 'NO', 'NP', 'NR', 'NU', \ - 'NZ', 'OM', 'PA', 'PE', 'PF', 'PG', 'PH', 'PK', 'PL', 'PM', \ - 'PN', 'PR', 'PS', 'PT', 'PW', 'PY', 'QA', 'RE', 'RO', 'RS', \ - 'RU', 'RW', 'SA', 'SB', 'SC', 'SD', 'SE', 'SG', 'SH', 'SI', \ - 'SJ', 'SK', 'SL', 'SM', 'SN', 'SO', 'SR', 'SS', 'ST', 'SV', \ - 'SX', 'SY', 'SZ', 'TC', 'TD', 'TF', 'TG', 'TH', 'TJ', 'TK', \ - 'TL', 'TM', 'TN', 'TO', 'TR', 'TT', 'TV', 'TW', 'TZ', 'UA', \ - 'UG', 'UM', 'US', 'UY', 'UZ', 'VA', 'VC', 'VE', 'VG', 'VI', \ - 'VN', 'VU', 'WF', 'WS', 'YE', 'YT', 'ZA', 'ZM', 'ZW', \ - 'ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', \ - 'ARG', 'ARM', 'ASM', 'ATA', 'ATF', 'ATG', 'AUS', 'AUT', \ - 'AZE', 'BDI', 'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', \ - 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', \ - 'BRA', 'BRB', 'BRN', 'BTN', 'BVT', 'BWA', 'CAF', 'CAN', \ - 'CCK', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', \ - 'COK', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW', 'CXR', \ - 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', \ - 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', \ - 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', \ - 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', \ - 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', \ - 'HKG', 'HMD', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', \ - 'IND', 'IOT', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', \ - 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', \ - 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', \ - 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', \ - 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', \ - 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MNP', 'MOZ', \ - 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', 'NAM', \ - 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', \ - 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PCN', 'PER', \ - 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', \ - 'PSE', 'PYF', 'QAT', 'REU', 'ROU', 'RUS', 'RWA', 'SAU', \ - 'SDN', 'SEN', 'SGP', 'SGS', 'SHN', 'SJM', 'SLB', 'SLE', \ - 'SLV', 'SMR', 'SOM', 'SPM', 'SRB', 'SSD', 'STP', 'SUR', \ - 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', \ - 'TCD', 'TGO', 'THA', 'TJK', 'TKL', 'TKM', 'TLS', 'TON', \ - 'TTO', 'TUN', 'TUR', 'TUV', 'TWN', 'TZA', 'UGA', 'UKR', \ - 'UMI', 'URY', 'USA', 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', \ - 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', 'ZAF', 'ZMB', \ - 'ZWE', 'all', 'ALL', 'All'] - -def download(country=None, indicator=None, - start=2003, end=2005,errors='warn'): - """ - Download data series from the World Bank's World Development Indicators - - Parameters - ---------- - - indicator: string or list of strings - taken from the ``id`` field in ``WDIsearch()`` - - country: string or list of strings. - ``all`` downloads data for all countries - 2 or 3 character ISO country codes select individual - countries (e.g.``US``,``CA``) or (e.g.``USA``,``CAN``). The codes - can be mixed. - - The two ISO lists of countries, provided by wikipedia, are hardcoded - into pandas as of 11/10/2014. - - start: int - First year of the data series - - end: int - Last year of the data series (inclusive) - - errors: str {'ignore', 'warn', 'raise'}, default 'warn' - Country codes are validated against a hardcoded list. This controls - the outcome of that validation, and attempts to also apply - to the results from world bank. - - errors='raise', will raise a ValueError on a bad country code. - - Returns - ------- - - ``pandas`` DataFrame with columns: country, iso_code, year, - indicator value. - - """ - if country is None: - country = ['MX', 'CA', 'US'] - if indicator is None: - indicator = ['NY.GDP.MKTP.CD', 'NY.GNS.ICTR.ZS'] - - if type(country) == str: - country = [country] - - bad_countries = np.setdiff1d(country, country_codes) - - # Validate the input - if len(bad_countries) > 0: - tmp = ", ".join(bad_countries) - if errors == 'raise': - raise ValueError("Invalid Country Code(s): %s" % tmp) - if errors == 'warn': - warnings.warn('Non-standard ISO country codes: %s' % tmp) - - # Work with a list of indicators - if type(indicator) == str: - indicator = [indicator] - - # Download - data = [] - bad_indicators = {} - for ind in indicator: - one_indicator_data,msg = _get_data(ind, country, start, end) - if msg == "Success": - data.append(one_indicator_data) - else: - bad_indicators[ind] = msg - - if len(bad_indicators.keys()) > 0: - bad_ind_msgs = [i + " : " + m for i,m in bad_indicators.items()] - bad_ind_msgs = "\n\n".join(bad_ind_msgs) - bad_ind_msgs = "\n\nInvalid Indicators:\n\n%s" % bad_ind_msgs - if errors == 'raise': - raise ValueError(bad_ind_msgs) - if errors == 'warn': - warnings.warn(bad_ind_msgs) - - # Confirm we actually got some data, and build Dataframe - if len(data) > 0: - out = reduce(lambda x, y: x.merge(y, how='outer'), data) - out = out.drop('iso_code', axis=1) - out = out.set_index(['country', 'year']) - out = out._convert(datetime=True, numeric=True) - return out - else: - msg = "No indicators returned data." - if errors == 'ignore': - msg += " Set errors='warn' for more information." - raise ValueError(msg) - -def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', - start=2002, end=2005): - - if type(country) == str: - country = [country] - - countries = ';'.join(country) - - # Build URL for api call - url = ("http://api.worldbank.org/countries/" + countries + "/indicators/" + - indicator + "?date=" + str(start) + ":" + str(end) + - "&per_page=25000&format=json") - - # Download - with urlopen(url) as response: - data = response.read() - - # Check to see if there is a possible problem - possible_message = json.loads(data)[0] - if 'message' in possible_message.keys(): - msg = possible_message['message'][0] - try: - msg = msg['key'].split() + ["\n "] + msg['value'].split() - wb_err = ' '.join(msg) - except: - wb_err = "" - if 'key' in msg.keys(): - wb_err = msg['key'] + "\n " - if 'value' in msg.keys(): - wb_err += msg['value'] - error_msg = "Problem with a World Bank Query \n %s" - return None, error_msg % wb_err - - if 'total' in possible_message.keys(): - if possible_message['total'] == 0: - return None, "No results from world bank." - - # Parse JSON file - data = json.loads(data)[1] - country = [x['country']['value'] for x in data] - iso_code = [x['country']['id'] for x in data] - year = [x['date'] for x in data] - value = [x['value'] for x in data] - # Prepare output - out = pandas.DataFrame([country, iso_code, year, value]).T - out.columns = ['country', 'iso_code', 'year', indicator] - return out,"Success" - -def get_countries(): - """Query information about countries - """ - url = 'http://api.worldbank.org/countries/?per_page=1000&format=json' - with urlopen(url) as response: - data = response.read() - data = json.loads(data)[1] - data = pandas.DataFrame(data) - data.adminregion = [x['value'] for x in data.adminregion] - data.incomeLevel = [x['value'] for x in data.incomeLevel] - data.lendingType = [x['value'] for x in data.lendingType] - data.region = [x['value'] for x in data.region] - data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) - return data - -def get_indicators(): - """Download information about all World Bank data series - """ - url = 'http://api.worldbank.org/indicators?per_page=50000&format=json' - with urlopen(url) as response: - data = response.read() - data = json.loads(data)[1] - data = pandas.DataFrame(data) - # Clean fields - data.source = [x['value'] for x in data.source] - fun = lambda x: x.encode('ascii', 'ignore') - data.sourceOrganization = data.sourceOrganization.apply(fun) - # Clean topic field - - def get_value(x): - try: - return x['value'] - except: - return '' - fun = lambda x: [get_value(y) for y in x] - data.topics = data.topics.apply(fun) - data.topics = data.topics.apply(lambda x: ' ; '.join(x)) - # Clean outpu - data = data.sort(columns='id') - data.index = pandas.Index(lrange(data.shape[0])) - return data - -_cached_series = None - - -def search(string='gdp.*capi', field='name', case=False): - """ - Search available data series from the world bank - - Parameters - ---------- - - string: string - regular expression - field: string - id, name, source, sourceNote, sourceOrganization, topics - See notes below - case: bool - case sensitive search? - - Notes - ----- - - The first time this function is run it will download and cache the full - list of available series. Depending on the speed of your network - connection, this can take time. Subsequent searches will use the cached - copy, so they should be much faster. - - id : Data series indicator (for use with the ``indicator`` argument of - ``WDI()``) e.g. NY.GNS.ICTR.GN.ZS" - name: Short description of the data series - source: Data collection project - sourceOrganization: Data collection organization - note: - sourceNote: - topics: - """ - # Create cached list of series if it does not exist - global _cached_series - if type(_cached_series) is not pandas.core.frame.DataFrame: - _cached_series = get_indicators() - data = _cached_series[field] - idx = data.str.contains(string, case=case) - out = _cached_series.ix[idx].dropna() - return out +raise ImportError( + "The pandas.io.wb module is moved to a separate package " + "(pandas-datareader). After installing the pandas-datareader package " + "(https://github.com/pydata/pandas-datareader), you can change " + "the import ``from pandas.io import data, wb`` to " + "``from pandas_datareader import data, wb``.") From 474fd0519b768efd1c7ec6555930873bf9910e00 Mon Sep 17 00:00:00 2001 From: Michael Scherer Date: Sun, 24 Jul 2016 20:13:43 +0200 Subject: [PATCH 164/359] DOC: Add Fedora and Centos install instructions (#13588) --- doc/source/install.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index b43d2b8aac517..82d2dcd1cc709 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -165,8 +165,9 @@ To install pandas for Python 3 you may need to use the package ``python3-pandas` Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python-pandas`` Ubuntu, stable, `official Ubuntu repository `__ , ``sudo apt-get install python-pandas`` Ubuntu, unstable (daily builds), `PythonXY PPA `__; activate by: ``sudo add-apt-repository ppa:pythonxy/pythonxy-devel && sudo apt-get update``, ``sudo apt-get install python-pandas`` - OpenSuse & Fedora, stable, `OpenSuse Repository `__ , ``zypper in python-pandas`` - + OpenSuse, stable, `OpenSuse Repository `__ , ``zypper in python-pandas`` + Fedora, stable, `official Fedora repository `__ , ``dnf install python-pandas`` + Centos/RHEL, stable, `EPEL repository `__ , ``yum install python-pandas`` From 5f524d61fd336b850d34f13d5ffb2b6136073f21 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 25 Jul 2016 07:35:29 -0400 Subject: [PATCH 165/359] ENH: add is_leap_year property for datetime-like closes #13727 Author: sinhrks Closes #13739 from sinhrks/is_leapyear and squashes the following commits: 5d227ee [sinhrks] ENH: add is_leapyear property for datetime-like --- doc/source/api.rst | 2 + doc/source/timeseries.rst | 1 + doc/source/whatsnew/v0.19.0.txt | 6 ++- pandas/src/period.pyx | 3 ++ pandas/tests/series/test_datetime_values.py | 2 +- pandas/tseries/index.py | 53 ++++++++++----------- pandas/tseries/period.py | 20 +++++--- pandas/tseries/tests/test_period.py | 28 ++++++++++- pandas/tseries/tests/test_timeseries.py | 32 +++++++++++-- pandas/tseries/tests/test_util.py | 18 ++++++- pandas/tseries/util.py | 4 ++ pandas/tslib.pyx | 27 +++++++++-- 12 files changed, 149 insertions(+), 47 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index e8fe26e8a525d..7b9fbb9b41a79 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -472,6 +472,7 @@ These can be accessed like ``Series.dt.``. Series.dt.is_quarter_end Series.dt.is_year_start Series.dt.is_year_end + Series.dt.is_leap_year Series.dt.daysinmonth Series.dt.days_in_month Series.dt.tz @@ -1497,6 +1498,7 @@ Time/Date Components DatetimeIndex.is_quarter_end DatetimeIndex.is_year_start DatetimeIndex.is_year_end + DatetimeIndex.is_leap_year DatetimeIndex.inferred_freq Selecting diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index da19c6a7d2bec..b8f747757987c 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -560,6 +560,7 @@ There are several time/date properties that one can access from ``Timestamp`` or is_quarter_end,"Logical indicating if last day of quarter (defined by frequency)" is_year_start,"Logical indicating if first day of year (defined by frequency)" is_year_end,"Logical indicating if last day of year (defined by frequency)" + is_leap_year,"Logical indicating if the date belongs to a leap year" Furthermore, if you have a ``Series`` with datetimelike values, then you can access these properties via the ``.dt`` accessor, see the :ref:`docs ` diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 317383e866464..0d70ff47a416e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -347,6 +347,8 @@ API changes - ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) +- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) + .. _whatsnew_0190.api.tolist: @@ -609,7 +611,9 @@ Deprecations - ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) - top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) - ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) -- ``pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) +- ``pd.tseries.util.pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) +- ``pd.tseries.util.isleapyear`` has been deprecated and will be removed in a subsequent release. Datetime-likes now have a ``.is_leap_year`` property. (:issue:`13727`) + .. _whatsnew_0190.prior_deprecations: diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 45743d1cf70ff..965ed53a4b802 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -913,6 +913,9 @@ cdef class _Period(object): property daysinmonth: def __get__(self): return self.days_in_month + property is_leap_year: + def __get__(self): + return bool(is_leapyear(self._field(0))) @classmethod def now(cls, freq=None): diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index c25895548dcb9..6211597b4a91b 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -32,7 +32,7 @@ def test_dt_namespace_accessor(self): ok_for_base = ['year', 'month', 'day', 'hour', 'minute', 'second', 'weekofyear', 'week', 'dayofweek', 'weekday', 'dayofyear', 'quarter', 'freq', 'days_in_month', - 'daysinmonth'] + 'daysinmonth', 'is_leap_year'] ok_for_period = ok_for_base + ['qyear', 'start_time', 'end_time'] ok_for_period_methods = ['strftime', 'to_timestamp', 'asfreq'] ok_for_dt = ok_for_base + ['date', 'time', 'microsecond', 'nanosecond', diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index a1775c11d2226..4a7ba0286aab1 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -72,11 +72,14 @@ def f(self): self.freq.kwds.get('month', 12)) if self.freq else 12) - result = tslib.get_start_end_field( - values, field, self.freqstr, month_kw) + result = tslib.get_start_end_field(values, field, self.freqstr, + month_kw) elif field in ['weekday_name']: result = tslib.get_date_name_field(values, field) return self._maybe_mask_results(result) + elif field in ['is_leap_year']: + # no need to mask NaT + return tslib.get_date_field(values, field) else: result = tslib.get_date_field(values, field) @@ -227,7 +230,8 @@ def _join_i8_wrapper(joinf, **kwargs): 'daysinmonth', 'date', 'time', 'microsecond', 'nanosecond', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', - 'is_year_end', 'tz', 'freq', 'weekday_name'] + 'is_year_end', 'tz', 'freq', 'weekday_name', + 'is_leap_year'] _is_numeric_dtype = False _infer_as_myclass = True @@ -1521,29 +1525,21 @@ def _set_freq(self, value): doc="get/set the frequncy of the Index") year = _field_accessor('year', 'Y', "The year of the datetime") - month = _field_accessor( - 'month', 'M', "The month as January=1, December=12") + month = _field_accessor('month', 'M', + "The month as January=1, December=12") day = _field_accessor('day', 'D', "The days of the datetime") hour = _field_accessor('hour', 'h', "The hours of the datetime") minute = _field_accessor('minute', 'm', "The minutes of the datetime") second = _field_accessor('second', 's', "The seconds of the datetime") - microsecond = _field_accessor( - 'microsecond', - 'us', - "The microseconds of the datetime") - nanosecond = _field_accessor( - 'nanosecond', - 'ns', - "The nanoseconds of the datetime") - weekofyear = _field_accessor( - 'weekofyear', - 'woy', - "The week ordinal of the year") + microsecond = _field_accessor('microsecond', 'us', + "The microseconds of the datetime") + nanosecond = _field_accessor('nanosecond', 'ns', + "The nanoseconds of the datetime") + weekofyear = _field_accessor('weekofyear', 'woy', + "The week ordinal of the year") week = weekofyear - dayofweek = _field_accessor( - 'dayofweek', - 'dow', - "The day of the week with Monday=0, Sunday=6") + dayofweek = _field_accessor('dayofweek', 'dow', + "The day of the week with Monday=0, Sunday=6") weekday = dayofweek weekday_name = _field_accessor( @@ -1551,14 +1547,9 @@ def _set_freq(self, value): 'weekday_name', "The name of day in a week (ex: Friday)\n\n.. versionadded:: 0.18.1") - dayofyear = _field_accessor( - 'dayofyear', - 'doy', - "The ordinal day of the year") - quarter = _field_accessor( - 'quarter', - 'q', - "The quarter of the date") + dayofyear = _field_accessor('dayofyear', 'doy', + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', @@ -1588,6 +1579,10 @@ def _set_freq(self, value): 'is_year_end', 'is_year_end', "Logical indicating if last day of year (defined by frequency)") + is_leap_year = _field_accessor( + 'is_leap_year', + 'is_leap_year', + "Logical indicating if the date belongs to a leap year") @property def time(self): diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index dffb71cff526a..810c89b3f969b 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -165,7 +165,8 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): 'weekofyear', 'week', 'dayofweek', 'weekday', 'dayofyear', 'quarter', 'qyear', 'freq', 'days_in_month', 'daysinmonth', - 'to_timestamp', 'asfreq', 'start_time', 'end_time'] + 'to_timestamp', 'asfreq', 'start_time', 'end_time', + 'is_leap_year'] _is_numeric_dtype = False _infer_as_myclass = True @@ -509,17 +510,22 @@ def to_datetime(self, dayfirst=False): second = _field_accessor('second', 7, "The second of the period") weekofyear = _field_accessor('week', 8, "The week ordinal of the year") week = weekofyear - dayofweek = _field_accessor( - 'dayofweek', 10, "The day of the week with Monday=0, Sunday=6") + dayofweek = _field_accessor('dayofweek', 10, + "The day of the week with Monday=0, Sunday=6") weekday = dayofweek - dayofyear = day_of_year = _field_accessor( - 'dayofyear', 9, "The ordinal day of the year") + dayofyear = day_of_year = _field_accessor('dayofyear', 9, + "The ordinal day of the year") quarter = _field_accessor('quarter', 2, "The quarter of the date") qyear = _field_accessor('qyear', 1) - days_in_month = _field_accessor( - 'days_in_month', 11, "The number of days in the month") + days_in_month = _field_accessor('days_in_month', 11, + "The number of days in the month") daysinmonth = days_in_month + @property + def is_leap_year(self): + """ Logical indicating if the date belongs to a leap year """ + return tslib._isleapyear_arr(self.year) + @property def start_time(self): return self.to_timestamp(how='start') diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 7077a61092b9e..88ab239790aa1 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1514,6 +1514,22 @@ def test_asfreq_mult(self): self.assertEqual(result.ordinal, expected.ordinal) self.assertEqual(result.freq, expected.freq) + def test_is_leap_year(self): + # GH 13727 + for freq in ['A', 'M', 'D', 'H']: + p = Period('2000-01-01 00:00:00', freq=freq) + self.assertTrue(p.is_leap_year) + self.assertIsInstance(p.is_leap_year, bool) + + p = Period('1999-01-01 00:00:00', freq=freq) + self.assertFalse(p.is_leap_year) + + p = Period('2004-01-01 00:00:00', freq=freq) + self.assertTrue(p.is_leap_year) + + p = Period('2100-01-01 00:00:00', freq=freq) + self.assertFalse(p.is_leap_year) + class TestPeriodIndex(tm.TestCase): def setUp(self): @@ -3130,9 +3146,10 @@ def test_fields(self): def _check_all_fields(self, periodindex): fields = ['year', 'month', 'day', 'hour', 'minute', 'second', 'weekofyear', 'week', 'dayofweek', 'weekday', 'dayofyear', - 'quarter', 'qyear', 'days_in_month'] + 'quarter', 'qyear', 'days_in_month', 'is_leap_year'] periods = list(periodindex) + s = pd.Series(periodindex) for field in fields: field_idx = getattr(periodindex, field) @@ -3140,6 +3157,14 @@ def _check_all_fields(self, periodindex): for x, val in zip(periods, field_idx): self.assertEqual(getattr(x, field), val) + if len(s) == 0: + continue + + field_s = getattr(s.dt, field) + self.assertEqual(len(periodindex), len(field_s)) + for x, val in zip(periods, field_s): + self.assertEqual(getattr(x, field), val) + def test_is_full(self): index = PeriodIndex([2005, 2007, 2009], freq='A') self.assertFalse(index.is_full) @@ -4569,6 +4594,7 @@ def test_get_period_field_array_raises_on_out_of_range(self): self.assertRaises(ValueError, _period.get_period_field_arr, -1, np.empty(1), 0) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 7b9999bd05c83..09fb4beb74f28 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -969,13 +969,20 @@ def test_nat_vector_field_access(self): fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', - 'days_in_month'] + 'days_in_month', 'is_leap_year'] + for field in fields: result = getattr(idx, field) - expected = [getattr(x, field) if x is not NaT else np.nan - for x in idx] + expected = [getattr(x, field) for x in idx] self.assert_numpy_array_equal(result, np.array(expected)) + s = pd.Series(idx) + + for field in fields: + result = getattr(s.dt, field) + expected = [getattr(x, field) for x in idx] + self.assert_series_equal(result, pd.Series(expected)) + def test_nat_scalar_field_access(self): fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', @@ -4761,6 +4768,25 @@ def test_timestamp_compare_series(self): result = right_f(Timestamp('nat'), s_nat) tm.assert_series_equal(result, expected) + def test_is_leap_year(self): + # GH 13727 + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + dt = Timestamp('2000-01-01 00:00:00', tz=tz) + self.assertTrue(dt.is_leap_year) + self.assertIsInstance(dt.is_leap_year, bool) + + dt = Timestamp('1999-01-01 00:00:00', tz=tz) + self.assertFalse(dt.is_leap_year) + + dt = Timestamp('2004-01-01 00:00:00', tz=tz) + self.assertTrue(dt.is_leap_year) + + dt = Timestamp('2100-01-01 00:00:00', tz=tz) + self.assertFalse(dt.is_leap_year) + + self.assertFalse(pd.NaT.is_leap_year) + self.assertIsInstance(pd.NaT.is_leap_year, bool) + class TestSlicing(tm.TestCase): def test_slice_year(self): diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py index 9d992995df3a7..96da32a4a845c 100644 --- a/pandas/tseries/tests/test_util.py +++ b/pandas/tseries/tests/test_util.py @@ -25,7 +25,9 @@ def test_daily(self): annual = pivot_annual(ts, 'D') doy = ts.index.dayofyear - doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 for i in range(1, 367): subset = ts[doy == i] @@ -51,7 +53,9 @@ def test_hourly(self): grouped = ts_hourly.groupby(ts_hourly.index.year) hoy = grouped.apply(lambda x: x.reset_index(drop=True)) hoy = hoy.index.droplevel(0).values - hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 hoy += 1 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -100,6 +104,16 @@ def test_period_daily(self): def test_period_weekly(self): pass + def test_isleapyear_deprecate(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(isleapyear(2000)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertFalse(isleapyear(2001)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(isleapyear(2004)) + def test_normalize_date(): value = date(2012, 9, 7) diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index 7bac0567ea5c6..59daa8d7780b4 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -95,6 +95,10 @@ def isleapyear(year): year : integer / sequence A given (list of) year(s). """ + + msg = "isleapyear is deprecated. Use .is_leap_year property instead" + warnings.warn(msg, FutureWarning) + year = np.asarray(year) return np.logical_or(year % 400 == 0, np.logical_and(year % 4 == 0, year % 100 > 0)) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index bc42adbab62b1..56a007bfa352c 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -546,6 +546,10 @@ class Timestamp(_Timestamp): def is_year_end(self): return self._get_start_end_field('is_year_end') + @property + def is_leap_year(self): + return bool(is_leapyear(self.year)) + def tz_localize(self, tz, ambiguous='raise', errors='raise'): """ Convert naive Timestamp to local time zone, or remove @@ -753,6 +757,10 @@ class NaTType(_NaT): # GH 10939 return np.nan + @property + def is_leap_year(self): + return False + def __rdiv__(self, other): return _nat_rdivide_op(self, other) @@ -771,7 +779,8 @@ class NaTType(_NaT): fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond', - 'week', 'dayofyear', 'days_in_month', 'daysinmonth', 'dayofweek', 'weekday_name'] + 'week', 'dayofyear', 'days_in_month', 'daysinmonth', 'dayofweek', + 'weekday_name'] for field in fields: prop = property(fget=lambda self: np.nan) setattr(NaTType, field, prop) @@ -4431,6 +4440,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = days_in_month(dts) return out + elif field == 'is_leap_year': + return _isleapyear_arr(get_date_field(dtindex, 'Y')) raise ValueError("Field %s not supported" % field) @@ -4821,8 +4832,18 @@ def dates_normalized(ndarray[int64_t] stamps, tz=None): # Some general helper functions #---------------------------------------------------------------------- -def isleapyear(int64_t year): - return is_leapyear(year) + +cpdef _isleapyear_arr(ndarray years): + cdef: + ndarray[int8_t] out + + # to make NaT result as False + out = np.zeros(len(years), dtype='int8') + out[np.logical_or(years % 400 == 0, + np.logical_and(years % 4 == 0, + years % 100 > 0))] = 1 + return out.view(bool) + def monthrange(int64_t year, int64_t month): cdef: From 2166ac1394da3fcad4c152cc1cb16e40b89ba08f Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 25 Jul 2016 08:00:51 -0400 Subject: [PATCH 166/359] PERF: Improve duplicated perf closes #10235 Author: sinhrks Closes #13751 from sinhrks/perf_duplicated and squashes the following commits: 12fb5ac [sinhrks] PERF: Improve duplicated perf --- asv_bench/benchmarks/algorithms.py | 14 +++ doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/algorithms.py | 52 +++++++++- pandas/core/base.py | 20 ++-- pandas/hashtable.pyx | 84 ++++++++++++++++ pandas/lib.pyx | 40 -------- pandas/tests/indexes/test_multi.py | 2 +- pandas/tests/test_algos.py | 150 +++++++++++++++++++++++++++++ pandas/tests/test_lib.py | 39 -------- 9 files changed, 314 insertions(+), 88 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 310a4c5549e4f..6eac7b4831f0f 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -7,6 +7,11 @@ class algorithm(object): def setup(self): N = 100000 + + self.int_unique = pd.Int64Index(np.arange(N * 5)) + # cache is_unique + self.int_unique.is_unique + self.int = pd.Int64Index(np.arange(N).repeat(5)) self.float = pd.Float64Index(np.random.randn(N).repeat(5)) @@ -15,3 +20,12 @@ def time_int_factorize(self): def time_float_factorize(self): self.int.factorize() + + def time_int_unique_duplicated(self): + self.int_unique.duplicated() + + def time_int_duplicated(self): + self.int.duplicated() + + def time_float_duplicated(self): + self.float.duplicated() diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0d70ff47a416e..cdab02265aa5c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -656,6 +656,7 @@ Performance Improvements - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) +- Improved performance of ``Index`` and ``Series`` ``.duplicated`` (:issue:`10235`) - Improved performance of ``Index.difference`` (:issue:`12044`) - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) - Improved performance of hashing ``Period`` (:issue:`12817`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 96a8582102cc9..52b1a3aae788c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -8,7 +8,8 @@ from pandas import compat, lib, tslib, _np_version_under1p8 from pandas.types.cast import _maybe_promote -from pandas.types.generic import ABCPeriodIndex, ABCDatetimeIndex +from pandas.types.generic import (ABCSeries, ABCIndex, ABCPeriodIndex, + ABCDatetimeIndex) from pandas.types.common import (is_integer_dtype, is_int64_dtype, is_categorical_dtype, @@ -448,6 +449,55 @@ def _value_counts_arraylike(values, dropna=True): return keys, counts +def duplicated(values, keep='first'): + """ + Return boolean ndarray denoting duplicate values + + .. versionadded:: 0.19.0 + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first + occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last + occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + duplicated : ndarray + """ + + dtype = values.dtype + + # no need to revert to original type + if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype): + if isinstance(values, (ABCSeries, ABCIndex)): + values = values.values.view(np.int64) + else: + values = values.view(np.int64) + elif is_period_arraylike(values): + from pandas.tseries.period import PeriodIndex + values = PeriodIndex(values).asi8 + elif is_categorical_dtype(dtype): + values = values.values.codes + elif isinstance(values, (ABCSeries, ABCIndex)): + values = values.values + + if is_integer_dtype(dtype): + values = _ensure_int64(values) + duplicated = htable.duplicated_int64(values, keep=keep) + elif is_float_dtype(dtype): + values = _ensure_float64(values) + duplicated = htable.duplicated_float64(values, keep=keep) + else: + values = _ensure_object(values) + duplicated = htable.duplicated_object(values, keep=keep) + + return duplicated + + def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. diff --git a/pandas/core/base.py b/pandas/core/base.py index 8c150d9fbb07e..0f9eb14be40db 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,7 +7,7 @@ from pandas.types.missing import isnull from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass -from pandas.types.common import (_ensure_object, is_object_dtype, +from pandas.types.common import (is_object_dtype, is_list_like, is_scalar) from pandas.core import common as com @@ -1014,6 +1014,7 @@ def is_monotonic(self): """ from pandas import Index return Index(self).is_monotonic + is_monotonic_increasing = is_monotonic @property @@ -1171,6 +1172,10 @@ def searchsorted(self, key, side='left', sorter=None): False: 'first'}) @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): + if isinstance(self, ABCIndexClass): + if self.is_unique: + return self._shallow_copy() + duplicated = self.duplicated(keep=keep) result = self[np.logical_not(duplicated)] if inplace: @@ -1200,13 +1205,14 @@ def drop_duplicates(self, keep='first', inplace=False): False: 'first'}) @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) def duplicated(self, keep='first'): - keys = com._values_from_object(_ensure_object(self.values)) - duplicated = lib.duplicated(keys, keep=keep) - try: - return self._constructor(duplicated, + from pandas.core.algorithms import duplicated + if isinstance(self, ABCIndexClass): + if self.is_unique: + return np.zeros(len(self), dtype=np.bool) + return duplicated(self, keep=keep) + else: + return self._constructor(duplicated(self, keep=keep), index=self.index).__finalize__(self) - except AttributeError: - return np.array(duplicated, dtype=bool) # ---------------------------------------------------------------------- # abstracts diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index e1c3733a0449d..18e54621e8bf5 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1073,6 +1073,90 @@ def mode_int64(int64_t[:] values): return modes[:j+1] + +def duplicated_object(ndarray[object] values, object keep='first'): + cdef: + Py_ssize_t i, n + dict seen = dict() + object row + + n = len(values) + cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) + + if keep == 'last': + for i from n > i >= 0: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = i + result[i] = 0 + elif keep == 'first': + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = i + result[i] = 0 + elif keep is False: + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + result[seen[row]] = 1 + else: + seen[row] = i + result[i] = 0 + else: + raise ValueError('keep must be either "first", "last" or False') + + return result.view(np.bool_) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_float64(ndarray[float64_t, ndim=1] values, + object keep='first'): + cdef: + int ret = 0, k + float64_t value + Py_ssize_t i, n = len(values) + kh_float64_t * table = kh_init_float64() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_float64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_float64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_float64(table) + return out + + @cython.wraparound(False) @cython.boundscheck(False) def duplicated_int64(ndarray[int64_t, ndim=1] values, diff --git a/pandas/lib.pyx b/pandas/lib.pyx index bf1dd1246120b..0473ae79adce5 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1394,46 +1394,6 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null): return result -def duplicated(ndarray[object] values, object keep='first'): - cdef: - Py_ssize_t i, n - dict seen = dict() - object row - - n = len(values) - cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) - - if keep == 'last': - for i from n > i >= 0: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = i - result[i] = 0 - elif keep == 'first': - for i from 0 <= i < n: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = i - result[i] = 0 - elif keep is False: - for i from 0 <= i < n: - row = values[i] - if row in seen: - result[i] = 1 - result[seen[row]] = 1 - else: - seen[row] = i - result[i] = 0 - else: - raise ValueError('keep must be either "first", "last" or False') - - return result.view(np.bool_) - - def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 0b65b6a9d09f5..408f81fe1e982 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1860,7 +1860,7 @@ def check(nlevels, with_nulls): for keep in ['first', 'last', False]: left = mi.duplicated(keep=keep) - right = pd.lib.duplicated(mi.values, keep=keep) + right = pd.hashtable.duplicated_object(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3c77d19aa7f3c..9535a3f97955c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -667,6 +667,156 @@ def test_value_counts_normalized(self): tm.assert_series_equal(result, expected) +class TestDuplicated(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_duplicated_with_nas(self): + keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) + + result = algos.duplicated(keys) + expected = np.array([False, False, False, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep='first') + expected = np.array([False, False, False, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep='last') + expected = np.array([True, False, True, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep=False) + expected = np.array([True, False, True, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + keys = np.empty(8, dtype=object) + for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, + [0, np.nan, 0, np.nan] * 2)): + keys[i] = t + + result = algos.duplicated(keys) + falses = [False] * 4 + trues = [True] * 4 + expected = np.array(falses + trues) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep='last') + expected = np.array(trues + falses) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep=False) + expected = np.array(trues + trues) + tm.assert_numpy_array_equal(result, expected) + + def test_numeric_object_likes(self): + cases = [np.array([1, 2, 1, 5, 3, + 2, 4, 1, 5, 6]), + np.array([1.1, 2.2, 1.1, np.nan, 3.3, + 2.2, 4.4, 1.1, np.nan, 6.6]), + np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j, + 2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]), + np.array(['a', 'b', 'a', 'e', 'c', + 'b', 'd', 'a', 'e', 'f'], dtype=object)] + + exp_first = np.array([False, False, True, False, False, + True, False, True, True, False]) + exp_last = np.array([True, True, True, True, False, + False, False, False, False, False]) + exp_false = exp_first | exp_last + + for case in cases: + res_first = algos.duplicated(case, keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = algos.duplicated(case, keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = algos.duplicated(case, keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # index + for idx in [pd.Index(case), pd.Index(case, dtype='category')]: + res_first = idx.duplicated(keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # series + for s in [pd.Series(case), pd.Series(case, dtype='category')]: + res_first = s.duplicated(keep='first') + tm.assert_series_equal(res_first, pd.Series(exp_first)) + + res_last = s.duplicated(keep='last') + tm.assert_series_equal(res_last, pd.Series(exp_last)) + + res_false = s.duplicated(keep=False) + tm.assert_series_equal(res_false, pd.Series(exp_false)) + + def test_datetime_likes(self): + + dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03', + '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06'] + td = ['1 days', '2 days', '1 days', 'NaT', '3 days', + '2 days', '4 days', '1 days', 'NaT', '6 days'] + + cases = [np.array([pd.Timestamp(d) for d in dt]), + np.array([pd.Timestamp(d, tz='US/Eastern') for d in dt]), + np.array([pd.Period(d, freq='D') for d in dt]), + np.array([np.datetime64(d) for d in dt]), + np.array([pd.Timedelta(d) for d in td])] + + exp_first = np.array([False, False, True, False, False, + True, False, True, True, False]) + exp_last = np.array([True, True, True, True, False, + False, False, False, False, False]) + exp_false = exp_first | exp_last + + for case in cases: + print(case) + res_first = algos.duplicated(case, keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = algos.duplicated(case, keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = algos.duplicated(case, keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # index + for idx in [pd.Index(case), pd.Index(case, dtype='category')]: + res_first = idx.duplicated(keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # series + for s in [pd.Series(case), pd.Series(case, dtype='category')]: + res_first = s.duplicated(keep='first') + tm.assert_series_equal(res_first, pd.Series(exp_first)) + + res_last = s.duplicated(keep='last') + tm.assert_series_equal(res_last, pd.Series(exp_last)) + + res_false = s.duplicated(keep=False) + tm.assert_series_equal(res_false, pd.Series(exp_false)) + + def test_unique_index(self): + cases = [pd.Index([1, 2, 3]), pd.RangeIndex(0, 3)] + for case in cases: + self.assertTrue(case.is_unique) + tm.assert_numpy_array_equal(case.duplicated(), + np.array([False, False, False])) + + class GroupVarTestMixin(object): def test_group_var_generic_1d(self): diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 80b5e41e881cd..945f8004687cd 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -234,45 +234,6 @@ def test_empty_like(self): self._check_behavior(arr, expected) -def test_duplicated_with_nas(): - keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) - - result = lib.duplicated(keys) - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='first') - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='last') - expected = [True, False, True, False, False, False] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep=False) - expected = [True, False, True, True, False, True] - assert (np.array_equal(result, expected)) - - keys = np.empty(8, dtype=object) - for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, - [0, np.nan, 0, np.nan] * 2)): - keys[i] = t - - result = lib.duplicated(keys) - falses = [False] * 4 - trues = [True] * 4 - expected = falses + trues - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='last') - expected = trues + falses - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep=False) - expected = trues + trues - assert (np.array_equal(result, expected)) - - if __name__ == '__main__': import nose From 4c2840e1b055771e740c8096a295ba6136d8302d Mon Sep 17 00:00:00 2001 From: mpuels Date: Mon, 25 Jul 2016 17:07:00 +0200 Subject: [PATCH 167/359] BUG: Fix .to_excel() for MultiIndex containing a NaN value #13511 (#13551) --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/formats/format.py | 6 +++++- pandas/io/tests/test_excel.py | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index cdab02265aa5c..04a749dfbc5bc 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -772,3 +772,5 @@ Bug Fixes - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) + +- Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 436a9d5d5d4c8..50d54ddb95100 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1839,7 +1839,11 @@ def _format_hierarchical_rows(self): for spans, levels, labels in zip(level_lengths, self.df.index.levels, self.df.index.labels): - values = levels.take(labels) + + values = levels.take(labels, + allow_fill=levels._can_hold_na, + fill_value=True) + for i in spans: if spans[i] > 1: yield ExcelCell(self.rowcounter + i, gcolidx, diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 55a7f5350719d..34e47ebcfcf5a 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1328,6 +1328,20 @@ def test_to_excel_multiindex(self): parse_dates=False) tm.assert_frame_equal(frame, df) + # GH13511 + def test_to_excel_multiindex_nan_label(self): + _skip_if_no_xlrd() + + frame = pd.DataFrame({'A': [None, 2, 3], + 'B': [10, 20, 30], + 'C': np.random.sample(3)}) + frame = frame.set_index(['A', 'B']) + + with ensure_clean(self.ext) as path: + frame.to_excel(path, merge_cells=self.merge_cells) + df = read_excel(path, index_col=[0, 1]) + tm.assert_frame_equal(frame, df) + # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells From 4dd734c9c3f38ddc19fa77fea71abfb6c1f3f2ba Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Mon, 25 Jul 2016 08:14:31 -0700 Subject: [PATCH 168/359] DOC: fix slashes in read_csv line_terminator/sep kwargs descriptions (#13761) --- pandas/core/frame.py | 4 ++-- pandas/io/parsers.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fe05b3715f45d..4ffd9c5466b6c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1306,7 +1306,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, escapechar=None, decimal='.', **kwds): - """Write DataFrame to a comma-separated values (csv) file + r"""Write DataFrame to a comma-separated values (csv) file Parameters ---------- @@ -1343,7 +1343,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, a string representing the compression to use in the output file, allowed values are 'gzip', 'bz2', 'xz', only used when the first argument is a filename - line_terminator : string, default '\\n' + line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file quoting : optional constant from csv module diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f6a84ea9debaa..bedf21318aa83 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -277,11 +277,11 @@ Parser engine to use. The C engine is faster while the python engine is currently more feature-complete.""" -_sep_doc = """sep : str, default {default} +_sep_doc = r"""sep : str, default {default} Delimiter to use. If sep is None, will try to automatically determine - this. Separators longer than 1 character and different from '\s+' will be - interpreted as regular expressions, will force use of the python parsing - engine and will ignore quotes in the data. Regex example: '\\r\\t'""" + this. Separators longer than 1 character and different from ``'\s+'`` will + be interpreted as regular expressions, will force use of the python parsing + engine and will ignore quotes in the data. Regex example: ``'\r\t'``""" _read_csv_doc = """ Read CSV (comma-separated) file into DataFrame From 136794524a8f8954c0b8bf8f0327870d83cbf8d4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 25 Jul 2016 17:55:50 -0400 Subject: [PATCH 169/359] CLN: Removed copy parameter in xs_* methods Title is self-explanatory. Picks up where #6919 left off. Author: gfyoung Closes #13781 from gfyoung/xs-copy-remove and squashes the following commits: c314bc1 [gfyoung] CLN: Removed copy parameter in xs_* methods --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/generic.py | 14 ++++---------- pandas/core/panel.py | 24 +++--------------------- pandas/tests/test_panel.py | 7 ------- 4 files changed, 8 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 04a749dfbc5bc..089ff94bbbb8c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -629,6 +629,7 @@ Removal of prior version deprecations/changes - ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) - ``pd.Index`` has dropped the ``diff`` method in favour of ``difference`` (:issue:`13669`) +- ``Series.xs``, ``DataFrame.xs``, ``Panel.xs``, ``Panel.major_xs``, and ``Panel.minor_xs`` have dropped the ``copy`` parameter (:issue:`13781`) - ``str.split`` has dropped the ``return_type`` parameter in favor of ``expand`` (:issue:`13701`) - Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 005d5467c14cd..f57b94fe0a326 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1671,7 +1671,7 @@ def take(self, indices, axis=0, convert=True, is_copy=True, **kwargs): return result - def xs(self, key, axis=0, level=None, copy=None, drop_level=True): + def xs(self, key, axis=0, level=None, drop_level=True): """ Returns a cross-section (row(s) or column(s)) from the Series/DataFrame. Defaults to cross-section on the rows (axis=0). @@ -1685,8 +1685,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): level : object, defaults to first n levels (n=1 or len(key)) In case of a key partially contained in a MultiIndex, indicate which levels are used. Levels can be referred by label or position. - copy : boolean [deprecated] - Whether to make a copy of the data drop_level : boolean, default True If False, returns object with same levels as self. @@ -1742,10 +1740,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): :ref:`MultiIndex Slicers ` """ - if copy is not None: - warnings.warn("copy keyword is deprecated, " - "default is to return a copy or a view if possible") - axis = self._get_axis_number(axis) labels = self._get_axis(axis) if level is not None: @@ -1800,9 +1794,9 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): if not is_list_like(new_values) or self.ndim == 1: return _maybe_box_datetimelike(new_values) - result = self._constructor_sliced(new_values, index=self.columns, - name=self.index[loc], copy=copy, - dtype=new_values.dtype) + result = self._constructor_sliced( + new_values, index=self.columns, + name=self.index[loc], dtype=new_values.dtype) else: result = self.iloc[loc] diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 4d61563cccce5..1d49ac5e2be86 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -758,7 +758,7 @@ def _combine_panel(self, other, func): return self._constructor(result_values, items, major, minor) - def major_xs(self, key, copy=None): + def major_xs(self, key): """ Return slice of panel along major axis @@ -766,8 +766,6 @@ def major_xs(self, key, copy=None): ---------- key : object Major axis label - copy : boolean [deprecated] - Whether to make a copy of the data Returns ------- @@ -783,13 +781,9 @@ def major_xs(self, key, copy=None): :ref:`MultiIndex Slicers ` """ - if copy is not None: - warnings.warn("copy keyword is deprecated, " - "default is to return a copy or a view if possible") - return self.xs(key, axis=self._AXIS_LEN - 2) - def minor_xs(self, key, copy=None): + def minor_xs(self, key): """ Return slice of panel along minor axis @@ -797,8 +791,6 @@ def minor_xs(self, key, copy=None): ---------- key : object Minor axis label - copy : boolean [deprecated] - Whether to make a copy of the data Returns ------- @@ -814,13 +806,9 @@ def minor_xs(self, key, copy=None): :ref:`MultiIndex Slicers ` """ - if copy is not None: - warnings.warn("copy keyword is deprecated, " - "default is to return a copy or a view if possible") - return self.xs(key, axis=self._AXIS_LEN - 1) - def xs(self, key, axis=1, copy=None): + def xs(self, key, axis=1): """ Return slice of panel along selected axis @@ -829,8 +817,6 @@ def xs(self, key, axis=1, copy=None): key : object Label axis : {'items', 'major', 'minor}, default 1/'major' - copy : boolean [deprecated] - Whether to make a copy of the data Returns ------- @@ -845,10 +831,6 @@ def xs(self, key, axis=1, copy=None): :ref:`MultiIndex Slicers ` """ - if copy is not None: - warnings.warn("copy keyword is deprecated, " - "default is to return a copy or a view if possible") - axis = self._get_axis_number(axis) if axis == 0: return self[key] diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index d9c7c1dc0dc62..a37e7ca732bde 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2597,13 +2597,6 @@ def test_panel_index(): tm.assert_index_equal(index, expected) -def test_import_warnings(): - # GH8152 - panel = Panel(np.random.rand(3, 3, 3)) - with assert_produces_warning(): - panel.major_xs(1, copy=False) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 309e1fef435dc8bc04ec84a0f3dff6024bb88879 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 25 Jul 2016 19:33:25 -0400 Subject: [PATCH 170/359] DEPR: Panel4d and panelND closes #13564 Author: Jeff Reback Closes #13776 from jreback/panelnd and squashes the following commits: 3f55809 [Jeff Reback] DEPR: Panel4d and panelND --- doc/source/dsintro.rst | 8 + doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/api/tests/test_api.py | 5 +- pandas/core/common.py | 8 +- pandas/core/panel4d.py | 15 + pandas/core/panelnd.py | 18 + pandas/io/tests/test_packers.py | 4 +- pandas/io/tests/test_pytables.py | 537 ++++++------- pandas/sparse/panel.py | 2 +- pandas/sparse/tests/test_panel.py | 33 +- pandas/tests/test_generic.py | 102 ++- pandas/tests/test_panel.py | 10 - pandas/tests/test_panel4d.py | 1125 ++++++++++++---------------- pandas/tests/test_panelnd.py | 116 +-- pandas/tests/types/test_missing.py | 9 +- pandas/tools/tests/test_concat.py | 40 +- pandas/util/testing.py | 1 - 17 files changed, 992 insertions(+), 1043 deletions(-) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index bbb43396a85b9..6334167b2c746 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -941,6 +941,10 @@ method: Panel4D (Experimental) ---------------------- +.. warning:: + + In 0.19.0 ``Panel4D` is deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. + ``Panel4D`` is a 4-Dimensional named container very much like a ``Panel``, but having 4 named dimensions. It is intended as a test bed for more N-Dimensional named containers. @@ -1026,6 +1030,10 @@ copy by default unless the data are heterogeneous): PanelND (Experimental) ---------------------- +.. warning:: + + In 0.19.0 ``PanelND` is deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. + PanelND is a module with a set of factory functions to enable a user to construct N-dimensional named containers like Panel4D, with a custom set of axis labels. Thus a domain-specific container can easily be created. diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 089ff94bbbb8c..9aa206adc513f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -613,7 +613,7 @@ Deprecations - ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) - ``pd.tseries.util.pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) - ``pd.tseries.util.isleapyear`` has been deprecated and will be removed in a subsequent release. Datetime-likes now have a ``.is_leap_year`` property. (:issue:`13727`) - +- ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. (:issue:`13564`) .. _whatsnew_0190.prior_deprecations: diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index 0aefdbeae0518..8143f925af3e0 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -58,11 +58,10 @@ class TestPDApi(Base, tm.TestCase): # these are already deprecated; awaiting removal deprecated_classes = ['SparsePanel', 'TimeSeries', 'WidePanel', - 'SparseTimeSeries'] + 'SparseTimeSeries', 'Panel4D'] # these should be deperecated in the future - deprecated_classes_in_future = ['Panel', 'Panel4D', - 'SparseList', 'Term'] + deprecated_classes_in_future = ['SparseList', 'Term', 'Panel'] # these should be removed from top-level namespace remove_classes_from_top_level_namespace = ['Expr'] diff --git a/pandas/core/common.py b/pandas/core/common.py index 99dd2e9f5b8a9..054b899f1ded2 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -14,7 +14,7 @@ from pandas.compat import long, zip, iteritems from pandas.core.config import get_option from pandas.types.generic import ABCSeries -from pandas.types.common import _NS_DTYPE, is_integer +from pandas.types.common import _NS_DTYPE from pandas.types.inference import _iterable_not_string from pandas.types.missing import isnull from pandas.api import types @@ -31,7 +31,7 @@ def wrapper(*args, **kwargs): warnings.warn("pandas.core.common.{t} is deprecated. " "import from the public API: " "pandas.api.types.{t} instead".format(t=t), - FutureWarning, stacklevel=2) + FutureWarning, stacklevel=3) return getattr(types, t)(*args, **kwargs) return wrapper @@ -57,7 +57,7 @@ def wrapper(*args, **kwargs): "These are not longer public API functions, " "but can be imported from " "pandas.types.common.{t} instead".format(t=t), - FutureWarning, stacklevel=2) + FutureWarning, stacklevel=3) return getattr(common, t)(*args, **kwargs) return wrapper @@ -578,7 +578,7 @@ def _random_state(state=None): np.random.RandomState """ - if is_integer(state): + if types.is_integer(state): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): return state diff --git a/pandas/core/panel4d.py b/pandas/core/panel4d.py index 33bd79195cc77..f32de29c5c167 100644 --- a/pandas/core/panel4d.py +++ b/pandas/core/panel4d.py @@ -1,5 +1,6 @@ """ Panel4D: a 4-d dict like collection of panels """ +import warnings from pandas.core.panelnd import create_nd_panel_factory from pandas.core.panel import Panel @@ -18,6 +19,11 @@ having 4 named dimensions. It is intended as a test bed for more N-Dimensional named containers. + DEPRECATED. Panel4D is deprecated and will be removed in a future version. + The recommended way to represent these types of n-dimensional data are with + the `xarray package `__. + Pandas provides a `.to_xarray()` method to automate this conversion. + Parameters ---------- data : ndarray (labels x items x major x minor), or dict of Panels @@ -37,6 +43,15 @@ def panel4d_init(self, data=None, labels=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None): + # deprecation GH13564 + warnings.warn("\nPanel4D is deprecated and will be removed in a " + "future version.\nThe recommended way to represent " + "these types of n-dimensional data are with\n" + "the `xarray package " + "`__.\n" + "Pandas provides a `.to_xarray()` method to help " + "automate this conversion.\n", + FutureWarning, stacklevel=2) self._init_data(data=data, labels=labels, items=items, major_axis=major_axis, minor_axis=minor_axis, copy=copy, dtype=dtype) diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index 04fbaab30b42e..26ceeea654e4e 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -1,5 +1,6 @@ """ Factory methods to create N-D panels """ +import warnings from pandas.compat import zip import pandas.compat as compat @@ -8,6 +9,11 @@ def create_nd_panel_factory(klass_name, orders, slices, slicer, aliases=None, stat_axis=2, info_axis=0, ns=None): """ manufacture a n-d class: + DEPRECATED. Panelnd is deprecated and will be removed in a future version. + The recommended way to represent these types of n-dimensional data are with + the `xarray package `__. + Pandas provides a `.to_xarray()` method to automate this conversion. + Parameters ---------- klass_name : the klass name @@ -44,6 +50,18 @@ def create_nd_panel_factory(klass_name, orders, slices, slicer, aliases=None, # define the methods #### def __init__(self, *args, **kwargs): + + # deprecation GH13564 + warnings.warn("\n{klass} is deprecated and will be removed in a " + "future version.\nThe recommended way to represent " + "these types of n-dimensional data are with the\n" + "`xarray package " + "`__.\n" + "Pandas provides a `.to_xarray()` method to help " + "automate this conversion.\n".format( + klass=self.__class__.__name__), + FutureWarning, stacklevel=2) + if not (kwargs.get('data') or len(args)): raise Exception("must supply at least a data argument to [%s]" % klass_name) diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 0a491a69af8e2..fe5972d35d5ec 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -1,6 +1,5 @@ import nose -import warnings import os import datetime import numpy as np @@ -545,7 +544,8 @@ def test_sparse_frame(self): def test_sparse_panel(self): - with warnings.catch_warnings(record=True): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): items = ['x', 'y', 'z'] p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index ab5362da21a7d..89d2f13f256fe 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -133,13 +133,15 @@ def _maybe_remove(store, key): pass -def compat_assert_produces_warning(w, f): +@contextmanager +def compat_assert_produces_warning(w): """ don't produce a warning under PY3 """ if compat.PY3: - f() + yield else: - with tm.assert_produces_warning(expected_warning=w): - f() + with tm.assert_produces_warning(expected_warning=w, + check_stacklevel=False): + yield class Base(tm.TestCase): @@ -808,28 +810,30 @@ def test_append(self): assert_panel_equal(store['wp1'], wp) # ndim - p4d = tm.makePanel4D() - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :]) - store.append('p4d', p4d.ix[:, :, 10:, :]) - assert_panel4d_equal(store['p4d'], p4d) - - # test using axis labels - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=[ - 'items', 'major_axis', 'minor_axis']) - store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ - 'items', 'major_axis', 'minor_axis']) - assert_panel4d_equal(store['p4d'], p4d) - - # test using differnt number of items on each axis - p4d2 = p4d.copy() - p4d2['l4'] = p4d['l1'] - p4d2['l5'] = p4d['l1'] - _maybe_remove(store, 'p4d2') - store.append( - 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis']) - assert_panel4d_equal(store['p4d2'], p4d2) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + p4d = tm.makePanel4D() + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :]) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store['p4d'], p4d) + + # test using axis labels + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=[ + 'items', 'major_axis', 'minor_axis']) + store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + 'items', 'major_axis', 'minor_axis']) + assert_panel4d_equal(store['p4d'], p4d) + + # test using differnt number of items on each axis + p4d2 = p4d.copy() + p4d2['l4'] = p4d['l1'] + p4d2['l5'] = p4d['l1'] + _maybe_remove(store, 'p4d2') + store.append( + 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis']) + assert_panel4d_equal(store['p4d2'], p4d2) # test using differt order of items on the non-index axes _maybe_remove(store, 'wp1') @@ -1220,72 +1224,74 @@ def test_append_with_different_block_ordering(self): self.assertRaises(ValueError, store.append, 'df', df) def test_ndim_indexables(self): - """ test using ndim tables in new ways""" - - with ensure_clean_store(self.path) as store: - - p4d = tm.makePanel4D() - - def check_indexers(key, indexers): - for i, idx in enumerate(indexers): - self.assertTrue(getattr(getattr( - store.root, key).table.description, idx)._v_pos == i) - - # append then change (will take existing schema) - indexers = ['items', 'major_axis', 'minor_axis'] - - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :]) - assert_panel4d_equal(store.select('p4d'), p4d) - check_indexers('p4d', indexers) - - # same as above, but try to append with differnt axes - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ - 'labels', 'items', 'major_axis']) - assert_panel4d_equal(store.select('p4d'), p4d) - check_indexers('p4d', indexers) - - # pass incorrect number of axes - _maybe_remove(store, 'p4d') - self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[ - :, :, :10, :], axes=['major_axis', 'minor_axis']) - - # different than default indexables #1 - indexers = ['labels', 'major_axis', 'minor_axis'] - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :]) - assert_panel4d_equal(store['p4d'], p4d) - check_indexers('p4d', indexers) - - # different than default indexables #2 - indexers = ['major_axis', 'labels', 'minor_axis'] - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :]) - assert_panel4d_equal(store['p4d'], p4d) - check_indexers('p4d', indexers) - - # partial selection - result = store.select('p4d', ['labels=l1']) - expected = p4d.reindex(labels=['l1']) - assert_panel4d_equal(result, expected) - - # partial selection2 - result = store.select('p4d', [Term( - 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) - expected = p4d.reindex( - labels=['l1'], items=['ItemA'], minor_axis=['B']) - assert_panel4d_equal(result, expected) - - # non-existant partial selection - result = store.select('p4d', [Term( - 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) - expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B']) - assert_panel4d_equal(result, expected) + # test using ndim tables in new ways + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with ensure_clean_store(self.path) as store: + + p4d = tm.makePanel4D() + + def check_indexers(key, indexers): + for i, idx in enumerate(indexers): + descr = getattr(store.root, key).table.description + self.assertTrue(getattr(descr, idx)._v_pos == i) + + # append then change (will take existing schema) + indexers = ['items', 'major_axis', 'minor_axis'] + + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store.select('p4d'), p4d) + check_indexers('p4d', indexers) + + # same as above, but try to append with differnt axes + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + 'labels', 'items', 'major_axis']) + assert_panel4d_equal(store.select('p4d'), p4d) + check_indexers('p4d', indexers) + + # pass incorrect number of axes + _maybe_remove(store, 'p4d') + self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[ + :, :, :10, :], axes=['major_axis', 'minor_axis']) + + # different than default indexables #1 + indexers = ['labels', 'major_axis', 'minor_axis'] + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store['p4d'], p4d) + check_indexers('p4d', indexers) + + # different than default indexables #2 + indexers = ['major_axis', 'labels', 'minor_axis'] + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store['p4d'], p4d) + check_indexers('p4d', indexers) + + # partial selection + result = store.select('p4d', ['labels=l1']) + expected = p4d.reindex(labels=['l1']) + assert_panel4d_equal(result, expected) + + # partial selection2 + result = store.select('p4d', [Term( + 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) + expected = p4d.reindex( + labels=['l1'], items=['ItemA'], minor_axis=['B']) + assert_panel4d_equal(result, expected) + + # non-existant partial selection + result = store.select('p4d', [Term( + 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) + expected = p4d.reindex(labels=['l1'], items=[], + minor_axis=['B']) + assert_panel4d_equal(result, expected) def test_append_with_strings(self): @@ -1816,24 +1822,27 @@ def test_append_misc(self): with ensure_clean_store(self.path) as store: - # unsuported data types for non-tables - p4d = tm.makePanel4D() - self.assertRaises(TypeError, store.put, 'p4d', p4d) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): - # unsuported data types - self.assertRaises(TypeError, store.put, 'abc', None) - self.assertRaises(TypeError, store.put, 'abc', '123') - self.assertRaises(TypeError, store.put, 'abc', 123) - self.assertRaises(TypeError, store.put, 'abc', np.arange(5)) + # unsuported data types for non-tables + p4d = tm.makePanel4D() + self.assertRaises(TypeError, store.put, 'p4d', p4d) - df = tm.makeDataFrame() - store.append('df', df, chunksize=1) - result = store.select('df') - tm.assert_frame_equal(result, df) + # unsuported data types + self.assertRaises(TypeError, store.put, 'abc', None) + self.assertRaises(TypeError, store.put, 'abc', '123') + self.assertRaises(TypeError, store.put, 'abc', 123) + self.assertRaises(TypeError, store.put, 'abc', np.arange(5)) - store.append('df1', df, expectedrows=10) - result = store.select('df1') - tm.assert_frame_equal(result, df) + df = tm.makeDataFrame() + store.append('df', df, chunksize=1) + result = store.select('df') + tm.assert_frame_equal(result, df) + + store.append('df1', df, expectedrows=10) + result = store.select('df1') + tm.assert_frame_equal(result, df) # more chunksize in append tests def check(obj, comparator): @@ -1855,8 +1864,9 @@ def check(obj, comparator): p = tm.makePanel() check(p, assert_panel_equal) - p4d = tm.makePanel4D() - check(p4d, assert_panel4d_equal) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() + check(p4d, assert_panel4d_equal) # empty frame, GH4273 with ensure_clean_store(self.path) as store: @@ -2022,19 +2032,20 @@ def test_table_mixed_dtypes(self): store.append('p1_mixed', wp) assert_panel_equal(store.select('p1_mixed'), wp) - # ndim - wp = tm.makePanel4D() - wp['obj1'] = 'foo' - wp['obj2'] = 'bar' - wp['bool1'] = wp['l1'] > 0 - wp['bool2'] = wp['l2'] > 0 - wp['int1'] = 1 - wp['int2'] = 2 - wp = wp.consolidate() - - with ensure_clean_store(self.path) as store: - store.append('p4d_mixed', wp) - assert_panel4d_equal(store.select('p4d_mixed'), wp) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # ndim + wp = tm.makePanel4D() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['l1'] > 0 + wp['bool2'] = wp['l2'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + wp = wp.consolidate() + + with ensure_clean_store(self.path) as store: + store.append('p4d_mixed', wp) + assert_panel4d_equal(store.select('p4d_mixed'), wp) def test_unimplemented_dtypes_table_columns(self): @@ -2355,29 +2366,34 @@ def test_invalid_terms(self): with ensure_clean_store(self.path) as store: - df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df.ix[0:4, 'string'] = 'bar' - wp = tm.makePanel() - p4d = tm.makePanel4D() - store.put('df', df, format='table') - store.put('wp', wp, format='table') - store.put('p4d', p4d, format='table') - - # some invalid terms - self.assertRaises(ValueError, store.select, - 'wp', "minor=['A', 'B']") - self.assertRaises(ValueError, store.select, - 'wp', ["index=['20121114']"]) - self.assertRaises(ValueError, store.select, 'wp', [ - "index=['20121114', '20121114']"]) - self.assertRaises(TypeError, Term) - - # more invalid - self.assertRaises(ValueError, store.select, 'df', 'df.index[3]') - self.assertRaises(SyntaxError, store.select, 'df', 'index>') - self.assertRaises(ValueError, store.select, 'wp', - "major_axis<'20000108' & minor_axis['A', 'B']") + with compat_assert_produces_warning(FutureWarning): + + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.ix[0:4, 'string'] = 'bar' + wp = tm.makePanel() + + p4d = tm.makePanel4D() + store.put('df', df, format='table') + store.put('wp', wp, format='table') + store.put('p4d', p4d, format='table') + + # some invalid terms + self.assertRaises(ValueError, store.select, + 'wp', "minor=['A', 'B']") + self.assertRaises(ValueError, store.select, + 'wp', ["index=['20121114']"]) + self.assertRaises(ValueError, store.select, 'wp', [ + "index=['20121114', '20121114']"]) + self.assertRaises(TypeError, Term) + + # more invalid + self.assertRaises( + ValueError, store.select, 'df', 'df.index[3]') + self.assertRaises(SyntaxError, store.select, 'df', 'index>') + self.assertRaises( + ValueError, store.select, 'wp', + "major_axis<'20000108' & minor_axis['A', 'B']") # from the docs with ensure_clean_path(self.path) as path: @@ -2404,12 +2420,16 @@ def test_terms(self): with ensure_clean_store(self.path) as store: wp = tm.makePanel() - p4d = tm.makePanel4D() wpneg = Panel.fromDict({-1: tm.makeDataFrame(), 0: tm.makeDataFrame(), 1: tm.makeDataFrame()}) + + with compat_assert_produces_warning(FutureWarning): + + p4d = tm.makePanel4D() + store.put('p4d', p4d, format='table') + store.put('wp', wp, format='table') - store.put('p4d', p4d, format='table') store.put('wpneg', wpneg, format='table') # panel @@ -2425,12 +2445,15 @@ def test_terms(self): tm.assert_panel_equal(result, expected) # p4d - result = store.select('p4d', [Term('major_axis<"20000108"'), - Term("minor_axis=['A', 'B']"), - Term("items=['ItemA', 'ItemB']")]) - expected = p4d.truncate(after='20000108').reindex( - minor=['A', 'B'], items=['ItemA', 'ItemB']) - assert_panel4d_equal(result, expected) + with compat_assert_produces_warning(FutureWarning): + + result = store.select('p4d', + [Term('major_axis<"20000108"'), + Term("minor_axis=['A', 'B']"), + Term("items=['ItemA', 'ItemB']")]) + expected = p4d.truncate(after='20000108').reindex( + minor=['A', 'B'], items=['ItemA', 'ItemB']) + assert_panel4d_equal(result, expected) # back compat invalid terms terms = [dict(field='major_axis', op='>', value='20121114'), @@ -2442,34 +2465,34 @@ def test_terms(self): check_stacklevel=False): Term(t) - # valid terms - terms = [ - ('major_axis=20121114'), - ('major_axis>20121114'), - (("major_axis=['20121114', '20121114']"),), - ('major_axis=datetime.datetime(2012, 11, 14)'), - 'major_axis> 20121114', - 'major_axis >20121114', - 'major_axis > 20121114', - (("minor_axis=['A', 'B']"),), - (("minor_axis=['A', 'B']"),), - ((("minor_axis==['A', 'B']"),),), - (("items=['ItemA', 'ItemB']"),), - ('items=ItemA'), - ] - - for t in terms: - store.select('wp', t) - store.select('p4d', t) - - # valid for p4d only - terms = [ - (("labels=['l1', 'l2']"),), - Term("labels=['l1', 'l2']"), - ] - - for t in terms: - store.select('p4d', t) + with compat_assert_produces_warning(FutureWarning): + + # valid terms + terms = [('major_axis=20121114'), + ('major_axis>20121114'), + (("major_axis=['20121114', '20121114']"),), + ('major_axis=datetime.datetime(2012, 11, 14)'), + 'major_axis> 20121114', + 'major_axis >20121114', + 'major_axis > 20121114', + (("minor_axis=['A', 'B']"),), + (("minor_axis=['A', 'B']"),), + ((("minor_axis==['A', 'B']"),),), + (("items=['ItemA', 'ItemB']"),), + ('items=ItemA'), + ] + + for t in terms: + store.select('wp', t) + store.select('p4d', t) + + # valid for p4d only + terms = [(("labels=['l1', 'l2']"),), + Term("labels=['l1', 'l2']"), + ] + + for t in terms: + store.select('p4d', t) with tm.assertRaisesRegexp(TypeError, 'Only named functions are supported'): @@ -4405,12 +4428,12 @@ def test_legacy_table_read(self): def test_legacy_0_10_read(self): # legacy from 0.10 - with ensure_clean_store( - tm.get_data_path('legacy_hdf/legacy_0.10.h5'), - mode='r') as store: - str(store) - for k in store.keys(): - store.select(k) + with compat_assert_produces_warning(FutureWarning): + path = tm.get_data_path('legacy_hdf/legacy_0.10.h5') + with ensure_clean_store(path, mode='r') as store: + str(store) + for k in store.keys(): + store.select(k) def test_legacy_0_11_read(self): # legacy from 0.11 @@ -4429,65 +4452,69 @@ def test_legacy_0_11_read(self): def test_copy(self): - def do_copy(f=None, new_f=None, keys=None, propindexes=True, **kwargs): - try: - if f is None: - f = tm.get_data_path(os.path.join('legacy_hdf', - 'legacy_0.10.h5')) - - store = HDFStore(f, 'r') - - if new_f is None: - import tempfile - fd, new_f = tempfile.mkstemp() - - tstore = store.copy( - new_f, keys=keys, propindexes=propindexes, **kwargs) + with compat_assert_produces_warning(FutureWarning): - # check keys - if keys is None: - keys = store.keys() - self.assertEqual(set(keys), set(tstore.keys())) - - # check indicies & nrows - for k in tstore.keys(): - if tstore.get_storer(k).is_table: - new_t = tstore.get_storer(k) - orig_t = store.get_storer(k) - - self.assertEqual(orig_t.nrows, new_t.nrows) - - # check propindixes - if propindexes: - for a in orig_t.axes: - if a.is_indexed: - self.assertTrue(new_t[a.name].is_indexed) - - finally: - safe_close(store) - safe_close(tstore) + def do_copy(f=None, new_f=None, keys=None, + propindexes=True, **kwargs): try: - os.close(fd) - except: - pass - safe_remove(new_f) - - do_copy() - do_copy(keys=['/a', '/b', '/df1_mixed']) - do_copy(propindexes=False) - - # new table - df = tm.makeDataFrame() + if f is None: + f = tm.get_data_path(os.path.join('legacy_hdf', + 'legacy_0.10.h5')) + + store = HDFStore(f, 'r') + + if new_f is None: + import tempfile + fd, new_f = tempfile.mkstemp() + + tstore = store.copy( + new_f, keys=keys, propindexes=propindexes, **kwargs) + + # check keys + if keys is None: + keys = store.keys() + self.assertEqual(set(keys), set(tstore.keys())) + + # check indicies & nrows + for k in tstore.keys(): + if tstore.get_storer(k).is_table: + new_t = tstore.get_storer(k) + orig_t = store.get_storer(k) + + self.assertEqual(orig_t.nrows, new_t.nrows) + + # check propindixes + if propindexes: + for a in orig_t.axes: + if a.is_indexed: + self.assertTrue( + new_t[a.name].is_indexed) + + finally: + safe_close(store) + safe_close(tstore) + try: + os.close(fd) + except: + pass + safe_remove(new_f) + + do_copy() + do_copy(keys=['/a', '/b', '/df1_mixed']) + do_copy(propindexes=False) + + # new table + df = tm.makeDataFrame() - try: - path = create_tempfile(self.path) - st = HDFStore(path) - st.append('df', df, data_columns=['A']) - st.close() - do_copy(f=path) - do_copy(f=path, propindexes=False) - finally: - safe_remove(path) + try: + path = create_tempfile(self.path) + st = HDFStore(path) + st.append('df', df, data_columns=['A']) + st.close() + do_copy(f=path) + do_copy(f=path, propindexes=False) + finally: + safe_remove(path) def test_legacy_table_write(self): raise nose.SkipTest("cannot write legacy tables") @@ -4567,12 +4594,10 @@ def test_unicode_index(self): unicode_values = [u('\u03c3'), u('\u03c3\u03c3')] - def f(): + with compat_assert_produces_warning(PerformanceWarning): s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) - compat_assert_produces_warning(PerformanceWarning, f) - def test_unicode_longer_encoded(self): # GH 11234 char = '\u0394' @@ -5041,16 +5066,18 @@ def test_complex_across_dimensions(self): s = Series(complex128, index=list('abcd')) df = DataFrame({'A': s, 'B': s}) p = Panel({'One': df, 'Two': df}) - p4d = pd.Panel4D({'i': p, 'ii': p}) - objs = [df, p, p4d] - comps = [tm.assert_frame_equal, tm.assert_panel_equal, - tm.assert_panel4d_equal] - for obj, comp in zip(objs, comps): - with ensure_clean_path(self.path) as path: - obj.to_hdf(path, 'obj', format='table') - reread = read_hdf(path, 'obj') - comp(obj, reread) + with compat_assert_produces_warning(FutureWarning): + p4d = pd.Panel4D({'i': p, 'ii': p}) + + objs = [df, p, p4d] + comps = [tm.assert_frame_equal, tm.assert_panel_equal, + tm.assert_panel4d_equal] + for obj, comp in zip(objs, comps): + with ensure_clean_path(self.path) as path: + obj.to_hdf(path, 'obj', format='table') + reread = read_hdf(path, 'obj') + comp(obj, reread) def test_complex_indexing_error(self): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index 0996cd3bd826a..4370d040d8eaf 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -71,7 +71,7 @@ def __init__(self, frames=None, items=None, major_axis=None, # deprecation #11157 warnings.warn("SparsePanel is deprecated and will be removed in a " - "future version", FutureWarning, stacklevel=2) + "future version", FutureWarning, stacklevel=3) if frames is None: frames = {} diff --git a/pandas/sparse/tests/test_panel.py b/pandas/sparse/tests/test_panel.py index e988ddebd92f0..09d861fe0a9ac 100644 --- a/pandas/sparse/tests/test_panel.py +++ b/pandas/sparse/tests/test_panel.py @@ -55,7 +55,7 @@ def setUp(self): 'ItemC': panel_data3(), 'ItemD': panel_data1(), } - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self.panel = SparsePanel(self.data_dict) @staticmethod @@ -76,12 +76,12 @@ def test_constructor(self): # deprecation GH11157 def test_deprecation(self): - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): SparsePanel() # GH 9272 def test_constructor_empty(self): - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): sp = SparsePanel() self.assertEqual(len(sp.items), 0) self.assertEqual(len(sp.major_axis), 0) @@ -104,7 +104,8 @@ def _test_roundtrip(panel): def test_dense_to_sparse(self): wp = Panel.from_dict(self.data_dict) - dwp = wp.to_sparse() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + dwp = wp.to_sparse() tm.assertIsInstance(dwp['ItemA']['A'], SparseSeries) def test_to_dense(self): @@ -127,7 +128,8 @@ def _compare_with_dense(panel): _compare_with_dense(self.panel) _compare_with_dense(self.panel.reindex(items=['ItemA'])) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): zero_panel = SparsePanel(self.data_dict, default_fill_value=0) self.assertRaises(Exception, zero_panel.to_frame) @@ -154,15 +156,18 @@ def test_setitem(self): self.assertRaises(Exception, self.panel.__setitem__, 'item6', 1) def test_set_value(self): - def _check_loc(item, major, minor, val=1.5): - res = self.panel.set_value(item, major, minor, val) - self.assertIsNot(res, self.panel) - self.assertEqual(res.get_value(item, major, minor), val) - - _check_loc('ItemA', self.panel.major_axis[4], self.panel.minor_axis[3]) - _check_loc('ItemF', self.panel.major_axis[4], self.panel.minor_axis[3]) - _check_loc('ItemF', 'foo', self.panel.minor_axis[3]) - _check_loc('ItemE', 'foo', 'bar') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + def _check_loc(item, major, minor, val=1.5): + res = self.panel.set_value(item, major, minor, val) + self.assertIsNot(res, self.panel) + self.assertEqual(res.get_value(item, major, minor), val) + + _check_loc('ItemA', self.panel.major_axis[4], + self.panel.minor_axis[3]) + _check_loc('ItemF', self.panel.major_axis[4], + self.panel.minor_axis[3]) + _check_loc('ItemF', 'foo', self.panel.minor_axis[3]) + _check_loc('ItemE', 'foo', 'bar') def test_delitem_pop(self): del self.panel['ItemB'] diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index a53e79439b017..cdcd8b1bcba60 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1532,17 +1532,41 @@ def test_to_xarray(self): tm._skip_if_no_xarray() from xarray import DataArray - p = tm.makePanel4D() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p = tm.makePanel4D() - result = p.to_xarray() - self.assertIsInstance(result, DataArray) - self.assertEqual(len(result.coords), 4) - assert_almost_equal(list(result.coords.keys()), - ['labels', 'items', 'major_axis', 'minor_axis']) - self.assertEqual(len(result.dims), 4) - - # non-convertible - self.assertRaises(ValueError, lambda: result.to_pandas()) + result = p.to_xarray() + self.assertIsInstance(result, DataArray) + self.assertEqual(len(result.coords), 4) + assert_almost_equal(list(result.coords.keys()), + ['labels', 'items', 'major_axis', + 'minor_axis']) + self.assertEqual(len(result.dims), 4) + + # non-convertible + self.assertRaises(ValueError, lambda: result.to_pandas()) + +# run all the tests, but wrap each in a warning catcher +for t in ['test_rename', 'test_rename_axis', 'test_get_numeric_data', + 'test_get_default', 'test_nonzero', + 'test_numpy_1_7_compat_numeric_methods', + 'test_downcast', 'test_constructor_compound_dtypes', + 'test_head_tail', + 'test_size_compat', 'test_split_compat', + 'test_unexpected_keyword', + 'test_stat_unexpected_keyword', 'test_api_compat', + 'test_stat_non_defaults_args', + 'test_clip', 'test_truncate_out_of_bounds', 'test_numpy_clip', + 'test_metadata_propagation']: + + def f(): + def tester(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + return getattr(super(TestPanel4D, self), t)() + return tester + + setattr(TestPanel4D, t, f()) class TestNDFrame(tm.TestCase): @@ -1674,8 +1698,9 @@ def test_squeeze(self): tm.assert_frame_equal(df.squeeze(), df) for p in [tm.makePanel()]: tm.assert_panel_equal(p.squeeze(), p) - for p4d in [tm.makePanel4D()]: - tm.assert_panel4d_equal(p4d.squeeze(), p4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + for p4d in [tm.makePanel4D()]: + tm.assert_panel4d_equal(p4d.squeeze(), p4d) # squeezing df = tm.makeTimeDataFrame().reindex(columns=['A']) @@ -1687,11 +1712,13 @@ def test_squeeze(self): p = tm.makePanel().reindex(items=['ItemA'], minor_axis=['A']) tm.assert_series_equal(p.squeeze(), p.ix['ItemA', :, 'A']) - p4d = tm.makePanel4D().reindex(labels=['label1']) - tm.assert_panel_equal(p4d.squeeze(), p4d['label1']) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D().reindex(labels=['label1']) + tm.assert_panel_equal(p4d.squeeze(), p4d['label1']) - p4d = tm.makePanel4D().reindex(labels=['label1'], items=['ItemA']) - tm.assert_frame_equal(p4d.squeeze(), p4d.ix['label1', 'ItemA']) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D().reindex(labels=['label1'], items=['ItemA']) + tm.assert_frame_equal(p4d.squeeze(), p4d.ix['label1', 'ItemA']) # don't fail with 0 length dimensions GH11229 & GH8999 empty_series = pd.Series([], name='five') @@ -1726,11 +1753,13 @@ def test_transpose(self): .transpose(1, 2, 0), p) tm.assertRaisesRegexp(TypeError, msg, p.transpose, 2, 0, 1, axes=(2, 0, 1)) - for p4d in [tm.makePanel4D()]: - tm.assert_panel4d_equal(p4d.transpose(2, 0, 3, 1) - .transpose(1, 3, 0, 2), p4d) - tm.assertRaisesRegexp(TypeError, msg, p4d.transpose, - 2, 0, 3, 1, axes=(2, 0, 3, 1)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + for p4d in [tm.makePanel4D()]: + tm.assert_panel4d_equal(p4d.transpose(2, 0, 3, 1) + .transpose(1, 3, 0, 2), p4d) + tm.assertRaisesRegexp(TypeError, msg, p4d.transpose, + 2, 0, 3, 1, axes=(2, 0, 3, 1)) def test_numpy_transpose(self): msg = "the 'axes' parameter is not supported" @@ -1752,10 +1781,11 @@ def test_numpy_transpose(self): np.transpose(p, axes=(2, 0, 1)), axes=(1, 2, 0)), p) - p4d = tm.makePanel4D() - tm.assert_panel4d_equal(np.transpose( - np.transpose(p4d, axes=(2, 0, 3, 1)), - axes=(1, 3, 0, 2)), p4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() + tm.assert_panel4d_equal(np.transpose( + np.transpose(p4d, axes=(2, 0, 3, 1)), + axes=(1, 3, 0, 2)), p4d) def test_take(self): indices = [1, 5, -2, 6, 3, -1] @@ -1780,21 +1810,25 @@ def test_take(self): major_axis=p.major_axis, minor_axis=p.minor_axis) tm.assert_panel_equal(out, expected) - for p4d in [tm.makePanel4D()]: - out = p4d.take(indices) - expected = Panel4D(data=p4d.values.take(indices, axis=0), - labels=p4d.labels.take(indices), - major_axis=p4d.major_axis, - minor_axis=p4d.minor_axis, - items=p4d.items) - tm.assert_panel4d_equal(out, expected) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + for p4d in [tm.makePanel4D()]: + out = p4d.take(indices) + expected = Panel4D(data=p4d.values.take(indices, axis=0), + labels=p4d.labels.take(indices), + major_axis=p4d.major_axis, + minor_axis=p4d.minor_axis, + items=p4d.items) + tm.assert_panel4d_equal(out, expected) def test_take_invalid_kwargs(self): indices = [-3, 2, 0, 1] s = tm.makeFloatSeries() df = tm.makeTimeDataFrame() p = tm.makePanel() - p4d = tm.makePanel4D() + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() for obj in (s, df, p, p4d): msg = "take\(\) got an unexpected keyword argument 'foo'" diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index a37e7ca732bde..46eba1772c47a 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -157,16 +157,6 @@ def alt(x): self._check_stat_op('sem', alt) - # def test_skew(self): - # from scipy.stats import skew - - # def alt(x): - # if len(x) < 3: - # return np.nan - # return skew(x, bias=False) - - # self._check_stat_op('skew', alt) - def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): if obj is None: obj = self.panel diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 16a55c7ec4aeb..50ede3f2c2367 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -205,20 +205,22 @@ def test_get_axis_name(self): self.assertEqual(self.panel4d._get_axis_name(3), 'minor_axis') def test_arith(self): - self._test_op(self.panel4d, operator.add) - self._test_op(self.panel4d, operator.sub) - self._test_op(self.panel4d, operator.mul) - self._test_op(self.panel4d, operator.truediv) - self._test_op(self.panel4d, operator.floordiv) - self._test_op(self.panel4d, operator.pow) - - self._test_op(self.panel4d, lambda x, y: y + x) - self._test_op(self.panel4d, lambda x, y: y - x) - self._test_op(self.panel4d, lambda x, y: y * x) - self._test_op(self.panel4d, lambda x, y: y / x) - self._test_op(self.panel4d, lambda x, y: y ** x) - - self.assertRaises(Exception, self.panel4d.__add__, self.panel4d['l1']) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self._test_op(self.panel4d, operator.add) + self._test_op(self.panel4d, operator.sub) + self._test_op(self.panel4d, operator.mul) + self._test_op(self.panel4d, operator.truediv) + self._test_op(self.panel4d, operator.floordiv) + self._test_op(self.panel4d, operator.pow) + + self._test_op(self.panel4d, lambda x, y: y + x) + self._test_op(self.panel4d, lambda x, y: y - x) + self._test_op(self.panel4d, lambda x, y: y * x) + self._test_op(self.panel4d, lambda x, y: y / x) + self._test_op(self.panel4d, lambda x, y: y ** x) + + self.assertRaises(Exception, self.panel4d.__add__, + self.panel4d['l1']) @staticmethod def _test_op(panel4d, op): @@ -235,41 +237,47 @@ def test_iteritems(self): len(self.panel4d.labels)) def test_combinePanel4d(self): - result = self.panel4d.add(self.panel4d) - self.assert_panel4d_equal(result, self.panel4d * 2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = self.panel4d.add(self.panel4d) + self.assert_panel4d_equal(result, self.panel4d * 2) def test_neg(self): - self.assert_panel4d_equal(-self.panel4d, self.panel4d * -1) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assert_panel4d_equal(-self.panel4d, self.panel4d * -1) def test_select(self): - p = self.panel4d + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + p = self.panel4d - # select labels - result = p.select(lambda x: x in ('l1', 'l3'), axis='labels') - expected = p.reindex(labels=['l1', 'l3']) - self.assert_panel4d_equal(result, expected) + # select labels + result = p.select(lambda x: x in ('l1', 'l3'), axis='labels') + expected = p.reindex(labels=['l1', 'l3']) + self.assert_panel4d_equal(result, expected) - # select items - result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') - expected = p.reindex(items=['ItemA', 'ItemC']) - self.assert_panel4d_equal(result, expected) + # select items + result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') + expected = p.reindex(items=['ItemA', 'ItemC']) + self.assert_panel4d_equal(result, expected) - # select major_axis - result = p.select(lambda x: x >= datetime(2000, 1, 15), axis='major') - new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] - expected = p.reindex(major=new_major) - self.assert_panel4d_equal(result, expected) + # select major_axis + result = p.select(lambda x: x >= datetime(2000, 1, 15), + axis='major') + new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] + expected = p.reindex(major=new_major) + self.assert_panel4d_equal(result, expected) - # select minor_axis - result = p.select(lambda x: x in ('D', 'A'), axis=3) - expected = p.reindex(minor=['A', 'D']) - self.assert_panel4d_equal(result, expected) + # select minor_axis + result = p.select(lambda x: x in ('D', 'A'), axis=3) + expected = p.reindex(minor=['A', 'D']) + self.assert_panel4d_equal(result, expected) - # corner case, empty thing - result = p.select(lambda x: x in ('foo',), axis='items') - self.assert_panel4d_equal(result, p.reindex(items=[])) + # corner case, empty thing + result = p.select(lambda x: x in ('foo',), axis='items') + self.assert_panel4d_equal(result, p.reindex(items=[])) def test_get_value(self): + for item in self.panel.items: for mjr in self.panel.major_axis[::2]: for mnr in self.panel.minor_axis: @@ -278,19 +286,21 @@ def test_get_value(self): assert_almost_equal(result, expected) def test_abs(self): - result = self.panel4d.abs() - expected = np.abs(self.panel4d) - self.assert_panel4d_equal(result, expected) - p = self.panel4d['l1'] - result = p.abs() - expected = np.abs(p) - assert_panel_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = self.panel4d.abs() + expected = np.abs(self.panel4d) + self.assert_panel4d_equal(result, expected) + + p = self.panel4d['l1'] + result = p.abs() + expected = np.abs(p) + assert_panel_equal(result, expected) - df = p['ItemA'] - result = df.abs() - expected = np.abs(df) - assert_frame_equal(result, expected) + df = p['ItemA'] + result = df.abs() + expected = np.abs(df) + assert_frame_equal(result, expected) class CheckIndexing(object): @@ -301,48 +311,50 @@ def test_getitem(self): self.assertRaises(Exception, self.panel4d.__getitem__, 'ItemQ') def test_delitem_and_pop(self): - expected = self.panel4d['l2'] - result = self.panel4d.pop('l2') - assert_panel_equal(expected, result) - self.assertNotIn('l2', self.panel4d.labels) - - del self.panel4d['l3'] - self.assertNotIn('l3', self.panel4d.labels) - self.assertRaises(Exception, self.panel4d.__delitem__, 'l3') - - values = np.empty((4, 4, 4, 4)) - values[0] = 0 - values[1] = 1 - values[2] = 2 - values[3] = 3 - - panel4d = Panel4D(values, lrange(4), lrange(4), lrange(4), lrange(4)) - - # did we delete the right row? - - panel4dc = panel4d.copy() - del panel4dc[0] - assert_panel_equal(panel4dc[1], panel4d[1]) - assert_panel_equal(panel4dc[2], panel4d[2]) - assert_panel_equal(panel4dc[3], panel4d[3]) - - panel4dc = panel4d.copy() - del panel4dc[1] - assert_panel_equal(panel4dc[0], panel4d[0]) - assert_panel_equal(panel4dc[2], panel4d[2]) - assert_panel_equal(panel4dc[3], panel4d[3]) - - panel4dc = panel4d.copy() - del panel4dc[2] - assert_panel_equal(panel4dc[1], panel4d[1]) - assert_panel_equal(panel4dc[0], panel4d[0]) - assert_panel_equal(panel4dc[3], panel4d[3]) - - panel4dc = panel4d.copy() - del panel4dc[3] - assert_panel_equal(panel4dc[1], panel4d[1]) - assert_panel_equal(panel4dc[2], panel4d[2]) - assert_panel_equal(panel4dc[0], panel4d[0]) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + expected = self.panel4d['l2'] + result = self.panel4d.pop('l2') + assert_panel_equal(expected, result) + self.assertNotIn('l2', self.panel4d.labels) + + del self.panel4d['l3'] + self.assertNotIn('l3', self.panel4d.labels) + self.assertRaises(Exception, self.panel4d.__delitem__, 'l3') + + values = np.empty((4, 4, 4, 4)) + values[0] = 0 + values[1] = 1 + values[2] = 2 + values[3] = 3 + + panel4d = Panel4D(values, lrange(4), lrange(4), + lrange(4), lrange(4)) + + # did we delete the right row? + panel4dc = panel4d.copy() + del panel4dc[0] + assert_panel_equal(panel4dc[1], panel4d[1]) + assert_panel_equal(panel4dc[2], panel4d[2]) + assert_panel_equal(panel4dc[3], panel4d[3]) + + panel4dc = panel4d.copy() + del panel4dc[1] + assert_panel_equal(panel4dc[0], panel4d[0]) + assert_panel_equal(panel4dc[2], panel4d[2]) + assert_panel_equal(panel4dc[3], panel4d[3]) + + panel4dc = panel4d.copy() + del panel4dc[2] + assert_panel_equal(panel4dc[1], panel4d[1]) + assert_panel_equal(panel4dc[0], panel4d[0]) + assert_panel_equal(panel4dc[3], panel4d[3]) + + panel4dc = panel4d.copy() + del panel4dc[3] + assert_panel_equal(panel4dc[1], panel4d[1]) + assert_panel_equal(panel4dc[2], panel4d[2]) + assert_panel_equal(panel4dc[0], panel4d[0]) def test_setitem(self): # LongPanel with one item @@ -378,95 +390,83 @@ def test_setitem(self): def test_setitem_by_indexer(self): - # Panel - panel4dc = self.panel4d.copy() - p = panel4dc.iloc[0] - - def func(): - self.panel4d.iloc[0] = p - self.assertRaises(NotImplementedError, func) - - # DataFrame - panel4dc = self.panel4d.copy() - df = panel4dc.iloc[0, 0] - df.iloc[:] = 1 - panel4dc.iloc[0, 0] = df - self.assertTrue((panel4dc.iloc[0, 0].values == 1).all()) - - # Series - panel4dc = self.panel4d.copy() - s = panel4dc.iloc[0, 0, :, 0] - s.iloc[:] = 1 - panel4dc.iloc[0, 0, :, 0] = s - self.assertTrue((panel4dc.iloc[0, 0, :, 0].values == 1).all()) - - # scalar - panel4dc = self.panel4d.copy() - panel4dc.iloc[0] = 1 - panel4dc.iloc[1] = True - panel4dc.iloc[2] = 'foo' - self.assertTrue((panel4dc.iloc[0].values == 1).all()) - self.assertTrue(panel4dc.iloc[1].values.all()) - self.assertTrue((panel4dc.iloc[2].values == 'foo').all()) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + # Panel + panel4dc = self.panel4d.copy() + p = panel4dc.iloc[0] + + def func(): + self.panel4d.iloc[0] = p + self.assertRaises(NotImplementedError, func) + + # DataFrame + panel4dc = self.panel4d.copy() + df = panel4dc.iloc[0, 0] + df.iloc[:] = 1 + panel4dc.iloc[0, 0] = df + self.assertTrue((panel4dc.iloc[0, 0].values == 1).all()) + + # Series + panel4dc = self.panel4d.copy() + s = panel4dc.iloc[0, 0, :, 0] + s.iloc[:] = 1 + panel4dc.iloc[0, 0, :, 0] = s + self.assertTrue((panel4dc.iloc[0, 0, :, 0].values == 1).all()) + + # scalar + panel4dc = self.panel4d.copy() + panel4dc.iloc[0] = 1 + panel4dc.iloc[1] = True + panel4dc.iloc[2] = 'foo' + self.assertTrue((panel4dc.iloc[0].values == 1).all()) + self.assertTrue(panel4dc.iloc[1].values.all()) + self.assertTrue((panel4dc.iloc[2].values == 'foo').all()) def test_setitem_by_indexer_mixed_type(self): - # GH 8702 - self.panel4d['foo'] = 'bar' - # scalar - panel4dc = self.panel4d.copy() - panel4dc.iloc[0] = 1 - panel4dc.iloc[1] = True - panel4dc.iloc[2] = 'foo' - self.assertTrue((panel4dc.iloc[0].values == 1).all()) - self.assertTrue(panel4dc.iloc[1].values.all()) - self.assertTrue((panel4dc.iloc[2].values == 'foo').all()) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # GH 8702 + self.panel4d['foo'] = 'bar' + + # scalar + panel4dc = self.panel4d.copy() + panel4dc.iloc[0] = 1 + panel4dc.iloc[1] = True + panel4dc.iloc[2] = 'foo' + self.assertTrue((panel4dc.iloc[0].values == 1).all()) + self.assertTrue(panel4dc.iloc[1].values.all()) + self.assertTrue((panel4dc.iloc[2].values == 'foo').all()) def test_comparisons(self): - p1 = tm.makePanel4D() - p2 = tm.makePanel4D() - - tp = p1.reindex(labels=p1.labels.tolist() + ['foo']) - p = p1[p1.labels[0]] - - def test_comp(func): - result = func(p1, p2) - self.assert_numpy_array_equal(result.values, - func(p1.values, p2.values)) - - # versus non-indexed same objs - self.assertRaises(Exception, func, p1, tp) - - # versus different objs - self.assertRaises(Exception, func, p1, p) - - result3 = func(self.panel4d, 0) - self.assert_numpy_array_equal(result3.values, - func(self.panel4d.values, 0)) - - test_comp(operator.eq) - test_comp(operator.ne) - test_comp(operator.lt) - test_comp(operator.gt) - test_comp(operator.ge) - test_comp(operator.le) - - def test_setitem_ndarray(self): - raise nose.SkipTest("skipping for now") - # from pandas import DateRange, datetools - - # timeidx = DateRange(start=datetime(2009,1,1), - # end=datetime(2009,12,31), - # offset=datetools.MonthEnd()) - # lons_coarse = np.linspace(-177.5, 177.5, 72) - # lats_coarse = np.linspace(-87.5, 87.5, 36) - # P = Panel(items=timeidx, major_axis=lons_coarse, - # minor_axis=lats_coarse) - # data = np.random.randn(72*36).reshape((72,36)) - # key = datetime(2009,2,28) - # P[key] = data# - - # assert_almost_equal(P[key].values, data) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p1 = tm.makePanel4D() + p2 = tm.makePanel4D() + + tp = p1.reindex(labels=p1.labels.tolist() + ['foo']) + p = p1[p1.labels[0]] + + def test_comp(func): + result = func(p1, p2) + self.assert_numpy_array_equal(result.values, + func(p1.values, p2.values)) + + # versus non-indexed same objs + self.assertRaises(Exception, func, p1, tp) + + # versus different objs + self.assertRaises(Exception, func, p1, p) + + result3 = func(self.panel4d, 0) + self.assert_numpy_array_equal(result3.values, + func(self.panel4d.values, 0)) + + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) def test_major_xs(self): ref = self.panel4d['l1']['ItemA'] @@ -521,42 +521,43 @@ def test_xs(self): self.assertIsNotNone(result.is_copy) def test_getitem_fancy_labels(self): - panel4d = self.panel4d + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + panel4d = self.panel4d - labels = panel4d.labels[[1, 0]] - items = panel4d.items[[1, 0]] - dates = panel4d.major_axis[::2] - cols = ['D', 'C', 'F'] + labels = panel4d.labels[[1, 0]] + items = panel4d.items[[1, 0]] + dates = panel4d.major_axis[::2] + cols = ['D', 'C', 'F'] - # all 4 specified - assert_panel4d_equal(panel4d.ix[labels, items, dates, cols], - panel4d.reindex(labels=labels, items=items, - major=dates, minor=cols)) + # all 4 specified + assert_panel4d_equal(panel4d.ix[labels, items, dates, cols], + panel4d.reindex(labels=labels, items=items, + major=dates, minor=cols)) - # 3 specified - assert_panel4d_equal(panel4d.ix[:, items, dates, cols], - panel4d.reindex(items=items, major=dates, - minor=cols)) + # 3 specified + assert_panel4d_equal(panel4d.ix[:, items, dates, cols], + panel4d.reindex(items=items, major=dates, + minor=cols)) - # 2 specified - assert_panel4d_equal(panel4d.ix[:, :, dates, cols], - panel4d.reindex(major=dates, minor=cols)) + # 2 specified + assert_panel4d_equal(panel4d.ix[:, :, dates, cols], + panel4d.reindex(major=dates, minor=cols)) - assert_panel4d_equal(panel4d.ix[:, items, :, cols], - panel4d.reindex(items=items, minor=cols)) + assert_panel4d_equal(panel4d.ix[:, items, :, cols], + panel4d.reindex(items=items, minor=cols)) - assert_panel4d_equal(panel4d.ix[:, items, dates, :], - panel4d.reindex(items=items, major=dates)) + assert_panel4d_equal(panel4d.ix[:, items, dates, :], + panel4d.reindex(items=items, major=dates)) - # only 1 - assert_panel4d_equal(panel4d.ix[:, items, :, :], - panel4d.reindex(items=items)) + # only 1 + assert_panel4d_equal(panel4d.ix[:, items, :, :], + panel4d.reindex(items=items)) - assert_panel4d_equal(panel4d.ix[:, :, dates, :], - panel4d.reindex(major=dates)) + assert_panel4d_equal(panel4d.ix[:, :, dates, :], + panel4d.reindex(major=dates)) - assert_panel4d_equal(panel4d.ix[:, :, :, cols], - panel4d.reindex(minor=cols)) + assert_panel4d_equal(panel4d.ix[:, :, :, cols], + panel4d.reindex(minor=cols)) def test_getitem_fancy_slice(self): pass @@ -564,11 +565,6 @@ def test_getitem_fancy_slice(self): def test_getitem_fancy_ints(self): pass - def test_getitem_fancy_xs(self): - raise nose.SkipTest("skipping for now") - # self.assertRaises(NotImplementedError, self.panel4d.major_xs) - # self.assertRaises(NotImplementedError, self.panel4d.minor_xs) - def test_get_value(self): for label in self.panel4d.labels: for item in self.panel4d.items: @@ -580,22 +576,28 @@ def test_get_value(self): assert_almost_equal(result, expected) def test_set_value(self): - for label in self.panel4d.labels: - for item in self.panel4d.items: - for mjr in self.panel4d.major_axis[::2]: - for mnr in self.panel4d.minor_axis: - self.panel4d.set_value(label, item, mjr, mnr, 1.) - assert_almost_equal( - self.panel4d[label][item][mnr][mjr], 1.) - # resize - res = self.panel4d.set_value('l4', 'ItemE', 'foo', 'bar', 1.5) - tm.assertIsInstance(res, Panel4D) - self.assertIsNot(res, self.panel4d) - self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) - self.assertTrue(is_float_dtype(res3['l4'].values)) + for label in self.panel4d.labels: + for item in self.panel4d.items: + for mjr in self.panel4d.major_axis[::2]: + for mnr in self.panel4d.minor_axis: + self.panel4d.set_value(label, item, mjr, mnr, 1.) + assert_almost_equal( + self.panel4d[label][item][mnr][mjr], 1.) + + res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) + self.assertTrue(is_float_dtype(res3['l4'].values)) + + # resize + res = self.panel4d.set_value('l4', 'ItemE', 'foo', 'bar', 1.5) + tm.assertIsInstance(res, Panel4D) + self.assertIsNot(res, self.panel4d) + self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) + + res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) + self.assertTrue(is_float_dtype(res3['l4'].values)) class TestPanel4d(tm.TestCase, CheckIndexing, SafeForSparse, @@ -608,194 +610,150 @@ def assert_panel4d_equal(cls, x, y): assert_panel4d_equal(x, y) def setUp(self): - self.panel4d = tm.makePanel4D(nper=8) - add_nans(self.panel4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.panel4d = tm.makePanel4D(nper=8) + add_nans(self.panel4d) def test_constructor(self): - # with BlockManager - panel4d = Panel4D(self.panel4d._data) - self.assertIs(panel4d._data, self.panel4d._data) - - panel4d = Panel4D(self.panel4d._data, copy=True) - self.assertIsNot(panel4d._data, self.panel4d._data) - assert_panel4d_equal(panel4d, self.panel4d) - - # strings handled prop - # panel4d = Panel4D([[['foo', 'foo', 'foo',], - # ['foo', 'foo', 'foo']]]) - # self.assertEqual(wp.values.dtype, np.object_) - - vals = self.panel4d.values - - # no copy - panel4d = Panel4D(vals) - self.assertIs(panel4d.values, vals) - - # copy - panel4d = Panel4D(vals, copy=True) - self.assertIsNot(panel4d.values, vals) - - # GH #8285, test when scalar data is used to construct a Panel4D - # if dtype is not passed, it should be inferred - value_and_dtype = [(1, 'int64'), (3.14, 'float64'), - ('foo', np.object_)] - for (val, dtype) in value_and_dtype: - panel4d = Panel4D(val, labels=range(2), items=range( - 3), major_axis=range(4), minor_axis=range(5)) - vals = np.empty((2, 3, 4, 5), dtype=dtype) - vals.fill(val) - assert_panel4d_equal(panel4d, Panel4D(vals, dtype=dtype)) - - # test the case when dtype is passed - panel4d = Panel4D(1, labels=range(2), items=range( - 3), major_axis=range(4), minor_axis=range(5), dtype='float32') - vals = np.empty((2, 3, 4, 5), dtype='float32') - vals.fill(1) - assert_panel4d_equal(panel4d, Panel4D(vals, dtype='float32')) - def test_constructor_cast(self): - zero_filled = self.panel4d.fillna(0) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + panel4d = Panel4D(self.panel4d._data) + self.assertIs(panel4d._data, self.panel4d._data) + + panel4d = Panel4D(self.panel4d._data, copy=True) + self.assertIsNot(panel4d._data, self.panel4d._data) + assert_panel4d_equal(panel4d, self.panel4d) + + vals = self.panel4d.values + + # no copy + panel4d = Panel4D(vals) + self.assertIs(panel4d.values, vals) + + # copy + panel4d = Panel4D(vals, copy=True) + self.assertIsNot(panel4d.values, vals) + + # GH #8285, test when scalar data is used to construct a Panel4D + # if dtype is not passed, it should be inferred + value_and_dtype = [(1, 'int64'), (3.14, 'float64'), + ('foo', np.object_)] + for (val, dtype) in value_and_dtype: + panel4d = Panel4D(val, labels=range(2), items=range( + 3), major_axis=range(4), minor_axis=range(5)) + vals = np.empty((2, 3, 4, 5), dtype=dtype) + vals.fill(val) + expected = Panel4D(vals, dtype=dtype) + assert_panel4d_equal(panel4d, expected) + + # test the case when dtype is passed + panel4d = Panel4D(1, labels=range(2), items=range( + 3), major_axis=range(4), minor_axis=range(5), dtype='float32') + vals = np.empty((2, 3, 4, 5), dtype='float32') + vals.fill(1) + + expected = Panel4D(vals, dtype='float32') + assert_panel4d_equal(panel4d, expected) - casted = Panel4D(zero_filled._data, dtype=int) - casted2 = Panel4D(zero_filled.values, dtype=int) - - exp_values = zero_filled.values.astype(int) - assert_almost_equal(casted.values, exp_values) - assert_almost_equal(casted2.values, exp_values) + def test_constructor_cast(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + zero_filled = self.panel4d.fillna(0) - casted = Panel4D(zero_filled._data, dtype=np.int32) - casted2 = Panel4D(zero_filled.values, dtype=np.int32) + casted = Panel4D(zero_filled._data, dtype=int) + casted2 = Panel4D(zero_filled.values, dtype=int) - exp_values = zero_filled.values.astype(np.int32) - assert_almost_equal(casted.values, exp_values) - assert_almost_equal(casted2.values, exp_values) + exp_values = zero_filled.values.astype(int) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) - # can't cast - data = [[['foo', 'bar', 'baz']]] - self.assertRaises(ValueError, Panel, data, dtype=float) + casted = Panel4D(zero_filled._data, dtype=np.int32) + casted2 = Panel4D(zero_filled.values, dtype=np.int32) - def test_constructor_empty_panel(self): - empty = Panel() - self.assertEqual(len(empty.items), 0) - self.assertEqual(len(empty.major_axis), 0) - self.assertEqual(len(empty.minor_axis), 0) + exp_values = zero_filled.values.astype(np.int32) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) - def test_constructor_observe_dtype(self): - # GH #411 - panel = Panel(items=lrange(3), major_axis=lrange(3), - minor_axis=lrange(3), dtype='O') - self.assertEqual(panel.values.dtype, np.object_) + # can't cast + data = [[['foo', 'bar', 'baz']]] + self.assertRaises(ValueError, Panel, data, dtype=float) def test_consolidate(self): - self.assertTrue(self.panel4d._data.is_consolidated()) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(self.panel4d._data.is_consolidated()) - self.panel4d['foo'] = 1. - self.assertFalse(self.panel4d._data.is_consolidated()) + self.panel4d['foo'] = 1. + self.assertFalse(self.panel4d._data.is_consolidated()) - panel4d = self.panel4d.consolidate() - self.assertTrue(panel4d._data.is_consolidated()) + panel4d = self.panel4d.consolidate() + self.assertTrue(panel4d._data.is_consolidated()) def test_ctor_dict(self): - l1 = self.panel4d['l1'] - l2 = self.panel4d['l2'] - - d = {'A': l1, 'B': l2.ix[['ItemB'], :, :]} - # d2 = {'A' : itema._series, 'B' : itemb[5:]._series} - # d3 = {'A' : DataFrame(itema._series), - # 'B' : DataFrame(itemb[5:]._series)} - - panel4d = Panel4D(d) - # wp2 = Panel.from_dict(d2) # nested Dict - # wp3 = Panel.from_dict(d3) - # self.assertTrue(wp.major_axis.equals(self.panel.major_axis)) - assert_panel_equal(panel4d['A'], self.panel4d['l1']) - assert_frame_equal(panel4d.ix['B', 'ItemB', :, :], - self.panel4d.ix['l2', ['ItemB'], :, :]['ItemB']) - - # intersect - # wp = Panel.from_dict(d, intersect=True) - # self.assertTrue(wp.major_axis.equals(itemb.index[5:])) - - # use constructor - # assert_panel_equal(Panel(d), Panel.from_dict(d)) - # assert_panel_equal(Panel(d2), Panel.from_dict(d2)) - # assert_panel_equal(Panel(d3), Panel.from_dict(d3)) - - # cast - # dcasted = dict((k, v.reindex(wp.major_axis).fillna(0)) - # for k, v in d.iteritems()) - # result = Panel(dcasted, dtype=int) - # expected = Panel(dict((k, v.astype(int)) - # for k, v in dcasted.iteritems())) - # assert_panel_equal(result, expected) - - def test_constructor_dict_mixed(self): - data = dict((k, v.values) for k, v in self.panel4d.iteritems()) - result = Panel4D(data) - exp_major = Index(np.arange(len(self.panel4d.major_axis))) - self.assert_index_equal(result.major_axis, exp_major) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + l1 = self.panel4d['l1'] + l2 = self.panel4d['l2'] - result = Panel4D(data, - labels=self.panel4d.labels, - items=self.panel4d.items, - major_axis=self.panel4d.major_axis, - minor_axis=self.panel4d.minor_axis) - assert_panel4d_equal(result, self.panel4d) + d = {'A': l1, 'B': l2.ix[['ItemB'], :, :]} + panel4d = Panel4D(d) - data['l2'] = self.panel4d['l2'] - result = Panel4D(data) - assert_panel4d_equal(result, self.panel4d) - - # corner, blow up - data['l2'] = data['l2']['ItemB'] - self.assertRaises(Exception, Panel4D, data) - - data['l2'] = self.panel4d['l2'].values[:, :, :-1] - self.assertRaises(Exception, Panel4D, data) - - def test_constructor_resize(self): - data = self.panel4d._data - labels = self.panel4d.labels[:-1] - items = self.panel4d.items[:-1] - major = self.panel4d.major_axis[:-1] - minor = self.panel4d.minor_axis[:-1] + assert_panel_equal(panel4d['A'], self.panel4d['l1']) + assert_frame_equal(panel4d.ix['B', 'ItemB', :, :], + self.panel4d.ix['l2', ['ItemB'], :, :]['ItemB']) - result = Panel4D(data, labels=labels, items=items, - major_axis=major, minor_axis=minor) - expected = self.panel4d.reindex( - labels=labels, items=items, major=major, minor=minor) - assert_panel4d_equal(result, expected) - - result = Panel4D(data, items=items, major_axis=major) - expected = self.panel4d.reindex(items=items, major=major) - assert_panel4d_equal(result, expected) + def test_constructor_dict_mixed(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + data = dict((k, v.values) for k, v in self.panel4d.iteritems()) + result = Panel4D(data) - result = Panel4D(data, items=items) - expected = self.panel4d.reindex(items=items) - assert_panel4d_equal(result, expected) + exp_major = Index(np.arange(len(self.panel4d.major_axis))) + self.assert_index_equal(result.major_axis, exp_major) - result = Panel4D(data, minor_axis=minor) - expected = self.panel4d.reindex(minor=minor) - assert_panel4d_equal(result, expected) + result = Panel4D(data, + labels=self.panel4d.labels, + items=self.panel4d.items, + major_axis=self.panel4d.major_axis, + minor_axis=self.panel4d.minor_axis) + assert_panel4d_equal(result, self.panel4d) - def test_from_dict_mixed_orient(self): - raise nose.SkipTest("skipping for now") - # df = tm.makeDataFrame() - # df['foo'] = 'bar' + data['l2'] = self.panel4d['l2'] - # data = {'k1' : df, - # 'k2' : df} + result = Panel4D(data) + assert_panel4d_equal(result, self.panel4d) - # panel = Panel.from_dict(data, orient='minor') + # corner, blow up + data['l2'] = data['l2']['ItemB'] + self.assertRaises(Exception, Panel4D, data) - # self.assertEqual(panel['foo'].values.dtype, np.object_) - # self.assertEqual(panel['A'].values.dtype, np.float64) + data['l2'] = self.panel4d['l2'].values[:, :, :-1] + self.assertRaises(Exception, Panel4D, data) - def test_values(self): - self.assertRaises(Exception, Panel, np.random.randn(5, 5, 5), - lrange(5), lrange(5), lrange(4)) + def test_constructor_resize(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + data = self.panel4d._data + labels = self.panel4d.labels[:-1] + items = self.panel4d.items[:-1] + major = self.panel4d.major_axis[:-1] + minor = self.panel4d.minor_axis[:-1] + + result = Panel4D(data, labels=labels, items=items, + major_axis=major, minor_axis=minor) + expected = self.panel4d.reindex( + labels=labels, items=items, major=major, minor=minor) + assert_panel4d_equal(result, expected) + + result = Panel4D(data, items=items, major_axis=major) + expected = self.panel4d.reindex(items=items, major=major) + assert_panel4d_equal(result, expected) + + result = Panel4D(data, items=items) + expected = self.panel4d.reindex(items=items) + assert_panel4d_equal(result, expected) + + result = Panel4D(data, minor_axis=minor) + expected = self.panel4d.reindex(minor=minor) + assert_panel4d_equal(result, expected) def test_conform(self): + p = self.panel4d['l1'].filter(items=['ItemA', 'ItemB']) conformed = self.panel4d.conform(p) @@ -804,208 +762,155 @@ def test_conform(self): tm.assert_index_equal(conformed.minor_axis, self.panel4d.minor_axis) def test_reindex(self): - ref = self.panel4d['l2'] - - # labels - result = self.panel4d.reindex(labels=['l1', 'l2']) - assert_panel_equal(result['l2'], ref) - - # items - result = self.panel4d.reindex(items=['ItemA', 'ItemB']) - assert_frame_equal(result['l2']['ItemB'], ref['ItemB']) - - # major - new_major = list(self.panel4d.major_axis[:10]) - result = self.panel4d.reindex(major=new_major) - assert_frame_equal( - result['l2']['ItemB'], ref['ItemB'].reindex(index=new_major)) - - # raise exception put both major and major_axis - self.assertRaises(Exception, self.panel4d.reindex, - major_axis=new_major, major=new_major) - - # minor - new_minor = list(self.panel4d.minor_axis[:2]) - result = self.panel4d.reindex(minor=new_minor) - assert_frame_equal( - result['l2']['ItemB'], ref['ItemB'].reindex(columns=new_minor)) - - result = self.panel4d.reindex(labels=self.panel4d.labels, - items=self.panel4d.items, - major=self.panel4d.major_axis, - minor=self.panel4d.minor_axis) - - # don't necessarily copy - result = self.panel4d.reindex() - assert_panel4d_equal(result, self.panel4d) - self.assertFalse(result is self.panel4d) - - # with filling - smaller_major = self.panel4d.major_axis[::5] - smaller = self.panel4d.reindex(major=smaller_major) - - larger = smaller.reindex(major=self.panel4d.major_axis, - method='pad') - - assert_panel_equal(larger.ix[:, :, self.panel4d.major_axis[1], :], - smaller.ix[:, :, smaller_major[0], :]) - - # don't necessarily copy - result = self.panel4d.reindex( - major=self.panel4d.major_axis, copy=False) - assert_panel4d_equal(result, self.panel4d) - self.assertTrue(result is self.panel4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + ref = self.panel4d['l2'] + + # labels + result = self.panel4d.reindex(labels=['l1', 'l2']) + assert_panel_equal(result['l2'], ref) + + # items + result = self.panel4d.reindex(items=['ItemA', 'ItemB']) + assert_frame_equal(result['l2']['ItemB'], ref['ItemB']) + + # major + new_major = list(self.panel4d.major_axis[:10]) + result = self.panel4d.reindex(major=new_major) + assert_frame_equal( + result['l2']['ItemB'], ref['ItemB'].reindex(index=new_major)) + + # raise exception put both major and major_axis + self.assertRaises(Exception, self.panel4d.reindex, + major_axis=new_major, major=new_major) + + # minor + new_minor = list(self.panel4d.minor_axis[:2]) + result = self.panel4d.reindex(minor=new_minor) + assert_frame_equal( + result['l2']['ItemB'], ref['ItemB'].reindex(columns=new_minor)) + + result = self.panel4d.reindex(labels=self.panel4d.labels, + items=self.panel4d.items, + major=self.panel4d.major_axis, + minor=self.panel4d.minor_axis) + + # don't necessarily copy + result = self.panel4d.reindex() + assert_panel4d_equal(result, self.panel4d) + self.assertFalse(result is self.panel4d) + + # with filling + smaller_major = self.panel4d.major_axis[::5] + smaller = self.panel4d.reindex(major=smaller_major) + + larger = smaller.reindex(major=self.panel4d.major_axis, + method='pad') + + assert_panel_equal(larger.ix[:, :, self.panel4d.major_axis[1], :], + smaller.ix[:, :, smaller_major[0], :]) + + # don't necessarily copy + result = self.panel4d.reindex( + major=self.panel4d.major_axis, copy=False) + assert_panel4d_equal(result, self.panel4d) + self.assertTrue(result is self.panel4d) def test_not_hashable(self): - p4D_empty = Panel4D() - self.assertRaises(TypeError, hash, p4D_empty) - self.assertRaises(TypeError, hash, self.panel4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4D_empty = Panel4D() + self.assertRaises(TypeError, hash, p4D_empty) + self.assertRaises(TypeError, hash, self.panel4d) def test_reindex_like(self): # reindex_like - smaller = self.panel4d.reindex(labels=self.panel4d.labels[:-1], - items=self.panel4d.items[:-1], - major=self.panel4d.major_axis[:-1], - minor=self.panel4d.minor_axis[:-1]) - smaller_like = self.panel4d.reindex_like(smaller) - assert_panel4d_equal(smaller, smaller_like) - - def test_take(self): - raise nose.SkipTest("skipping for now") - - # # axis == 0 - # result = self.panel.take([2, 0, 1], axis=0) - # expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) - # assert_panel_equal(result, expected)# - - # # axis >= 1 - # result = self.panel.take([3, 0, 1, 2], axis=2) - # expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) - # assert_panel_equal(result, expected) - - # self.assertRaises(Exception, self.panel.take, [3, -1, 1, 2], axis=2) - # self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + smaller = self.panel4d.reindex(labels=self.panel4d.labels[:-1], + items=self.panel4d.items[:-1], + major=self.panel4d.major_axis[:-1], + minor=self.panel4d.minor_axis[:-1]) + smaller_like = self.panel4d.reindex_like(smaller) + assert_panel4d_equal(smaller, smaller_like) def test_sort_index(self): - import random - - rlabels = list(self.panel4d.labels) - ritems = list(self.panel4d.items) - rmajor = list(self.panel4d.major_axis) - rminor = list(self.panel4d.minor_axis) - random.shuffle(rlabels) - random.shuffle(ritems) - random.shuffle(rmajor) - random.shuffle(rminor) - - random_order = self.panel4d.reindex(labels=rlabels) - sorted_panel4d = random_order.sort_index(axis=0) - assert_panel4d_equal(sorted_panel4d, self.panel4d) - - # descending - # random_order = self.panel.reindex(items=ritems) - # sorted_panel = random_order.sort_index(axis=0, ascending=False) - # assert_panel_equal(sorted_panel, - # self.panel.reindex(items=self.panel.items[::-1])) - - # random_order = self.panel.reindex(major=rmajor) - # sorted_panel = random_order.sort_index(axis=1) - # assert_panel_equal(sorted_panel, self.panel) - - # random_order = self.panel.reindex(minor=rminor) - # sorted_panel = random_order.sort_index(axis=2) - # assert_panel_equal(sorted_panel, self.panel) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + import random + + rlabels = list(self.panel4d.labels) + ritems = list(self.panel4d.items) + rmajor = list(self.panel4d.major_axis) + rminor = list(self.panel4d.minor_axis) + random.shuffle(rlabels) + random.shuffle(ritems) + random.shuffle(rmajor) + random.shuffle(rminor) + + random_order = self.panel4d.reindex(labels=rlabels) + sorted_panel4d = random_order.sort_index(axis=0) + assert_panel4d_equal(sorted_panel4d, self.panel4d) def test_fillna(self): - self.assertFalse(np.isfinite(self.panel4d.values).all()) - filled = self.panel4d.fillna(0) - self.assertTrue(np.isfinite(filled.values).all()) - self.assertRaises(NotImplementedError, - self.panel4d.fillna, method='pad') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertFalse(np.isfinite(self.panel4d.values).all()) + filled = self.panel4d.fillna(0) + self.assertTrue(np.isfinite(filled.values).all()) - def test_swapaxes(self): - result = self.panel4d.swapaxes('labels', 'items') - self.assertIs(result.items, self.panel4d.labels) + self.assertRaises(NotImplementedError, + self.panel4d.fillna, method='pad') - result = self.panel4d.swapaxes('labels', 'minor') - self.assertIs(result.labels, self.panel4d.minor_axis) - - result = self.panel4d.swapaxes('items', 'minor') - self.assertIs(result.items, self.panel4d.minor_axis) - - result = self.panel4d.swapaxes('items', 'major') - self.assertIs(result.items, self.panel4d.major_axis) - - result = self.panel4d.swapaxes('major', 'minor') - self.assertIs(result.major_axis, self.panel4d.minor_axis) - - # this should also work - result = self.panel4d.swapaxes(0, 1) - self.assertIs(result.labels, self.panel4d.items) + def test_swapaxes(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = self.panel4d.swapaxes('labels', 'items') + self.assertIs(result.items, self.panel4d.labels) - # this works, but return a copy - result = self.panel4d.swapaxes('items', 'items') - assert_panel4d_equal(self.panel4d, result) - self.assertNotEqual(id(self.panel4d), id(result)) + result = self.panel4d.swapaxes('labels', 'minor') + self.assertIs(result.labels, self.panel4d.minor_axis) - def test_to_frame(self): - raise nose.SkipTest("skipping for now") - # # filtered - # filtered = self.panel.to_frame() - # expected = self.panel.to_frame().dropna(how='any') - # assert_frame_equal(filtered, expected) + result = self.panel4d.swapaxes('items', 'minor') + self.assertIs(result.items, self.panel4d.minor_axis) - # # unfiltered - # unfiltered = self.panel.to_frame(filter_observations=False) - # assert_panel_equal(unfiltered.to_panel(), self.panel) + result = self.panel4d.swapaxes('items', 'major') + self.assertIs(result.items, self.panel4d.major_axis) - # # names - # self.assertEqual(unfiltered.index.names, ('major', 'minor')) + result = self.panel4d.swapaxes('major', 'minor') + self.assertIs(result.major_axis, self.panel4d.minor_axis) - def test_to_frame_mixed(self): - raise nose.SkipTest("skipping for now") - # panel = self.panel.fillna(0) - # panel['str'] = 'foo' - # panel['bool'] = panel['ItemA'] > 0 + # this should also work + result = self.panel4d.swapaxes(0, 1) + self.assertIs(result.labels, self.panel4d.items) - # lp = panel.to_frame() - # wp = lp.to_panel() - # self.assertEqual(wp['bool'].values.dtype, np.bool_) - # assert_frame_equal(wp['bool'], panel['bool']) + # this works, but return a copy + result = self.panel4d.swapaxes('items', 'items') + assert_panel4d_equal(self.panel4d, result) + self.assertNotEqual(id(self.panel4d), id(result)) def test_update(self): - p4d = Panel4D([[[[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]], - [[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]]) - - other = Panel4D([[[[3.6, 2., np.nan]], - [[np.nan, np.nan, 7]]]]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = Panel4D([[[[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]]) - p4d.update(other) + other = Panel4D([[[[3.6, 2., np.nan]], + [[np.nan, np.nan, 7]]]]) - expected = Panel4D([[[[3.6, 2, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]], - [[1.5, np.nan, 7], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]]) + p4d.update(other) - assert_panel4d_equal(p4d, expected) + expected = Panel4D([[[[3.6, 2, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]]) - def test_filter(self): - raise nose.SkipTest("skipping for now") - - def test_apply(self): - raise nose.SkipTest("skipping for now") + assert_panel4d_equal(p4d, expected) def test_dtypes(self): @@ -1013,98 +918,36 @@ def test_dtypes(self): expected = Series(np.dtype('float64'), index=self.panel4d.labels) assert_series_equal(result, expected) - def test_compound(self): - raise nose.SkipTest("skipping for now") - # compounded = self.panel.compound() - - # assert_series_equal(compounded['ItemA'], - # (1 + self.panel['ItemA']).product(0) - 1) - - def test_shift(self): - raise nose.SkipTest("skipping for now") - # # major - # idx = self.panel.major_axis[0] - # idx_lag = self.panel.major_axis[1] - - # shifted = self.panel.shift(1) - - # assert_frame_equal(self.panel.major_xs(idx), - # shifted.major_xs(idx_lag)) - - # # minor - # idx = self.panel.minor_axis[0] - # idx_lag = self.panel.minor_axis[1] - - # shifted = self.panel.shift(1, axis='minor') - - # assert_frame_equal(self.panel.minor_xs(idx), - # shifted.minor_xs(idx_lag)) - - # self.assertRaises(Exception, self.panel.shift, 1, axis='items') - - def test_multiindex_get(self): - raise nose.SkipTest("skipping for now") - # ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b',2)], - # names=['first', 'second']) - # wp = Panel(np.random.random((4,5,5)), - # items=ind, - # major_axis=np.arange(5), - # minor_axis=np.arange(5)) - # f1 = wp['a'] - # f2 = wp.ix['a'] - # assert_panel_equal(f1, f2) - - # self.assertTrue((f1.items == [1, 2]).all()) - # self.assertTrue((f2.items == [1, 2]).all()) - - # ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], - # names=['first', 'second']) - - def test_multiindex_blocks(self): - raise nose.SkipTest("skipping for now") - # ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], - # names=['first', 'second']) - # wp = Panel(self.panel._data) - # wp.items = ind - # f1 = wp['a'] - # self.assertTrue((f1.items == [1, 2]).all()) - - # f1 = wp[('b',1)] - # self.assertTrue((f1.columns == ['A', 'B', 'C', 'D']).all()) - def test_repr_empty(self): - empty = Panel4D() - repr(empty) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + empty = Panel4D() + repr(empty) def test_rename(self): - mapper = { - 'l1': 'foo', - 'l2': 'bar', - 'l3': 'baz' - } + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + mapper = {'l1': 'foo', + 'l2': 'bar', + 'l3': 'baz'} - renamed = self.panel4d.rename_axis(mapper, axis=0) - exp = Index(['foo', 'bar', 'baz']) - self.assert_index_equal(renamed.labels, exp) + renamed = self.panel4d.rename_axis(mapper, axis=0) + exp = Index(['foo', 'bar', 'baz']) + self.assert_index_equal(renamed.labels, exp) - renamed = self.panel4d.rename_axis(str.lower, axis=3) - exp = Index(['a', 'b', 'c', 'd']) - self.assert_index_equal(renamed.minor_axis, exp) + renamed = self.panel4d.rename_axis(str.lower, axis=3) + exp = Index(['a', 'b', 'c', 'd']) + self.assert_index_equal(renamed.minor_axis, exp) - # don't copy - renamed_nocopy = self.panel4d.rename_axis(mapper, axis=0, copy=False) - renamed_nocopy['foo'] = 3. - self.assertTrue((self.panel4d['l1'].values == 3).all()) + # don't copy + renamed_nocopy = self.panel4d.rename_axis(mapper, + axis=0, + copy=False) + renamed_nocopy['foo'] = 3. + self.assertTrue((self.panel4d['l1'].values == 3).all()) def test_get_attr(self): assert_panel_equal(self.panel4d['l1'], self.panel4d.l1) - def test_from_frame_level1_unsorted(self): - raise nose.SkipTest("skipping for now") - - def test_to_excel(self): - raise nose.SkipTest("skipping for now") - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_panelnd.py b/pandas/tests/test_panelnd.py index ac497bc580585..2a69a65e8d55e 100644 --- a/pandas/tests/test_panelnd.py +++ b/pandas/tests/test_panelnd.py @@ -15,31 +15,35 @@ def setUp(self): def test_4d_construction(self): - # create a 4D - Panel4D = panelnd.create_nd_panel_factory( - klass_name='Panel4D', - orders=['labels', 'items', 'major_axis', 'minor_axis'], - slices={'items': 'items', 'major_axis': 'major_axis', - 'minor_axis': 'minor_axis'}, - slicer=Panel, - aliases={'major': 'major_axis', 'minor': 'minor_axis'}, - stat_axis=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + # create a 4D + Panel4D = panelnd.create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels', 'items', 'major_axis', 'minor_axis'], + slices={'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) - p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) # noqa + p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) # noqa def test_4d_construction_alt(self): - # create a 4D - Panel4D = panelnd.create_nd_panel_factory( - klass_name='Panel4D', - orders=['labels', 'items', 'major_axis', 'minor_axis'], - slices={'items': 'items', 'major_axis': 'major_axis', - 'minor_axis': 'minor_axis'}, - slicer='Panel', - aliases={'major': 'major_axis', 'minor': 'minor_axis'}, - stat_axis=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) # noqa + # create a 4D + Panel4D = panelnd.create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels', 'items', 'major_axis', 'minor_axis'], + slices={'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer='Panel', + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) # noqa def test_4d_construction_error(self): @@ -59,40 +63,44 @@ def test_4d_construction_error(self): def test_5d_construction(self): - # create a 4D - Panel4D = panelnd.create_nd_panel_factory( - klass_name='Panel4D', - orders=['labels1', 'items', 'major_axis', 'minor_axis'], - slices={'items': 'items', 'major_axis': 'major_axis', - 'minor_axis': 'minor_axis'}, - slicer=Panel, - aliases={'major': 'major_axis', 'minor': 'minor_axis'}, - stat_axis=2) - - p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) - - # create a 5D - Panel5D = panelnd.create_nd_panel_factory( - klass_name='Panel5D', - orders=['cool1', 'labels1', 'items', 'major_axis', - 'minor_axis'], - slices={'labels1': 'labels1', 'items': 'items', - 'major_axis': 'major_axis', - 'minor_axis': 'minor_axis'}, - slicer=Panel4D, - aliases={'major': 'major_axis', 'minor': 'minor_axis'}, - stat_axis=2) - - p5d = Panel5D(dict(C1=p4d)) - - # slice back to 4d - results = p5d.ix['C1', :, :, 0:3, :] - expected = p4d.ix[:, :, 0:3, :] - assert_panel_equal(results['L1'], expected['L1']) - - # test a transpose - # results = p5d.transpose(1,2,3,4,0) - # expected = + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + # create a 4D + Panel4D = panelnd.create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels1', 'items', 'major_axis', 'minor_axis'], + slices={'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + # deprecation GH13564 + p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) + + # create a 5D + Panel5D = panelnd.create_nd_panel_factory( + klass_name='Panel5D', + orders=['cool1', 'labels1', 'items', 'major_axis', + 'minor_axis'], + slices={'labels1': 'labels1', 'items': 'items', + 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel4D, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + # deprecation GH13564 + p5d = Panel5D(dict(C1=p4d)) + + # slice back to 4d + results = p5d.ix['C1', :, :, 0:3, :] + expected = p4d.ix[:, :, 0:3, :] + assert_panel_equal(results['L1'], expected['L1']) + + # test a transpose + # results = p5d.transpose(1,2,3,4,0) + # expected = if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py index edcb69de7bfad..b0e1eb72bd791 100644 --- a/pandas/tests/types/test_missing.py +++ b/pandas/tests/types/test_missing.py @@ -73,10 +73,11 @@ def test_isnull(): tm.assert_panel_equal(result, expected) # panel 4d - for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel4d_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel4d_equal(result, expected) def test_isnull_lists(): diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 13c6b72ade27b..2a8b0a47c283a 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -704,35 +704,37 @@ def df(): concat([panel1, panel3], axis=1, verify_integrity=True) def test_panel4d_concat(self): - p4d = tm.makePanel4D() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() - p1 = p4d.ix[:, :, :5, :] - p2 = p4d.ix[:, :, 5:, :] + p1 = p4d.ix[:, :, :5, :] + p2 = p4d.ix[:, :, 5:, :] - result = concat([p1, p2], axis=2) - tm.assert_panel4d_equal(result, p4d) + result = concat([p1, p2], axis=2) + tm.assert_panel4d_equal(result, p4d) - p1 = p4d.ix[:, :, :, :2] - p2 = p4d.ix[:, :, :, 2:] + p1 = p4d.ix[:, :, :, :2] + p2 = p4d.ix[:, :, :, 2:] - result = concat([p1, p2], axis=3) - tm.assert_panel4d_equal(result, p4d) + result = concat([p1, p2], axis=3) + tm.assert_panel4d_equal(result, p4d) def test_panel4d_concat_mixed_type(self): - p4d = tm.makePanel4D() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() - # if things are a bit misbehaved - p1 = p4d.ix[:, :2, :, :2] - p2 = p4d.ix[:, :, :, 2:] - p1['L5'] = 'baz' + # if things are a bit misbehaved + p1 = p4d.ix[:, :2, :, :2] + p2 = p4d.ix[:, :, :, 2:] + p1['L5'] = 'baz' - result = concat([p1, p2], axis=3) + result = concat([p1, p2], axis=3) - p2['L5'] = np.nan - expected = concat([p1, p2], axis=3) - expected = expected.ix[result.labels] + p2['L5'] = np.nan + expected = concat([p1, p2], axis=3) + expected = expected.ix[result.labels] - tm.assert_panel4d_equal(result, expected) + tm.assert_panel4d_equal(result, expected) def test_concat_series(self): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 402613d3f1728..a7c66e18aa604 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -63,7 +63,6 @@ def set_testing_mode(): # set the testing mode filters testing_mode = os.environ.get('PANDAS_TESTING_MODE', 'None') if 'deprecate' in testing_mode: - warnings.simplefilter('always', _testing_mode_warnings) From 3fdcea6f89a5a9f9fdb98452343a21e576430bd9 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 25 Jul 2016 19:40:54 -0400 Subject: [PATCH 171/359] PERF: RangeIndex.is_monotonic_inc/dec Author: sinhrks Closes #13749 from sinhrks/perf_range and squashes the following commits: 8e25563 [sinhrks] PERF: RangeIndex.is_monotonic_inc/dec --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/range.py | 8 ++++++++ pandas/tests/indexes/test_range.py | 10 ++++++++++ 3 files changed, 19 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9aa206adc513f..e340d04416fe6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -659,6 +659,7 @@ Performance Improvements - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - Improved performance of ``Index`` and ``Series`` ``.duplicated`` (:issue:`10235`) - Improved performance of ``Index.difference`` (:issue:`12044`) +- Improved performance of ``RangeIndex.is_monotonic_increasing`` and ``is_monotonic_decreasing`` (:issue:`13749`) - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) - Improved performance of hashing ``Period`` (:issue:`12817`) - Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`) diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index f680d2da0161e..a561cab30b472 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -221,6 +221,14 @@ def is_unique(self): """ return if the index has unique values """ return True + @cache_readonly + def is_monotonic_increasing(self): + return self._step > 0 or len(self) <= 1 + + @cache_readonly + def is_monotonic_decreasing(self): + return self._step < 0 or len(self) <= 1 + @property def has_duplicates(self): return False diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 99e4b72bcee37..329ffa9b7cc77 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -315,6 +315,16 @@ def test_is_monotonic(self): self.assertTrue(index.is_monotonic_increasing) self.assertTrue(index.is_monotonic_decreasing) + index = RangeIndex(2, 1) + self.assertTrue(index.is_monotonic) + self.assertTrue(index.is_monotonic_increasing) + self.assertTrue(index.is_monotonic_decreasing) + + index = RangeIndex(1, 1) + self.assertTrue(index.is_monotonic) + self.assertTrue(index.is_monotonic_increasing) + self.assertTrue(index.is_monotonic_decreasing) + def test_equals(self): equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), (RangeIndex(0), RangeIndex(1, -1, 3)), From a208c56cba2a73dce0a76173fed5ee6c445f869d Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 26 Jul 2016 10:46:33 +0200 Subject: [PATCH 172/359] DOC: contributing: fix asv usage instructions (#13794) By default asv continuous uses head^ as the commit to compare to, whereas for PRs you usually want to compare to master. --- doc/source/contributing.rst | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 51fa2a9de953b..666111470811f 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -575,42 +575,34 @@ To install asv:: pip install git+https://github.com/spacetelescope/asv -If you need to run a benchmark, change your directory to ``/asv_bench/`` and run -the following if you have been developing on ``master``:: +If you need to run a benchmark, change your directory to ``asv_bench/`` and run:: - asv continuous master + asv continuous upstream/master HEAD -This command uses ``conda`` by default for creating the benchmark +You can replace ``HEAD`` with the name of the branch you are working on. +The command uses ``conda`` by default for creating the benchmark environments. If you want to use virtualenv instead, write:: - asv continuous -E virtualenv master + asv continuous -E virtualenv upstream/master HEAD The ``-E virtualenv`` option should be added to all ``asv`` commands that run benchmarks. The default value is defined in ``asv.conf.json``. -If you are working on another branch, either of the following can be used:: +Running the full test suite can take up to one hour and use up to 3GB of RAM. +Usually it is sufficient to paste only a subset of the results into the pull +request to show that the committed changes do not cause unexpected performance +regressions. You can run specific benchmarks using the ``-b`` flag, which +takes a regular expression. For example, this will only run tests from a +``pandas/asv_bench/benchmarks/groupby.py`` file:: - asv continuous master HEAD - asv continuous master your_branch - -This will check out the master revision and run the suite on both master and -your commit. Running the full test suite can take up to one hour and use up -to 3GB of RAM. Usually it is sufficient to paste only a subset of the results into -the pull request to show that the committed changes do not cause unexpected -performance regressions. - -You can run specific benchmarks using the ``-b`` flag, which takes a regular expression. -For example, this will only run tests from a ``pandas/asv_bench/benchmarks/groupby.py`` -file:: - - asv continuous master -b groupby + asv continuous upstream/master HEAD -b groupby If you want to only run a specific group of tests from a file, you can do it using ``.`` as a separator. For example:: - asv continuous master -b groupby.groupby_agg_builtins1 + asv continuous upstream/master HEAD -b groupby.groupby_agg_builtins -will only run a ``groupby_agg_builtins1`` test defined in a ``groupby`` file. +will only run the ``groupby_agg_builtins`` benchmark defined in ``groupby.py``. You can also run the benchmark suite using the version of ``pandas`` already installed in your current Python environment. This can be From 690d52cf6b37ce1c3c896d789750cc5c7f8174d2 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 26 Jul 2016 05:58:59 -0400 Subject: [PATCH 173/359] CLN: Removed SparsePanel Title is self-explanatory. Picks up where #11157 left off. Author: gfyoung Closes #13778 from gfyoung/remove-sparse-panel and squashes the following commits: f3fa93b [gfyoung] CLN: Removed SparsePanel --- bench/bench_sparse.py | 92 ----- doc/source/sparse.rst | 20 +- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/api/tests/test_api.py | 2 +- pandas/core/panel.py | 22 +- pandas/core/sparse.py | 1 - pandas/io/packers.py | 14 +- pandas/io/pytables.py | 37 +- pandas/io/tests/test_packers.py | 20 -- pandas/io/tests/test_pytables.py | 17 - pandas/sparse/api.py | 1 - pandas/sparse/panel.py | 563 ------------------------------ pandas/sparse/tests/test_panel.py | 279 --------------- pandas/stats/plm.py | 4 +- pandas/tests/test_panel.py | 54 +-- pandas/util/testing.py | 16 - 16 files changed, 32 insertions(+), 1112 deletions(-) delete mode 100644 bench/bench_sparse.py delete mode 100644 pandas/sparse/panel.py delete mode 100644 pandas/sparse/tests/test_panel.py diff --git a/bench/bench_sparse.py b/bench/bench_sparse.py deleted file mode 100644 index 0aa705118d970..0000000000000 --- a/bench/bench_sparse.py +++ /dev/null @@ -1,92 +0,0 @@ -import numpy as np - -from pandas import * -import pandas.core.sparse as spm -import pandas.compat as compat -reload(spm) -from pandas.core.sparse import * - -N = 10000. - -arr1 = np.arange(N) -index = Index(np.arange(N)) - -off = N // 10 -arr1[off: 2 * off] = np.NaN -arr1[4 * off: 5 * off] = np.NaN -arr1[8 * off: 9 * off] = np.NaN - -arr2 = np.arange(N) -arr2[3 * off // 2: 2 * off + off // 2] = np.NaN -arr2[8 * off + off // 2: 9 * off + off // 2] = np.NaN - -s1 = SparseSeries(arr1, index=index) -s2 = SparseSeries(arr2, index=index) - -is1 = SparseSeries(arr1, kind='integer', index=index) -is2 = SparseSeries(arr2, kind='integer', index=index) - -s1_dense = s1.to_dense() -s2_dense = s2.to_dense() - -if compat.is_platform_linux(): - pth = '/home/wesm/code/pandas/example' -else: - pth = '/Users/wesm/code/pandas/example' - -dm = DataFrame.load(pth) - -sdf = dm.to_sparse() - - -def new_data_like(sdf): - new_data = {} - for col, series in compat.iteritems(sdf): - new_data[col] = SparseSeries(np.random.randn(len(series.sp_values)), - index=sdf.index, - sparse_index=series.sp_index, - fill_value=series.fill_value) - - return SparseDataFrame(new_data) - -# data = {} -# for col, ser in dm.iteritems(): -# data[col] = SparseSeries(ser) - -dwp = Panel.fromDict({'foo': dm}) -# sdf = SparseDataFrame(data) - - -lp = stack_sparse_frame(sdf) - - -swp = SparsePanel({'A': sdf}) -swp = SparsePanel({'A': sdf, - 'B': sdf, - 'C': sdf, - 'D': sdf}) - -y = sdf -x = SparsePanel({'x1': sdf + new_data_like(sdf) / 10, - 'x2': sdf + new_data_like(sdf) / 10}) - -dense_y = sdf -dense_x = x.to_dense() - -# import hotshot, hotshot.stats -# prof = hotshot.Profile('test.prof') - -# benchtime, stones = prof.runcall(ols, y=y, x=x) - -# prof.close() - -# stats = hotshot.stats.load('test.prof') - -dense_model = ols(y=dense_y, x=dense_x) - -import pandas.stats.plm as plm -import pandas.stats.interface as face -reload(plm) -reload(face) - -# model = face.ols(y=y, x=x) diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 257fb2909d42c..41ed0bf16ebae 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -15,13 +15,14 @@ Sparse data structures ********************** -We have implemented "sparse" versions of Series, DataFrame, and Panel. These -are not sparse in the typical "mostly 0". You can view these objects as being -"compressed" where any data matching a specific value (NaN/missing by default, -though any value can be chosen) is omitted. A special ``SparseIndex`` object -tracks where data has been "sparsified". This will make much more sense in an -example. All of the standard pandas data structures have a ``to_sparse`` -method: +.. note:: The ``SparsePanel`` class has been removed in 0.19.0 + +We have implemented "sparse" versions of Series and DataFrame. These are not sparse +in the typical "mostly 0". Rather, you can view these objects as being "compressed" +where any data matching a specific value (``NaN`` / missing value, though any value +can be chosen) is omitted. A special ``SparseIndex`` object tracks where data has been +"sparsified". This will make much more sense in an example. All of the standard pandas +data structures have a ``to_sparse`` method: .. ipython:: python @@ -77,9 +78,8 @@ distinct from the ``fill_value``: sparr = pd.SparseArray(arr) sparr -Like the indexed objects (SparseSeries, SparseDataFrame, SparsePanel), a -``SparseArray`` can be converted back to a regular ndarray by calling -``to_dense``: +Like the indexed objects (SparseSeries, SparseDataFrame), a ``SparseArray`` +can be converted back to a regular ndarray by calling ``to_dense``: .. ipython:: python diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e340d04416fe6..375bbd79fd29b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -330,6 +330,7 @@ API changes ~~~~~~~~~~~ +- ``Panel.to_sparse`` will raise a ``NotImplementedError`` exception when called (:issue:`13778`) - ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`) - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) @@ -619,6 +620,7 @@ Deprecations Removal of prior version deprecations/changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- The ``SparsePanel`` class has been removed (:issue:`13778`) - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - The ``pandas.io.data`` and ``pandas.io.wb`` modules are removed in favor of the `pandas-datareader package `__ (:issue:`13724`). diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index 8143f925af3e0..fda81ee6c9045 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -57,7 +57,7 @@ class TestPDApi(Base, tm.TestCase): 'TimedeltaIndex', 'Timestamp'] # these are already deprecated; awaiting removal - deprecated_classes = ['SparsePanel', 'TimeSeries', 'WidePanel', + deprecated_classes = ['TimeSeries', 'WidePanel', 'SparseTimeSeries', 'Panel4D'] # these should be deperecated in the future diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 1d49ac5e2be86..b8cd9b90e7989 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -393,25 +393,15 @@ def _get_plane_axes(self, axis): fromDict = from_dict - def to_sparse(self, fill_value=None, kind='block'): + def to_sparse(self, *args, **kwargs): """ - Convert to SparsePanel - - Parameters - ---------- - fill_value : float, default NaN - kind : {'block', 'integer'} + NOT IMPLEMENTED: do not call this method, as sparsifying is not + supported for Panel objects and will raise an error. - Returns - ------- - y : SparseDataFrame + Convert to SparsePanel """ - from pandas.core.sparse import SparsePanel - frames = dict(self.iteritems()) - return SparsePanel(frames, items=self.items, - major_axis=self.major_axis, - minor_axis=self.minor_axis, default_kind=kind, - default_fill_value=fill_value) + raise NotImplementedError("sparsifying is not supported " + "for Panel objects") def to_excel(self, path, na_rep='', engine=None, **kwargs): """ diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py index 701e6b1102b05..4fc329844d616 100644 --- a/pandas/core/sparse.py +++ b/pandas/core/sparse.py @@ -8,4 +8,3 @@ from pandas.sparse.series import SparseSeries from pandas.sparse.frame import SparseDataFrame -from pandas.sparse.panel import SparsePanel diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 94f390955dddd..1838d9175e597 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -56,7 +56,7 @@ Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, Categorical) from pandas.tslib import NaTType -from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel +from pandas.sparse.api import SparseSeries, SparseDataFrame from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame from pandas.core.common import PerformanceWarning @@ -447,18 +447,6 @@ def encode(obj): # d['data'] = dict([(name, ss) # for name, ss in compat.iteritems(obj)]) # return d - elif isinstance(obj, SparsePanel): - raise NotImplementedError( - 'msgpack sparse frame is not implemented' - ) - # d = {'typ': 'sparse_panel', - # 'klass': obj.__class__.__name__, - # 'items': obj.items} - # for f in ['default_fill_value', 'default_kind']: - # d[f] = getattr(obj, f, None) - # d['data'] = dict([(name, df) - # for name, df in compat.iteritems(obj)]) - # return d else: data = obj._data diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 038ca7ac7775b..7503b21160250 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -29,7 +29,7 @@ MultiIndex, Int64Index, isnull) from pandas.core import config from pandas.io.common import _stringify_path -from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel +from pandas.sparse.api import SparseSeries, SparseDataFrame from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex @@ -169,7 +169,6 @@ class DuplicateWarning(Warning): SparseDataFrame: u('sparse_frame'), Panel: u('wide'), Panel4D: u('ndim'), - SparsePanel: u('sparse_panel') } # storer class map @@ -183,7 +182,6 @@ class DuplicateWarning(Warning): u('frame'): 'FrameFixed', u('sparse_frame'): 'SparseFrameFixed', u('wide'): 'PanelFixed', - u('sparse_panel'): 'SparsePanelFixed', } # table class map @@ -2777,39 +2775,6 @@ def write(self, obj, **kwargs): self.write_index('columns', obj.columns) -class SparsePanelFixed(SparseFixed): - pandas_kind = u('sparse_panel') - attributes = ['default_kind', 'default_fill_value'] - - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - items = self.read_index('items') - - sdict = {} - for name in items: - key = 'sparse_frame_%s' % name - s = SparseFrameFixed(self.parent, getattr(self.group, key)) - s.infer_axes() - sdict[name] = s.read() - return SparsePanel(sdict, items=items, default_kind=self.default_kind, - default_fill_value=self.default_fill_value) - - def write(self, obj, **kwargs): - super(SparsePanelFixed, self).write(obj, **kwargs) - self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind - self.write_index('items', obj.items) - - for name, sdf in obj.iteritems(): - key = 'sparse_frame_%s' % name - if key not in self.group._v_children: - node = self._handle.create_group(self.group, key) - else: - node = getattr(self.group, key) - s = SparseFrameFixed(self.parent, node) - s.write(sdf) - - class BlockManagerFixed(GenericFixed): attributes = ['ndim', 'nblocks'] is_shape_reversed = False diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index fe5972d35d5ec..cf61ad9a35935 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -542,26 +542,6 @@ def test_sparse_frame(self): self._check_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) - def test_sparse_panel(self): - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - - items = ['x', 'y', 'z'] - p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) - sp = p.to_sparse() - - self._check_roundtrip(sp, tm.assert_panel_equal, - check_panel_type=True) - - sp2 = p.to_sparse(kind='integer') - self._check_roundtrip(sp2, tm.assert_panel_equal, - check_panel_type=True) - - sp3 = p.to_sparse(fill_value=0) - self._check_roundtrip(sp3, tm.assert_panel_equal, - check_panel_type=True) - class TestCompression(TestPackers): """See https://github.com/pydata/pandas/pull/9783 diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 89d2f13f256fe..f95e764ad4da3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2688,23 +2688,6 @@ def test_sparse_frame(self): self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) - def test_sparse_panel(self): - - items = ['x', 'y', 'z'] - p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) - sp = p.to_sparse() - - self._check_double_roundtrip(sp, assert_panel_equal, - check_panel_type=True) - - sp2 = p.to_sparse(kind='integer') - self._check_double_roundtrip(sp2, assert_panel_equal, - check_panel_type=True) - - sp3 = p.to_sparse(fill_value=0) - self._check_double_roundtrip(sp3, assert_panel_equal, - check_panel_type=True) - def test_float_index(self): # GH #454 diff --git a/pandas/sparse/api.py b/pandas/sparse/api.py index b4d874e6a1ab9..55841fbeffa2d 100644 --- a/pandas/sparse/api.py +++ b/pandas/sparse/api.py @@ -4,4 +4,3 @@ from pandas.sparse.list import SparseList from pandas.sparse.series import SparseSeries, SparseTimeSeries from pandas.sparse.frame import SparseDataFrame -from pandas.sparse.panel import SparsePanel diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py deleted file mode 100644 index 4370d040d8eaf..0000000000000 --- a/pandas/sparse/panel.py +++ /dev/null @@ -1,563 +0,0 @@ -""" -Data structures for sparse float data. Life is made simpler by dealing only -with float64 data -""" - -# pylint: disable=E1101,E1103,W0231 - -import warnings -from pandas.compat import lrange, zip -from pandas import compat -import numpy as np - -from pandas.types.common import is_list_like, is_scalar -from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.frame import DataFrame -from pandas.core.panel import Panel -from pandas.sparse.frame import SparseDataFrame -from pandas.util.decorators import deprecate - -import pandas.core.common as com -import pandas.core.ops as ops - - -class SparsePanelAxis(object): - def __init__(self, cache_field, frame_attr): - self.cache_field = cache_field - self.frame_attr = frame_attr - - def __get__(self, obj, type=None): - return getattr(obj, self.cache_field, None) - - def __set__(self, obj, value): - value = _ensure_index(value) - - if isinstance(value, MultiIndex): - raise NotImplementedError("value cannot be a MultiIndex") - - for v in compat.itervalues(obj._frames): - setattr(v, self.frame_attr, value) - - setattr(obj, self.cache_field, value) - - -class SparsePanel(Panel): - """ - Sparse version of Panel - - Parameters - ---------- - frames : dict of DataFrame objects - items : array-like - major_axis : array-like - minor_axis : array-like - default_kind : {'block', 'integer'}, default 'block' - Default sparse kind for converting Series to SparseSeries. Will not - override SparseSeries passed into constructor - default_fill_value : float - Default fill_value for converting Series to SparseSeries. Will not - override SparseSeries passed in - - Notes - ----- - """ - ndim = 3 - _typ = 'panel' - _subtyp = 'sparse_panel' - - def __init__(self, frames=None, items=None, major_axis=None, - minor_axis=None, default_fill_value=np.nan, - default_kind='block', copy=False): - - # deprecation #11157 - warnings.warn("SparsePanel is deprecated and will be removed in a " - "future version", FutureWarning, stacklevel=3) - - if frames is None: - frames = {} - - if isinstance(frames, np.ndarray): - new_frames = {} - for item, vals in zip(items, frames): - new_frames[item] = SparseDataFrame( - vals, index=major_axis, columns=minor_axis, - default_fill_value=default_fill_value, - default_kind=default_kind) - frames = new_frames - - if not isinstance(frames, dict): - raise TypeError('input must be a dict, a %r was passed' % - type(frames).__name__) - - self.default_fill_value = fill_value = default_fill_value - self.default_kind = kind = default_kind - - # pre-filter, if necessary - if items is None: - items = Index(sorted(frames.keys())) - items = _ensure_index(items) - - (clean_frames, major_axis, - minor_axis) = _convert_frames(frames, major_axis, minor_axis, - kind=kind, fill_value=fill_value) - - self._frames = clean_frames - - # do we want to fill missing ones? - for item in items: - if item not in clean_frames: - raise ValueError('column %r not found in data' % item) - - self._items = items - self.major_axis = major_axis - self.minor_axis = minor_axis - - def _consolidate_inplace(self): # pragma: no cover - # do nothing when DataFrame calls this method - pass - - def __array_wrap__(self, result): - return SparsePanel(result, items=self.items, - major_axis=self.major_axis, - minor_axis=self.minor_axis, - default_kind=self.default_kind, - default_fill_value=self.default_fill_value) - - @classmethod - def from_dict(cls, data): - """ - Analogous to Panel.from_dict - """ - return SparsePanel(data) - - def to_dense(self): - """ - Convert SparsePanel to (dense) Panel - - Returns - ------- - dense : Panel - """ - return Panel(self.values, self.items, self.major_axis, self.minor_axis) - - def as_matrix(self): - return self.values - - @property - def values(self): - # return dense values - return np.array([self._frames[item].values for item in self.items]) - - # need a special property for items to make the field assignable - - _items = None - - def _get_items(self): - return self._items - - def _set_items(self, new_items): - new_items = _ensure_index(new_items) - if isinstance(new_items, MultiIndex): - raise NotImplementedError("itemps cannot be a MultiIndex") - - # need to create new frames dict - - old_frame_dict = self._frames - old_items = self._items - self._frames = dict((new_k, old_frame_dict[old_k]) - for new_k, old_k in zip(new_items, old_items)) - self._items = new_items - - items = property(fget=_get_items, fset=_set_items) - - # DataFrame's index - major_axis = SparsePanelAxis('_major_axis', 'index') - - # DataFrame's columns / "items" - minor_axis = SparsePanelAxis('_minor_axis', 'columns') - - def _ixs(self, i, axis=0): - """ - for compat as we don't support Block Manager here - i : int, slice, or sequence of integers - axis : int - """ - - key = self._get_axis(axis)[i] - - # xs cannot handle a non-scalar key, so just reindex here - if is_list_like(key): - return self.reindex(**{self._get_axis_name(axis): key}) - - return self.xs(key, axis=axis) - - def _slice(self, slobj, axis=0, kind=None): - """ - for compat as we don't support Block Manager here - """ - axis = self._get_axis_name(axis) - index = self._get_axis(axis) - - return self.reindex(**{axis: index[slobj]}) - - def _get_item_cache(self, key): - return self._frames[key] - - def __setitem__(self, key, value): - if isinstance(value, DataFrame): - value = value.reindex(index=self.major_axis, - columns=self.minor_axis) - if not isinstance(value, SparseDataFrame): - value = value.to_sparse(fill_value=self.default_fill_value, - kind=self.default_kind) - else: - raise ValueError('only DataFrame objects can be set currently') - - self._frames[key] = value - - if key not in self.items: - self._items = Index(list(self.items) + [key]) - - def set_value(self, item, major, minor, value): - """ - Quickly set single value at (item, major, minor) location - - Parameters - ---------- - item : item label (panel item) - major : major axis label (panel item row) - minor : minor axis label (panel item column) - value : scalar - - Notes - ----- - This method *always* returns a new object. It is not particularly - efficient but is provided for API compatibility with Panel - - Returns - ------- - panel : SparsePanel - """ - dense = self.to_dense().set_value(item, major, minor, value) - return dense.to_sparse(kind=self.default_kind, - fill_value=self.default_fill_value) - - def __delitem__(self, key): - loc = self.items.get_loc(key) - indices = lrange(loc) + lrange(loc + 1, len(self.items)) - del self._frames[key] - self._items = self._items.take(indices) - - def __getstate__(self): - # pickling - from pandas.io.pickle import _pickle_array - return (self._frames, _pickle_array(self.items), - _pickle_array(self.major_axis), - _pickle_array(self.minor_axis), self.default_fill_value, - self.default_kind) - - def __setstate__(self, state): - frames, items, major, minor, fv, kind = state - - from pandas.io.pickle import _unpickle_array - self.default_fill_value = fv - self.default_kind = kind - self._items = _ensure_index(_unpickle_array(items)) - self._major_axis = _ensure_index(_unpickle_array(major)) - self._minor_axis = _ensure_index(_unpickle_array(minor)) - self._frames = frames - - def copy(self, deep=True): - """ - Make a copy of the sparse panel - - Returns - ------- - copy : SparsePanel - """ - - d = self._construct_axes_dict() - if deep: - new_data = dict((k, v.copy(deep=True)) - for k, v in compat.iteritems(self._frames)) - d = dict((k, v.copy(deep=True)) for k, v in compat.iteritems(d)) - else: - new_data = self._frames.copy() - d['default_fill_value'] = self.default_fill_value - d['default_kind'] = self.default_kind - - return SparsePanel(new_data, **d) - - def to_frame(self, filter_observations=True): - """ - Convert SparsePanel to (dense) DataFrame - - Returns - ------- - frame : DataFrame - """ - if not filter_observations: - raise TypeError('filter_observations=False not supported for ' - 'SparsePanel.to_long') - - I, N, K = self.shape - counts = np.zeros(N * K, dtype=int) - - d_values = {} - d_indexer = {} - - for item in self.items: - frame = self[item] - - values, major, minor = _stack_sparse_info(frame) - - # values are stacked column-major - indexer = minor * N + major - counts.put(indexer, counts.take(indexer) + 1) # cuteness - - d_values[item] = values - d_indexer[item] = indexer - - # have full set of observations for each item - mask = counts == I - - # for each item, take mask values at index locations for those sparse - # values, and use that to select values - values = np.column_stack([d_values[item][mask.take(d_indexer[item])] - for item in self.items]) - - inds, = mask.nonzero() - - # still column major - major_labels = inds % N - minor_labels = inds // N - - index = MultiIndex(levels=[self.major_axis, self.minor_axis], - labels=[major_labels, minor_labels], - verify_integrity=False) - - df = DataFrame(values, index=index, columns=self.items) - return df.sortlevel(level=0) - - to_long = deprecate('to_long', to_frame) - toLong = deprecate('toLong', to_frame) - - def reindex(self, major=None, items=None, minor=None, major_axis=None, - minor_axis=None, copy=False): - """ - Conform / reshape panel axis labels to new input labels - - Parameters - ---------- - major : array-like, default None - items : array-like, default None - minor : array-like, default None - copy : boolean, default False - Copy underlying SparseDataFrame objects - - Returns - ------- - reindexed : SparsePanel - """ - major = com._mut_exclusive(major=major, major_axis=major_axis) - minor = com._mut_exclusive(minor=minor, minor_axis=minor_axis) - - if com._all_none(items, major, minor): - raise ValueError('Must specify at least one axis') - - major = self.major_axis if major is None else major - minor = self.minor_axis if minor is None else minor - - if items is not None: - new_frames = {} - for item in items: - if item in self._frames: - new_frames[item] = self._frames[item] - else: - raise NotImplementedError('Reindexing with new items not ' - 'yet supported') - else: - new_frames = self._frames - - if copy: - new_frames = dict((k, v.copy()) - for k, v in compat.iteritems(new_frames)) - - return SparsePanel(new_frames, items=items, major_axis=major, - minor_axis=minor, - default_fill_value=self.default_fill_value, - default_kind=self.default_kind) - - def _combine(self, other, func, axis=0): - if isinstance(other, DataFrame): - return self._combineFrame(other, func, axis=axis) - elif isinstance(other, Panel): - return self._combinePanel(other, func) - elif is_scalar(other): - new_frames = dict((k, func(v, other)) - for k, v in self.iteritems()) - return self._new_like(new_frames) - - def _combineFrame(self, other, func, axis=0): - index, columns = self._get_plane_axes(axis) - axis = self._get_axis_number(axis) - - other = other.reindex(index=index, columns=columns) - - if axis == 0: - new_values = func(self.values, other.values) - elif axis == 1: - new_values = func(self.values.swapaxes(0, 1), other.values.T) - new_values = new_values.swapaxes(0, 1) - elif axis == 2: - new_values = func(self.values.swapaxes(0, 2), other.values) - new_values = new_values.swapaxes(0, 2) - - # TODO: make faster! - new_frames = {} - for item, item_slice in zip(self.items, new_values): - old_frame = self[item] - ofv = old_frame.default_fill_value - ok = old_frame.default_kind - new_frames[item] = SparseDataFrame(item_slice, - index=self.major_axis, - columns=self.minor_axis, - default_fill_value=ofv, - default_kind=ok) - - return self._new_like(new_frames) - - def _new_like(self, new_frames): - return SparsePanel(new_frames, self.items, self.major_axis, - self.minor_axis, - default_fill_value=self.default_fill_value, - default_kind=self.default_kind) - - def _combinePanel(self, other, func): - items = self.items.union(other.items) - major = self.major_axis.union(other.major_axis) - minor = self.minor_axis.union(other.minor_axis) - - # could check that everything's the same size, but forget it - - this = self.reindex(items=items, major=major, minor=minor) - other = other.reindex(items=items, major=major, minor=minor) - - new_frames = {} - for item in items: - new_frames[item] = func(this[item], other[item]) - - if not isinstance(other, SparsePanel): - new_default_fill = self.default_fill_value - else: - # maybe unnecessary - new_default_fill = func(self.default_fill_value, - other.default_fill_value) - - return SparsePanel(new_frames, items, major, minor, - default_fill_value=new_default_fill, - default_kind=self.default_kind) - - def major_xs(self, key): - """ - Return slice of panel along major axis - - Parameters - ---------- - key : object - Major axis label - - Returns - ------- - y : DataFrame - index -> minor axis, columns -> items - """ - slices = dict((k, v.xs(key)) for k, v in self.iteritems()) - return DataFrame(slices, index=self.minor_axis, columns=self.items) - - def minor_xs(self, key): - """ - Return slice of panel along minor axis - - Parameters - ---------- - key : object - Minor axis label - - Returns - ------- - y : SparseDataFrame - index -> major axis, columns -> items - """ - slices = dict((k, v[key]) for k, v in self.iteritems()) - return SparseDataFrame(slices, index=self.major_axis, - columns=self.items, - default_fill_value=self.default_fill_value, - default_kind=self.default_kind) - - # TODO: allow SparsePanel to work with flex arithmetic. - # pow and mod only work for scalars for now - def pow(self, val, *args, **kwargs): - """wrapper around `__pow__` (only works for scalar values)""" - return self.__pow__(val) - - def mod(self, val, *args, **kwargs): - """wrapper around `__mod__` (only works for scalar values""" - return self.__mod__(val) - -# Sparse objects opt out of numexpr -SparsePanel._add_aggregate_operations(use_numexpr=False) -ops.add_special_arithmetic_methods(SparsePanel, use_numexpr=False, ** - ops.panel_special_funcs) -SparseWidePanel = SparsePanel - - -def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'): - from pandas.core.panel import _get_combined_index - output = {} - for item, df in compat.iteritems(frames): - if not isinstance(df, SparseDataFrame): - df = SparseDataFrame(df, default_kind=kind, - default_fill_value=fill_value) - - output[item] = df - - if index is None: - all_indexes = [x.index for x in output.values()] - index = _get_combined_index(all_indexes) - if columns is None: - all_columns = [x.columns for x in output.values()] - columns = _get_combined_index(all_columns) - - index = _ensure_index(index) - columns = _ensure_index(columns) - - for item, df in compat.iteritems(output): - if not (df.index.equals(index) and df.columns.equals(columns)): - output[item] = df.reindex(index=index, columns=columns) - - return output, index, columns - - -def _stack_sparse_info(frame): - lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)] - - # this is pretty fast - minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) - - inds_to_concat = [] - vals_to_concat = [] - for col in frame.columns: - series = frame[col] - - if not np.isnan(series.fill_value): - raise TypeError('This routine assumes NaN fill value') - - int_index = series.sp_index.to_int_index() - inds_to_concat.append(int_index.indices) - vals_to_concat.append(series.sp_values) - - major_labels = np.concatenate(inds_to_concat) - sparse_values = np.concatenate(vals_to_concat) - - return sparse_values, major_labels, minor_labels diff --git a/pandas/sparse/tests/test_panel.py b/pandas/sparse/tests/test_panel.py deleted file mode 100644 index 09d861fe0a9ac..0000000000000 --- a/pandas/sparse/tests/test_panel.py +++ /dev/null @@ -1,279 +0,0 @@ -# pylint: disable-msg=E1101,W0612 - -import nose # noqa -from numpy import nan -import pandas as pd - -from pandas import DataFrame, bdate_range, Panel -from pandas.core.index import Index -import pandas.util.testing as tm -from pandas.sparse.api import SparseSeries, SparsePanel -import pandas.tests.test_panel as test_panel - - -def panel_data1(): - index = bdate_range('1/1/2011', periods=8) - - return DataFrame({ - 'A': [nan, nan, nan, 0, 1, 2, 3, 4], - 'B': [0, 1, 2, 3, 4, nan, nan, nan], - 'C': [0, 1, 2, nan, nan, nan, 3, 4], - 'D': [nan, 0, 1, nan, 2, 3, 4, nan] - }, index=index) - - -def panel_data2(): - index = bdate_range('1/1/2011', periods=9) - - return DataFrame({ - 'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5], - 'B': [0, 1, 2, 3, 4, 5, nan, nan, nan], - 'C': [0, 1, 2, nan, nan, nan, 3, 4, 5], - 'D': [nan, 0, 1, nan, 2, 3, 4, 5, nan] - }, index=index) - - -def panel_data3(): - index = bdate_range('1/1/2011', periods=10).shift(-2) - - return DataFrame({ - 'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, 3, 4, 5, 6, nan, nan, nan], - 'C': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'D': [nan, 0, 1, nan, 2, 3, 4, 5, 6, nan] - }, index=index) - - -class TestSparsePanel(tm.TestCase, test_panel.SafeForLongAndSparse, - test_panel.SafeForSparse): - _multiprocess_can_split_ = True - - def setUp(self): - self.data_dict = { - 'ItemA': panel_data1(), - 'ItemB': panel_data2(), - 'ItemC': panel_data3(), - 'ItemD': panel_data1(), - } - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.panel = SparsePanel(self.data_dict) - - @staticmethod - def _test_op(panel, op): - # arithmetic tests - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = op(panel, 1) - tm.assert_sp_frame_equal(result['ItemA'], op(panel['ItemA'], 1)) - - def test_constructor(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertRaises(ValueError, SparsePanel, self.data_dict, - items=['Item0', 'ItemA', 'ItemB']) - with tm.assertRaisesRegexp(TypeError, - "input must be a dict, a 'list' was " - "passed"): - SparsePanel(['a', 'b', 'c']) - - # deprecation GH11157 - def test_deprecation(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - SparsePanel() - - # GH 9272 - def test_constructor_empty(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - sp = SparsePanel() - self.assertEqual(len(sp.items), 0) - self.assertEqual(len(sp.major_axis), 0) - self.assertEqual(len(sp.minor_axis), 0) - - def test_from_dict(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - fd = SparsePanel.from_dict(self.data_dict) - tm.assert_sp_panel_equal(fd, self.panel) - - def test_pickle(self): - def _test_roundtrip(panel): - result = self.round_trip_pickle(panel) - tm.assertIsInstance(result.items, Index) - tm.assertIsInstance(result.major_axis, Index) - tm.assertIsInstance(result.minor_axis, Index) - tm.assert_sp_panel_equal(panel, result) - - _test_roundtrip(self.panel) - - def test_dense_to_sparse(self): - wp = Panel.from_dict(self.data_dict) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - dwp = wp.to_sparse() - tm.assertIsInstance(dwp['ItemA']['A'], SparseSeries) - - def test_to_dense(self): - dwp = self.panel.to_dense() - dwp2 = Panel.from_dict(self.data_dict) - tm.assert_panel_equal(dwp, dwp2) - - def test_to_frame(self): - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - - def _compare_with_dense(panel): - slp = panel.to_frame() - dlp = panel.to_dense().to_frame() - - self.assert_numpy_array_equal(slp.values, dlp.values) - self.assert_index_equal(slp.index, dlp.index, - check_names=False) - - _compare_with_dense(self.panel) - _compare_with_dense(self.panel.reindex(items=['ItemA'])) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - zero_panel = SparsePanel(self.data_dict, default_fill_value=0) - self.assertRaises(Exception, zero_panel.to_frame) - - self.assertRaises(Exception, self.panel.to_frame, - filter_observations=False) - - def test_long_to_wide_sparse(self): - pass - - def test_values(self): - pass - - def test_setitem(self): - self.panel['ItemE'] = self.panel['ItemC'] - self.panel['ItemF'] = self.panel['ItemC'].to_dense() - - tm.assert_sp_frame_equal(self.panel['ItemE'], self.panel['ItemC']) - tm.assert_sp_frame_equal(self.panel['ItemF'], self.panel['ItemC']) - - expected = pd.Index(['ItemA', 'ItemB', 'ItemC', - 'ItemD', 'ItemE', 'ItemF']) - tm.assert_index_equal(self.panel.items, expected) - - self.assertRaises(Exception, self.panel.__setitem__, 'item6', 1) - - def test_set_value(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - def _check_loc(item, major, minor, val=1.5): - res = self.panel.set_value(item, major, minor, val) - self.assertIsNot(res, self.panel) - self.assertEqual(res.get_value(item, major, minor), val) - - _check_loc('ItemA', self.panel.major_axis[4], - self.panel.minor_axis[3]) - _check_loc('ItemF', self.panel.major_axis[4], - self.panel.minor_axis[3]) - _check_loc('ItemF', 'foo', self.panel.minor_axis[3]) - _check_loc('ItemE', 'foo', 'bar') - - def test_delitem_pop(self): - del self.panel['ItemB'] - tm.assert_index_equal(self.panel.items, - pd.Index(['ItemA', 'ItemC', 'ItemD'])) - crackle = self.panel['ItemC'] - pop = self.panel.pop('ItemC') - self.assertIs(pop, crackle) - tm.assert_almost_equal(self.panel.items, pd.Index(['ItemA', 'ItemD'])) - - self.assertRaises(KeyError, self.panel.__delitem__, 'ItemC') - - def test_copy(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - cop = self.panel.copy() - tm.assert_sp_panel_equal(cop, self.panel) - - def test_reindex(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - - def _compare_with_dense(swp, items, major, minor): - swp_re = swp.reindex(items=items, major=major, minor=minor) - dwp_re = swp.to_dense().reindex(items=items, major=major, - minor=minor) - tm.assert_panel_equal(swp_re.to_dense(), dwp_re) - - _compare_with_dense(self.panel, self.panel.items[:2], - self.panel.major_axis[::2], - self.panel.minor_axis[::2]) - _compare_with_dense(self.panel, None, self.panel.major_axis[::2], - self.panel.minor_axis[::2]) - - self.assertRaises(ValueError, self.panel.reindex) - - # TODO: do something about this later... - self.assertRaises(Exception, self.panel.reindex, - items=['item0', 'ItemA', 'ItemB']) - - # test copying - cp = self.panel.reindex(self.panel.major_axis, copy=True) - cp['ItemA']['E'] = cp['ItemA']['A'] - self.assertNotIn('E', self.panel['ItemA']) - - def test_operators(self): - def _check_ops(panel): - def _dense_comp(op): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - dense = panel.to_dense() - sparse_result = op(panel) - dense_result = op(dense) - tm.assert_panel_equal(sparse_result.to_dense(), - dense_result) - - def _mixed_comp(op): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = op(panel, panel.to_dense()) - expected = op(panel.to_dense(), panel.to_dense()) - tm.assert_panel_equal(result, expected) - - op1 = lambda x: x + 2 - - _dense_comp(op1) - op2 = lambda x: x.add(x.reindex(major=x.major_axis[::2])) - _dense_comp(op2) - op3 = lambda x: x.subtract(x.mean(0), axis=0) - _dense_comp(op3) - op4 = lambda x: x.subtract(x.mean(1), axis=1) - _dense_comp(op4) - op5 = lambda x: x.subtract(x.mean(2), axis=2) - _dense_comp(op5) - - _mixed_comp(Panel.multiply) - _mixed_comp(Panel.subtract) - - # TODO: this case not yet supported! - # op6 = lambda x: x.add(x.to_frame()) - # _dense_comp(op6) - - _check_ops(self.panel) - - def test_major_xs(self): - def _dense_comp(sparse): - dense = sparse.to_dense() - - for idx in sparse.major_axis: - dslice = dense.major_xs(idx) - sslice = sparse.major_xs(idx) - tm.assert_frame_equal(dslice, sslice) - - _dense_comp(self.panel) - - def test_minor_xs(self): - def _dense_comp(sparse): - dense = sparse.to_dense() - - for idx in sparse.minor_axis: - dslice = dense.minor_xs(idx) - sslice = sparse.minor_xs(idx).to_dense() - tm.assert_frame_equal(dslice, sslice) - - _dense_comp(self.panel) - - -if __name__ == '__main__': - import nose # noqa - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py index baa30cde9344e..099c45d5ec60b 100644 --- a/pandas/stats/plm.py +++ b/pandas/stats/plm.py @@ -18,7 +18,6 @@ from pandas.core.frame import DataFrame from pandas.core.reshape import get_dummies from pandas.core.series import Series -from pandas.core.sparse import SparsePanel from pandas.stats.ols import OLS, MovingOLS import pandas.stats.common as com import pandas.stats.math as math @@ -137,8 +136,7 @@ def _filter_data(self): if isinstance(data, Panel): data = data.copy() - if not isinstance(data, SparsePanel): - data, cat_mapping = self._convert_x(data) + data, cat_mapping = self._convert_x(data) if not isinstance(data, Panel): data = Panel.from_dict(data, intersect=True) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 46eba1772c47a..a93f2ae5651b4 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -5,7 +5,6 @@ import operator import nose -from functools import wraps import numpy as np import pandas as pd @@ -20,37 +19,16 @@ from pandas.formats.printing import pprint_thing from pandas import compat from pandas.compat import range, lrange, StringIO, OrderedDict, signature -from pandas import SparsePanel from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, - assert_produces_warning, ensure_clean, - assertRaisesRegexp, makeCustomDataframe as - mkdf, makeMixedDataFrame) + ensure_clean, assertRaisesRegexp, + makeCustomDataframe as mkdf, + makeMixedDataFrame) import pandas.core.panel as panelm import pandas.util.testing as tm -def ignore_sparse_panel_future_warning(func): - """ - decorator to ignore FutureWarning if we have a SparsePanel - - can be removed when SparsePanel is fully removed - """ - - @wraps(func) - def wrapper(self, *args, **kwargs): - - if isinstance(self.panel, SparsePanel): - with assert_produces_warning(FutureWarning, - check_stacklevel=False): - return func(self, *args, **kwargs) - else: - return func(self, *args, **kwargs) - - return wrapper - - class PanelTests(object): panel = None @@ -78,7 +56,6 @@ class SafeForLongAndSparse(object): def test_repr(self): repr(self.panel) - @ignore_sparse_panel_future_warning def test_copy_names(self): for attr in ('major_axis', 'minor_axis'): getattr(self.panel, attr).name = None @@ -261,7 +238,6 @@ def test_get_plane_axes(self): index, columns = self.panel._get_plane_axes('minor_axis') index, columns = self.panel._get_plane_axes(0) - @ignore_sparse_panel_future_warning def test_truncate(self): dates = self.panel.major_axis start, end = dates[1], dates[5] @@ -322,7 +298,6 @@ def test_iteritems(self): self.assertEqual(len(list(self.panel.iteritems())), len(self.panel.items)) - @ignore_sparse_panel_future_warning def test_combineFrame(self): def check_op(op, name): # items @@ -352,18 +327,9 @@ def check_op(op, name): assert_frame_equal(result.minor_xs(idx), op(self.panel.minor_xs(idx), xs)) - ops = ['add', 'sub', 'mul', 'truediv', 'floordiv'] + ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'pow', 'mod'] if not compat.PY3: ops.append('div') - # pow, mod not supported for SparsePanel as flex ops (for now) - if not isinstance(self.panel, SparsePanel): - ops.extend(['pow', 'mod']) - else: - idx = self.panel.minor_axis[1] - with assertRaisesRegexp(ValueError, "Simple arithmetic.*scalar"): - self.panel.pow(self.panel.minor_xs(idx), axis='minor') - with assertRaisesRegexp(ValueError, "Simple arithmetic.*scalar"): - self.panel.mod(self.panel.minor_xs(idx), axis='minor') for op in ops: try: @@ -378,12 +344,10 @@ def check_op(op, name): pprint_thing("Failing operation: %r" % 'div') raise - @ignore_sparse_panel_future_warning def test_combinePanel(self): result = self.panel.add(self.panel) self.assert_panel_equal(result, self.panel * 2) - @ignore_sparse_panel_future_warning def test_neg(self): self.assert_panel_equal(-self.panel, self.panel * -1) @@ -399,7 +363,6 @@ def test_raise_when_not_implemented(self): with self.assertRaises(NotImplementedError): getattr(p, op)(d, axis=0) - @ignore_sparse_panel_future_warning def test_select(self): p = self.panel @@ -431,7 +394,6 @@ def test_get_value(self): expected = self.panel[item][mnr][mjr] assert_almost_equal(result, expected) - @ignore_sparse_panel_future_warning def test_abs(self): result = self.panel.abs() @@ -1654,7 +1616,6 @@ def test_transpose_copy(self): panel.values[0, 1, 1] = np.nan self.assertTrue(notnull(result.values[1, 0, 1])) - @ignore_sparse_panel_future_warning def test_to_frame(self): # filtered filtered = self.panel.to_frame() @@ -2432,7 +2393,12 @@ def test_to_string(self): buf = StringIO() self.panel.to_string(buf) - @ignore_sparse_panel_future_warning + def test_to_sparse(self): + if isinstance(self.panel, Panel): + msg = 'sparsifying is not supported' + tm.assertRaisesRegexp(NotImplementedError, msg, + self.panel.to_sparse) + def test_truncate(self): dates = self.panel.index.levels[0] start, end = dates[1], dates[5] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a7c66e18aa604..e49d92e4ab202 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1452,22 +1452,6 @@ def assert_sp_frame_equal(left, right, exact_indices=True, assert (col in left) -def assert_sp_panel_equal(left, right, exact_indices=True): - assertIsInstance(left, pd.SparsePanel, '[SparsePanel]') - assertIsInstance(right, pd.SparsePanel, '[SparsePanel]') - - for item, frame in left.iteritems(): - assert (item in right) - # trade-off? - assert_sp_frame_equal(frame, right[item], exact_indices=exact_indices) - - assert_almost_equal(left.default_fill_value, right.default_fill_value) - assert (left.default_kind == right.default_kind) - - for item in right: - assert (item in left) - - def assert_sp_list_equal(left, right): assertIsInstance(left, pd.SparseList, '[SparseList]') assertIsInstance(right, pd.SparseList, '[SparseList]') From 98c5b88d6f9f7bb0afa4fbae49a045d93a1cb33f Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 26 Jul 2016 06:48:45 -0400 Subject: [PATCH 174/359] BLD: Use tempita for cython templating closes #13399 Author: sinhrks Closes #13716 from sinhrks/tempita and squashes the following commits: 09f63b6 [sinhrks] BLD: Use tempita for cython templating --- ci/lint.sh | 14 +- pandas/algos.pyx | 15 +- pandas/src/algos_common_helper.pxi | 2925 +++++++ pandas/src/algos_common_helper.pxi.in | 603 ++ pandas/src/algos_groupby_helper.pxi | 1369 +++ pandas/src/algos_groupby_helper.pxi.in | 713 ++ pandas/src/algos_join_helper.pxi | 1899 ++++ pandas/src/algos_join_helper.pxi.in | 407 + pandas/src/algos_take_helper.pxi | 4949 +++++++++++ pandas/src/algos_take_helper.pxi.in | 261 + pandas/src/generate_code.py | 2182 ----- pandas/src/generated.pyx | 10522 ----------------------- setup.py | 36 + 13 files changed, 13188 insertions(+), 12707 deletions(-) create mode 100644 pandas/src/algos_common_helper.pxi create mode 100644 pandas/src/algos_common_helper.pxi.in create mode 100644 pandas/src/algos_groupby_helper.pxi create mode 100644 pandas/src/algos_groupby_helper.pxi.in create mode 100644 pandas/src/algos_join_helper.pxi create mode 100644 pandas/src/algos_join_helper.pxi.in create mode 100644 pandas/src/algos_take_helper.pxi create mode 100644 pandas/src/algos_take_helper.pxi.in delete mode 100644 pandas/src/generate_code.py delete mode 100644 pandas/src/generated.pyx diff --git a/ci/lint.sh b/ci/lint.sh index 144febcfcece5..3adfa8d1e3d33 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -23,7 +23,7 @@ if [ "$LINT" ]; then for path in 'window.pyx' do echo "linting -> pandas/$path" - flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126,E128 + flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126 if [ $? -ne "0" ]; then RET=1 fi @@ -31,6 +31,18 @@ if [ "$LINT" ]; then done echo "Linting *.pyx DONE" + echo "Linting *.pxi.in" + for path in 'src' + do + echo "linting -> pandas/$path" + flake8 pandas/$path --filename '*.pxi.in' --select=E501,E302,E203,E111,E114,E221,E303,E231,E126 + if [ $? -ne "0" ]; then + RET=1 + fi + + done + echo "Linting *.pxi.in DONE" + echo "Check for invalid testing" grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas if [ $? = "0" ]; then diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 8e659a8566adb..cccc5377d0dec 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -13,6 +13,7 @@ cdef float64_t FP_ERR = 1e-13 cimport util from libc.stdlib cimport malloc, free +from libc.string cimport memmove from numpy cimport NPY_INT8 as NPY_int8 from numpy cimport NPY_INT16 as NPY_int16 @@ -41,10 +42,14 @@ cdef extern from "src/headers/math.h": double fabs(double) nogil # this is our util.pxd -from util cimport numeric +from util cimport numeric, get_nat +cimport lib +from lib cimport is_null_datetimelike from pandas import lib +cdef int64_t iNaT = get_nat() + cdef: int TIEBREAK_AVERAGE = 0 int TIEBREAK_MIN = 1 @@ -1334,5 +1339,11 @@ cdef inline float64_t _median_linear(float64_t* a, int n): return result + include "join.pyx" -include "generated.pyx" + +# generated from template +include "algos_common_helper.pxi" +include "algos_groupby_helper.pxi" +include "algos_join_helper.pxi" +include "algos_take_helper.pxi" diff --git a/pandas/src/algos_common_helper.pxi b/pandas/src/algos_common_helper.pxi new file mode 100644 index 0000000000000..59b3ddff46dec --- /dev/null +++ b/pandas/src/algos_common_helper.pxi @@ -0,0 +1,2925 @@ +""" +Template for each `dtype` helper function using 1-d template + +# 1-d template +- map_indices +- pad +- pad_1d +- pad_2d +- backfill +- backfill_1d +- backfill_2d +- is_monotonic +- groupby +- arrmap + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# 1-d template +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_float64(ndarray[float64_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float64_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float64_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_float64(ndarray[float64_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + float64_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_float64(ndarray[float64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_float64(ndarray[float64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_float32(ndarray[float32_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float32_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_float32(ndarray[float32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float32_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_float32(ndarray[float32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_float32(ndarray[float32_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + float32_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_float32(ndarray[float32_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_float32(ndarray[float32_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_object(ndarray[object] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_object(ndarray[object] old, ndarray[object] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef object cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef object val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_object(ndarray[object] old, ndarray[object] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef object cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef object val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_object(ndarray[object] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + object prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_object(ndarray[object] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_object(ndarray[object] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int32(ndarray[int32_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int32_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int32_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int32(ndarray[int32_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + int32_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int32(ndarray[int32_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int32(ndarray[int32_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int64(ndarray[int64_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int64_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int64_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int64(ndarray[int64_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + int64_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int64(ndarray[int64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int64(ndarray[int64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_bool(ndarray[uint8_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef uint8_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef uint8_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + uint8_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_bool(ndarray[uint8_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_bool(ndarray[uint8_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +#---------------------------------------------------------------------- +# put template +#---------------------------------------------------------------------- + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_float64(ndarray[float64_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_float64_float64(ndarray[float64_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float64_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_float32(ndarray[float32_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_float32_float32(ndarray[float32_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float32_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int8(ndarray[int8_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_int8_float32(ndarray[int8_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float32_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int16(ndarray[int16_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_int16_float32(ndarray[int16_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float32_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int32(ndarray[int32_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_int32_float64(ndarray[int32_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float64_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int64(ndarray[int64_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float64_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + +#---------------------------------------------------------------------- +# ensure_dtype +#---------------------------------------------------------------------- + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num + +cpdef ensure_platform_int(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.int_) + else: + return np.array(arr, dtype=np.int_) + +cpdef ensure_object(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_OBJECT: + return arr + else: + return arr.astype(np.object_) + elif hasattr(arr, 'asobject'): + return arr.asobject + else: + return np.array(arr, dtype=np.object_) + +cpdef ensure_float64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_FLOAT64: + return arr + else: + return arr.astype(np.float64) + else: + return np.array(arr, dtype=np.float64) + +cpdef ensure_float32(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_FLOAT32: + return arr + else: + return arr.astype(np.float32) + else: + return np.array(arr, dtype=np.float32) + +cpdef ensure_int8(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT8: + return arr + else: + return arr.astype(np.int8) + else: + return np.array(arr, dtype=np.int8) + +cpdef ensure_int16(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT16: + return arr + else: + return arr.astype(np.int16) + else: + return np.array(arr, dtype=np.int16) + +cpdef ensure_int32(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT32: + return arr + else: + return arr.astype(np.int32) + else: + return np.array(arr, dtype=np.int32) + +cpdef ensure_int64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT64: + return arr + else: + return arr.astype(np.int64) + else: + return np.array(arr, dtype=np.int64) diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in new file mode 100644 index 0000000000000..2327f10389cb5 --- /dev/null +++ b/pandas/src/algos_common_helper.pxi.in @@ -0,0 +1,603 @@ +""" +Template for each `dtype` helper function using 1-d template + +# 1-d template +- map_indices +- pad +- pad_1d +- pad_2d +- backfill +- backfill_1d +- backfill_2d +- is_monotonic +- groupby +- arrmap + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# 1-d template +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dtype, can_hold_na, nogil +dtypes = [('float64', 'float64_t', 'np.float64', True, True), + ('float32', 'float32_t', 'np.float32', True, True), + ('object', 'object', 'object', True, False), + ('int32', 'int32_t', 'np.int32', False, True), + ('int64', 'int64_t', 'np.int64', False, True), + ('bool', 'uint8_t', 'np.bool', False, True)] + +def get_dispatch(dtypes): + + for name, c_type, dtype, can_hold_na, nogil in dtypes: + + nogil_str = 'with nogil:' if nogil else '' + tab = ' ' if nogil else '' + yield name, c_type, dtype, can_hold_na, nogil_str, tab +}} + +{{for name, c_type, dtype, can_hold_na, nogil_str, tab + in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_{{name}}(ndarray[{{c_type}}] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef {{c_type}} cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_{{name}}(ndarray[{{c_type}}] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef {{c_type}} val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef {{c_type}} val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef {{c_type}} cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_{{name}}(ndarray[{{c_type}}] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef {{c_type}} val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef {{c_type}} val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + {{c_type}} prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + {{nogil_str}} + {{tab}}prev = arr[0] + {{tab}}for i in range(1, n): + {{tab}} cur = arr[i] + {{tab}} if timelike and cur == iNaT: + {{tab}} is_monotonic_inc = 0 + {{tab}} is_monotonic_dec = 0 + {{tab}} break + {{tab}} if cur < prev: + {{tab}} is_monotonic_inc = 0 + {{tab}} elif cur > prev: + {{tab}} is_monotonic_dec = 0 + {{tab}} elif cur == prev: + {{tab}} pass # is_unique = 0 + {{tab}} else: + {{tab}} # cur or prev is NaN + {{tab}} is_monotonic_inc = 0 + {{tab}} is_monotonic_dec = 0 + {{tab}} break + {{tab}} if not is_monotonic_inc and not is_monotonic_dec: + {{tab}} is_monotonic_inc = 0 + {{tab}} is_monotonic_dec = 0 + {{tab}} break + {{tab}} prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_{{name}}(ndarray[{{c_type}}] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_{{name}}(ndarray[{{c_type}}] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +{{endfor}} + +#---------------------------------------------------------------------- +# put template +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dest_type, dest_dtype +dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'), + ('float32', 'float32_t', 'float32_t', 'np.float32'), + ('int8', 'int8_t', 'float32_t', 'np.float32'), + ('int16', 'int16_t', 'float32_t', 'np.float32'), + ('int32', 'int32_t', 'float64_t', 'np.float64'), + ('int64', 'int64_t', 'float64_t', 'np.float64')] + +def get_dispatch(dtypes): + + for name, c_type, dest_type, dest_dtype, in dtypes: + + dest_type2 = dest_type + dest_type = dest_type.replace('_t', '') + + yield name, c_type, dest_type, dest_type2, dest_dtype + +}} + +{{for name, c_type, dest_type, dest_type2, dest_dtype + in get_dispatch(dtypes)}} + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr, + ndarray[{{dest_type2}}, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[{{dest_type2}}] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + +{{endfor}} + +#---------------------------------------------------------------------- +# ensure_dtype +#---------------------------------------------------------------------- + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num + +cpdef ensure_platform_int(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.int_) + else: + return np.array(arr, dtype=np.int_) + +cpdef ensure_object(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_OBJECT: + return arr + else: + return arr.astype(np.object_) + elif hasattr(arr, 'asobject'): + return arr.asobject + else: + return np.array(arr, dtype=np.object_) + +{{py: + +# name, c_type, dtype +dtypes = [('float64', 'FLOAT64', 'float64'), + ('float32', 'FLOAT32', 'float32'), + ('int8', 'INT8', 'int8'), + ('int16', 'INT16', 'int16'), + ('int32', 'INT32', 'int32'), + ('int64', 'INT64', 'int64'), + # ('platform_int', 'INT', 'int_'), + # ('object', 'OBJECT', 'object_'), +] + +def get_dispatch(dtypes): + + for name, c_type, dtype in dtypes: + yield name, c_type, dtype +}} + +{{for name, c_type, dtype in get_dispatch(dtypes)}} + +cpdef ensure_{{name}}(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_{{c_type}}: + return arr + else: + return arr.astype(np.{{dtype}}) + else: + return np.array(arr, dtype=np.{{dtype}}) + +{{endfor}} \ No newline at end of file diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi new file mode 100644 index 0000000000000..fb86c4efb7314 --- /dev/null +++ b/pandas/src/algos_groupby_helper.pxi @@ -0,0 +1,1369 @@ +""" +Template for each `dtype` helper function using groupby + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +cdef extern from "numpy/npy_math.h": + double NAN "NPY_NAN" +_int64_max = np.iinfo(np.int64).max + +#---------------------------------------------------------------------- +# group_add, group_prod, group_var, group_mean, group_ohlc +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_add_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + + if K > 1: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + + else: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_prod_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = prodx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +def group_var_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, ct, oldmean + ndarray[float64_t, ndim=2] nobs, mean + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + mean = np.zeros_like(out) + + N, K = ( values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + + for i in range(ncounts): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = NAN + else: + out[i, j] /= (ct - 1) +# add passing bin edges, instead of labels + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] / count + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + Py_ssize_t ngroups = len(counts) + + if len(labels) == 0: + return + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + if K > 1: + raise NotImplementedError("Argument 'values' must have only " + "one dimension") + out.fill(np.nan) + + with nogil: + for i in range(N): + lab = labels[i] + if lab == -1: + continue + + counts[lab] += 1 + val = values[i, 0] + if val != val: + continue + + if out[lab, 0] != out[lab, 0]: + out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val + else: + out[lab, 1] = max(out[lab, 1], val) + out[lab, 2] = min(out[lab, 2], val) + out[lab, 3] = val + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_add_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + + if K > 1: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + + else: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_prod_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = prodx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +def group_var_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, ct, oldmean + ndarray[float32_t, ndim=2] nobs, mean + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + mean = np.zeros_like(out) + + N, K = ( values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + + for i in range(ncounts): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = NAN + else: + out[i, j] /= (ct - 1) +# add passing bin edges, instead of labels + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] / count + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + Py_ssize_t ngroups = len(counts) + + if len(labels) == 0: + return + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + if K > 1: + raise NotImplementedError("Argument 'values' must have only " + "one dimension") + out.fill(np.nan) + + with nogil: + for i in range(N): + lab = labels[i] + if lab == -1: + continue + + counts[lab] += 1 + val = values[i, 0] + if val != val: + continue + + if out[lab, 0] != out[lab, 0]: + out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val + else: + out[lab, 1] = max(out[lab, 1], val) + out[lab, 2] = min(out[lab, 2], val) + out[lab, 3] = val + +#---------------------------------------------------------------------- +# group_nth, group_last +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + int64_t val, count + ndarray[int64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != iNaT: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + int64_t val, count + ndarray[int64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != iNaT: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = resx[i, j] + +#---------------------------------------------------------------------- +# group_min, group_max +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != NAN: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = maxx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != NAN: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = minx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != NAN: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = maxx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != NAN: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = minx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + int64_t val, count + ndarray[int64_t, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-_int64_max) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != iNaT: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != iNaT: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = maxx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + int64_t val, count + ndarray[int64_t, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(_int64_max) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != iNaT: + + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != iNaT: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = minx[i, j] + +#---------------------------------------------------------------------- +# other grouping functions not needing a template +#---------------------------------------------------------------------- + + +def group_median_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, ngroups, size + ndarray[int64_t] _counts + ndarray data + float64_t* ptr + ngroups = len(counts) + N, K = ( values).shape + + indexer, _counts = groupsort_indexer(labels, ngroups) + counts[:] = _counts[1:] + + data = np.empty((K, N), dtype=np.float64) + ptr = data.data + + take_2d_axis1_float64_float64(values.T, indexer, out=data) + + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = _median_linear(ptr, size) + ptr += size + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumprod_float64(float64_t[:, :] out, + float64_t[:, :] values, + int64_t[:] labels, + float64_t[:, :] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + float64_t val + int64_t lab + + N, K = ( values).shape + accum = np.ones_like(accum) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] *= val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumsum(numeric[:, :] out, + numeric[:, :] values, + int64_t[:] labels, + numeric[:, :] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + numeric val + int64_t lab + + N, K = ( values).shape + accum = np.zeros_like(accum) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] += val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_shift_indexer(int64_t[:] out, int64_t[:] labels, + int ngroups, int periods): + cdef: + Py_ssize_t N, i, j, ii + int offset, sign + int64_t lab, idxer, idxer_slot + int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) + int64_t[:, :] label_indexer + + N, = ( labels).shape + + if periods < 0: + periods = -periods + offset = N - 1 + sign = -1 + elif periods > 0: + offset = 0 + sign = 1 + + if periods == 0: + with nogil: + for i in range(N): + out[i] = i + else: + # array of each previous indexer seen + label_indexer = np.zeros((ngroups, periods), dtype=np.int64) + with nogil: + for i in range(N): + ## reverse iterator if shifting backwards + ii = offset + sign * i + lab = labels[ii] + label_seen[lab] += 1 + + idxer_slot = label_seen[lab] % periods + idxer = label_indexer[lab, idxer_slot] + + if label_seen[lab] > periods: + out[ii] = idxer + else: + out[ii] = -1 + + label_indexer[lab, idxer_slot] = ii diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in new file mode 100644 index 0000000000000..6b9d8f07587bc --- /dev/null +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -0,0 +1,713 @@ +""" +Template for each `dtype` helper function using groupby + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +cdef extern from "numpy/npy_math.h": + double NAN "NPY_NAN" +_int64_max = np.iinfo(np.int64).max + +#---------------------------------------------------------------------- +# group_add, group_prod, group_var, group_mean, group_ohlc +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dest_type, dest_dtype +dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'), + ('float32', 'float32_t', 'float32_t', 'np.float32')] + +def get_dispatch(dtypes): + + for name, c_type, dest_type, dest_dtype in dtypes: + + dest_type2 = dest_type + dest_type = dest_type.replace('_t', '') + + yield name, c_type, dest_type, dest_type2, dest_dtype +}} + +{{for name, c_type, dest_type, dest_type2, dest_dtype in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + + if K > 1: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + + else: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = prodx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, ct, oldmean + ndarray[{{dest_type2}}, ndim=2] nobs, mean + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + mean = np.zeros_like(out) + + N, K = ( values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + + for i in range(ncounts): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = NAN + else: + out[i, j] /= (ct - 1) +# add passing bin edges, instead of labels + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] / count + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + {{dest_type2}} val, count + Py_ssize_t ngroups = len(counts) + + if len(labels) == 0: + return + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + if K > 1: + raise NotImplementedError("Argument 'values' must have only " + "one dimension") + out.fill(np.nan) + + with nogil: + for i in range(N): + lab = labels[i] + if lab == -1: + continue + + counts[lab] += 1 + val = values[i, 0] + if val != val: + continue + + if out[lab, 0] != out[lab, 0]: + out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val + else: + out[lab, 1] = max(out[lab, 1], val) + out[lab, 2] = min(out[lab, 2], val) + out[lab, 3] = val + +{{endfor}} + +#---------------------------------------------------------------------- +# group_nth, group_last +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dest_type2, nan_val +dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'), + ('float32', 'float32_t', 'float32_t', 'NAN'), + ('int64', 'int64_t', 'int64_t', 'iNaT')] + +def get_dispatch(dtypes): + + for name, c_type, dest_type2, nan_val in dtypes: + + yield name, c_type, dest_type2, nan_val +}} + + +{{for name, c_type, dest_type2, nan_val in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = {{nan_val}} + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = {{nan_val}} + else: + out[i, j] = resx[i, j] + +{{endfor}} + +#---------------------------------------------------------------------- +# group_min, group_max +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dest_type2, nan_val +dtypes = [('float64', 'float64_t', 'NAN', 'np.inf'), + ('float32', 'float32_t', 'NAN', 'np.inf'), + ('int64', 'int64_t', 'iNaT', '_int64_max')] + +def get_dispatch(dtypes): + + for name, dest_type2, nan_val, inf_val in dtypes: + yield name, dest_type2, nan_val, inf_val +}} + + +{{for name, dest_type2, nan_val, inf_val in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-{{inf_val}}) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = {{nan_val}} + else: + out[i, j] = maxx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill({{inf_val}}) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != {{nan_val}}: + + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = {{nan_val}} + else: + out[i, j] = minx[i, j] + +{{endfor}} + +#---------------------------------------------------------------------- +# other grouping functions not needing a template +#---------------------------------------------------------------------- + + +def group_median_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, ngroups, size + ndarray[int64_t] _counts + ndarray data + float64_t* ptr + ngroups = len(counts) + N, K = ( values).shape + + indexer, _counts = groupsort_indexer(labels, ngroups) + counts[:] = _counts[1:] + + data = np.empty((K, N), dtype=np.float64) + ptr = data.data + + take_2d_axis1_float64_float64(values.T, indexer, out=data) + + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = _median_linear(ptr, size) + ptr += size + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumprod_float64(float64_t[:, :] out, + float64_t[:, :] values, + int64_t[:] labels, + float64_t[:, :] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + float64_t val + int64_t lab + + N, K = ( values).shape + accum = np.ones_like(accum) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] *= val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumsum(numeric[:, :] out, + numeric[:, :] values, + int64_t[:] labels, + numeric[:, :] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + numeric val + int64_t lab + + N, K = ( values).shape + accum = np.zeros_like(accum) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] += val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_shift_indexer(int64_t[:] out, int64_t[:] labels, + int ngroups, int periods): + cdef: + Py_ssize_t N, i, j, ii + int offset, sign + int64_t lab, idxer, idxer_slot + int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) + int64_t[:, :] label_indexer + + N, = ( labels).shape + + if periods < 0: + periods = -periods + offset = N - 1 + sign = -1 + elif periods > 0: + offset = 0 + sign = 1 + + if periods == 0: + with nogil: + for i in range(N): + out[i] = i + else: + # array of each previous indexer seen + label_indexer = np.zeros((ngroups, periods), dtype=np.int64) + with nogil: + for i in range(N): + ## reverse iterator if shifting backwards + ii = offset + sign * i + lab = labels[ii] + label_seen[lab] += 1 + + idxer_slot = label_seen[lab] % periods + idxer = label_indexer[lab, idxer_slot] + + if label_seen[lab] > periods: + out[ii] = idxer + else: + out[ii] = -1 + + label_indexer[lab, idxer_slot] = ii diff --git a/pandas/src/algos_join_helper.pxi b/pandas/src/algos_join_helper.pxi new file mode 100644 index 0000000000000..44b8159351492 --- /dev/null +++ b/pandas/src/algos_join_helper.pxi @@ -0,0 +1,1899 @@ +""" +Template for each `dtype` helper function for join + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# left_join_indexer, inner_join_indexer, outer_join_indexer +#---------------------------------------------------------------------- + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + float64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + float32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + object lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_object(ndarray[object] left, + ndarray[object] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_object(ndarray[object] left, + ndarray[object] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer diff --git a/pandas/src/algos_join_helper.pxi.in b/pandas/src/algos_join_helper.pxi.in new file mode 100644 index 0000000000000..5b55ec2b1bf6d --- /dev/null +++ b/pandas/src/algos_join_helper.pxi.in @@ -0,0 +1,407 @@ +""" +Template for each `dtype` helper function for join + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# left_join_indexer, inner_join_indexer, outer_join_indexer +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dtype +dtypes = [('float64', 'float64_t', 'np.float64'), + ('float32', 'float32_t', 'np.float32'), + ('object', 'object', 'object'), + ('int32', 'int32_t', 'np.int32'), + ('int64', 'int64_t', 'np.int64')] + +def get_dispatch(dtypes): + + for name, c_type, dtype in dtypes: + yield name, c_type, dtype + +}} + +{{for name, c_type, dtype in get_dispatch(dtypes)}} + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_{{name}}(ndarray[{{c_type}}] left, + ndarray[{{c_type}}] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + {{c_type}} lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_{{name}}(ndarray[{{c_type}}] left, + ndarray[{{c_type}}] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + {{c_type}} lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[{{c_type}}] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype={{dtype}}) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_{{name}}(ndarray[{{c_type}}] left, + ndarray[{{c_type}}] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + {{c_type}} lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[{{c_type}}] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype={{dtype}}) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_{{name}}(ndarray[{{c_type}}] left, + ndarray[{{c_type}}] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + {{c_type}} lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[{{c_type}}] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype={{dtype}}) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +{{endfor}} \ No newline at end of file diff --git a/pandas/src/algos_take_helper.pxi b/pandas/src/algos_take_helper.pxi new file mode 100644 index 0000000000000..d8fb05804d4e5 --- /dev/null +++ b/pandas/src/algos_take_helper.pxi @@ -0,0 +1,4949 @@ +""" +Template for each `dtype` helper function for take + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# take_1d, take_2d +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_bool_bool_memview(uint8_t[:] values, + int64_t[:] indexer, + uint8_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + uint8_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_bool_bool(ndarray[uint8_t, ndim=1] values, + int64_t[:] indexer, + uint8_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_bool_bool_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + uint8_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values, + int64_t[:] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + uint8_t *v + uint8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(uint8_t) and + sizeof(uint8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(uint8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_bool_bool_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + uint8_t *v + uint8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(uint8_t) and + sizeof(uint8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(uint8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values, + int64_t[:] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_bool_bool_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values, + indexer, + ndarray[uint8_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + uint8_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_bool_object_memview(uint8_t[:] values, + int64_t[:] indexer, + object[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + object fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = True if values[idx] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_bool_object(ndarray[uint8_t, ndim=1] values, + int64_t[:] indexer, + object[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_bool_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + object fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = True if values[idx] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = True if values[idx, j] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_bool_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = True if values[idx, j] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = True if values[i, idx] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_bool_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = True if values[i, idx] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values, + indexer, + ndarray[object, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + object fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = True if values[idx, idx1[j]] > 0 else False + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int8_int8_memview(int8_t[:] values, + int64_t[:] indexer, + int8_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int8_t fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int8_int8(ndarray[int8_t, ndim=1] values, + int64_t[:] indexer, + int8_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int8_int8_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int8_t fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values, + int64_t[:] indexer, + int8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int8_t *v + int8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int8_t) and + sizeof(int8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int8_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_int8_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int8_t *v + int8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int8_t) and + sizeof(int8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values, + int64_t[:] indexer, + int8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int8_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_int8_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[int8_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int8_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int8_int32_memview(int8_t[:] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int8_int32(ndarray[int8_t, ndim=1] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int8_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[int32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int8_int64_memview(int8_t[:] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int8_int64(ndarray[int8_t, ndim=1] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int8_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int8_float64_memview(int8_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int8_float64(ndarray[int8_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int8_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int16_int16_memview(int16_t[:] values, + int64_t[:] indexer, + int16_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int16_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int16_int16(ndarray[int16_t, ndim=1] values, + int64_t[:] indexer, + int16_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int16_int16_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int16_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values, + int64_t[:] indexer, + int16_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int16_t *v + int16_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int16_t) and + sizeof(int16_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int16_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int16_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_int16_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int16_t *v + int16_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int16_t) and + sizeof(int16_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int16_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values, + int64_t[:] indexer, + int16_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int16_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_int16_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[int16_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int16_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int16_int32_memview(int16_t[:] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int16_int32(ndarray[int16_t, ndim=1] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int16_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[int32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int16_int64_memview(int16_t[:] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int16_int64(ndarray[int16_t, ndim=1] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int16_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int16_float64_memview(int16_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int16_float64(ndarray[int16_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int16_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int32_int32_memview(int32_t[:] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int32_int32(ndarray[int32_t, ndim=1] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int32_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int32_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int32_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values, + indexer, + ndarray[int32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int32_int64_memview(int32_t[:] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int32_int64(ndarray[int32_t, ndim=1] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int32_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int32_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int32_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int32_float64_memview(int32_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int32_float64(ndarray[int32_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int64_int64_memview(int64_t[:] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int64_int64(ndarray[int64_t, ndim=1] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int64_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int64_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int64_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int64_float64_memview(int64_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int64_float64(ndarray[int64_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_float32_float32_memview(float32_t[:] values, + int64_t[:] indexer, + float32_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_float32_float32(ndarray[float32_t, ndim=1] values, + int64_t[:] indexer, + float32_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_float32_float32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values, + int64_t[:] indexer, + float32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float32_t *v + float32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float32_t) and + sizeof(float32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_float32_float32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float32_t *v + float32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float32_t) and + sizeof(float32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values, + int64_t[:] indexer, + float32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float32_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_float32_float32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values, + indexer, + ndarray[float32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_float32_float64_memview(float32_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_float32_float64(ndarray[float32_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_float32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_float32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_float32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_float64_float64_memview(float64_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_float64_float64(ndarray[float64_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_float64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_float64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_float64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_object_object_memview(object[:] values, + int64_t[:] indexer, + object[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + object fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_object_object(ndarray[object, ndim=1] values, + int64_t[:] indexer, + object[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_object_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + object fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_object_object_memview(object[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_object_object(ndarray[object, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_object_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_object_object_memview(object[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_object_object(ndarray[object, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_object_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_object_object(ndarray[object, ndim=2] values, + indexer, + ndarray[object, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + object fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] diff --git a/pandas/src/algos_take_helper.pxi.in b/pandas/src/algos_take_helper.pxi.in new file mode 100644 index 0000000000000..e9abbcd13f499 --- /dev/null +++ b/pandas/src/algos_take_helper.pxi.in @@ -0,0 +1,261 @@ +""" +Template for each `dtype` helper function for take + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# take_1d, take_2d +#---------------------------------------------------------------------- + +{{py: + +# name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil +dtypes = [ + ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True), + ('bool', 'object', 'uint8_t', 'object', + 'True if ', ' > 0 else False', False, False), + ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False), + ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True), + ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True), + ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True), + ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True), + ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True), + ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True), + ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True), + ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True), + ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True), + ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True), + ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True), + ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True), + ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True), + ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True), + ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True), + ('object', 'object', 'object', 'object', '', '', False, False)] + + +def get_dispatch(dtypes): + + inner_take_1d_template = """ + cdef: + Py_ssize_t i, n, idx + %(c_type_out)s fv + + n = indexer.shape[0] + + fv = fill_value + + %(nogil_str)s + %(tab)sfor i from 0 <= i < n: + %(tab)s idx = indexer[i] + %(tab)s if idx == -1: + %(tab)s out[i] = fv + %(tab)s else: + %(tab)s out[i] = %(preval)svalues[idx]%(postval)s +""" + + inner_take_2d_axis0_template = """\ + cdef: + Py_ssize_t i, j, k, n, idx + %(c_type_out)s fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF %(can_copy)s: + cdef: + %(c_type_out)s *v + %(c_type_out)s *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(%(c_type_out)s) and + sizeof(%(c_type_out)s) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(%(c_type_out)s) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = %(preval)svalues[idx, j]%(postval)s +""" + + inner_take_2d_axis1_template = """\ + cdef: + Py_ssize_t i, j, k, n, idx + %(c_type_out)s fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = %(preval)svalues[i, idx]%(postval)s +""" + + for (name, dest, c_type_in, c_type_out, preval, postval, + can_copy, nogil) in dtypes: + if nogil: + nogil_str = "with nogil:" + tab = ' ' + else: + nogil_str = '' + tab = '' + + args = dict(name=name, dest=dest, c_type_in=c_type_in, + c_type_out=c_type_out, preval=preval, postval=postval, + can_copy=can_copy, nogil_str=nogil_str, tab=tab) + + inner_take_1d = inner_take_1d_template % args + inner_take_2d_axis0 = inner_take_2d_axis0_template % args + inner_take_2d_axis1 = inner_take_2d_axis1_template % args + + yield (name, dest, c_type_in, c_type_out, preval, postval, can_copy, + inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1) + +}} + + +{{for name, dest, c_type_in, c_type_out, preval, postval, can_copy, + inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1 + in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values, + int64_t[:] indexer, + {{c_type_out}}[:] out, + fill_value=np.nan): + + +{{inner_take_1d}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, + int64_t[:] indexer, + {{c_type_out}}[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_{{name}}_{{dest}}_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +{{inner_take_1d}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, + int64_t[:] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): +{{inner_take_2d_axis0}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + ndarray[int64_t] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_{{name}}_{{dest}}_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +{{inner_take_2d_axis0}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, + int64_t[:] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): +{{inner_take_2d_axis1}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + ndarray[int64_t] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_{{name}}_{{dest}}_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +{{inner_take_2d_axis1}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + indexer, + ndarray[{{c_type_out}}, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + {{c_type_out}} fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}} + +{{endfor}} \ No newline at end of file diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py deleted file mode 100644 index 309a81b38f4e1..0000000000000 --- a/pandas/src/generate_code.py +++ /dev/null @@ -1,2182 +0,0 @@ -""" -This file generates `generated.pyx` which is then included in `../algos.pyx` -during building. To regenerate `generated.pyx`, just run: - - `python generate_code.py`. - -""" - -# flake8: noqa - -from __future__ import print_function -import os -from pandas.compat import StringIO -import numpy as np - -_int64_max = np.iinfo(np.int64).max - -warning_to_new_contributors = """ -# DO NOT EDIT THIS FILE: This file was autogenerated from generate_code.py, so -# please edit that file and then run `python2 generate_code.py` to re-generate -# this file. -""" - -header = """ -cimport numpy as np -cimport cython - -from libc.string cimport memmove - -from numpy cimport * - -from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, - PyDict_Contains, PyDict_Keys, - Py_INCREF, PyTuple_SET_ITEM, - PyTuple_SetItem, - PyTuple_New) -from cpython cimport PyFloat_Check -cimport cpython - -cdef extern from "numpy/npy_math.h": - double NAN "NPY_NAN" - -import numpy as np -isnan = np.isnan - -from datetime import datetime as pydatetime - -# this is our datetime.pxd -from datetime cimport * - -from khash cimport * - -ctypedef unsigned char UChar - -cimport util -from util cimport is_array, _checknull, _checknan, get_nat -cimport lib -from lib cimport is_null_datetimelike - -cdef int64_t iNaT = get_nat() - -# import datetime C API -PyDateTime_IMPORT - -# initialize numpy -import_array() -import_ufunc() - -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num - -cpdef ensure_platform_int(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == PLATFORM_INT: - return arr - else: - return arr.astype(np.int_) - else: - return np.array(arr, dtype=np.int_) - -cpdef ensure_object(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_OBJECT: - return arr - else: - return arr.astype(np.object_) - elif hasattr(arr,'asobject'): - return arr.asobject - else: - return np.array(arr, dtype=np.object_) -""" - - -inner_take_1d_template = """\ - cdef: - Py_ssize_t i, n, idx - %(c_type_out)s fv - - n = indexer.shape[0] - - fv = fill_value - - %(nogil)s - %(tab)sfor i from 0 <= i < n: - %(tab)s idx = indexer[i] - %(tab)s if idx == -1: - %(tab)s out[i] = fv - %(tab)s else: - %(tab)s out[i] = %(preval)svalues[idx]%(postval)s -""" - -take_1d_template = """\ -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_%(name)s_%(dest)s_memview(%(c_type_in)s[:] values, - int64_t[:] indexer, - %(c_type_out)s[:] out, - fill_value=np.nan): -""" + inner_take_1d_template + """ - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=1] values, - int64_t[:] indexer, - %(c_type_out)s[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_%(name)s_%(dest)s_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -""" + inner_take_1d_template - -inner_take_2d_axis0_template = """\ - cdef: - Py_ssize_t i, j, k, n, idx - %(c_type_out)s fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF %(can_copy)s: - cdef: - %(c_type_out)s *v - %(c_type_out)s *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(%(c_type_out)s) and - sizeof(%(c_type_out)s) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(%(c_type_out)s) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = %(preval)svalues[idx, j]%(postval)s -""" - -take_2d_axis0_template = """\ -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_%(name)s_%(dest)s_memview(%(c_type_in)s[:, :] values, - int64_t[:] indexer, - %(c_type_out)s[:, :] out, - fill_value=np.nan): -""" + inner_take_2d_axis0_template + """ - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, - ndarray[int64_t] indexer, - %(c_type_out)s[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_%(name)s_%(dest)s_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -""" + inner_take_2d_axis0_template - - -inner_take_2d_axis1_template = """\ - cdef: - Py_ssize_t i, j, k, n, idx - %(c_type_out)s fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = %(preval)svalues[i, idx]%(postval)s -""" - -take_2d_axis1_template = """\ -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_%(name)s_%(dest)s_memview(%(c_type_in)s[:, :] values, - int64_t[:] indexer, - %(c_type_out)s[:, :] out, - fill_value=np.nan): -""" + inner_take_2d_axis1_template + """ - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, - ndarray[int64_t] indexer, - %(c_type_out)s[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_%(name)s_%(dest)s_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -""" + inner_take_2d_axis1_template - - -take_2d_multi_template = """@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, - indexer, - ndarray[%(c_type_out)s, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - %(c_type_out)s fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = %(preval)svalues[idx, idx1[j]]%(postval)s -""" - - - -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - -backfill_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef %(c_type)s cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer -""" - - -pad_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def pad_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef %(c_type)s cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer -""" - -pad_1d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_%(name)s(ndarray[%(c_type)s] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef %(c_type)s val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] -""" - -pad_2d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef %(c_type)s val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] -""" - -backfill_2d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef %(c_type)s val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] -""" - -backfill_1d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_%(name)s(ndarray[%(c_type)s] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef %(c_type)s val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] -""" - - -diff_2d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_%(name)s(ndarray[%(c_type)s, ndim=2] arr, - ndarray[%(dest_type2)s, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] -""" - -is_monotonic_template = '''@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_%(name)s(ndarray[%(c_type)s] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - %(c_type)s prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - %(nogil)s - %(tab)sprev = arr[0] - %(tab)sfor i in range(1, n): - %(tab)s cur = arr[i] - %(tab)s if timelike and cur == iNaT: - %(tab)s is_monotonic_inc = 0 - %(tab)s is_monotonic_dec = 0 - %(tab)s break - %(tab)s if cur < prev: - %(tab)s is_monotonic_inc = 0 - %(tab)s elif cur > prev: - %(tab)s is_monotonic_dec = 0 - %(tab)s elif cur == prev: - %(tab)s pass # is_unique = 0 - %(tab)s else: - %(tab)s # cur or prev is NaN - %(tab)s is_monotonic_inc = 0 - %(tab)s is_monotonic_dec = 0 - %(tab)s break - %(tab)s if not is_monotonic_inc and not is_monotonic_dec: - %(tab)s is_monotonic_inc = 0 - %(tab)s is_monotonic_dec = 0 - %(tab)s break - %(tab)s prev = cur - return is_monotonic_inc, is_monotonic_dec -''' - -map_indices_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_%(name)s(ndarray[%(c_type)s] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result -''' - -groupby_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result -''' - -group_last_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(c_type)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = %(nan_val)s - else: - out[i, j] = resx[i, j] -''' - -group_nth_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(c_type)s, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = %(nan_val)s - else: - out[i, j] = resx[i, j] -''' - -group_add_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(c_type)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - - with nogil: - - if K > 1: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - else: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] -''' - -group_prod_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(c_type)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] -''' - -group_var_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, ct, oldmean - ndarray[%(dest_type2)s, ndim=2] nobs, mean - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = ( values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) - -''' - -# add passing bin edges, instead of labels - - -#---------------------------------------------------------------------- -# group_min, group_max - -group_max_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-%(inf_val)s) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = %(nan_val)s - else: - out[i, j] = maxx[i, j] -''' - -group_min_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(%(inf_val)s) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != %(nan_val)s: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = %(nan_val)s - else: - out[i, j] = minx[i, j] -''' - - -group_mean_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count -''' - -group_ohlc_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - %(dest_type2)s val, count - Py_ssize_t ngroups = len(counts) - - if len(labels) == 0: - return - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out.fill(np.nan) - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val -''' - -arrmap_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_%(name)s(ndarray[%(c_type)s] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) -''' - -#---------------------------------------------------------------------- -# Joins on ordered, unique indices - -# right might contain non-unique values - -left_join_unique_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - %(c_type)s lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer -''' - -# @cython.wraparound(False) -# @cython.boundscheck(False) - -left_join_template = '''def left_join_indexer_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - %(c_type)s lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[%(c_type)s] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=%(dtype)s) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer -''' - - -inner_join_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - %(c_type)s lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[%(c_type)s] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=%(dtype)s) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer -''' - - -outer_join_template2 = '''@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - %(c_type)s lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[%(c_type)s] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=%(dtype)s) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer -''' - -outer_join_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - %(c_type)s lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[%(c_type)s] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - while True: - if i == nleft: - if j == nright: - # we are done - break - else: - while j < nright: - j += 1 - count += 1 - break - elif j == nright: - while i < nleft: - i += 1 - count += 1 - break - else: - if left[i] == right[j]: - i += 1 - j += 1 - elif left[i] < right[j]: - i += 1 - else: - j += 1 - - count += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=%(dtype)s) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - while True: - if i == nleft: - if j == nright: - # we are done - break - else: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - j += 1 - count += 1 - break - elif j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - else: - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - i += 1 - j += 1 - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - j += 1 - - count += 1 - - return result, lindexer, rindexer -''' - -# ensure_dtype functions - -ensure_dtype_template = """ -cpdef ensure_%(name)s(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_%(ctype)s: - return arr - else: - return arr.astype(np.%(dtype)s) - else: - return np.array(arr, dtype=np.%(dtype)s) -""" - -ensure_functions = [ - ('float64', 'FLOAT64', 'float64'), - ('float32', 'FLOAT32', 'float32'), - ('int8', 'INT8', 'int8'), - ('int16', 'INT16', 'int16'), - ('int32', 'INT32', 'int32'), - ('int64', 'INT64', 'int64'), - # ('platform_int', 'INT', 'int_'), - #('object', 'OBJECT', 'object_'), -] - -def generate_ensure_dtypes(): - output = StringIO() - for name, ctype, dtype in ensure_functions: - filled = ensure_dtype_template % locals() - output.write(filled) - return output.getvalue() - -#---------------------------------------------------------------------- -# Fast "put" logic for speeding up interleaving logic - -put2d_template = """ -def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, - ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[%(dest_type2)s] out): - cdef: - Py_ssize_t i, j, k - - k = len(values) - for j from 0 <= j < k: - i = indexer[j] - out[i] = values[j, loc] -""" - -#---------------------------------------------------------------------- -# other grouping functions not needing a template -grouping_no_template = '''def group_median_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts - ndarray data - float64_t* ptr - ngroups = len(counts) - N, K = ( values).shape - - indexer, _counts = groupsort_indexer(labels, ngroups) - counts[:] = _counts[1:] - - data = np.empty((K, N), dtype=np.float64) - ptr = data.data - - take_2d_axis1_float64_float64(values.T, indexer, out=data) - - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = _median_linear(ptr, size) - ptr += size - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumprod_float64(float64_t[:,:] out, - float64_t[:,:] values, - int64_t[:] labels, - float64_t[:,:] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - float64_t val - int64_t lab - - N, K = ( values).shape - accum = np.ones_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - if val == val: - accum[lab, j] *= val - out[i, j] = accum[lab, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumsum(numeric[:,:] out, - numeric[:,:] values, - int64_t[:] labels, - numeric[:,:] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - numeric val - int64_t lab - - N, K = ( values).shape - accum = np.zeros_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i,j] - if val == val: - accum[lab,j] += val - out[i,j] = accum[lab,j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_shift_indexer(int64_t[:] out, int64_t[:] labels, - int ngroups, int periods): - cdef: - Py_ssize_t N, i, j, ii - int offset, sign - int64_t lab, idxer, idxer_slot - int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) - int64_t[:,:] label_indexer - - N, = ( labels).shape - - if periods < 0: - periods = -periods - offset = N - 1 - sign = -1 - elif periods > 0: - offset = 0 - sign = 1 - - if periods == 0: - with nogil: - for i in range(N): - out[i] = i - else: - # array of each previous indexer seen - label_indexer = np.zeros((ngroups, periods), dtype=np.int64) - with nogil: - for i in range(N): - ## reverse iterator if shifting backwards - ii = offset + sign * i - lab = labels[ii] - label_seen[lab] += 1 - - idxer_slot = label_seen[lab] % periods - idxer = label_indexer[lab, idxer_slot] - - if label_seen[lab] > periods: - out[ii] = idxer - else: - out[ii] = -1 - - label_indexer[lab, idxer_slot] = ii -''' - - -#------------------------------------------------------------------------- -# Generators - -def generate_put_template(template, use_ints=True, use_floats=True, - use_objects=False, use_datelikes=False): - floats_list = [ - ('float64', 'float64_t', 'float64_t', 'np.float64', True), - ('float32', 'float32_t', 'float32_t', 'np.float32', True), - ] - ints_list = [ - ('int8', 'int8_t', 'float32_t', 'np.float32', True), - ('int16', 'int16_t', 'float32_t', 'np.float32', True), - ('int32', 'int32_t', 'float64_t', 'np.float64', True), - ('int64', 'int64_t', 'float64_t', 'np.float64', True), - ] - date_like_list = [ - ('int64', 'int64_t', 'float64_t', 'np.float64', True), - ] - object_list = [('object', 'object', 'object', 'np.object_', False)] - function_list = [] - if use_floats: - function_list.extend(floats_list) - if use_ints: - function_list.extend(ints_list) - if use_objects: - function_list.extend(object_list) - if use_datelikes: - function_list.extend(date_like_list) - - output = StringIO() - for name, c_type, dest_type, dest_dtype, nogil in function_list: - func = template % {'name': name, - 'c_type': c_type, - 'dest_type': dest_type.replace('_t', ''), - 'dest_type2': dest_type, - 'dest_dtype': dest_dtype, - 'nogil' : 'with nogil:' if nogil else '', - 'tab' : ' ' if nogil else '' } - output.write(func) - output.write("\n") - return output.getvalue() - -def generate_put_min_max_template(template, use_ints=True, use_floats=True, - use_objects=False, use_datelikes=False): - floats_list = [ - ('float64', 'float64_t', 'NAN', 'np.inf', True), - ('float32', 'float32_t', 'NAN', 'np.inf', True), - ] - ints_list = [ - ('int64', 'int64_t', 'iNaT', _int64_max, True), - ] - date_like_list = [ - ('int64', 'int64_t', 'iNaT', _int64_max, True), - ] - object_list = [('object', 'object', 'np.nan', 'np.inf', False)] - function_list = [] - if use_floats: - function_list.extend(floats_list) - if use_ints: - function_list.extend(ints_list) - if use_objects: - function_list.extend(object_list) - if use_datelikes: - function_list.extend(date_like_list) - - output = StringIO() - for name, dest_type, nan_val, inf_val, nogil in function_list: - func = template % {'name': name, - 'dest_type2': dest_type, - 'nan_val': nan_val, - 'inf_val': inf_val, - 'nogil' : "with nogil:" if nogil else '', - 'tab' : ' ' if nogil else '' } - output.write(func) - output.write("\n") - return output.getvalue() - -def generate_put_selection_template(template, use_ints=True, use_floats=True, - use_objects=False, use_datelikes=False): - floats_list = [ - ('float64', 'float64_t', 'float64_t', 'NAN', True), - ('float32', 'float32_t', 'float32_t', 'NAN', True), - ] - ints_list = [ - ('int64', 'int64_t', 'int64_t', 'iNaT', True), - ] - date_like_list = [ - ('int64', 'int64_t', 'int64_t', 'iNaT', True), - ] - object_list = [('object', 'object', 'object', 'np.nan', False)] - function_list = [] - if use_floats: - function_list.extend(floats_list) - if use_ints: - function_list.extend(ints_list) - if use_objects: - function_list.extend(object_list) - if use_datelikes: - function_list.extend(date_like_list) - - output = StringIO() - for name, c_type, dest_type, nan_val, nogil in function_list: - - if nogil: - nogil = "with nogil:" - tab = ' ' - else: - nogil = '' - tab = '' - - func = template % {'name': name, - 'c_type': c_type, - 'dest_type2': dest_type, - 'nan_val': nan_val, - 'nogil' : nogil, - 'tab' : tab } - output.write(func) - output.write("\n") - return output.getvalue() - -def generate_take_template(template, exclude=None): - # name, dest, ctypein, ctypeout, preval, postval, cancopy, nogil - function_list = [ - ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True), - ('bool', 'object', 'uint8_t', 'object', - 'True if ', ' > 0 else False', False, False), - ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False), - ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True), - ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True), - ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True), - ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True), - ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True), - ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True), - ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True), - ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True), - ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True), - ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True), - ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True), - ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True), - ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True), - ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True), - ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True), - ('object', 'object', 'object', 'object', '', '', False, False), - ] - - output = StringIO() - for (name, dest, c_type_in, c_type_out, - preval, postval, can_copy, nogil) in function_list: - - if exclude is not None and name in exclude: - continue - - if nogil: - nogil = "with nogil:" - tab = ' ' - else: - nogil = '' - tab = '' - - func = template % {'name': name, 'dest': dest, - 'c_type_in': c_type_in, 'c_type_out': c_type_out, - 'preval': preval, 'postval': postval, - 'can_copy': 'True' if can_copy else 'False', - 'nogil' : nogil, - 'tab' : tab } - output.write(func) - output.write("\n") - return output.getvalue() - -def generate_from_template(template, exclude=None): - # name, ctype, capable of holding NA - function_list = [ - ('float64', 'float64_t', 'np.float64', True, True), - ('float32', 'float32_t', 'np.float32', True, True), - ('object', 'object', 'object', True, False), - ('int32', 'int32_t', 'np.int32', False, True), - ('int64', 'int64_t', 'np.int64', False, True), - ('bool', 'uint8_t', 'np.bool', False, True) - ] - - output = StringIO() - for name, c_type, dtype, can_hold_na, nogil in function_list: - if exclude is not None and name in exclude: - continue - - func = template % {'name': name, 'c_type': c_type, - 'dtype': dtype, - 'raise_on_na': 'False' if can_hold_na else 'True', - 'nogil' : 'with nogil:' if nogil else '', - 'tab' : ' ' if nogil else '' } - output.write(func) - output.write("\n") - return output.getvalue() - -put_2d = [diff_2d_template] - -groupbys = [group_add_template, - group_prod_template, - group_var_template, - group_mean_template, - group_ohlc_template] - -groupby_selection = [group_last_template, - group_nth_template] - -groupby_min_max = [group_min_template, - group_max_template] - -templates_1d = [map_indices_template, - pad_template, - backfill_template, - pad_1d_template, - backfill_1d_template, - pad_2d_template, - backfill_2d_template, - is_monotonic_template, - groupby_template, - arrmap_template] - -nobool_1d_templates = [left_join_unique_template, - left_join_template, - outer_join_template2, - inner_join_template] - -take_templates = [take_1d_template, - take_2d_axis0_template, - take_2d_axis1_template, - take_2d_multi_template] - - -def generate_take_cython_file(): - # Put `generated.pyx` in the same directory as this file - directory = os.path.dirname(os.path.realpath(__file__)) - filename = 'generated.pyx' - path = os.path.join(directory, filename) - - with open(path, 'w') as f: - print(warning_to_new_contributors, file=f) - print(header, file=f) - - print(generate_ensure_dtypes(), file=f) - - for template in templates_1d: - print(generate_from_template(template), file=f) - - for template in take_templates: - print(generate_take_template(template), file=f) - - for template in put_2d: - print(generate_put_template(template), file=f) - - for template in groupbys: - print(generate_put_template(template, use_ints=False), file=f) - - for template in groupby_selection: - print(generate_put_selection_template(template, use_ints=True), - file=f) - - for template in groupby_min_max: - print(generate_put_min_max_template(template, use_ints=True), - file=f) - - print(grouping_no_template, file=f) - - for template in nobool_1d_templates: - print(generate_from_template(template, exclude=['bool']), file=f) - - -if __name__ == '__main__': - generate_take_cython_file() diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx deleted file mode 100644 index c6dcd609a2c6e..0000000000000 --- a/pandas/src/generated.pyx +++ /dev/null @@ -1,10522 +0,0 @@ - -# DO NOT EDIT THIS FILE: This file was autogenerated from generate_code.py, so -# please edit that file and then run `python2 generate_code.py` to re-generate -# this file. - - -cimport numpy as np -cimport cython - -from libc.string cimport memmove - -from numpy cimport * - -from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, - PyDict_Contains, PyDict_Keys, - Py_INCREF, PyTuple_SET_ITEM, - PyTuple_SetItem, - PyTuple_New) -from cpython cimport PyFloat_Check -cimport cpython - -cdef extern from "numpy/npy_math.h": - double NAN "NPY_NAN" - -import numpy as np -isnan = np.isnan - -from datetime import datetime as pydatetime - -# this is our datetime.pxd -from datetime cimport * - -from khash cimport * - -ctypedef unsigned char UChar - -cimport util -from util cimport is_array, _checknull, _checknan, get_nat -cimport lib -from lib cimport is_null_datetimelike - -cdef int64_t iNaT = get_nat() - -# import datetime C API -PyDateTime_IMPORT - -# initialize numpy -import_array() -import_ufunc() - -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num - -cpdef ensure_platform_int(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == PLATFORM_INT: - return arr - else: - return arr.astype(np.int_) - else: - return np.array(arr, dtype=np.int_) - -cpdef ensure_object(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_OBJECT: - return arr - else: - return arr.astype(np.object_) - elif hasattr(arr,'asobject'): - return arr.asobject - else: - return np.array(arr, dtype=np.object_) - - -cpdef ensure_float64(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_FLOAT64: - return arr - else: - return arr.astype(np.float64) - else: - return np.array(arr, dtype=np.float64) - -cpdef ensure_float32(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_FLOAT32: - return arr - else: - return arr.astype(np.float32) - else: - return np.array(arr, dtype=np.float32) - -cpdef ensure_int8(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT8: - return arr - else: - return arr.astype(np.int8) - else: - return np.array(arr, dtype=np.int8) - -cpdef ensure_int16(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT16: - return arr - else: - return arr.astype(np.int16) - else: - return np.array(arr, dtype=np.int16) - -cpdef ensure_int32(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT32: - return arr - else: - return arr.astype(np.int32) - else: - return np.array(arr, dtype=np.int32) - -cpdef ensure_int64(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT64: - return arr - else: - return arr.astype(np.int64) - else: - return np.array(arr, dtype=np.int64) - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_float64(ndarray[float64_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_float32(ndarray[float32_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_object(ndarray[object] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_int32(ndarray[int32_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_int64(ndarray[int64_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_bool(ndarray[uint8_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float64_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float32_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_object(ndarray[object] old, ndarray[object] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef object cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int32_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int64_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef uint8_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float64_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float32_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_object(ndarray[object] old, ndarray[object] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef object cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int32_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int64_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef uint8_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_float64(ndarray[float64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_float32(ndarray[float32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef object val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_int32(ndarray[int32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_int64(ndarray[int64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_bool(ndarray[uint8_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef uint8_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_float64(ndarray[float64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_float32(ndarray[float32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef object val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_int32(ndarray[int32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_int64(ndarray[int64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_bool(ndarray[uint8_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef uint8_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_object(ndarray[object, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef object val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef uint8_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_object(ndarray[object, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef object val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef uint8_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_float64(ndarray[float64_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - float64_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_float32(ndarray[float32_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - float32_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_object(ndarray[object] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - object prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int32(ndarray[int32_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - int32_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int64(ndarray[int64_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - int64_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - uint8_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_float64(ndarray[float64_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_float32(ndarray[float32_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_object(ndarray[object] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_int32(ndarray[int32_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_int64(ndarray[int64_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_bool(ndarray[uint8_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_float64(ndarray[float64_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_float32(ndarray[float32_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_object(ndarray[object] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_int32(ndarray[int32_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_int64(ndarray[int64_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_bool(ndarray[uint8_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_bool_bool_memview(uint8_t[:] values, - int64_t[:] indexer, - uint8_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - uint8_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_bool_bool(ndarray[uint8_t, ndim=1] values, - int64_t[:] indexer, - uint8_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_bool_bool_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - uint8_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_bool_object_memview(uint8_t[:] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = True if values[idx] > 0 else False - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_bool_object(ndarray[uint8_t, ndim=1] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_bool_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = True if values[idx] > 0 else False - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_int8_memview(int8_t[:] values, - int64_t[:] indexer, - int8_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int8_t fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_int8(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - int8_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_int8_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int8_t fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_int32_memview(int8_t[:] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_int32(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_int64_memview(int8_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_int64(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_float64_memview(int8_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_float64(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_int16_memview(int16_t[:] values, - int64_t[:] indexer, - int16_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int16_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_int16(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - int16_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_int16_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int16_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_int32_memview(int16_t[:] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_int32(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_int64_memview(int16_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_int64(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_float64_memview(int16_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_float64(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int32_int32_memview(int32_t[:] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int32_int32(ndarray[int32_t, ndim=1] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int32_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int32_int64_memview(int32_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int32_int64(ndarray[int32_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int32_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int32_float64_memview(int32_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int32_float64(ndarray[int32_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int64_int64_memview(int64_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int64_int64(ndarray[int64_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int64_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int64_float64_memview(int64_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int64_float64(ndarray[int64_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_float32_float32_memview(float32_t[:] values, - int64_t[:] indexer, - float32_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_float32_float32(ndarray[float32_t, ndim=1] values, - int64_t[:] indexer, - float32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_float32_float32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_float32_float64_memview(float32_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_float32_float64(ndarray[float32_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_float32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_float64_float64_memview(float64_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_float64_float64(ndarray[float64_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_float64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_object_object_memview(object[:] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_object_object(ndarray[object, ndim=1] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_object_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values, - int64_t[:] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - uint8_t *v - uint8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(uint8_t) and - sizeof(uint8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(uint8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_bool_bool_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - uint8_t *v - uint8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(uint8_t) and - sizeof(uint8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(uint8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = True if values[idx, j] > 0 else False - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_bool_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = True if values[idx, j] > 0 else False - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values, - int64_t[:] indexer, - int8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int8_t *v - int8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int8_t) and - sizeof(int8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int8_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_int8_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int8_t *v - int8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int8_t) and - sizeof(int8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values, - int64_t[:] indexer, - int16_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int16_t *v - int16_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int16_t) and - sizeof(int16_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int16_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int16_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_int16_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int16_t *v - int16_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int16_t) and - sizeof(int16_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int16_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int32_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int32_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int64_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values, - int64_t[:] indexer, - float32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float32_t *v - float32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float32_t) and - sizeof(float32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_float32_float32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float32_t *v - float32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float32_t) and - sizeof(float32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_float32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_float64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_object_object_memview(object[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_object_object(ndarray[object, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_object_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values, - int64_t[:] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_bool_bool_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = True if values[i, idx] > 0 else False - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_bool_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = True if values[i, idx] > 0 else False - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values, - int64_t[:] indexer, - int8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int8_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_int8_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values, - int64_t[:] indexer, - int16_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int16_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_int16_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int32_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int32_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int64_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values, - int64_t[:] indexer, - float32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_float32_float32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_float32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_float64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_object_object_memview(object[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_object_object(ndarray[object, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_object_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values, - indexer, - ndarray[uint8_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - uint8_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values, - indexer, - ndarray[object, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - object fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = True if values[idx, idx1[j]] > 0 else False - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[int8_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int8_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[int32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[int16_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int16_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[int32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values, - indexer, - ndarray[int32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values, - indexer, - ndarray[float32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_object_object(ndarray[object, ndim=2] values, - indexer, - ndarray[object, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - object fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_float64(ndarray[float64_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_float32(ndarray[float32_t, ndim=2] arr, - ndarray[float32_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int8(ndarray[int8_t, ndim=2] arr, - ndarray[float32_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int16(ndarray[int16_t, ndim=2] arr, - ndarray[float32_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int32(ndarray[int32_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int64(ndarray[int64_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_add_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - - with nogil: - - if K > 1: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - else: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_add_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - - with nogil: - - if K > 1: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - else: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, ct, oldmean - ndarray[float64_t, ndim=2] nobs, mean - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = ( values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) - - -@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, ct, oldmean - ndarray[float32_t, ndim=2] nobs, mean - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = ( values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - Py_ssize_t ngroups = len(counts) - - if len(labels) == 0: - return - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out.fill(np.nan) - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - float32_t val, count - Py_ssize_t ngroups = len(counts) - - if len(labels) == 0: - return - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out.fill(np.nan) - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = resx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = resx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = minx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = minx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(9223372036854775807) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != iNaT: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = minx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = maxx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = maxx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-9223372036854775807) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != iNaT: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = maxx[i, j] - - -def group_median_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts - ndarray data - float64_t* ptr - ngroups = len(counts) - N, K = ( values).shape - - indexer, _counts = groupsort_indexer(labels, ngroups) - counts[:] = _counts[1:] - - data = np.empty((K, N), dtype=np.float64) - ptr = data.data - - take_2d_axis1_float64_float64(values.T, indexer, out=data) - - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = _median_linear(ptr, size) - ptr += size - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumprod_float64(float64_t[:,:] out, - float64_t[:,:] values, - int64_t[:] labels, - float64_t[:,:] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - float64_t val - int64_t lab - - N, K = ( values).shape - accum = np.ones_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - if val == val: - accum[lab, j] *= val - out[i, j] = accum[lab, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumsum(numeric[:,:] out, - numeric[:,:] values, - int64_t[:] labels, - numeric[:,:] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - numeric val - int64_t lab - - N, K = ( values).shape - accum = np.zeros_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i,j] - if val == val: - accum[lab,j] += val - out[i,j] = accum[lab,j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_shift_indexer(int64_t[:] out, int64_t[:] labels, - int ngroups, int periods): - cdef: - Py_ssize_t N, i, j, ii - int offset, sign - int64_t lab, idxer, idxer_slot - int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) - int64_t[:,:] label_indexer - - N, = ( labels).shape - - if periods < 0: - periods = -periods - offset = N - 1 - sign = -1 - elif periods > 0: - offset = 0 - sign = 1 - - if periods == 0: - with nogil: - for i in range(N): - out[i] = i - else: - # array of each previous indexer seen - label_indexer = np.zeros((ngroups, periods), dtype=np.int64) - with nogil: - for i in range(N): - ## reverse iterator if shifting backwards - ii = offset + sign * i - lab = labels[ii] - label_seen[lab] += 1 - - idxer_slot = label_seen[lab] % periods - idxer = label_indexer[lab, idxer_slot] - - if label_seen[lab] > periods: - out[ii] = idxer - else: - out[ii] = -1 - - label_indexer[lab, idxer_slot] = ii - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - float64_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - float32_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_object(ndarray[object] left, - ndarray[object] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - object lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - int32_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - int64_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - - -def left_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -def left_join_indexer_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float32) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -def left_join_indexer_object(ndarray[object] left, - ndarray[object] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - object lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[object] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -def left_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -def left_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - float64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - float32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float32) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_object(ndarray[object] left, - ndarray[object] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - object lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[object] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - int32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - int64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float32) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_object(ndarray[object] left, - ndarray[object] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - object lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[object] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - diff --git a/setup.py b/setup.py index 937b3509cf493..86777f5579a09 100755 --- a/setup.py +++ b/setup.py @@ -90,11 +90,47 @@ def is_platform_mac(): except ImportError: cython = False + +if cython: + try: + try: + from Cython import Tempita as tempita + except ImportError: + import tempita + except ImportError: + raise ImportError('Building pandas requires Tempita: ' + 'pip install Tempita') + + from os.path import join as pjoin +_pxipath = pjoin('pandas', 'src') +_pxifiles = ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', + 'algos_join_helper.pxi.in', 'algos_take_helper.pxi.in'] + + class build_ext(_build_ext): def build_extensions(self): + + for _pxifile in _pxifiles: + # build pxifiles first, template extention must be .pxi.in + assert _pxifile.endswith('.pxi.in') + pxifile = pjoin(_pxipath, _pxifile) + outfile = pxifile[:-3] + + if (os.path.exists(outfile) and + os.stat(pxifile).st_mtime < os.stat(outfile).st_mtime): + # if .pxi.in is not updated, no need to output .pxi + continue + + with open(pxifile, "r") as f: + tmpl = f.read() + pyxcontent = tempita.sub(tmpl) + + with open(outfile, "w") as f: + f.write(pyxcontent) + numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') for ext in self.extensions: From 8f64ad700bb254e17fdb006b08199a2cc6ee5f36 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Tue, 26 Jul 2016 13:25:33 -0500 Subject: [PATCH 175/359] DOC: show failing string on numeric parse (#13773) --- pandas/src/inference.pyx | 4 ++-- pandas/src/parse_helper.h | 2 +- pandas/tools/tests/test_util.py | 11 +++++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index fe4748eb0eba0..039e0df4193b3 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -686,9 +686,9 @@ def maybe_convert_numeric(object[:] values, set na_values, raise ValueError('integer out of range') else: seen_float = True - except: + except (TypeError, ValueError) as e: if not coerce_numeric: - raise + raise type(e)(str(e) + ' at position {}'.format(i)) floats[i] = nan seen_float = True diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h index fd5089dd8963d..e565f02f27c88 100644 --- a/pandas/src/parse_helper.h +++ b/pandas/src/parse_helper.h @@ -66,7 +66,7 @@ int floatify(PyObject* str, double *result, int *maybe_int) { return 0; parsingerror: - PyErr_SetString(PyExc_ValueError, "Unable to parse string"); + PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data); Py_XDECREF(tmp); return -1; diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 5b738086a1ad4..7532997ef9d8e 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -120,7 +120,8 @@ def test_series_numeric(self): def test_error(self): s = pd.Series([1, -3.14, 'apple']) - with tm.assertRaises(ValueError): + msg = 'Unable to parse string "apple" at position 2' + with tm.assertRaisesRegexp(ValueError, msg): to_numeric(s, errors='raise') res = to_numeric(s, errors='ignore') @@ -131,9 +132,15 @@ def test_error(self): expected = pd.Series([1, -3.14, np.nan]) tm.assert_series_equal(res, expected) + s = pd.Series(['orange', 1, -3.14, 'apple']) + msg = 'Unable to parse string "orange" at position 0' + with tm.assertRaisesRegexp(ValueError, msg): + to_numeric(s, errors='raise') + def test_error_seen_bool(self): s = pd.Series([True, False, 'apple']) - with tm.assertRaises(ValueError): + msg = 'Unable to parse string "apple" at position 2' + with tm.assertRaisesRegexp(ValueError, msg): to_numeric(s, errors='raise') res = to_numeric(s, errors='ignore') From d06355d86b11ea4fcd87a80c7e1c99b6248e5ec1 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 26 Jul 2016 18:44:16 -0400 Subject: [PATCH 176/359] TST: Add test for skipfooter + decimal in read_csv (#13800) --- pandas/io/tests/parser/python_parser_only.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index 6f0ea75c4da93..0408401672a2f 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -185,3 +185,19 @@ def test_temporary_file(self): result = self.read_csv(new_file, sep=r"\s*", header=None) expected = DataFrame([[0, 0]]) tm.assert_frame_equal(result, expected) + + def test_skipfooter_with_decimal(self): + # see gh-6971 + data = '1#2\n3#4' + expected = DataFrame({'a': [1.2, 3.4]}) + + result = self.read_csv(StringIO(data), names=['a'], + decimal='#') + tm.assert_frame_equal(result, expected) + + # the stray footer line should not mess with the + # casting of the first t wo lines if we skip it + data = data + '\nFooter' + result = self.read_csv(StringIO(data), names=['a'], + decimal='#', skipfooter=1) + tm.assert_frame_equal(result, expected) From a3cddfa9c4302c13747509caef7a056159010adf Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 26 Jul 2016 18:47:23 -0400 Subject: [PATCH 177/359] BUG: TypeError in merge with timedelta64 column closes #13389 Author: sinhrks Closes #13802 from sinhrks/isnull_dateunit and squashes the following commits: 8dbfde2 [sinhrks] BUG: TypeError in merge with timedelta64 column --- doc/source/whatsnew/v0.19.0.txt | 4 + pandas/tests/types/test_missing.py | 255 ++++++++++++++++++---------- pandas/tools/tests/test_merge.py | 41 +++++ pandas/tseries/tests/test_period.py | 18 ++ pandas/types/common.py | 3 +- pandas/types/missing.py | 2 +- 6 files changed, 227 insertions(+), 96 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 375bbd79fd29b..06625e09d70a1 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -768,6 +768,10 @@ Bug Fixes - Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) - Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) - Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) +- Bug in ``isnull`` ``notnull`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) +- Bug in ``.merge`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) + + - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py index b0e1eb72bd791..fa2bd535bb8d5 100644 --- a/pandas/tests/types/test_missing.py +++ b/pandas/tests/types/test_missing.py @@ -5,6 +5,7 @@ from datetime import datetime from pandas.util import testing as tm +import pandas as pd from pandas.core import config as cf from pandas.compat import u from pandas.tslib import iNaT @@ -45,100 +46,6 @@ def test_notnull(): assert (isinstance(isnull(s), Series)) -def test_isnull(): - assert not isnull(1.) - assert isnull(None) - assert isnull(np.NaN) - assert not isnull(np.inf) - assert not isnull(-np.inf) - - # series - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: - assert (isinstance(isnull(s), Series)) - - # frame - for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), - tm.makeMixedDataFrame()]: - result = isnull(df) - expected = df.apply(isnull) - tm.assert_frame_equal(result, expected) - - # panel - for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) - ]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel_equal(result, expected) - - # panel 4d - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel4d_equal(result, expected) - - -def test_isnull_lists(): - result = isnull([[False]]) - exp = np.array([[False]]) - assert (np.array_equal(result, exp)) - - result = isnull([[1], [2]]) - exp = np.array([[False], [False]]) - assert (np.array_equal(result, exp)) - - # list of strings / unicode - result = isnull(['foo', 'bar']) - assert (not result.any()) - - result = isnull([u('foo'), u('bar')]) - assert (not result.any()) - - -def test_isnull_nat(): - result = isnull([NaT]) - exp = np.array([True]) - assert (np.array_equal(result, exp)) - - result = isnull(np.array([NaT], dtype=object)) - exp = np.array([True]) - assert (np.array_equal(result, exp)) - - -def test_isnull_numpy_nat(): - arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), - np.datetime64('NaT', 's')]) - result = isnull(arr) - expected = np.array([True] * 4) - tm.assert_numpy_array_equal(result, expected) - - -def test_isnull_datetime(): - assert (not isnull(datetime.now())) - assert notnull(datetime.now()) - - idx = date_range('1/1/1990', periods=20) - assert (notnull(idx).all()) - - idx = np.asarray(idx) - idx[0] = iNaT - idx = DatetimeIndex(idx) - mask = isnull(idx) - assert (mask[0]) - assert (not mask[1:].any()) - - # GH 9129 - pidx = idx.to_period(freq='M') - mask = isnull(pidx) - assert (mask[0]) - assert (not mask[1:].any()) - - mask = isnull(pidx[1:]) - assert (not mask.any()) - - class TestIsNull(tm.TestCase): def test_0d_array(self): @@ -150,6 +57,166 @@ def test_0d_array(self): self.assertFalse(isnull(np.array(0.0, dtype=object))) self.assertFalse(isnull(np.array(0, dtype=object))) + def test_isnull(self): + self.assertFalse(isnull(1.)) + self.assertTrue(isnull(None)) + self.assertTrue(isnull(np.NaN)) + self.assertTrue(float('nan')) + self.assertFalse(isnull(np.inf)) + self.assertFalse(isnull(-np.inf)) + + # series + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), + tm.makeObjectSeries(), tm.makeTimeSeries(), + tm.makePeriodSeries()]: + self.assertIsInstance(isnull(s), Series) + + # frame + for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), + tm.makeMixedDataFrame()]: + result = isnull(df) + expected = df.apply(isnull) + tm.assert_frame_equal(result, expected) + + # panel + for p in [tm.makePanel(), tm.makePeriodPanel(), + tm.add_nans(tm.makePanel())]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel_equal(result, expected) + + # panel 4d + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel4d_equal(result, expected) + + def test_isnull_lists(self): + result = isnull([[False]]) + exp = np.array([[False]]) + tm.assert_numpy_array_equal(result, exp) + + result = isnull([[1], [2]]) + exp = np.array([[False], [False]]) + tm.assert_numpy_array_equal(result, exp) + + # list of strings / unicode + result = isnull(['foo', 'bar']) + exp = np.array([False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = isnull([u('foo'), u('bar')]) + exp = np.array([False, False]) + tm.assert_numpy_array_equal(result, exp) + + def test_isnull_nat(self): + result = isnull([NaT]) + exp = np.array([True]) + tm.assert_numpy_array_equal(result, exp) + + result = isnull(np.array([NaT], dtype=object)) + exp = np.array([True]) + tm.assert_numpy_array_equal(result, exp) + + def test_isnull_numpy_nat(self): + arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), + np.datetime64('NaT', 's')]) + result = isnull(arr) + expected = np.array([True] * 4) + tm.assert_numpy_array_equal(result, expected) + + def test_isnull_datetime(self): + self.assertFalse(isnull(datetime.now())) + self.assertTrue(notnull(datetime.now())) + + idx = date_range('1/1/1990', periods=20) + exp = np.ones(len(idx), dtype=bool) + tm.assert_numpy_array_equal(notnull(idx), exp) + + idx = np.asarray(idx) + idx[0] = iNaT + idx = DatetimeIndex(idx) + mask = isnull(idx) + self.assertTrue(mask[0]) + exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) + self.assert_numpy_array_equal(mask, exp) + + # GH 9129 + pidx = idx.to_period(freq='M') + mask = isnull(pidx) + self.assertTrue(mask[0]) + exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) + self.assert_numpy_array_equal(mask, exp) + + mask = isnull(pidx[1:]) + exp = np.zeros(len(mask), dtype=bool) + self.assert_numpy_array_equal(mask, exp) + + def test_datetime_other_units(self): + idx = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-02']) + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(idx), exp) + tm.assert_numpy_array_equal(notnull(idx), ~exp) + tm.assert_numpy_array_equal(isnull(idx.values), exp) + tm.assert_numpy_array_equal(notnull(idx.values), ~exp) + + for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', + 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', + 'datetime64[ns]']: + values = idx.values.astype(dtype) + + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(values), exp) + tm.assert_numpy_array_equal(notnull(values), ~exp) + + exp = pd.Series([False, True, False]) + s = pd.Series(values) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + s = pd.Series(values, dtype=object) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + + def test_timedelta_other_units(self): + idx = pd.TimedeltaIndex(['1 days', 'NaT', '2 days']) + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(idx), exp) + tm.assert_numpy_array_equal(notnull(idx), ~exp) + tm.assert_numpy_array_equal(isnull(idx.values), exp) + tm.assert_numpy_array_equal(notnull(idx.values), ~exp) + + for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', + 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', + 'timedelta64[ns]']: + values = idx.values.astype(dtype) + + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(values), exp) + tm.assert_numpy_array_equal(notnull(values), ~exp) + + exp = pd.Series([False, True, False]) + s = pd.Series(values) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + s = pd.Series(values, dtype=object) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + + def test_period(self): + idx = pd.PeriodIndex(['2011-01', 'NaT', '2012-01'], freq='M') + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(idx), exp) + tm.assert_numpy_array_equal(notnull(idx), ~exp) + + exp = pd.Series([False, True, False]) + s = pd.Series(idx) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + s = pd.Series(idx, dtype=object) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + def test_array_equivalent(): assert array_equivalent(np.array([np.nan, np.nan]), diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 396b095fabbd6..6e36100ddd0b4 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -473,6 +473,47 @@ def test_join_append_timedeltas(self): '0r': Series([td, NaT], index=list('AB'))}) assert_frame_equal(result, expected) + def test_other_datetime_unit(self): + # GH 13389 + df1 = pd.DataFrame({'entity_id': [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name='days') + + for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', + 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', + 'datetime64[ns]']: + + df2 = s.astype(dtype).to_frame('days') + # coerces to datetime64[ns], thus sholuld not be affected + self.assertEqual(df2['days'].dtype, 'datetime64[ns]') + + result = df1.merge(df2, left_on='entity_id', right_index=True) + + exp = pd.DataFrame({'entity_id': [101, 102], + 'days': np.array(['nat', 'nat'], + dtype='datetime64[ns]')}, + columns=['entity_id', 'days']) + tm.assert_frame_equal(result, exp) + + def test_other_timedelta_unit(self): + # GH 13389 + df1 = pd.DataFrame({'entity_id': [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name='days') + + for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', + 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', + 'timedelta64[ns]']: + + df2 = s.astype(dtype).to_frame('days') + self.assertEqual(df2['days'].dtype, dtype) + + result = df1.merge(df2, left_on='entity_id', right_index=True) + + exp = pd.DataFrame({'entity_id': [101, 102], + 'days': np.array(['nat', 'nat'], + dtype=dtype)}, + columns=['entity_id', 'days']) + tm.assert_frame_equal(result, exp) + def test_overlapping_columns_error_message(self): df = DataFrame({'key': [1, 2, 3], 'v1': [4, 5, 6], diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 88ab239790aa1..c3d0ee28540e1 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1663,6 +1663,24 @@ def test_constructor_datetime64arr(self): self.assertRaises(ValueError, PeriodIndex, vals, freq='D') + def test_view(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + def test_constructor_empty(self): idx = pd.PeriodIndex([], freq='M') tm.assertIsInstance(idx, PeriodIndex) diff --git a/pandas/types/common.py b/pandas/types/common.py index 9d0ccaac843ef..bffff0357f329 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -230,7 +230,8 @@ def is_object(x): def needs_i8_conversion(arr_or_dtype): return (is_datetime_or_timedelta_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)) + is_datetime64tz_dtype(arr_or_dtype) or + isinstance(arr_or_dtype, ABCPeriodIndex)) def is_numeric_dtype(arr_or_dtype): diff --git a/pandas/types/missing.py b/pandas/types/missing.py index 8b4193d02beb7..a4af127e0c381 100644 --- a/pandas/types/missing.py +++ b/pandas/types/missing.py @@ -140,7 +140,7 @@ def _isnull_ndarraylike(obj): vec = lib.isnullobj(values.ravel()) result[...] = vec.reshape(shape) - elif is_datetimelike(obj): + elif needs_i8_conversion(obj): # this is the NaT pattern result = values.view('i8') == iNaT else: From fcf2d8683f919efbcfbdcc3e4f0887250a2fadae Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Wed, 27 Jul 2016 11:37:43 +0200 Subject: [PATCH 178/359] DOC: contributing: explain how to set the asv reporting threshold (#13807) The default asv reporting threshold is 100% change (2x), which probably is too large for pandas, so instruct how to set it lower. --- doc/source/contributing.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 666111470811f..54de4d86a48d9 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -577,13 +577,14 @@ To install asv:: If you need to run a benchmark, change your directory to ``asv_bench/`` and run:: - asv continuous upstream/master HEAD + asv continuous -f 1.1 upstream/master HEAD -You can replace ``HEAD`` with the name of the branch you are working on. +You can replace ``HEAD`` with the name of the branch you are working on, +and report benchmarks that changed by more than 10%. The command uses ``conda`` by default for creating the benchmark environments. If you want to use virtualenv instead, write:: - asv continuous -E virtualenv upstream/master HEAD + asv continuous -f 1.1 -E virtualenv upstream/master HEAD The ``-E virtualenv`` option should be added to all ``asv`` commands that run benchmarks. The default value is defined in ``asv.conf.json``. @@ -595,12 +596,12 @@ regressions. You can run specific benchmarks using the ``-b`` flag, which takes a regular expression. For example, this will only run tests from a ``pandas/asv_bench/benchmarks/groupby.py`` file:: - asv continuous upstream/master HEAD -b groupby + asv continuous -f 1.1 upstream/master HEAD -b ^groupby If you want to only run a specific group of tests from a file, you can do it using ``.`` as a separator. For example:: - asv continuous upstream/master HEAD -b groupby.groupby_agg_builtins + asv continuous -f 1.1 upstream/master HEAD -b groupby.groupby_agg_builtins will only run the ``groupby_agg_builtins`` benchmark defined in ``groupby.py``. From 2d3ede6b1e0f37ad8a1a9a54dc0b1ea526468a68 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Jul 2016 12:19:31 +0200 Subject: [PATCH 179/359] ASV: fix params to be strings (for better repr) (#13805) --- asv_bench/benchmarks/inference.py | 35 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index ee9d3104be4b1..0f9689dadcbb0 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -139,19 +139,26 @@ def time_dtype_infer_uint32(self): class to_numeric(object): + + param_names = ['dtype', 'downcast'] + params = [['string-float', 'string-int', 'string-nint', 'datetime64', + 'int-list', 'int32'], + [None, 'integer', 'signed', 'unsigned', 'float']] + N = 500000 - param_names = ['data', 'downcast'] - params = [ - [(['1'] * (N / 2)) + ([2] * (N / 2)), - (['-1'] * (N / 2)) + ([2] * (N / 2)), - np.repeat(np.array(['1970-01-01', '1970-01-02'], - dtype='datetime64[D]'), N), - (['1.1'] * (N / 2)) + ([2] * (N / 2)), - ([1] * (N / 2)) + ([2] * (N / 2)), - np.repeat(np.int32(1), N)], - [None, 'integer', 'signed', 'unsigned', 'float'], - ] - - def time_to_numeric(self, data, downcast): - pd.to_numeric(data, downcast=downcast) + data_dict = { + 'string-int': (['1'] * (N / 2)) + ([2] * (N / 2)), + 'string-nint': (['-1'] * (N / 2)) + ([2] * (N / 2)), + 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], + dtype='datetime64[D]'), N), + 'string-float': (['1.1'] * (N / 2)) + ([2] * (N / 2)), + 'int-list': ([1] * (N / 2)) + ([2] * (N / 2)), + 'int32': np.repeat(np.int32(1), N) + } + + def setup(self, dtype, downcast): + self.data = self.data_dict[dtype] + + def time_downcast(self, dtype, downcast): + pd.to_numeric(self.data, downcast=downcast) From 10da3ae14d25f28d1c6bcfe368a03f0b0b754cc5 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 27 Jul 2016 06:22:12 -0400 Subject: [PATCH 180/359] BUG: RangeIndex can be created without args closes #13793 Author: sinhrks Closes #13803 from sinhrks/range_none and squashes the following commits: aab6ae6 [sinhrks] BUG: RangeIndex can be created without args --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/range.py | 10 ++++++- pandas/tests/indexes/test_range.py | 45 +++++++++++++++++++----------- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 06625e09d70a1..1aed3595f9c59 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -766,6 +766,7 @@ Bug Fixes - Bug in ``DatetimeIndex`` with nanosecond frequency does not include timestamp specified with ``end`` (:issue:`13672`) - Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) +- Bug in ``RangeIndex`` can be created without no arguments rather than raises ``TypeError`` (:issue:`13793`) - Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) - Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) - Bug in ``isnull`` ``notnull`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index a561cab30b472..7094f8d589036 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -70,7 +70,10 @@ def _ensure_int(value, field): return new_value - if start is None: + if start is None and stop is None and step is None: + msg = "RangeIndex(...) must be called with integers" + raise TypeError(msg) + elif start is None: start = 0 else: start = _ensure_int(start, 'start') @@ -122,8 +125,13 @@ def _simple_new(cls, start, stop=None, step=None, name=None, result = object.__new__(cls) # handle passed None, non-integers + if start is None and stop is None: + # empty + start, stop, step = 0, 0, 1 + if start is None or not is_integer(start): try: + return RangeIndex(start, stop, step, name=name, **kwargs) except TypeError: return Index(start, stop, step, name=name, **kwargs) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 329ffa9b7cc77..8a036def0be1b 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -74,17 +74,28 @@ def test_constructor(self): self.assertEqual(index._step, 2) tm.assert_index_equal(Index(expected), index) - index = RangeIndex() - expected = np.empty(0, dtype=np.int64) - self.assertIsInstance(index, RangeIndex) - self.assertEqual(index._start, 0) - self.assertEqual(index._stop, 0) - self.assertEqual(index._step, 1) - tm.assert_index_equal(Index(expected), index) - - index = RangeIndex(name='Foo') - self.assertIsInstance(index, RangeIndex) - self.assertEqual(index.name, 'Foo') + msg = "RangeIndex\\(\\.\\.\\.\\) must be called with integers" + with tm.assertRaisesRegexp(TypeError, msg): + RangeIndex() + + for index in [RangeIndex(0), RangeIndex(start=0), RangeIndex(stop=0), + RangeIndex(0, 0)]: + expected = np.empty(0, dtype=np.int64) + self.assertIsInstance(index, RangeIndex) + self.assertEqual(index._start, 0) + self.assertEqual(index._stop, 0) + self.assertEqual(index._step, 1) + tm.assert_index_equal(Index(expected), index) + + with tm.assertRaisesRegexp(TypeError, msg): + RangeIndex(name='Foo') + + for index in [RangeIndex(0, name='Foo'), + RangeIndex(start=0, name='Foo'), + RangeIndex(stop=0, name='Foo'), + RangeIndex(0, 0, name='Foo')]: + self.assertIsInstance(index, RangeIndex) + self.assertEqual(index.name, 'Foo') # we don't allow on a bare Index self.assertRaises(TypeError, lambda: Index(0, 1000)) @@ -210,10 +221,10 @@ def test_numeric_compat2(self): RangeIndex(0, 1000, 1)._int64index // 2), (RangeIndex(0, 100, 1), 2.0, RangeIndex(0, 100, 1)._int64index // 2.0), - (RangeIndex(), 50, RangeIndex()), + (RangeIndex(0), 50, RangeIndex(0)), (RangeIndex(2, 4, 2), 3, RangeIndex(0, 1, 1)), (RangeIndex(-5, -10, -6), 4, RangeIndex(-2, -1, 1)), - (RangeIndex(-100, -200, 3), 2, RangeIndex())] + (RangeIndex(-100, -200, 3), 2, RangeIndex(0))] for idx, div, expected in cases_exact: tm.assert_index_equal(idx // div, expected, exact=True) @@ -288,7 +299,7 @@ def test_delete(self): def test_view(self): super(TestRangeIndex, self).test_view() - i = RangeIndex(name='Foo') + i = RangeIndex(0, name='Foo') i_view = i.view() self.assertEqual(i_view.name, 'Foo') @@ -612,8 +623,8 @@ def test_union(self): (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5)), (RI(0, -100, -5), RI(5, -100, -20), RI(-95, 10, 5)), (RI(0, -11, -1), RI(1, -12, -4), RI(-11, 2, 1)), - (RI(), RI(), RI()), - (RI(0, -10, -2), RI(), RI(0, -10, -2)), + (RI(0), RI(0), RI(0)), + (RI(0, -10, -2), RI(0), RI(0, -10, -2)), (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2)), (RI(0, -100, -2), RI(-100, 50, 102), RI(-100, 4, 2)), (RI(0, -100, -1), RI(0, -50, -3), RI(-99, 1, 1)), @@ -621,7 +632,7 @@ def test_union(self): (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5)), (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4])), (RI(0, 10, 1), I64([]), RI(0, 10, 1)), - (RI(), I64([1, 5, 6]), I64([1, 5, 6]))] + (RI(0), I64([1, 5, 6]), I64([1, 5, 6]))] for idx1, idx2, expected in cases: res1 = idx1.union(idx2) res2 = idx2.union(idx1) From 63285a4dbb50f139f6996c94ca6d473e7b42ae0f Mon Sep 17 00:00:00 2001 From: wcwagner Date: Wed, 27 Jul 2016 06:38:57 -0400 Subject: [PATCH 181/359] DOC: Added note to io.rst regarding reading in mixed dtypes closes #13746 Author: wcwagner Closes #13782 from wcwagner/doc/13746 and squashes the following commits: 7400607 [wcwagner] DOC: Added refs to basics.dtypes and basics.object_conversion, added whatsnew entry 8112ad5 [wcwagner] DOC: Shortened note, moved alternatives to main text ba4c2ce [wcwagner] DOC: Added short commentary on alternatives b6e2b64 [wcwagner] DOC: Swtiched Counter to value_counts, added low_memory alternative example, clarified type inference process 335d043 [wcwagner] DOC: Added note to io.rst regarding reading in mixed dtypes --- doc/source/basics.rst | 2 ++ doc/source/io.rst | 60 +++++++++++++++++++++++++++++++++ doc/source/whatsnew/v0.19.0.txt | 2 ++ 3 files changed, 64 insertions(+) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 63a7c8fded2db..1f670fb7fb593 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1751,6 +1751,8 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) dft.dtypes +.. _basics.object_conversion: + object conversion ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index 86da2561a36be..e3b03b5a39b37 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -435,11 +435,71 @@ individual columns: df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64}) df.dtypes +Fortunately, ``pandas`` offers more than one way to ensure that your column(s) +contain only one ``dtype``. If you're unfamiliar with these concepts, you can +see :ref:`here` to learn more about dtypes, and +:ref:`here` to learn more about ``object`` conversion in +``pandas``. + + +For instance, you can use the ``converters`` argument +of :func:`~pandas.read_csv`: + +.. ipython:: python + + data = "col_1\n1\n2\n'A'\n4.22" + df = pd.read_csv(StringIO(data), converters={'col_1':str}) + df + df['col_1'].apply(type).value_counts() + +Or you can use the :func:`~pandas.to_numeric` function to coerce the +dtypes after reading in the data, + +.. ipython:: python + + df2 = pd.read_csv(StringIO(data)) + df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce') + df2 + df2['col_1'].apply(type).value_counts() + +which would convert all valid parsing to floats, leaving the invalid parsing +as ``NaN``. + +Ultimately, how you deal with reading in columns containing mixed dtypes +depends on your specific needs. In the case above, if you wanted to ``NaN`` out +the data anomalies, then :func:`~pandas.to_numeric` is probably your best option. +However, if you wanted for all the data to be coerced, no matter the type, then +using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be +worth trying. + .. note:: The ``dtype`` option is currently only supported by the C engine. Specifying ``dtype`` with ``engine`` other than 'c' raises a ``ValueError``. +.. note:: + In some cases, reading in abnormal data with columns containing mixed dtypes + will result in an inconsistent dataset. If you rely on pandas to infer the + dtypes of your columns, the parsing engine will go and infer the dtypes for + different chunks of the data, rather than the whole dataset at once. Consequently, + you can end up with column(s) with mixed dtypes. For example, + + .. ipython:: python + :okwarning: + + df = pd.DataFrame({'col_1':range(500000) + ['a', 'b'] + range(500000)}) + df.to_csv('foo') + mixed_df = pd.read_csv('foo') + mixed_df['col_1'].apply(type).value_counts() + mixed_df['col_1'].dtype + + will result with `mixed_df` containing an ``int`` dtype for certain chunks + of the column, and ``str`` for others due to the mixed dtypes from the + data that was read in. It is important to note that the overall column will be + marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes. + + + Naming and Using Columns '''''''''''''''''''''''' diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 1aed3595f9c59..6f3e8da56ce5c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -323,6 +323,8 @@ Other enhancements index=['row1', 'row2']) df.sort_values(by='row2', axis=1) +- Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) + .. _whatsnew_0190.api: From 31f8e4dc8af8f0d109f366d0b726aef210bf7904 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Wed, 27 Jul 2016 06:45:01 -0400 Subject: [PATCH 182/359] BUG: parser_trim_buffers properly initializes word pointers in read_csv closes #13703 Author: Ivan Nazarov Closes #13788 from ivannz/parser_trim_fix and squashes the following commits: d59624e [Ivan Nazarov] Moved the test to 'c_parser_only' 9b521f6 [Ivan Nazarov] Improved the clarity and logic of the test 629198d [Ivan Nazarov] Referenced issue in the test, rewrote the bugfix description 834c851 [Ivan Nazarov] Improved readability of bugfix description; minor style fixes of the test e0b4c83 [Ivan Nazarov] flake8 style test correction 020d706 [Ivan Nazarov] Updated WHATSNEW with the bug fix information bdba66f [Ivan Nazarov] Rewritten the 'parser_trim_buffers' test 5ab3636 [Ivan Nazarov] Expanded the explanation of the patch a831dbb [Ivan Nazarov] Moved 'parser_trim_buffers' test to its proper place 07b4647 [Ivan Nazarov] praser_trim_fix: More stressful test 2120719 [Ivan Nazarov] A memory 'stress' test of parser.pyx to cause corruption or segfault 434f1e0 [Ivan Nazarov] FIX: 'parser_trim_buffers' properly initializes word pointers --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/io/tests/parser/c_parser_only.py | 70 +++++++++++++++++++++++++ pandas/src/parser/tokenizer.c | 44 +++++++++++----- 3 files changed, 101 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 6f3e8da56ce5c..685c214454719 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -675,6 +675,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index b6048051edc4d..103c9fa2b7ce8 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -381,3 +381,73 @@ def test_empty_header_read(count): for count in range(1, 101): test_empty_header_read(count) + + def test_parse_trim_buffers(self): + # This test is part of a bugfix for issue #13703. It attmepts to + # to stress the system memory allocator, to cause it to move the + # stream buffer and either let the OS reclaim the region, or let + # other memory requests of parser otherwise modify the contents + # of memory space, where it was formely located. + # This test is designed to cause a `segfault` with unpatched + # `tokenizer.c`. Sometimes the test fails on `segfault`, other + # times it fails due to memory corruption, which causes the + # loaded DataFrame to differ from the expected one. + + # Generate a large mixed-type CSV file on-the-fly (one record is + # approx 1.5KiB). + record_ = \ + """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \ + """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \ + """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \ + """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \ + """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \ + """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \ + """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \ + """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \ + """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \ + """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \ + """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \ + """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \ + """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \ + """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ + """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \ + """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \ + """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \ + """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \ + """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \ + """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \ + """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \ + """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ + """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \ + """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \ + """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \ + """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \ + """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \ + """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \ + """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" + + # Set the number of lines so that a call to `parser_trim_buffers` + # is triggered: after a couple of full chunks are consumed a + # relatively small 'residual' chunk would cause reallocation + # within the parser. + chunksize, n_lines = 128, 2 * 128 + 15 + csv_data = "\n".join([record_] * n_lines) + "\n" + + # We will use StringIO to load the CSV from this text buffer. + # pd.read_csv() will iterate over the file in chunks and will + # finally read a residual chunk of really small size. + + # Generate the expected output: manually create the dataframe + # by splitting by comma and repeating the `n_lines` times. + row = tuple(val_ if val_ else float("nan") + for val_ in record_.split(",")) + expected = pd.DataFrame([row for _ in range(n_lines)], + dtype=object, columns=None, index=None) + + # Iterate over the CSV file in chunks of `chunksize` lines + chunks_ = self.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize) + result = pd.concat(chunks_, axis=0, ignore_index=True) + + # Check for data corruption if there was no segfault + tm.assert_frame_equal(result, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 6091c79e2b4fc..ac909f2c8bfdb 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1221,20 +1221,7 @@ int parser_trim_buffers(parser_t *self) { size_t new_cap; void *newptr; - /* trim stream */ - new_cap = _next_pow2(self->stream_len) + 1; - TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", - new_cap, self->stream_cap, self->lines_cap)); - if (new_cap < self->stream_cap) { - TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); - newptr = safe_realloc((void*) self->stream, new_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->stream = newptr; - self->stream_cap = new_cap; - } - } + int i; /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; @@ -1255,6 +1242,35 @@ int parser_trim_buffers(parser_t *self) { } } + /* trim stream */ + new_cap = _next_pow2(self->stream_len) + 1; + TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", + new_cap, self->stream_cap, self->lines_cap)); + if (new_cap < self->stream_cap) { + TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); + newptr = safe_realloc((void*) self->stream, new_cap); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + // Update the pointers in the self->words array (char **) if `safe_realloc` + // moved the `self->stream` buffer. This block mirrors a similar block in + // `make_stream_space`. + if (self->stream != newptr) { + /* TRACE(("Moving word pointers\n")) */ + self->pword_start = newptr + self->word_start; + + for (i = 0; i < self->words_len; ++i) + { + self->words[i] = newptr + self->word_starts[i]; + } + } + + self->stream = newptr; + self->stream_cap = new_cap; + + } + } + /* trim line_start, line_fields */ new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { From cc216ada82a2b06a52524da7b1559a7f5e31b1c1 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Thu, 28 Jul 2016 06:31:55 +0900 Subject: [PATCH 183/359] TST: AmbiguousTimeError with set_index() (#13814) --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/tests/frame/test_alter_axes.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 685c214454719..11d2fab464d1f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -756,6 +756,8 @@ Bug Fixes - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - Clean some compile time warnings in datetime parsing (:issue:`13607`) - Bug in ``factorize`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13750`) +- Bug in ``.set_index`` raises ``AmbiguousTimeError`` if new index contains DST boundary and multi levels (:issue:`12920`) + - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 3b50dd2c1d49f..66b14995e6d3c 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -268,6 +268,7 @@ def test_set_index_cast_datetimeindex(self): lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected) + def test_set_index_timezone(self): # GH 12358 # tz-aware Series should retain the tz i = pd.to_datetime(["2014-01-01 10:10:10"], @@ -277,6 +278,25 @@ def test_set_index_cast_datetimeindex(self): self.assertEqual(pd.DatetimeIndex(pd.Series(df.i))[0].hour, 11) self.assertEqual(df.set_index(df.i).index[0].hour, 11) + def test_set_index_dst(self): + di = pd.date_range('2006-10-29 00:00:00', periods=3, + req='H', tz='US/Pacific') + + df = pd.DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]}, + index=di).reset_index() + # single level + res = df.set_index('index') + exp = pd.DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]}, + index=pd.Index(di, name='index')) + tm.assert_frame_equal(res, exp) + + # GH 12920 + res = df.set_index(['index', 'a']) + exp_index = pd.MultiIndex.from_arrays([di, [0, 1, 2]], + names=['index', 'a']) + exp = pd.DataFrame({'b': [3, 4, 5]}, index=exp_index) + tm.assert_frame_equal(res, exp) + def test_set_index_multiindexcolumns(self): columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) From 12c8ce661826c748f675badbd24098e398d193a2 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Fri, 29 Jul 2016 08:41:41 +0900 Subject: [PATCH 184/359] BLD: use tempita for hashtable (#13815) --- pandas/core/algorithms.py | 6 +- pandas/core/groupby.py | 2 +- pandas/hashtable.pyx | 952 +---------------------- pandas/src/hashtable_class_helper.pxi | 860 ++++++++++++++++++++ pandas/src/hashtable_class_helper.pxi.in | 642 +++++++++++++++ pandas/src/hashtable_func_helper.pxi | 197 +++++ pandas/src/hashtable_func_helper.pxi.in | 114 +++ setup.py | 3 +- 8 files changed, 1831 insertions(+), 945 deletions(-) create mode 100644 pandas/src/hashtable_class_helper.pxi create mode 100644 pandas/src/hashtable_class_helper.pxi.in create mode 100644 pandas/src/hashtable_func_helper.pxi create mode 100644 pandas/src/hashtable_func_helper.pxi.in diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 52b1a3aae788c..7920f05b5e7a1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -413,7 +413,7 @@ def _value_counts_arraylike(values, dropna=True): freq = values.freq values = values.view(np.int64) - keys, counts = htable.value_count_scalar64(values, dropna) + keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT @@ -434,10 +434,10 @@ def _value_counts_arraylike(values, dropna=True): elif is_integer_dtype(dtype): values = _ensure_int64(values) - keys, counts = htable.value_count_scalar64(values, dropna) + keys, counts = htable.value_count_int64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) - keys, counts = htable.value_count_scalar64(values, dropna) + keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) mask = isnull(values) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6179857978b7b..f14f9b5dd24af 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4399,7 +4399,7 @@ def _groupby_indices(values): # bit better than factorizing again reverse = dict(enumerate(values.categories)) codes = values.codes.astype('int64') - _, counts = _hash.value_count_scalar64(codes, False) + _, counts = _hash.value_count_int64(codes, False) else: reverse, codes, counts = _algos.group_labels( _values_from_object(_ensure_object(values))) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 18e54621e8bf5..d1b6b326d7de6 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -35,773 +35,9 @@ cdef extern from "Python.h": cdef size_t _INIT_VEC_CAP = 32 -cdef class ObjectVector: - - cdef: - PyObject **data - size_t n, m - ndarray ao - - def __cinit__(self): - self.n = 0 - self.m = _INIT_VEC_CAP - self.ao = np.empty(_INIT_VEC_CAP, dtype=object) - self.data = self.ao.data - - def __len__(self): - return self.n - - cdef inline append(self, object o): - if self.n == self.m: - self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m) - self.data = self.ao.data - - Py_INCREF(o) - self.data[self.n] = o - self.n += 1 - - def to_array(self): - self.ao.resize(self.n) - self.m = self.n - return self.ao - -ctypedef struct Int64VectorData: - int64_t *data - size_t n, m - -ctypedef struct Float64VectorData: - float64_t *data - size_t n, m - -ctypedef fused vector_data: - Int64VectorData - Float64VectorData - -ctypedef fused sixty_four_bit_scalar: - int64_t - float64_t - -cdef bint needs_resize(vector_data *data) nogil: - return data.n == data.m - -cdef void append_data(vector_data *data, sixty_four_bit_scalar x) nogil: - - # compile time specilization of the fused types - # as the cross-product is generated, but we cannot assign float->int - # the types that don't pass are pruned - if (vector_data is Int64VectorData and sixty_four_bit_scalar is int64_t) or ( - vector_data is Float64VectorData and sixty_four_bit_scalar is float64_t): - - data.data[data.n] = x - data.n += 1 - -cdef class Int64Vector: - - cdef: - Int64VectorData *data - ndarray ao - - def __cinit__(self): - self.data = PyMem_Malloc(sizeof(Int64VectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.int64) - self.data.data = self.ao.data - - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = self.ao.data - - def __dealloc__(self): - PyMem_Free(self.data) - - def __len__(self): - return self.data.n - - def to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao - - cdef inline void append(self, int64_t x): - - if needs_resize(self.data): - self.resize() - - append_data(self.data, x) - -cdef class Float64Vector: - - cdef: - Float64VectorData *data - ndarray ao - - def __cinit__(self): - self.data = PyMem_Malloc(sizeof(Float64VectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.float64) - self.data.data = self.ao.data - - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = self.ao.data - - def __dealloc__(self): - PyMem_Free(self.data) - - def __len__(self): - return self.data.n - - def to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao - - cdef inline void append(self, float64_t x): - - if needs_resize(self.data): - self.resize() - - append_data(self.data, x) - -cdef class HashTable: - pass - -cdef class StringHashTable(HashTable): - cdef kh_str_t *table - - def __cinit__(self, int size_hint=1): - self.table = kh_init_str() - if size_hint is not None: - kh_resize_str(self.table, size_hint) - - def __dealloc__(self): - kh_destroy_str(self.table) - - cpdef get_item(self, object val): - cdef khiter_t k - k = kh_get_str(self.table, util.get_c_string(val)) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - for i in range(iterations): - k = kh_get_str(self.table, util.get_c_string(key)) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, object key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - char* buf - - buf = util.get_c_string(key) - - k = kh_put_str(self.table, buf, &ret) - self.table.keys[k] = key - if kh_exist_str(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def get_indexer(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - char *buf - int64_t *resbuf = labels.data - khiter_t k - kh_str_t *table = self.table - - for i in range(n): - buf = util.get_c_string(values[i]) - k = kh_get_str(table, buf) - if k != table.n_buckets: - resbuf[i] = table.vals[k] - else: - resbuf[i] = -1 - return labels - - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - char *buf - khiter_t k - ObjectVector uniques = ObjectVector() - - for i in range(n): - val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k == self.table.n_buckets: - kh_put_str(self.table, buf, &ret) - uniques.append(val) - - return uniques.to_array() - - def factorize(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - dict reverse = {} - Py_ssize_t idx, count = 0 - int ret = 0 - object val - char *buf - khiter_t k - - for i in range(n): - val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_str(self.table, buf, &ret) - # print 'putting %s, %s' % (val, count) - - self.table.vals[k] = count - reverse[count] = val - labels[i] = count - count += 1 - - return reverse, labels - -cdef class Int64HashTable(HashTable): - - def __cinit__(self, size_hint=1): - self.table = kh_init_int64() - if size_hint is not None: - kh_resize_int64(self.table, size_hint) - - def __len__(self): - return self.table.size - - def __dealloc__(self): - kh_destroy_int64(self.table) - - def __contains__(self, object key): - cdef khiter_t k - k = kh_get_int64(self.table, key) - return k != self.table.n_buckets - - cpdef get_item(self, int64_t val): - cdef khiter_t k - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, int64_t key, Py_ssize_t iterations): - cdef Py_ssize_t i, val=0 - for i in range(iterations): - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, int64_t key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - - k = kh_put_int64(self.table, key, &ret) - self.table.keys[k] = key - if kh_exist_int64(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - @cython.boundscheck(False) - def map(self, int64_t[:] keys, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t key - khiter_t k - - with nogil: - for i in range(n): - key = keys[i] - k = kh_put_int64(self.table, key, &ret) - self.table.vals[k] = values[i] - - @cython.boundscheck(False) - def map_locations(self, ndarray[int64_t, ndim=1] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - - with nogil: - for i in range(n): - val = values[i] - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - def factorize(self, ndarray[object] values): - reverse = {} - labels = self.get_labels(values, reverse, 0, 0) - return reverse, labels - - @cython.boundscheck(False) - def get_labels(self, int64_t[:] values, Int64Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - int64_t val - khiter_t k - Int64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - - if check_null and val == iNaT: - labels[i] = na_sentinel - continue - - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - labels[i] = count - count += 1 - - return np.asarray(labels) - - @cython.boundscheck(False) - def get_labels_groupby(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = 0 - int ret = 0 - int64_t val - khiter_t k - Int64Vector uniques = Int64Vector() - Int64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - # specific for groupby - if val < 0: - labels[i] = -1 - continue - - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - labels[i] = count - count += 1 - - arr_uniques = uniques.to_array() - - return np.asarray(labels), arr_uniques - - @cython.boundscheck(False) - def unique(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - Int64Vector uniques = Int64Vector() - Int64VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - if k == self.table.n_buckets: - kh_put_int64(self.table, val, &ret) - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - - return uniques.to_array() - - -cdef class Float64HashTable(HashTable): - - def __cinit__(self, size_hint=1): - self.table = kh_init_float64() - if size_hint is not None: - kh_resize_float64(self.table, size_hint) - - def __len__(self): - return self.table.size - - cpdef get_item(self, float64_t val): - cdef khiter_t k - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - cpdef set_item(self, float64_t key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - - k = kh_put_float64(self.table, key, &ret) - self.table.keys[k] = key - if kh_exist_float64(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def __dealloc__(self): - kh_destroy_float64(self.table) - - def __contains__(self, object key): - cdef khiter_t k - k = kh_get_float64(self.table, key) - return k != self.table.n_buckets - - def factorize(self, float64_t[:] values): - uniques = Float64Vector() - labels = self.get_labels(values, uniques, 0, -1, 1) - return uniques.to_array(), labels - - @cython.boundscheck(False) - def get_labels(self, float64_t[:] values, - Float64Vector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - float64_t val - khiter_t k - Float64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - if check_null and val != val: - labels[i] = na_sentinel - continue - - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_float64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - labels[i] = count - count += 1 - - return np.asarray(labels) - - @cython.boundscheck(False) - def map_locations(self, ndarray[float64_t, ndim=1] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - khiter_t k - - with nogil: - for i in range(n): - k = kh_put_float64(self.table, values[i], &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, float64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - @cython.boundscheck(False) - def unique(self, float64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t val - khiter_t k - bint seen_na = 0 - Float64Vector uniques = Float64Vector() - Float64VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_float64(self.table, val) - if k == self.table.n_buckets: - kh_put_float64(self.table, val, &ret) - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - - elif not seen_na: - seen_na = 1 - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, NAN) - - return uniques.to_array() - -na_sentinel = object - -cdef class PyObjectHashTable(HashTable): - - def __init__(self, size_hint=1): - self.table = kh_init_pymap() - kh_resize_pymap(self.table, size_hint) - - def __dealloc__(self): - if self.table is not NULL: - self.destroy() - - def __len__(self): - return self.table.size - - def __contains__(self, object key): - cdef khiter_t k - hash(key) - if key != key or key is None: - key = na_sentinel - k = kh_get_pymap(self.table, key) - return k != self.table.n_buckets - - def destroy(self): - kh_destroy_pymap(self.table) - self.table = NULL - - cpdef get_item(self, object val): - cdef khiter_t k - if val != val or val is None: - val = na_sentinel - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - if key != key or key is None: - key = na_sentinel - for i in range(iterations): - k = kh_get_pymap(self.table, key) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, object key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - char* buf - - hash(key) - if key != key or key is None: - key = na_sentinel - k = kh_put_pymap(self.table, key, &ret) - # self.table.keys[k] = key - if kh_exist_pymap(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def map_locations(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - - for i in range(n): - val = values[i] - hash(val) - if val != val or val is None: - val = na_sentinel - - k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = i - - def lookup(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - for i in range(n): - val = values[i] - hash(val) - if val != val or val is None: - val = na_sentinel - - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - ObjectVector uniques = ObjectVector() - bint seen_na = 0 - - for i in range(n): - val = values[i] - hash(val) - if not _checknan(val): - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - elif not seen_na: - seen_na = 1 - uniques.append(nan) - - return uniques.to_array() - - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - object val - khiter_t k - - labels = np.empty(n, dtype=np.int64) - - for i in range(n): - val = values[i] - hash(val) - - if check_null and val != val or val is None: - labels[i] = na_sentinel - continue - - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = count - uniques.append(val) - labels[i] = count - count += 1 - - return np.asarray(labels) +include "hashtable_class_helper.pxi" +include "hashtable_func_helper.pxi" cdef class Factorizer: cdef public PyObjectHashTable table @@ -876,94 +112,9 @@ cdef class Int64Factorizer: self.count = len(self.uniques) return labels -ctypedef fused kh_scalar64: - kh_int64_t - kh_float64_t - -@cython.boundscheck(False) -cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values, - kh_scalar64 *table, bint dropna): - cdef: - khiter_t k - Py_ssize_t i, n = len(values) - sixty_four_bit_scalar val - int ret = 0 - - if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t: - with nogil: - kh_resize_float64(table, n) - - for i in range(n): - val = values[i] - if val == val or not dropna: - k = kh_get_float64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_float64(table, val, &ret) - table.vals[k] = 1 - elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t: - with nogil: - kh_resize_int64(table, n) - - for i in range(n): - val = values[i] - k = kh_get_int64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_int64(table, val, &ret) - table.vals[k] = 1 - else: - raise ValueError("Table type must match scalar type.") - - +@cython.wraparound(False) @cython.boundscheck(False) -cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna): - cdef: - Py_ssize_t i - kh_float64_t *ftable - kh_int64_t *itable - sixty_four_bit_scalar[:] result_keys - int64_t[:] result_counts - int k - - i = 0 - - if sixty_four_bit_scalar is float64_t: - ftable = kh_init_float64() - build_count_table_scalar64(values, ftable, dropna) - - result_keys = np.empty(ftable.n_occupied, dtype=np.float64) - result_counts = np.zeros(ftable.n_occupied, dtype=np.int64) - - with nogil: - for k in range(ftable.n_buckets): - if kh_exist_float64(ftable, k): - result_keys[i] = ftable.keys[k] - result_counts[i] = ftable.vals[k] - i += 1 - kh_destroy_float64(ftable) - - elif sixty_four_bit_scalar is int64_t: - itable = kh_init_int64() - build_count_table_scalar64(values, itable, dropna) - - result_keys = np.empty(itable.n_occupied, dtype=np.int64) - result_counts = np.zeros(itable.n_occupied, dtype=np.int64) - - with nogil: - for k in range(itable.n_buckets): - if kh_exist_int64(itable, k): - result_keys[i] = itable.keys[k] - result_counts[i] = itable.vals[k] - i += 1 - kh_destroy_int64(itable) - - return np.asarray(result_keys), np.asarray(result_counts) - - cdef build_count_table_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask, kh_pymap_t *table): @@ -987,6 +138,8 @@ cdef build_count_table_object(ndarray[object] values, table.vals[k] = 1 +@cython.wraparound(False) +@cython.boundscheck(False) cpdef value_count_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): cdef: @@ -1010,6 +163,8 @@ cpdef value_count_object(ndarray[object] values, return result_keys, result_counts +@cython.wraparound(False) +@cython.boundscheck(False) def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): cdef: int count, max_count = 2 @@ -1040,6 +195,7 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): return modes[:j+1] +@cython.wraparound(False) @cython.boundscheck(False) def mode_int64(int64_t[:] values): cdef: @@ -1051,7 +207,7 @@ def mode_int64(int64_t[:] values): table = kh_init_int64() - build_count_table_scalar64(values, table, 0) + build_count_table_int64(values, table, 0) modes = np.empty(table.n_buckets, dtype=np.int64) @@ -1074,6 +230,8 @@ def mode_int64(int64_t[:] values): return modes[:j+1] +@cython.wraparound(False) +@cython.boundscheck(False) def duplicated_object(ndarray[object] values, object keep='first'): cdef: Py_ssize_t i, n @@ -1114,92 +272,6 @@ def duplicated_object(ndarray[object] values, object keep='first'): return result.view(np.bool_) -@cython.wraparound(False) -@cython.boundscheck(False) -def duplicated_float64(ndarray[float64_t, ndim=1] values, - object keep='first'): - cdef: - int ret = 0, k - float64_t value - Py_ssize_t i, n = len(values) - kh_float64_t * table = kh_init_float64() - ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - - kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT)) - - if keep not in ('last', 'first', False): - raise ValueError('keep must be either "first", "last" or False') - - if keep == 'last': - with nogil: - for i from n > i >=0: - kh_put_float64(table, values[i], &ret) - out[i] = ret == 0 - elif keep == 'first': - with nogil: - for i from 0 <= i < n: - kh_put_float64(table, values[i], &ret) - out[i] = ret == 0 - else: - with nogil: - for i from 0 <= i < n: - value = values[i] - k = kh_get_float64(table, value) - if k != table.n_buckets: - out[table.vals[k]] = 1 - out[i] = 1 - else: - k = kh_put_float64(table, value, &ret) - table.keys[k] = value - table.vals[k] = i - out[i] = 0 - kh_destroy_float64(table) - return out - - -@cython.wraparound(False) -@cython.boundscheck(False) -def duplicated_int64(ndarray[int64_t, ndim=1] values, - object keep='first'): - cdef: - int ret = 0, k - int64_t value - Py_ssize_t i, n = len(values) - kh_int64_t * table = kh_init_int64() - ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - - kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) - - if keep not in ('last', 'first', False): - raise ValueError('keep must be either "first", "last" or False') - - if keep == 'last': - with nogil: - for i from n > i >=0: - kh_put_int64(table, values[i], &ret) - out[i] = ret == 0 - elif keep == 'first': - with nogil: - for i from 0 <= i < n: - kh_put_int64(table, values[i], &ret) - out[i] = ret == 0 - else: - with nogil: - for i from 0 <= i < n: - value = values[i] - k = kh_get_int64(table, value) - if k != table.n_buckets: - out[table.vals[k]] = 1 - out[i] = 1 - else: - k = kh_put_int64(table, value, &ret) - table.keys[k] = value - table.vals[k] = i - out[i] = 0 - kh_destroy_int64(table) - return out - - @cython.wraparound(False) @cython.boundscheck(False) def unique_label_indices(ndarray[int64_t, ndim=1] labels): @@ -1225,7 +297,7 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels): if needs_resize(ud): with gil: idx.resize() - append_data(ud, i) + append_data_int64(ud, i) kh_destroy_int64(table) diff --git a/pandas/src/hashtable_class_helper.pxi b/pandas/src/hashtable_class_helper.pxi new file mode 100644 index 0000000000000..da0c76aeca86f --- /dev/null +++ b/pandas/src/hashtable_class_helper.pxi @@ -0,0 +1,860 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + + +ctypedef struct Float64VectorData: + float64_t *data + size_t n, m + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_float64(Float64VectorData *data, + float64_t x) nogil: + + data.data[data.n] = x + data.n += 1 + + +ctypedef struct Int64VectorData: + int64_t *data + size_t n, m + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_int64(Int64VectorData *data, + int64_t x) nogil: + + data.data[data.n] = x + data.n += 1 + +ctypedef fused vector_data: + Int64VectorData + Float64VectorData + +cdef bint needs_resize(vector_data *data) nogil: + return data.n == data.m + +#---------------------------------------------------------------------- +# Vector +#---------------------------------------------------------------------- + +cdef class Float64Vector: + + cdef: + Float64VectorData *data + ndarray ao + + def __cinit__(self): + self.data = PyMem_Malloc( + sizeof(Float64VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.float64) + self.data.data = self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m) + self.data.data = self.ao.data + + def __dealloc__(self): + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + self.ao.resize(self.data.n) + self.data.m = self.data.n + return self.ao + + cdef inline void append(self, float64_t x): + + if needs_resize(self.data): + self.resize() + + append_data_float64(self.data, x) + +cdef class Int64Vector: + + cdef: + Int64VectorData *data + ndarray ao + + def __cinit__(self): + self.data = PyMem_Malloc( + sizeof(Int64VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.int64) + self.data.data = self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m) + self.data.data = self.ao.data + + def __dealloc__(self): + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + self.ao.resize(self.data.n) + self.data.m = self.data.n + return self.ao + + cdef inline void append(self, int64_t x): + + if needs_resize(self.data): + self.resize() + + append_data_int64(self.data, x) + + +cdef class ObjectVector: + + cdef: + PyObject **data + size_t n, m + ndarray ao + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=object) + self.data = self.ao.data + + def __len__(self): + return self.n + + cdef inline append(self, object o): + if self.n == self.m: + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m) + self.data = self.ao.data + + Py_INCREF(o) + self.data[self.n] = o + self.n += 1 + + def to_array(self): + self.ao.resize(self.n) + self.m = self.n + return self.ao + + +#---------------------------------------------------------------------- +# HashTable +#---------------------------------------------------------------------- + + +cdef class HashTable: + pass + +cdef class Float64HashTable(HashTable): + + def __cinit__(self, size_hint=1): + self.table = kh_init_float64() + if size_hint is not None: + kh_resize_float64(self.table, size_hint) + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_float64(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_float64(self.table, key) + return k != self.table.n_buckets + + cpdef get_item(self, float64_t val): + cdef khiter_t k + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, float64_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, float64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_float64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_float64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def map(self, float64_t[:] keys, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t key + khiter_t k + + with nogil: + for i in range(n): + key = keys[i] + k = kh_put_float64(self.table, key, &ret) + self.table.vals[k] = values[i] + + @cython.boundscheck(False) + def map_locations(self, ndarray[float64_t, ndim=1] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, float64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def factorize(self, float64_t values): + uniques = Float64Vector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def get_labels(self, float64_t[:] values, Float64Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + float64_t val + khiter_t k + Float64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if check_null and val != val: + labels[i] = na_sentinel + continue + + k = kh_get_float64(self.table, val) + + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, val) + labels[i] = count + count += 1 + + return np.asarray(labels) + + @cython.boundscheck(False) + def get_labels_groupby(self, float64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = 0 + int ret = 0 + float64_t val + khiter_t k + Float64Vector uniques = Float64Vector() + Float64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + + @cython.boundscheck(False) + def unique(self, float64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + bint seen_na = 0 + Float64Vector uniques = Float64Vector() + Float64VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_float64(self.table, val) + if k == self.table.n_buckets: + kh_put_float64(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, val) + elif not seen_na: + seen_na = 1 + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, NAN) + + return uniques.to_array() + +cdef class Int64HashTable(HashTable): + + def __cinit__(self, size_hint=1): + self.table = kh_init_int64() + if size_hint is not None: + kh_resize_int64(self.table, size_hint) + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_int64(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_int64(self.table, key) + return k != self.table.n_buckets + + cpdef get_item(self, int64_t val): + cdef khiter_t k + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, int64_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, int64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_int64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_int64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def map(self, int64_t[:] keys, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t key + khiter_t k + + with nogil: + for i in range(n): + key = keys[i] + k = kh_put_int64(self.table, key, &ret) + self.table.vals[k] = values[i] + + @cython.boundscheck(False) + def map_locations(self, ndarray[int64_t, ndim=1] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def factorize(self, int64_t values): + uniques = Int64Vector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def get_labels(self, int64_t[:] values, Int64Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + int64_t val + khiter_t k + Int64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if check_null and val == iNaT: + labels[i] = na_sentinel + continue + + k = kh_get_int64(self.table, val) + + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_int64(ud, val) + labels[i] = count + count += 1 + + return np.asarray(labels) + + @cython.boundscheck(False) + def get_labels_groupby(self, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = 0 + int ret = 0 + int64_t val + khiter_t k + Int64Vector uniques = Int64Vector() + Int64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_int64(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + + @cython.boundscheck(False) + def unique(self, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + bint seen_na = 0 + Int64Vector uniques = Int64Vector() + Int64VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + k = kh_get_int64(self.table, val) + if k == self.table.n_buckets: + kh_put_int64(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_int64(ud, val) + + return uniques.to_array() + + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + def __cinit__(self, int size_hint=1): + self.table = kh_init_str() + if size_hint is not None: + kh_resize_str(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_str(self.table) + + cpdef get_item(self, object val): + cdef khiter_t k + k = kh_get_str(self.table, util.get_c_string(val)) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + for i in range(iterations): + k = kh_get_str(self.table, util.get_c_string(key)) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + buf = util.get_c_string(key) + + k = kh_put_str(self.table, buf, &ret) + self.table.keys[k] = key + if kh_exist_str(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def get_indexer(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + char *buf + int64_t *resbuf = labels.data + khiter_t k + kh_str_t *table = self.table + + for i in range(n): + buf = util.get_c_string(values[i]) + k = kh_get_str(table, buf) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + return labels + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + char *buf + khiter_t k + ObjectVector uniques = ObjectVector() + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k == self.table.n_buckets: + kh_put_str(self.table, buf, &ret) + uniques.append(val) + + return uniques.to_array() + + def factorize(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + count += 1 + + return reverse, labels + + +na_sentinel = object + +cdef class PyObjectHashTable(HashTable): + + def __init__(self, size_hint=1): + self.table = kh_init_pymap() + kh_resize_pymap(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + self.destroy() + + def __len__(self): + return self.table.size + + def __contains__(self, object key): + cdef khiter_t k + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_get_pymap(self.table, key) + return k != self.table.n_buckets + + def destroy(self): + kh_destroy_pymap(self.table) + self.table = NULL + + cpdef get_item(self, object val): + cdef khiter_t k + if val != val or val is None: + val = na_sentinel + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + if key != key or key is None: + key = na_sentinel + for i in range(iterations): + k = kh_get_pymap(self.table, key) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_put_pymap(self.table, key, &ret) + # self.table.keys[k] = key + if kh_exist_pymap(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ObjectVector uniques = ObjectVector() + bint seen_na = 0 + + for i in range(n): + val = values[i] + hash(val) + if not _checknan(val): + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + kh_put_pymap(self.table, val, &ret) + uniques.append(val) + elif not seen_na: + seen_na = 1 + uniques.append(nan) + + return uniques.to_array() + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + + if check_null and val != val or val is None: + labels[i] = na_sentinel + continue + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return np.asarray(labels) \ No newline at end of file diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in new file mode 100644 index 0000000000000..14e5363eee20c --- /dev/null +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -0,0 +1,642 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + +{{py: + +# name, dtype +dtypes = [('Float64', 'float64'), ('Int64', 'int64')] + +}} + +{{for name, dtype in dtypes}} + + +ctypedef struct {{name}}VectorData: + {{dtype}}_t *data + size_t n, m + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_{{dtype}}({{name}}VectorData *data, + {{dtype}}_t x) nogil: + + data.data[data.n] = x + data.n += 1 + +{{endfor}} + +ctypedef fused vector_data: + Int64VectorData + Float64VectorData + +cdef bint needs_resize(vector_data *data) nogil: + return data.n == data.m + +#---------------------------------------------------------------------- +# Vector +#---------------------------------------------------------------------- + +{{py: + +# name, dtype +dtypes = [('Float64', 'float64'), ('Int64', 'int64')] + +}} + +{{for name, dtype in dtypes}} + +cdef class {{name}}Vector: + + cdef: + {{name}}VectorData *data + ndarray ao + + def __cinit__(self): + self.data = <{{name}}VectorData *>PyMem_Malloc( + sizeof({{name}}VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) + self.data.data = <{{dtype}}_t*> self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m) + self.data.data = <{{dtype}}_t*> self.ao.data + + def __dealloc__(self): + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + self.ao.resize(self.data.n) + self.data.m = self.data.n + return self.ao + + cdef inline void append(self, {{dtype}}_t x): + + if needs_resize(self.data): + self.resize() + + append_data_{{dtype}}(self.data, x) + +{{endfor}} + + +cdef class ObjectVector: + + cdef: + PyObject **data + size_t n, m + ndarray ao + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=object) + self.data = self.ao.data + + def __len__(self): + return self.n + + cdef inline append(self, object o): + if self.n == self.m: + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m) + self.data = self.ao.data + + Py_INCREF(o) + self.data[self.n] = o + self.n += 1 + + def to_array(self): + self.ao.resize(self.n) + self.m = self.n + return self.ao + + +#---------------------------------------------------------------------- +# HashTable +#---------------------------------------------------------------------- + + +cdef class HashTable: + pass + +{{py: + +# name, dtype, null_condition, float_group +dtypes = [('Float64', 'float64', 'val != val', True), + ('Int64', 'int64', 'val == iNaT', False)] + +}} + + +{{for name, dtype, null_condition, float_group in dtypes}} + +cdef class {{name}}HashTable(HashTable): + + def __cinit__(self, size_hint=1): + self.table = kh_init_{{dtype}}() + if size_hint is not None: + kh_resize_{{dtype}}(self.table, size_hint) + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_{{dtype}}(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_{{dtype}}(self.table, key) + return k != self.table.n_buckets + + cpdef get_item(self, {{dtype}}_t val): + cdef khiter_t k + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, {{dtype}}_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_{{dtype}}(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_{{dtype}}(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def map(self, {{dtype}}_t[:] keys, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t key + khiter_t k + + with nogil: + for i in range(n): + key = keys[i] + k = kh_put_{{dtype}}(self.table, key, &ret) + self.table.vals[k] = values[i] + + @cython.boundscheck(False) + def map_locations(self, ndarray[{{dtype}}_t, ndim=1] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, {{dtype}}_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def factorize(self, {{dtype}}_t values): + uniques = {{name}}Vector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if check_null and {{null_condition}}: + labels[i] = na_sentinel + continue + + k = kh_get_{{dtype}}(self.table, val) + + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + labels[i] = count + count += 1 + + return np.asarray(labels) + + @cython.boundscheck(False) + def get_labels_groupby(self, {{dtype}}_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = 0 + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + + @cython.boundscheck(False) + def unique(self, {{dtype}}_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + bint seen_na = 0 + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + {{if float_group}} + if val == val: + k = kh_get_{{dtype}}(self.table, val) + if k == self.table.n_buckets: + kh_put_{{dtype}}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + elif not seen_na: + seen_na = 1 + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, NAN) + {{else}} + k = kh_get_{{dtype}}(self.table, val) + if k == self.table.n_buckets: + kh_put_{{dtype}}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + {{endif}} + + return uniques.to_array() + +{{endfor}} + + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + def __cinit__(self, int size_hint=1): + self.table = kh_init_str() + if size_hint is not None: + kh_resize_str(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_str(self.table) + + cpdef get_item(self, object val): + cdef khiter_t k + k = kh_get_str(self.table, util.get_c_string(val)) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + for i in range(iterations): + k = kh_get_str(self.table, util.get_c_string(key)) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + buf = util.get_c_string(key) + + k = kh_put_str(self.table, buf, &ret) + self.table.keys[k] = key + if kh_exist_str(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def get_indexer(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + char *buf + int64_t *resbuf = labels.data + khiter_t k + kh_str_t *table = self.table + + for i in range(n): + buf = util.get_c_string(values[i]) + k = kh_get_str(table, buf) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + return labels + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + char *buf + khiter_t k + ObjectVector uniques = ObjectVector() + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k == self.table.n_buckets: + kh_put_str(self.table, buf, &ret) + uniques.append(val) + + return uniques.to_array() + + def factorize(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + count += 1 + + return reverse, labels + + +na_sentinel = object + +cdef class PyObjectHashTable(HashTable): + + def __init__(self, size_hint=1): + self.table = kh_init_pymap() + kh_resize_pymap(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + self.destroy() + + def __len__(self): + return self.table.size + + def __contains__(self, object key): + cdef khiter_t k + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_get_pymap(self.table, key) + return k != self.table.n_buckets + + def destroy(self): + kh_destroy_pymap(self.table) + self.table = NULL + + cpdef get_item(self, object val): + cdef khiter_t k + if val != val or val is None: + val = na_sentinel + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + if key != key or key is None: + key = na_sentinel + for i in range(iterations): + k = kh_get_pymap(self.table, key) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_put_pymap(self.table, key, &ret) + # self.table.keys[k] = key + if kh_exist_pymap(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ObjectVector uniques = ObjectVector() + bint seen_na = 0 + + for i in range(n): + val = values[i] + hash(val) + if not _checknan(val): + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + kh_put_pymap(self.table, val, &ret) + uniques.append(val) + elif not seen_na: + seen_na = 1 + uniques.append(nan) + + return uniques.to_array() + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + + if check_null and val != val or val is None: + labels[i] = na_sentinel + continue + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return np.asarray(labels) \ No newline at end of file diff --git a/pandas/src/hashtable_func_helper.pxi b/pandas/src/hashtable_func_helper.pxi new file mode 100644 index 0000000000000..d05b81acc5dd5 --- /dev/null +++ b/pandas/src/hashtable_func_helper.pxi @@ -0,0 +1,197 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef build_count_table_float64(float64_t[:] values, + kh_float64_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + float64_t val + int ret = 0 + + with nogil: + kh_resize_float64(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_float64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_float64(table, val, &ret) + table.vals[k] = 1 + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef value_count_float64(float64_t[:] values, bint dropna): + cdef: + Py_ssize_t i=0 + kh_float64_t *table + float64_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_float64() + build_count_table_float64(values, table, dropna) + + result_keys = np.empty(table.n_occupied, dtype=np.float64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + + with nogil: + for k in range(table.n_buckets): + if kh_exist_float64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_float64(table) + + return np.asarray(result_keys), np.asarray(result_counts) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_float64(float64_t[:] values, + object keep='first'): + cdef: + int ret = 0, k + float64_t value + Py_ssize_t i, n = len(values) + kh_float64_t * table = kh_init_float64() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_float64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_float64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_float64(table) + return out + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef build_count_table_int64(int64_t[:] values, + kh_int64_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + int64_t val + int ret = 0 + + with nogil: + kh_resize_int64(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_int64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_int64(table, val, &ret) + table.vals[k] = 1 + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef value_count_int64(int64_t[:] values, bint dropna): + cdef: + Py_ssize_t i=0 + kh_int64_t *table + int64_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_int64() + build_count_table_int64(values, table, dropna) + + result_keys = np.empty(table.n_occupied, dtype=np.int64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + + with nogil: + for k in range(table.n_buckets): + if kh_exist_int64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_int64(table) + + return np.asarray(result_keys), np.asarray(result_counts) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_int64(int64_t[:] values, + object keep='first'): + cdef: + int ret = 0, k + int64_t value + Py_ssize_t i, n = len(values) + kh_int64_t * table = kh_init_int64() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_int64(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_int64(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_int64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_int64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_int64(table) + return out diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in new file mode 100644 index 0000000000000..1840b914f3328 --- /dev/null +++ b/pandas/src/hashtable_func_helper.pxi.in @@ -0,0 +1,114 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + +{{py: + +# name +dtypes = ['float64', 'int64'] + +}} + +{{for dtype in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, + kh_{{dtype}}_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + {{dtype}}_t val + int ret = 0 + + with nogil: + kh_resize_{{dtype}}(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_{{dtype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{dtype}}(table, val, &ret) + table.vals[k] = 1 + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna): + cdef: + Py_ssize_t i=0 + kh_{{dtype}}_t *table + {{dtype}}_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_{{dtype}}() + build_count_table_{{dtype}}(values, table, dropna) + + result_keys = np.empty(table.n_occupied, dtype=np.{{dtype}}) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + + with nogil: + for k in range(table.n_buckets): + if kh_exist_{{dtype}}(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_{{dtype}}(table) + + return np.asarray(result_keys), np.asarray(result_counts) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_{{dtype}}({{dtype}}_t[:] values, + object keep='first'): + cdef: + int ret = 0, k + {{dtype}}_t value + Py_ssize_t i, n = len(values) + kh_{{dtype}}_t * table = kh_init_{{dtype}}() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_{{dtype}}(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_{{dtype}}(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_{{dtype}}(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_{{dtype}}(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_{{dtype}}(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_{{dtype}}(table) + return out + +{{endfor}} diff --git a/setup.py b/setup.py index 86777f5579a09..e81cae633427d 100755 --- a/setup.py +++ b/setup.py @@ -107,7 +107,8 @@ def is_platform_mac(): _pxipath = pjoin('pandas', 'src') _pxifiles = ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', - 'algos_join_helper.pxi.in', 'algos_take_helper.pxi.in'] + 'algos_join_helper.pxi.in', 'algos_take_helper.pxi.in', + 'hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in'] class build_ext(_build_ext): From 28b4b01a1eb48286812141f78d91151e71354d49 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 28 Jul 2016 19:43:39 -0400 Subject: [PATCH 185/359] BUG, COMPAT: Fix read_csv for multi-char sep and non-utf8 data in Python 2.x (#13812) Closes gh-3404. [ci skip] --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/io/parsers.py | 4 ++++ pandas/io/tests/parser/python_parser_only.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 11d2fab464d1f..59a1a5f063f3a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -788,3 +788,4 @@ Bug Fixes - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) +- Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bedf21318aa83..090d826a5c085 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1871,6 +1871,10 @@ class MyDialect(csv.Dialect): else: def _read(): line = f.readline() + + if compat.PY2 and self.encoding: + line = line.decode(self.encoding) + pat = re.compile(sep) yield pat.split(line.strip()) for line in f: diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index 0408401672a2f..ad81dbb9f6e0f 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -201,3 +201,19 @@ def test_skipfooter_with_decimal(self): result = self.read_csv(StringIO(data), names=['a'], decimal='#', skipfooter=1) tm.assert_frame_equal(result, expected) + + def test_encoding_non_utf8_multichar_sep(self): + # see gh-3404 + expected = DataFrame({'a': [1], 'b': [2]}) + + for sep in ['::', '#####', '!!!', '123', '#1!c5', + '%!c!d', '@@#4:2', '_!pd#_']: + data = '1' + sep + '2' + + for encoding in ['utf-16', 'utf-16-be', 'utf-16-le', + 'utf-32', 'cp037']: + encoded_data = data.encode(encoding) + result = self.read_csv(BytesIO(encoded_data), + sep=sep, names=['a', 'b'], + encoding=encoding) + tm.assert_frame_equal(result, expected) From dcb7bf722f619ec39a8766fc570ba82c4197fe14 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 28 Jul 2016 19:59:19 -0400 Subject: [PATCH 186/359] ENH: add Index.dropna closes #6194 Author: sinhrks Closes #13791 from sinhrks/index_dropna and squashes the following commits: 1672f26 [sinhrks] ENH: add Index.dropna --- doc/source/api.rst | 2 ++ doc/source/whatsnew/v0.19.0.txt | 57 +++++++++++++++++++++--------- pandas/indexes/base.py | 23 ++++++++++++ pandas/indexes/multi.py | 13 +++++++ pandas/tests/indexes/test_base.py | 41 +++++++++++++++++++++ pandas/tests/indexes/test_multi.py | 21 +++++++++++ 6 files changed, 141 insertions(+), 16 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 7b9fbb9b41a79..a510f663d19ee 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1349,6 +1349,8 @@ Modifying and Computations Index.unique Index.nunique Index.value_counts + Index.fillna + Index.dropna Conversion ~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 59a1a5f063f3a..496d3f91c27e1 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -259,6 +259,45 @@ Using the anchoring suffix, you can also specify the day of month to use instead pd.date_range('2015-01-01', freq='SM-14', periods=4) +.. _whatsnew_0190.enhancements.index: + +New Index methods +^^^^^^^^^^^^^^^^^ + +Following methods and options are added to ``Index`` to be more consistent with ``Series`` and ``DataFrame``. + +- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) + + .. ipython:: python + + idx = pd.Index(['a', 'b', 'c']) + idx.where([True, False, True]) + + +- ``Index`` now supports ``.dropna`` to exclude missing values (:issue:`6194`) + + .. ipython:: python + + idx = pd.Index([1, 2, np.nan, 4]) + idx.dropna() + +For ``MultiIndex``, values are dropped if any level is missing by default. Specifying +``how='all'`` only drops values where all levels are missing. + + midx = pd.MultiIndex.from_arrays([[1, 2, np.nan, 4], + [1, 2, np.nan, np.nan]]) + midx + midx.dropna() + midx.dropna(how='all') + +- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) +- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here ` (:issue:`10008`, :issue:`13156`) + + .. ipython:: python + + idx = pd.Index(["a1a2", "b1", "c1"]) + idx.str.extractall("[ab](?P\d)") + .. _whatsnew_0190.enhancements.other: Other enhancements @@ -273,14 +312,8 @@ Other enhancements pd.to_numeric(s, downcast='unsigned') pd.to_numeric(s, downcast='integer') -- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here ` (:issue:`10008`, :issue:`13156`) - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) - .. ipython:: python - - idx = pd.Index(["a1a2", "b1", "c1"]) - idx.str.extractall("[ab](?P\d)") - - ``Timestamp`` can now accept positional and keyword parameters similar to :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) .. ipython:: python @@ -295,14 +328,6 @@ Other enhancements - The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`) -- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) -- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) - - .. ipython:: python - - idx = pd.Index(['a', 'b', 'c']) - idx.where([True, False, True]) - - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) @@ -310,7 +335,7 @@ Other enhancements - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) -- ``DataFrame.to_sql `` now allows a single value as the SQL type for all columns (:issue:`11886`). +- ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`). - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) - ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`) @@ -350,7 +375,7 @@ API changes - ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) -- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) +- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) .. _whatsnew_0190.api.tolist: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index b5ce456bda254..32bcb0bcc732f 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -3243,6 +3243,29 @@ def fillna(self, value=None, downcast=None): return Index(result, name=self.name) return self._shallow_copy() + _index_shared_docs['dropna'] = """ + Return Index without NA/NaN values + + Parameters + ---------- + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. + + Returns + ------- + valid : Index + """ + + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + if how not in ('any', 'all'): + raise ValueError("invalid how option: {0}".format(how)) + + if self.hasnans: + return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy() + def _evaluate_with_timedelta_like(self, other, op, opstr): raise TypeError("can only perform ops with timedelta like values") diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 184744915bd8d..95ef18d23a037 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -597,6 +597,19 @@ def fillna(self, value=None, downcast=None): # isnull is not implemented for MultiIndex raise NotImplementedError('isnull is not defined for MultiIndex') + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + nans = [label == -1 for label in self.labels] + if how == 'any': + indexer = np.any(nans, axis=0) + elif how == 'all': + indexer = np.all(nans, axis=0) + else: + raise ValueError("invalid how option: {0}".format(how)) + + new_labels = [label[~indexer] for label in self.labels] + return self.copy(labels=new_labels, deep=True) + def get_value(self, series, key): # somewhat broken encapsulation from pandas.core.indexing import maybe_droplevels diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0ddc71b01c22a..88e49c4b55c8a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1837,6 +1837,47 @@ def test_logical_compat(self): self.assertEqual(idx.all(), idx.values.all()) self.assertEqual(idx.any(), idx.values.any()) + def test_dropna(self): + # GH 6194 + for dtype in [None, object, 'category']: + idx = pd.Index([1, 2, 3], dtype=dtype) + tm.assert_index_equal(idx.dropna(), idx) + + idx = pd.Index([1., 2., 3.], dtype=dtype) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.Index([1., 2., np.nan, 3.], dtype=dtype) + tm.assert_index_equal(nanidx.dropna(), idx) + + idx = pd.Index(['A', 'B', 'C'], dtype=dtype) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.Index(['A', np.nan, 'B', 'C'], dtype=dtype) + tm.assert_index_equal(nanidx.dropna(), idx) + + tm.assert_index_equal(nanidx.dropna(how='any'), idx) + tm.assert_index_equal(nanidx.dropna(how='all'), idx) + + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03']) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03', pd.NaT]) + tm.assert_index_equal(nanidx.dropna(), idx) + + idx = pd.TimedeltaIndex(['1 days', '2 days', '3 days']) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.TimedeltaIndex([pd.NaT, '1 days', '2 days', + '3 days', pd.NaT]) + tm.assert_index_equal(nanidx.dropna(), idx) + + idx = pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M') + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.PeriodIndex(['2012-02', '2012-04', 'NaT', '2012-05'], + freq='M') + tm.assert_index_equal(nanidx.dropna(), idx) + + msg = "invalid how option: xxx" + with tm.assertRaisesRegexp(ValueError, msg): + pd.Index([1, 2, 3]).dropna(how='xxx') + def test_get_combined_index(): from pandas.core.index import _get_combined_index diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 408f81fe1e982..809e1ab05ef6e 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2258,3 +2258,24 @@ def test_rangeindex_fallback_coercion_bug(self): result = df.index.get_level_values('buzz') expected = pd.Int64Index(np.tile(np.arange(10), 10), name='buzz') tm.assert_index_equal(result, expected) + + def test_dropna(self): + # GH 6194 + idx = pd.MultiIndex.from_arrays([[1, np.nan, 3, np.nan, 5], + [1, 2, np.nan, np.nan, 5], + ['a', 'b', 'c', np.nan, 'e']]) + + exp = pd.MultiIndex.from_arrays([[1, 5], + [1, 5], + ['a', 'e']]) + tm.assert_index_equal(idx.dropna(), exp) + tm.assert_index_equal(idx.dropna(how='any'), exp) + + exp = pd.MultiIndex.from_arrays([[1, np.nan, 3, 5], + [1, 2, np.nan, 5], + ['a', 'b', 'c', 'e']]) + tm.assert_index_equal(idx.dropna(how='all'), exp) + + msg = "invalid how option: xxx" + with tm.assertRaisesRegexp(ValueError, msg): + idx.dropna(how='xxx') From 5b0d947e8079a438bdac9490efa97227a9145f9f Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 28 Jul 2016 20:10:21 -0400 Subject: [PATCH 187/359] BUG: provide chunks with progressively numbered (default) indices closes #12185 Notice the test I fix was indeed wrong - I had written that line as a workaround waiting for this fix. Author: Pietro Battiston Closes #12289 from toobaz/csvstate and squashes the following commits: 381e3b3 [Pietro Battiston] BUG: provide chunks with progressively numbered (default) indices --- doc/source/whatsnew/v0.19.0.txt | 34 ++++++++++++++++++++++++++ pandas/io/parsers.py | 15 +++++++++++- pandas/io/tests/parser/common.py | 12 +++++++++ pandas/io/tests/parser/test_network.py | 4 --- pandas/io/tests/test_common.py | 1 - 5 files changed, 60 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 496d3f91c27e1..2c0c75e988a4d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -626,6 +626,40 @@ New Behavior: idx1.difference(idx2) idx1.symmetric_difference(idx2) +.. _whatsnew_0190.api.autogenerated_chunksize_index: + +:func:`read_csv` called with ``chunksize`` will progressively enumerate chunks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When :func:`read_csv` is called with ``chunksize='n'`` and without specifying an index, +each chunk used to have an independently generated index from `0`` to ``n-1``. +They are now given instead a progressive index, starting from ``0`` for the first chunk, +from ``n`` for the second, and so on, so that, when concatenated, they are identical to +the result of calling :func:`read_csv` without the ``chunksize=`` argument. +(:issue:`12185`) + +.. ipython :: python + + data = 'A,B\n0,1\n2,3\n4,5\n6,7' + +Previous behaviour: + +.. code-block:: ipython + + In [2]: pd.concat(pd.read_csv(StringIO(data), chunksize=2)) + Out[2]: + A B + 0 0 1 + 1 2 3 + 0 4 5 + 1 6 7 + +New behaviour: + +.. ipython :: python + + pd.concat(pd.read_csv(StringIO(data), chunksize=2)) + .. _whatsnew_0190.deprecations: Deprecations diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 090d826a5c085..353bddbed3566 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -16,7 +16,7 @@ is_list_like, is_integer_dtype, is_float, is_scalar) -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, MultiIndex, RangeIndex from pandas.core.frame import DataFrame from pandas.core.common import AbstractMethodError from pandas.core.config import get_option @@ -700,6 +700,7 @@ def __init__(self, f, engine=None, **kwds): # miscellanea self.engine = engine self._engine = None + self._currow = 0 options = self._get_options_with_defaults(engine) @@ -913,8 +914,20 @@ def read(self, nrows=None): # May alter columns / col_dict index, columns, col_dict = self._create_index(ret) + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(compat.next(compat.itervalues(col_dict))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 + else: + new_rows = len(index) + df = DataFrame(col_dict, columns=columns, index=index) + self._currow += new_rows + if self.squeeze and len(df.columns) == 1: return df[df.columns[0]].copy() return df diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 11eed79e03267..f3adb0e39982c 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -461,6 +461,18 @@ def test_get_chunk_passed_chunksize(self): piece = result.get_chunk() self.assertEqual(len(piece), 2) + def test_read_chunksize_generated_index(self): + # GH 12185 + reader = self.read_csv(StringIO(self.data1), chunksize=2) + df = self.read_csv(StringIO(self.data1)) + + tm.assert_frame_equal(pd.concat(reader), df) + + reader = self.read_csv(StringIO(self.data1), chunksize=2, index_col=0) + df = self.read_csv(StringIO(self.data1), index_col=0) + + tm.assert_frame_equal(pd.concat(reader), df) + def test_read_text_list(self): data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index d5370db4b55db..8b8a6de36fc03 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -122,8 +122,6 @@ def test_parse_public_s3_bucket_chunked(self): self.assertFalse(df.empty) true_df = local_tips.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] - # Chunking doesn't preserve row numbering - true_df = true_df.reset_index().drop('index', axis=1) tm.assert_frame_equal(true_df, df) @tm.network @@ -143,8 +141,6 @@ def test_parse_public_s3_bucket_chunked_python(self): self.assertFalse(df.empty) true_df = local_tips.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] - # Chunking doesn't preserve row numbering - true_df = true_df.reset_index().drop('index', axis=1) tm.assert_frame_equal(true_df, df) @tm.network diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 0acf3244fe8fa..a443df5dac586 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -86,7 +86,6 @@ def test_iterator(self): it = read_csv(StringIO(self.data1), chunksize=1) first = next(it) tm.assert_frame_equal(first, expected.iloc[[0]]) - expected.index = [0 for i in range(len(expected))] tm.assert_frame_equal(concat(it), expected.iloc[1:]) From e9087339cd6ee67418a54e11f2a88df4e8c0f690 Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Mon, 25 Jul 2016 16:13:16 -0700 Subject: [PATCH 188/359] BUG: fix categories in HDFStore not filtering correctly closes #13322 closes #13792 --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/computation/pytables.py | 5 +++++ pandas/io/tests/test_pytables.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2c0c75e988a4d..392d58b3ef98a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -816,6 +816,7 @@ Bug Fixes - Clean some compile time warnings in datetime parsing (:issue:`13607`) - Bug in ``factorize`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13750`) - Bug in ``.set_index`` raises ``AmbiguousTimeError`` if new index contains DST boundary and multi levels (:issue:`12920`) +- Bug in ``pd.read_hdf()`` returns incorrect result when a ``DataFrame`` with a ``categorical`` column and a query which doesn't match any values (:issue:`13792`) - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index e375716b0d606..a4dd03a0fa7ee 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -198,6 +198,11 @@ def stringify(value): elif meta == u('category'): metadata = com._values_from_object(self.metadata) result = metadata.searchsorted(v, side='left') + + # result returns 0 if v is first element or if v is not in metadata + # check that metadata contains v + if not result and v not in metadata: + result = -1 return TermValue(result, result, u('integer')) elif kind == u('integer'): v = int(float(v)) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index f95e764ad4da3..e214ea5237f30 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4733,6 +4733,36 @@ def test_categorical(self): self.assertRaises( KeyError, lambda: store.select('df3/meta/s/meta')) + def test_categorical_conversion(self): + + # GH13322 + # Check that read_hdf with categorical columns doesn't return rows if + # where criteria isn't met. + obsids = ['ESP_012345_6789', 'ESP_987654_3210'] + imgids = ['APF00006np', 'APF0001imm'] + data = [4.3, 9.8] + + # Test without categories + df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data)) + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = read_hdf(path, 'df', where='obsids=B') + tm.assert_frame_equal(result, expected) + + # Test with categories + df.obsids = df.obsids.astype('category') + df.imgids = df.imgids.astype('category') + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = read_hdf(path, 'df', where='obsids=B') + tm.assert_frame_equal(result, expected) + def test_duplicate_column_name(self): df = DataFrame(columns=["a", "a"], data=[[0, 0]]) From aa88215a3f6390096a022bdc9401e7d215677006 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 28 Jul 2016 20:21:55 -0400 Subject: [PATCH 189/359] API: Deprecate skip_footer in read_csv Title is self-explanatory. Closes gh-13349 and partially undoes this commit back in `v0.9.0`. With such a massive API now, having duplicate arguments makes managing it way less practical. Author: gfyoung Closes #13386 from gfyoung/deprecate-dup-skipfooter and squashes the following commits: d21345f [gfyoung] API: Deprecate skip_footer in read_csv --- doc/source/io.rst | 4 +- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/io/excel.py | 2 +- pandas/io/parsers.py | 43 +++++++++++--------- pandas/io/tests/parser/common.py | 12 +++--- pandas/io/tests/parser/python_parser_only.py | 4 +- pandas/io/tests/parser/test_unsupported.py | 13 ++++-- pandas/parser.pyx | 16 ++++---- 8 files changed, 54 insertions(+), 41 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index e3b03b5a39b37..ee5734aaf9494 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -175,6 +175,8 @@ skiprows : list-like or integer, default ``None`` of the file. skipfooter : int, default ``0`` Number of lines at bottom of file to skip (unsupported with engine='c'). +skip_footer : int, default ``0`` + DEPRECATED: use the ``skipfooter`` parameter instead, as they are identical nrows : int, default ``None`` Number of rows of file to read. Useful for reading pieces of large files. low_memory : boolean, default ``True`` @@ -1411,7 +1413,7 @@ back to python if C-unsupported options are specified. Currently, C-unsupported options include: - ``sep`` other than a single character (e.g. regex separators) -- ``skip_footer`` +- ``skipfooter`` - ``sep=None`` with ``delim_whitespace=False`` Specifying any of the above options will produce a ``ParserWarning`` unless the diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 392d58b3ef98a..03f8dbc20b52e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -671,6 +671,7 @@ Deprecations - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) - ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) +- ``skip_footer`` has been deprecated in ``pd.read_csv()`` in favor of ``skipfooter`` and will be removed in a future version (:issue:`13349`) - top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) - ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) - ``pd.tseries.util.pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 703cdbeaa7a8f..b415661c99438 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -473,7 +473,7 @@ def _parse_cell(cell_contents, cell_typ): parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, - skip_footer=skip_footer, + skipfooter=skip_footer, squeeze=squeeze, **kwds) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 353bddbed3566..abbe7bdf18461 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -125,6 +125,8 @@ at the start of the file skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with engine='c') +skip_footer : int, default 0 + DEPRECATED: use the `skipfooter` parameter instead, as they are identical nrows : int, default None Number of rows of file to read. Useful for reading pieces of large files na_values : str or list-like or dict, default None @@ -341,9 +343,6 @@ def _validate_nrows(nrows): def _read(filepath_or_buffer, kwds): "Generic reader of line files." encoding = kwds.get('encoding', None) - skipfooter = kwds.pop('skipfooter', None) - if skipfooter is not None: - kwds['skip_footer'] = skipfooter # If the input could be a filename, check for a recognizable compression # extension. If we're reading from a URL, the `get_filepath_or_buffer` @@ -411,8 +410,8 @@ def _read(filepath_or_buffer, kwds): 'na_values': None, 'true_values': None, 'false_values': None, - 'skip_footer': 0, 'converters': None, + 'skipfooter': 0, 'keep_default_na': True, 'thousands': None, @@ -461,7 +460,7 @@ def _read(filepath_or_buffer, kwds): 'widths': None, } -_c_unsupported = set(['skip_footer']) +_c_unsupported = set(['skipfooter']) _python_unsupported = set([ 'low_memory', 'buffer_lines', @@ -503,7 +502,6 @@ def parser_f(filepath_or_buffer, false_values=None, skipinitialspace=False, skiprows=None, - skipfooter=None, nrows=None, # NA and Missing Data Handling @@ -541,8 +539,8 @@ def parser_f(filepath_or_buffer, error_bad_lines=True, warn_bad_lines=True, - # Deprecated - skip_footer=0, + skipfooter=0, + skip_footer=0, # deprecated # Internal doublequote=True, @@ -570,6 +568,13 @@ def parser_f(filepath_or_buffer, engine = 'c' engine_specified = False + if skip_footer != 0: + warnings.warn("The 'skip_footer' argument has " + "been deprecated and will be removed " + "in a future version. Please use the " + "'skipfooter' argument instead.", + FutureWarning, stacklevel=2) + kwds = dict(delimiter=delimiter, engine=engine, dialect=dialect, @@ -768,9 +773,9 @@ def _clean_options(self, options, engine): # C engine not supported yet if engine == 'c': - if options['skip_footer'] > 0: + if options['skipfooter'] > 0: fallback_reason = "the 'c' engine does not support"\ - " skip_footer" + " skipfooter" engine = 'python' if sep is None and not delim_whitespace: @@ -903,8 +908,8 @@ def _failover_to_python(self): def read(self, nrows=None): if nrows is not None: - if self.options.get('skip_footer'): - raise ValueError('skip_footer not supported for iteration') + if self.options.get('skipfooter'): + raise ValueError('skipfooter not supported for iteration') ret = self._engine.read(nrows) @@ -1591,7 +1596,7 @@ def TextParser(*args, **kwds): date_parser : function, default None skiprows : list of integers Row numbers to skip - skip_footer : int + skipfooter : int Number of line at bottom of file to skip converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -1704,7 +1709,7 @@ def __init__(self, f, **kwds): self.memory_map = kwds['memory_map'] self.skiprows = kwds['skiprows'] - self.skip_footer = kwds['skip_footer'] + self.skipfooter = kwds['skipfooter'] self.delimiter = kwds['delimiter'] self.quotechar = kwds['quotechar'] @@ -2340,7 +2345,7 @@ def _rows_to_cols(self, content): content, min_width=col_len).T) zip_len = len(zipped_content) - if self.skip_footer < 0: + if self.skipfooter < 0: raise ValueError('skip footer cannot be negative') # Loop through rows to verify lengths are correct. @@ -2353,8 +2358,8 @@ def _rows_to_cols(self, content): break footers = 0 - if self.skip_footer: - footers = self.skip_footer + if self.skipfooter: + footers = self.skipfooter row_num = self.pos - (len(content) - i + footers) @@ -2440,8 +2445,8 @@ def _get_lines(self, rows=None): else: lines = new_rows - if self.skip_footer: - lines = lines[:-self.skip_footer] + if self.skipfooter: + lines = lines[:-self.skipfooter] lines = self._check_comments(lines) if self.skip_blank_lines: diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index f3adb0e39982c..7e9513c0bcff3 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -218,9 +218,9 @@ def test_malformed(self): skiprows=[2]) it.read() - # skip_footer is not supported with the C parser yet + # skipfooter is not supported with the C parser yet if self.engine == 'python': - # skip_footer + # skipfooter data = """ignore A,B,C 1,2,3 # comment @@ -232,7 +232,7 @@ def test_malformed(self): with tm.assertRaisesRegexp(Exception, msg): self.read_table(StringIO(data), sep=',', header=1, comment='#', - skip_footer=1) + skipfooter=1) def test_quoting(self): bad_line_small = """printer\tresult\tvariant_name @@ -536,11 +536,11 @@ def test_iterator(self): self.assertEqual(len(result), 3) tm.assert_frame_equal(pd.concat(result), expected) - # skip_footer is not supported with the C parser yet + # skipfooter is not supported with the C parser yet if self.engine == 'python': - # test bad parameter (skip_footer) + # test bad parameter (skipfooter) reader = self.read_csv(StringIO(self.data1), index_col=0, - iterator=True, skip_footer=True) + iterator=True, skipfooter=True) self.assertRaises(ValueError, reader.read, 3) def test_pass_names_with_index(self): diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index ad81dbb9f6e0f..619b6b63568f3 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -98,7 +98,7 @@ def test_single_line(self): finally: sys.stdout = sys.__stdout__ - def test_skip_footer(self): + def test_skipfooter(self): # see gh-6607 data = """A,B,C 1,2,3 @@ -107,7 +107,7 @@ def test_skip_footer(self): want to skip this also also skip this """ - result = self.read_csv(StringIO(data), skip_footer=2) + result = self.read_csv(StringIO(data), skipfooter=2) no_footer = '\n'.join(data.split('\n')[:-3]) expected = self.read_csv(StringIO(no_footer)) tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index c8ad46af10795..ef8f7967193ff 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -52,7 +52,7 @@ def test_c_engine(self): with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), sep='\s', dtype={'a': float}) with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), skip_footer=1, dtype={'a': float}) + read_table(StringIO(data), skipfooter=1, dtype={'a': float}) # specify C engine with unsupported options (raise) with tm.assertRaisesRegexp(ValueError, msg): @@ -61,7 +61,7 @@ def test_c_engine(self): with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', sep='\s') with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), engine='c', skip_footer=1) + read_table(StringIO(data), engine='c', skipfooter=1) # specify C-unsupported options without python-unsupported options with tm.assert_produces_warning(parsers.ParserWarning): @@ -69,7 +69,7 @@ def test_c_engine(self): with tm.assert_produces_warning(parsers.ParserWarning): read_table(StringIO(data), sep='\s') with tm.assert_produces_warning(parsers.ParserWarning): - read_table(StringIO(data), skip_footer=1) + read_table(StringIO(data), skipfooter=1) text = """ A B C D E one two three four @@ -127,6 +127,7 @@ def test_deprecated_args(self): 'as_recarray': True, 'buffer_lines': True, 'compact_ints': True, + 'skip_footer': True, 'use_unsigned': True, } @@ -134,8 +135,12 @@ def test_deprecated_args(self): for engine in engines: for arg, non_default_val in deprecated.items(): + if engine == 'c' and arg == 'skip_footer': + # unsupported --> exception is raised + continue + if engine == 'python' and arg == 'buffer_lines': - # unsupported --> exception is raised first + # unsupported --> exception is raised continue with tm.assert_produces_warning( diff --git a/pandas/parser.pyx b/pandas/parser.pyx index b5d1c8b7acf2c..e72e2f90a5213 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -165,7 +165,7 @@ cdef extern from "parser/tokenizer.h": void *skipset int64_t skip_first_N_rows - int skip_footer + int skipfooter double (*converter)(const char *, char **, char, char, char, int) nogil # error handling @@ -270,7 +270,7 @@ cdef class TextReader: kh_str_t *true_set cdef public: - int leading_cols, table_width, skip_footer, buffer_lines + int leading_cols, table_width, skipfooter, buffer_lines object allow_leading_cols object delimiter, converters, delim_whitespace object na_values @@ -338,7 +338,7 @@ cdef class TextReader: low_memory=False, buffer_lines=None, skiprows=None, - skip_footer=0, + skipfooter=0, verbose=False, mangle_dupe_cols=True, tupleize_cols=False, @@ -418,7 +418,7 @@ cdef class TextReader: if skiprows is not None: self._make_skiprow_set() - self.skip_footer = skip_footer + self.skipfooter = skipfooter # suboptimal if usecols is not None: @@ -426,7 +426,7 @@ cdef class TextReader: self.usecols = set(usecols) # XXX - if skip_footer > 0: + if skipfooter > 0: self.parser.error_bad_lines = 0 self.parser.warn_bad_lines = 0 @@ -912,8 +912,8 @@ cdef class TextReader: if buffered_lines < irows: self._tokenize_rows(irows - buffered_lines) - if self.skip_footer > 0: - raise ValueError('skip_footer can only be used to read ' + if self.skipfooter > 0: + raise ValueError('skipfooter can only be used to read ' 'the whole file') else: with nogil: @@ -926,7 +926,7 @@ cdef class TextReader: if status < 0: raise_parser_error('Error tokenizing data', self.parser) - footer = self.skip_footer + footer = self.skipfooter if self.parser_start == self.parser.lines: raise StopIteration From 2c55f28fa7e6468d347894f26d6ecddcfada9cd9 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Fri, 29 Jul 2016 09:28:21 +0900 Subject: [PATCH 190/359] DOC: Fix groupby nth (#13810) --- pandas/core/groupby.py | 55 ++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f14f9b5dd24af..c2ab406e1da65 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1205,32 +1205,55 @@ def nth(self, n, dropna=None): Examples -------- - >>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + + >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], + ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) >>> g = df.groupby('A') >>> g.nth(0) - A B - 0 1 NaN - 2 5 6 + B + A + 1 NaN + 2 3.0 >>> g.nth(1) - A B - 1 1 4 + B + A + 1 2.0 + 2 5.0 >>> g.nth(-1) - A B - 1 1 4 - 2 5 6 + B + A + 1 4.0 + 2 5.0 + >>> g.nth([0, 1]) + B + A + 1 NaN + 1 2.0 + 2 3.0 + 2 5.0 + + Specifying ``dropna`` allows count ignoring NaN + >>> g.nth(0, dropna='any') - B - A - 1 4 - 5 6 + B + A + 1 2.0 + 2 3.0 NaNs denote group exhausted when using dropna - >>> g.nth(1, dropna='any') + >>> g.nth(3, dropna='any') B - A + A 1 NaN - 5 NaN + 2 NaN + + Specifying ``as_index=False`` in ``groupby`` keeps the original index. + + >>> df.groupby('A', as_index=False).nth(1) + A B + 1 1 2.0 + 4 2 5.0 """ if isinstance(n, int): From 748787dc079d7f8588bc3064c6b94cc49b6e5036 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 29 Jul 2016 03:56:20 -0400 Subject: [PATCH 191/359] CLN: Removed the kwds param in to_csv (#13804) Closes gh-8206. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/frame.py | 4 +--- pandas/core/series.py | 6 +++--- pandas/tests/formats/test_format.py | 27 --------------------------- pandas/tests/frame/test_to_csv.py | 2 +- 5 files changed, 6 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 03f8dbc20b52e..96f25144225ed 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -693,6 +693,7 @@ Removal of prior version deprecations/changes - ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) - ``pd.Index`` has dropped the ``diff`` method in favour of ``difference`` (:issue:`13669`) +- ``Series.to_csv`` has dropped the ``nanRep`` parameter in favor of ``na_rep`` (:issue:`13804`) - ``Series.xs``, ``DataFrame.xs``, ``Panel.xs``, ``Panel.major_xs``, and ``Panel.minor_xs`` have dropped the ``copy`` parameter (:issue:`13781`) - ``str.split`` has dropped the ``return_type`` parameter in favor of ``expand`` (:issue:`13701`) - Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4ffd9c5466b6c..0bf59403075af 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1305,7 +1305,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, - escapechar=None, decimal='.', **kwds): + escapechar=None, decimal='.'): r"""Write DataFrame to a comma-separated values (csv) file Parameters @@ -1332,8 +1332,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, sequence should be given if the DataFrame uses MultiIndex. If False do not print fields for index names. Use index_label=False for easier importing in R - nanRep : None - deprecated, use na_rep mode : str Python write mode, default 'w' encoding : string, optional diff --git a/pandas/core/series.py b/pandas/core/series.py index e1cff96b9741e..e388683012a66 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2534,8 +2534,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, return result def to_csv(self, path, index=True, sep=",", na_rep='', float_format=None, - header=False, index_label=None, mode='w', nanRep=None, - encoding=None, date_format=None, decimal='.'): + header=False, index_label=None, mode='w', encoding=None, + date_format=None, decimal='.'): """ Write Series to a comma-separated values (csv) file @@ -2572,7 +2572,7 @@ def to_csv(self, path, index=True, sep=",", na_rep='', float_format=None, # result is only a string if no path provided, otherwise None result = df.to_csv(path, index=index, sep=sep, na_rep=na_rep, float_format=float_format, header=header, - index_label=index_label, mode=mode, nanRep=nanRep, + index_label=index_label, mode=mode, encoding=encoding, date_format=date_format, decimal=decimal) if path is None: diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 7a282e7eb14ad..1580a33fb9456 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3139,10 +3139,6 @@ def test_to_csv_quotechar(self): df.to_csv(path, quoting=1) # 1=QUOTE_ALL with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) expected = """\ $$,$col$ @@ -3154,17 +3150,10 @@ def test_to_csv_quotechar(self): df.to_csv(path, quoting=1, quotechar="$") with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, quotechar="$", engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) with tm.ensure_clean('test.csv') as path: with tm.assertRaisesRegexp(TypeError, 'quotechar'): df.to_csv(path, quoting=1, quotechar=None) - with tm.ensure_clean('test.csv') as path: - with tm.assertRaisesRegexp(TypeError, 'quotechar'): - df.to_csv(path, quoting=1, quotechar=None, engine='python') def test_to_csv_doublequote(self): df = DataFrame({'col': ['a"a', '"bb"']}) @@ -3178,18 +3167,11 @@ def test_to_csv_doublequote(self): df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, doublequote=True, engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) from _csv import Error with tm.ensure_clean('test.csv') as path: with tm.assertRaisesRegexp(Error, 'escapechar'): df.to_csv(path, doublequote=False) # no escapechar set - with tm.ensure_clean('test.csv') as path: - with tm.assertRaisesRegexp(Error, 'escapechar'): - df.to_csv(path, doublequote=False, engine='python') def test_to_csv_escapechar(self): df = DataFrame({'col': ['a"a', '"bb"']}) @@ -3203,11 +3185,6 @@ def test_to_csv_escapechar(self): df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, doublequote=False, escapechar='\\', - engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) df = DataFrame({'col': ['a,a', ',bb,']}) expected = """\ @@ -3220,10 +3197,6 @@ def test_to_csv_escapechar(self): df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=3, escapechar='\\', engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) def test_csv_to_string(self): df = DataFrame({'col': [1, 2]}) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 55c7ebb183ce5..43c8d6f25ab01 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -896,7 +896,7 @@ def test_to_csv_path_is_none(self): # GH 8215 # Make sure we return string for consistency with # Series.to_csv() - csv_str = self.frame.to_csv(path=None) + csv_str = self.frame.to_csv(path_or_buf=None) self.assertIsInstance(csv_str, str) recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) From 54b2777089df8d723fabfb28d8a1759a388b95a3 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Fri, 29 Jul 2016 06:10:22 -0400 Subject: [PATCH 192/359] BUG: group_shift_indexer checks for null group keys closes #13813 Author: Ivan Nazarov Closes #13819 from ivannz/issue13813fix and squashes the following commits: bddf799 [Ivan Nazarov] Switched from float('nan') to np.nan eab8038 [Ivan Nazarov] Added bugfix description [ci skip] d92cf3c [Ivan Nazarov] minor flake8 style corrections 94bae0b [Ivan Nazarov] Patched the template, and added a test for '.shift()' fe2f0ec [Ivan Nazarov] Treat incomplete group keys as distinct when shifting 966d5c6 [Ivan Nazarov] BUG: group_shift_indexer checks for null group keys --- doc/source/whatsnew/v0.19.0.txt | 38 ++++++++++++++------------ pandas/src/algos_groupby_helper.pxi | 6 ++++ pandas/src/algos_groupby_helper.pxi.in | 6 ++++ pandas/tests/test_groupby.py | 21 ++++++++++++++ 4 files changed, 54 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 96f25144225ed..62091d7ff03ff 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -266,37 +266,40 @@ New Index methods Following methods and options are added to ``Index`` to be more consistent with ``Series`` and ``DataFrame``. -- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) +``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) - .. ipython:: python +.. ipython:: python - idx = pd.Index(['a', 'b', 'c']) - idx.where([True, False, True]) + idx = pd.Index(['a', 'b', 'c']) + idx.where([True, False, True]) -- ``Index`` now supports ``.dropna`` to exclude missing values (:issue:`6194`) +``Index`` now supports ``.dropna`` to exclude missing values (:issue:`6194`) - .. ipython:: python +.. ipython:: python - idx = pd.Index([1, 2, np.nan, 4]) - idx.dropna() + idx = pd.Index([1, 2, np.nan, 4]) + idx.dropna() For ``MultiIndex``, values are dropped if any level is missing by default. Specifying ``how='all'`` only drops values where all levels are missing. - midx = pd.MultiIndex.from_arrays([[1, 2, np.nan, 4], +.. ipython:: python + + midx = pd.MultiIndex.from_arrays([[1, 2, np.nan, 4], [1, 2, np.nan, np.nan]]) - midx - midx.dropna() - midx.dropna(how='all') + midx + midx.dropna() + midx.dropna(how='all') -- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) -- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here ` (:issue:`10008`, :issue:`13156`) +``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here ` (:issue:`10008`, :issue:`13156`) - .. ipython:: python +.. ipython:: python + + idx = pd.Index(["a1a2", "b1", "c1"]) + idx.str.extractall("[ab](?P\d)") - idx = pd.Index(["a1a2", "b1", "c1"]) - idx.str.extractall("[ab](?P\d)") +``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) .. _whatsnew_0190.enhancements.other: @@ -736,6 +739,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi index fb86c4efb7314..013a03f719bbd 100644 --- a/pandas/src/algos_groupby_helper.pxi +++ b/pandas/src/algos_groupby_helper.pxi @@ -1356,6 +1356,12 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, ## reverse iterator if shifting backwards ii = offset + sign * i lab = labels[ii] + + # Skip null keys + if lab == -1: + out[ii] = -1 + continue + label_seen[lab] += 1 idxer_slot = label_seen[lab] % periods diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index 6b9d8f07587bc..5c704436ce3a0 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -700,6 +700,12 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, ## reverse iterator if shifting backwards ii = offset + sign * i lab = labels[ii] + + # Skip null keys + if lab == -1: + out[ii] = -1 + continue + label_seen[lab] += 1 idxer_slot = label_seen[lab] % periods diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 3f5b4152afe31..268dcfc5744c1 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6560,6 +6560,27 @@ def test_grouping_string_repr(self): expected = "Grouping(('A', 'a'))" tm.assert_equal(result, expected) + def test_group_shift_with_null_key(self): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g.grouper.group_info` exactly + # at those places, where the group-by key is partilly missing. + df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) + for i in range(n_rows)], dtype=float, + columns=["A", "B", "Z"], index=None) + g = df.groupby(["A", "B"]) + + expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 + else np.nan) + for i in range(n_rows)], dtype=float, + columns=["Z"], index=None) + result = g.shift(-1) + + assert_frame_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() From 59f25575f42e19f066dc3cd2245983071e2d7813 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 29 Jul 2016 06:23:43 -0400 Subject: [PATCH 193/359] ENH: union_categorical supports identical categories with ordered xref #13410, #13524 Author: sinhrks Closes #13763 from sinhrks/union_categoricals_ordered and squashes the following commits: 9cadc4e [sinhrks] ENH: union_categorical supports identical categories with ordered --- doc/source/categorical.rst | 7 ++-- pandas/tools/tests/test_concat.py | 61 +++++++++++++++++++++++++++---- pandas/types/concat.py | 23 ++++++++++-- 3 files changed, 76 insertions(+), 15 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index f0e01ddc3fc2d..da9c707e07552 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -669,9 +669,10 @@ will be the union of the categories being combined. .. note:: - `union_categoricals` only works with unordered categoricals - and will raise if any are ordered. - + In addition to the "easy" case of combining two categoricals of the same + categories and order information (e.g. what you could also ``append`` for), + ``union_categoricals`` only works with unordered categoricals and will + raise if any are ordered. Getting Data In/Out ------------------- diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 2a8b0a47c283a..dd5b4936c70bb 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -872,23 +872,26 @@ def test_union_categorical(self): # new categories ordered by appearance s = Categorical(['x', 'y', 'z']) s2 = Categorical(['a', 'b', 'c']) - result = union_categoricals([s, s2]).categories - expected = Index(['x', 'y', 'z', 'a', 'b', 'c']) - tm.assert_index_equal(result, expected) + result = union_categoricals([s, s2]) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) - # can't be ordered s = Categorical([0, 1.2, 2], ordered=True) s2 = Categorical([0, 1.2, 2], ordered=True) - with tm.assertRaises(TypeError): - union_categoricals([s, s2]) + result = union_categoricals([s, s2]) + expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) + tm.assert_categorical_equal(result, expected) # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) - with tm.assertRaises(TypeError): + msg = 'dtype of categories must be the same' + with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([s, s2]) - with tm.assertRaises(ValueError): + msg = 'No Categoricals to union' + with tm.assertRaisesRegexp(ValueError, msg): union_categoricals([]) def test_union_categoricals_nan(self): @@ -944,6 +947,48 @@ def test_union_categoricals_empty(self): pd.Categorical([])]) tm.assert_categorical_equal(res, nanc) + def test_union_categorical_same_category(self): + # check fastpath + c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], + categories=[1, 2, 3, 4]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) + c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) + res = union_categoricals([c1, c2]) + exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], + categories=['x', 'y', 'z']) + tm.assert_categorical_equal(res, exp) + + def test_union_categoricals_ordered(self): + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], ordered=False) + + msg = 'Categorical.ordered must be the same' + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + + res = union_categoricals([c1, c1]) + exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3, np.nan], ordered=True) + c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + + msg = "to union ordered Categoricals, all categories must be the same" + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index c8af0ec62db86..e860ba3e201e9 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -231,8 +231,9 @@ def union_categoricals(to_union): Raises ------ TypeError - If any of the categoricals are ordered or all do not - have the same dtype + - all inputs do not have the same dtype + - all inputs do not have the same ordered property + - all inputs are ordered and their categories are not identical ValueError Emmpty list of categoricals passed """ @@ -242,13 +243,27 @@ def union_categoricals(to_union): raise ValueError('No Categoricals to union') first = to_union[0] - if any(c.ordered for c in to_union): - raise TypeError("Can only combine unordered Categoricals") if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") + if all(first.is_dtype_equal(other) for other in to_union[1:]): + return Categorical(np.concatenate([c.codes for c in to_union]), + categories=first.categories, ordered=first.ordered, + fastpath=True) + elif all(not c.ordered for c in to_union): + # not ordered + pass + else: + # to show a proper error message + if all(c.ordered for c in to_union): + msg = ("to union ordered Categoricals, " + "all categories must be the same") + raise TypeError(msg) + else: + raise TypeError('Categorical.ordered must be the same') + cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) From 28ec4cc08620c7d19818ebf7b7154c9a2bf99e42 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Tue, 2 Aug 2016 04:20:30 +0900 Subject: [PATCH 194/359] TST: use types.api in plotting (#13867) --- pandas/tests/plotting/common.py | 6 +++--- pandas/tests/plotting/test_frame.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index d80eb891c5bd6..faf16430fc94f 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -8,7 +8,7 @@ from pandas import DataFrame from pandas.compat import zip, iteritems, OrderedDict from pandas.util.decorators import cache_readonly -import pandas.core.common as com +from pandas.types.api import is_list_like import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, assert_is_valid_plot_return_object) @@ -157,7 +157,7 @@ def _check_visible(self, collections, visible=True): """ from matplotlib.collections import Collection if not isinstance(collections, - Collection) and not com.is_list_like(collections): + Collection) and not is_list_like(collections): collections = [collections] for patch in collections: @@ -242,7 +242,7 @@ def _check_text_labels(self, texts, expected): expected : str or list-like which has the same length as texts expected text label, or its list """ - if not com.is_list_like(texts): + if not is_list_like(texts): self.assertEqual(texts.get_text(), expected) else: labels = [t.get_text() for t in texts] diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 311da4a92e45a..11180c3e9b4f7 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1152,7 +1152,8 @@ def test_boxplot(self): # different warning on py3 if not PY3: - axes = _check_plot_works(df.plot.box, subplots=True, logy=True) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.box, subplots=True, logy=True) self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) self._check_ax_scales(axes, yaxis='log') From 78eadb7f61fa35c35fc26da3d7ba0e19421ecbb4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 1 Aug 2016 15:21:39 -0400 Subject: [PATCH 195/359] CLN: Removed / filled stub read_csv tests (#13864) --- pandas/io/tests/parser/common.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 7e9513c0bcff3..f8fc6c2bf78c3 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -149,11 +149,6 @@ def test_squeeze_no_view(self): result = self.read_csv(StringIO(data), index_col='time', squeeze=True) self.assertFalse(result._is_view) - def test_multiple_skts_example(self): - # TODO: Complete this - data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11." # noqa - pass - def test_malformed(self): # see gh-6607 @@ -290,8 +285,11 @@ def test_csv_mixed_type(self): b,3,4 c,4,5 """ - # TODO: complete this - df = self.read_csv(StringIO(data)) # noqa + expected = DataFrame({'A': ['a', 'b', 'c'], + 'B': [1, 3, 4], + 'C': [2, 4, 5]}) + out = self.read_csv(StringIO(data)) + tm.assert_frame_equal(out, expected) def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) From 51e6adb0e34f344ab8014c56577ad4fa16c828ef Mon Sep 17 00:00:00 2001 From: iamsimha Date: Tue, 2 Aug 2016 00:52:42 +0530 Subject: [PATCH 196/359] BUG: disallow 'w' mode in pd.read_hdf (#13623) (#13858) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/io/pytables.py | 4 ++++ pandas/io/tests/test_pytables.py | 11 ++++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 62091d7ff03ff..1d69579df21df 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -379,6 +379,7 @@ API changes - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) +- ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) .. _whatsnew_0190.api.tolist: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7503b21160250..b2da4218db99b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -303,6 +303,10 @@ def read_hdf(path_or_buf, key=None, **kwargs): """ + if kwargs.get('mode', 'a') not in ['r', 'r+', 'a']: + raise ValueError('mode {0} is not allowed while performing a read. ' + 'Allowed modes are r, r+ and a.' + .format(kwargs.get('mode'))) # grab the scope if 'where' in kwargs: kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index e214ea5237f30..e9ba80c3a026a 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -531,16 +531,25 @@ def f(): # conv read if mode in ['w']: - self.assertRaises(KeyError, read_hdf, + self.assertRaises(ValueError, read_hdf, path, 'df', mode=mode) else: result = read_hdf(path, 'df', mode=mode) assert_frame_equal(result, df) + def check_default_mode(): + + # read_hdf uses default mode + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', mode='w') + result = read_hdf(path, 'df') + assert_frame_equal(result, df) + check('r') check('r+') check('a') check('w') + check_default_mode() def test_reopen_handle(self): From 49243d6efcaf4c267b0b1bbab73d9f202f00557c Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Sun, 31 Jul 2016 20:10:15 +0300 Subject: [PATCH 197/359] BLD: Fixed pointer ambiguity in 'tokenizer.c' closes #13865 closes #13863 --- pandas/src/parser/tokenizer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index ac909f2c8bfdb..cc89fc51792dd 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1257,11 +1257,11 @@ int parser_trim_buffers(parser_t *self) { // `make_stream_space`. if (self->stream != newptr) { /* TRACE(("Moving word pointers\n")) */ - self->pword_start = newptr + self->word_start; + self->pword_start = (char*) newptr + self->word_start; for (i = 0; i < self->words_len; ++i) { - self->words[i] = newptr + self->word_starts[i]; + self->words[i] = (char*) newptr + self->word_starts[i]; } } From 9b2797d2aa7f0e810281daf8b0f4caab7438e6db Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Mon, 1 Aug 2016 16:56:54 -0400 Subject: [PATCH 198/359] BUG: Fix edge cases in merge_asof() by comparing factorized keys (#13709) (#13836) Also removes unnecessary check_duplicates. Added asv benchmarks for merge_asof() --- asv_bench/benchmarks/join_merge.py | 37 ++++++ doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/src/join.pyx | 170 ++++++-------------------- pandas/tools/merge.py | 25 +--- pandas/tools/tests/test_merge_asof.py | 55 +++------ 5 files changed, 100 insertions(+), 189 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index dcd07911f2ff0..86d5f84cb9b36 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -293,6 +293,43 @@ def time_join_dataframe_integer_key(self): merge(self.df, self.df2, on='key1') +class merge_asof_noby(object): + + def setup(self): + np.random.seed(0) + one_count = 200000 + two_count = 1000000 + self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), + 'value1': np.random.randn(one_count)}) + self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), + 'value2': np.random.randn(two_count)}) + self.df1 = self.df1.sort_values('time') + self.df2 = self.df2.sort_values('time') + + def time_merge_asof_noby(self): + merge_asof(self.df1, self.df2, on='time') + + +class merge_asof_by(object): + + def setup(self): + import string + np.random.seed(0) + one_count = 200000 + two_count = 1000000 + self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), + 'key': np.random.choice(list(string.uppercase), one_count), + 'value1': np.random.randn(one_count)}) + self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), + 'key': np.random.choice(list(string.uppercase), two_count), + 'value2': np.random.randn(two_count)}) + self.df1 = self.df1.sort_values('time') + self.df2 = self.df2.sort_values('time') + + def time_merge_asof_by(self): + merge_asof(self.df1, self.df2, on='time', by='key') + + class join_non_unique_equal(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 1d69579df21df..824bfa2c43b9a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -47,7 +47,7 @@ The following are now part of this API: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A long-time requested feature has been added through the :func:`merge_asof` function, to -support asof style joining of time-series. (:issue:`1870`, :issue:`13695`). Full documentation is +support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`). Full documentation is :ref:`here ` The :func:`merge_asof` performs an asof merge, which is similar to a left-join diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index ad3b1d4e4a90e..fbbef8a31071f 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -126,150 +126,56 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, - Py_ssize_t max_groups, sort=True, + Py_ssize_t max_groups, # ignored bint allow_exact_matches=1, - left_distance=None, - right_distance=None, + left_values=None, + right_values=None, tolerance=None): cdef: - Py_ssize_t i, j, k, count = 0 - Py_ssize_t loc, left_pos, right_pos, position - Py_ssize_t offset - ndarray[int64_t] left_count, right_count - ndarray left_sorter, right_sorter, rev + Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[int64_t] left_indexer, right_indexer - int64_t lc, rc, tol, left_val, right_val, diff, indexer - ndarray[int64_t] ld, rd - bint has_tol = 0 + bint has_tolerance = 0 + ndarray[int64_t] left_values_, right_values_ + int64_t tolerance_ # if we are using tolerance, set our objects - if left_distance is not None and right_distance is not None and tolerance is not None: - has_tol = 1 - ld = left_distance - rd = right_distance - tol = tolerance + if left_values is not None and right_values is not None and tolerance is not None: + has_tolerance = 1 + left_values_ = left_values + right_values_ = right_values + tolerance_ = tolerance - # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_size = len(left) + right_size = len(right) - # First pass, determine size of result set, do not use the NA group - for i in range(1, max_groups + 1): - if right_count[i] > 0: - count += left_count[i] * right_count[i] - else: - count += left_count[i] + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) - # group 0 is the NA group - left_pos = 0 right_pos = 0 - position = 0 - - # exclude the NA group - left_pos = left_count[0] - right_pos = right_count[0] - - left_indexer = np.empty(count, dtype=np.int64) - right_indexer = np.empty(count, dtype=np.int64) - - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] - - if rc == 0: - for j in range(lc): - indexer = position + j - left_indexer[indexer] = left_pos + j - - # take the most recent value - # if we are not the first - if right_pos: - - if has_tol: - - left_val = ld[left_pos + j] - right_val = rd[right_pos - 1] - diff = left_val - right_val - - # do we allow exact matches - if allow_exact_matches: - if diff > tol: - right_indexer[indexer] = -1 - continue - elif not allow_exact_matches: - if diff >= tol or lc == rc: - right_indexer[indexer] = -1 - continue - - right_indexer[indexer] = right_pos - 1 - else: - right_indexer[indexer] = -1 - position += lc + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and right[right_pos] <= left[left_pos]: + right_pos += 1 else: - for j in range(lc): - offset = position + j * rc - for k in range(rc): - - indexer = offset + k - left_indexer[indexer] = left_pos + j - - if has_tol: - - left_val = ld[left_pos + j] - right_val = rd[right_pos + k] - diff = left_val - right_val - - # do we allow exact matches - if allow_exact_matches: - if diff > tol: - right_indexer[indexer] = -1 - continue - - # we don't allow exact matches - elif not allow_exact_matches: - if diff >= tol or lc == rc: - right_indexer[indexer] = -1 - else: - right_indexer[indexer] = right_pos - 1 - continue - - else: - - # do we allow exact matches - if not allow_exact_matches: - - if right_pos: - right_indexer[indexer] = right_pos - 1 - else: - right_indexer[indexer] = -1 - continue - - right_indexer[indexer] = right_pos + k - position += lc * rc - left_pos += lc - right_pos += rc - - left_indexer = _get_result_indexer(left_sorter, left_indexer) - right_indexer = _get_result_indexer(right_sorter, right_indexer) - - if not sort: # if not asked to sort, revert to original order - if len(left) == len(left_indexer): - # no multiple matches for any row on the left - # this is a short-cut to avoid groupsort_indexer - # otherwise, the `else` path also works in this case - if left_sorter.dtype != np.int_: - left_sorter = left_sorter.astype(np.int_) - - rev = np.empty(len(left), dtype=np.int_) - rev.put(left_sorter, np.arange(len(left))) - else: - rev, _ = groupsort_indexer(left_indexer, len(left)) - - if rev.dtype != np.int_: - rev = rev.astype(np.int_) - right_indexer = right_indexer.take(rev) - left_indexer = left_indexer.take(rev) + while right_pos < right_size and right[right_pos] < left[left_pos]: + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != -1: + diff = left_values[left_pos] - right_values[right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 return left_indexer, right_indexer diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index e7d165354ec6c..9f8e27c4d8176 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -258,8 +258,7 @@ def merge_asof(left, right, on=None, by=None, suffixes=('_x', '_y'), tolerance=None, - allow_exact_matches=True, - check_duplicates=True): + allow_exact_matches=True): """Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. @@ -304,14 +303,6 @@ def merge_asof(left, right, on=None, - If False, don't match the same 'on' value (i.e., stricly less-than) - check_duplicates : boolean, default True - - - If True, check and remove duplicates for the right - DataFrame, on the [by, on] combination, keeping the last value. - - If False, no check for duplicates. If you *know* that - you don't have duplicates, then turning off the check for duplicates - can be more performant. - Returns ------- merged : DataFrame @@ -436,7 +427,7 @@ def _merger(x, y): if by is not None: result, groupby = _groupby_and_merge(by, on, left, right, lambda x, y: _merger(x, y), - check_duplicates=check_duplicates) + check_duplicates=False) # we want to preserve the original order # we had grouped, so need to reverse this @@ -446,20 +437,12 @@ def _merger(x, y): sorter = _ensure_platform_int( np.concatenate([groupby.indices[g] for g, _ in groupby])) if len(result) != len(sorter): - if check_duplicates: - raise AssertionError("invalid reverse grouping") return result rev = np.empty(len(sorter), dtype=np.int_) rev.put(sorter, np.arange(len(sorter))) return result.take(rev).reset_index(drop=True) - if check_duplicates: - if on is None: - on = [] - elif not isinstance(on, (list, tuple)): - on = [on] - if right.duplicated(on).any(): right = right.drop_duplicates(on, keep='last') @@ -1067,8 +1050,8 @@ def _get_join_indexers(self): lt = lt.view('i8') t = t.value rt = rt.view('i8') - kwargs['left_distance'] = lt - kwargs['right_distance'] = rt + kwargs['left_values'] = lt + kwargs['right_values'] = rt kwargs['tolerance'] = t return _get_join_indexers(self.left_join_keys, diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index bcbb0f0fadb49..e0c50cf3baaf7 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -184,43 +184,8 @@ def test_with_duplicates(self): expected = self.read_data('asof.csv') assert_frame_equal(result, expected) - result = merge_asof(self.trades, q, - on='time', - by='ticker', - check_duplicates=False) - expected = self.read_data('asof.csv') - expected = pd.concat([expected, expected]).sort_values( - ['time', 'ticker']).reset_index(drop=True) - - # the results are not ordered in a meaningful way - # nor are the exact matches duplicated, so comparisons - # are pretty tricky here, however the uniques are the same - - def aligner(x, ticker): - return (x[x.ticker == ticker] - .sort_values(['time', 'ticker', 'quantity', 'price', - 'marketCenter', 'bid', 'ask']) - .drop_duplicates(keep='last') - .reset_index(drop=True) - ) - - for ticker in expected.ticker.unique(): - r = aligner(result, ticker) - e = aligner(expected, ticker) - assert_frame_equal(r, e) - def test_with_duplicates_no_on(self): - df1 = pd.DataFrame({'key': [1, 1, 3], - 'left_val': [1, 2, 3]}) - df2 = pd.DataFrame({'key': [1, 3, 3], - 'right_val': [1, 2, 3]}) - result = merge_asof(df1, df2, on='key', check_duplicates=False) - expected = pd.DataFrame({'key': [1, 1, 3, 3], - 'left_val': [1, 2, 3, 3], - 'right_val': [1, 1, 2, 3]}) - assert_frame_equal(result, expected) - df1 = pd.DataFrame({'key': [1, 1, 3], 'left_val': [1, 2, 3]}) df2 = pd.DataFrame({'key': [1, 2, 2], @@ -379,6 +344,26 @@ def test_allow_exact_matches_and_tolerance2(self): 'version': [np.nan]}) assert_frame_equal(result, expected) + def test_allow_exact_matches_and_tolerance3(self): + # GH 13709 + df1 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030', + '2016-07-15 13:30:00.030']), + 'username': ['bob', 'charlie']}) + df2 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.000', + '2016-07-15 13:30:00.030']), + 'version': [1, 2]}) + + result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, + tolerance=pd.Timedelta('10ms')) + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030', + '2016-07-15 13:30:00.030']), + 'username': ['bob', 'charlie'], + 'version': [np.nan, np.nan]}) + assert_frame_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From b2a14459cf97af53ecc5edf51659ce3a9f1125b4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 1 Aug 2016 16:58:38 -0400 Subject: [PATCH 199/359] CLN: Removed colSpace parameter (#13857) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/frame.py | 31 ++++++++++--------------------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 824bfa2c43b9a..425b8daec6081 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -671,6 +671,7 @@ Deprecations - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``DataFrame.to_html()`` and ``DataFrame.to_latex()`` have dropped the ``colSpace`` parameter in favor of ``col_space`` (:issue:`13857`) - ``DataFrame.to_sql()`` has deprecated the ``flavor`` parameter, as it is superfluous when SQLAlchemy is not installed (:issue:`13611`) - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0bf59403075af..4416213817ab4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1556,12 +1556,11 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True, return result @Appender(fmt.docstring_to_string, indents=1) - def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, - header=True, index=True, na_rep='NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - justify=None, bold_rows=True, classes=None, escape=True, - max_rows=None, max_cols=None, show_dimensions=False, - notebook=False, decimal='.'): + def to_html(self, buf=None, columns=None, col_space=None, header=True, + index=True, na_rep='NaN', formatters=None, float_format=None, + sparsify=None, index_names=True, justify=None, bold_rows=True, + classes=None, escape=True, max_rows=None, max_cols=None, + show_dimensions=False, notebook=False, decimal='.'): """ Render a DataFrame as an HTML table. @@ -1585,11 +1584,6 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, .. versionadded:: 0.18.0 """ - if colSpace is not None: # pragma: no cover - warnings.warn("colSpace is deprecated, use col_space", - FutureWarning, stacklevel=2) - col_space = colSpace - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, col_space=col_space, na_rep=na_rep, formatters=formatters, @@ -1609,11 +1603,11 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, return formatter.buf.getvalue() @Appender(fmt.common_docstring + fmt.return_docstring, indents=1) - def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, - header=True, index=True, na_rep='NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - bold_rows=True, column_format=None, longtable=None, - escape=None, encoding=None, decimal='.'): + def to_latex(self, buf=None, columns=None, col_space=None, header=True, + index=True, na_rep='NaN', formatters=None, float_format=None, + sparsify=None, index_names=True, bold_rows=True, + column_format=None, longtable=None, escape=None, + encoding=None, decimal='.'): """ Render a DataFrame to a tabular environment table. You can splice this into a LaTeX document. Requires \\usepackage{booktabs}. @@ -1642,11 +1636,6 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, .. versionadded:: 0.18.0 """ - - if colSpace is not None: # pragma: no cover - warnings.warn("colSpace is deprecated, use col_space", - FutureWarning, stacklevel=2) - col_space = colSpace # Get defaults from the pandas config if longtable is None: longtable = get_option("display.latex.longtable") From d4f95fdebb74db5edb09c743b48c4b03e0940591 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 1 Aug 2016 17:17:11 -0400 Subject: [PATCH 200/359] MAINT: Nicer error msg for NULL byte in read_csv (#13859) Provides a nicer error message for the Python engine in read_csv when the data contains a NULL byte. Closes gh-2741. --- pandas/io/parsers.py | 12 +++++++++++- pandas/io/tests/parser/common.py | 16 ++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index abbe7bdf18461..8c615741679b5 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2190,7 +2190,17 @@ def _next_line(self): next(self.data) while True: - orig_line = next(self.data) + try: + orig_line = next(self.data) + except csv.Error as e: + if 'NULL byte' in str(e): + raise csv.Error( + 'NULL byte detected. This byte ' + 'cannot be processed in Python\'s ' + 'native csv library at the moment, ' + 'so please pass in engine=\'c\' instead.') + else: + raise line = self._check_comments([orig_line])[0] self.pos += 1 if (not self.skip_blank_lines and diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index f8fc6c2bf78c3..129e925e38d5b 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1501,3 +1501,19 @@ def test_memory_map(self): out = self.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(out, expected) + + def test_null_byte_char(self): + # see gh-2741 + data = '\x00,foo' + cols = ['a', 'b'] + + expected = DataFrame([[np.nan, 'foo']], + columns=cols) + + if self.engine == 'c': + out = self.read_csv(StringIO(data), names=cols) + tm.assert_frame_equal(out, expected) + else: + msg = "NULL byte detected" + with tm.assertRaisesRegexp(csv.Error, msg): + self.read_csv(StringIO(data), names=cols) From 299fb759b742b06cff6baed898e50936d0f55c7e Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Tue, 2 Aug 2016 19:14:08 +0900 Subject: [PATCH 201/359] BUG: Series creation with datetime64 with non-ns unit as object dtype (#13876) --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/tests/series/test_constructors.py | 10 +++++++++- pandas/tests/test_algos.py | 7 ++++--- pandas/types/cast.py | 2 ++ 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 425b8daec6081..d069a25c58143 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -842,6 +842,8 @@ Bug Fixes - Bug in ``RangeIndex`` can be created without no arguments rather than raises ``TypeError`` (:issue:`13793`) - Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) - Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) +- Bug in ``Series`` creation with ``np.datetime64`` which has other unit than ``ns`` as ``object`` dtype results in incorrect values (:issue:`13876`) + - Bug in ``isnull`` ``notnull`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) - Bug in ``.merge`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c8e04f1ffd75f..ed7b0fda19cb7 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -381,13 +381,21 @@ def test_constructor_dtype_datetime64(self): # coerce datetime64 non-ns properly dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') values2 = dates.view(np.ndarray).astype('datetime64[ns]') - expected = Series(values2, dates) + expected = Series(values2, index=dates) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, dates) assert_series_equal(result, expected) + # GH 13876 + # coerce to non-ns to object properly + expected = Series(values2, index=dates, dtype=object) + for dtype in ['s', 'D', 'ms', 'us', 'ns']: + values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + result = Series(values1, index=dates, dtype=object) + assert_series_equal(result, expected) + # leave datetime.date alone dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 9535a3f97955c..94c67ac7dd61a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -777,7 +777,6 @@ def test_datetime_likes(self): exp_false = exp_first | exp_last for case in cases: - print(case) res_first = algos.duplicated(case, keep='first') tm.assert_numpy_array_equal(res_first, exp_first) @@ -788,7 +787,8 @@ def test_datetime_likes(self): tm.assert_numpy_array_equal(res_false, exp_false) # index - for idx in [pd.Index(case), pd.Index(case, dtype='category')]: + for idx in [pd.Index(case), pd.Index(case, dtype='category'), + pd.Index(case, dtype=object)]: res_first = idx.duplicated(keep='first') tm.assert_numpy_array_equal(res_first, exp_first) @@ -799,7 +799,8 @@ def test_datetime_likes(self): tm.assert_numpy_array_equal(res_false, exp_false) # series - for s in [pd.Series(case), pd.Series(case, dtype='category')]: + for s in [pd.Series(case), pd.Series(case, dtype='category'), + pd.Series(case, dtype=object)]: res_first = s.duplicated(keep='first') tm.assert_series_equal(res_first, pd.Series(exp_first)) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index ca23d8d26a426..f4cb476672ec7 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -829,6 +829,8 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): # coerce datetimelike to object elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): if is_object_dtype(dtype): + if value.dtype != _NS_DTYPE: + value = value.astype(_NS_DTYPE) ints = np.asarray(value).view('i8') return tslib.ints_to_pydatetime(ints) From 1f55e919a3b85cd0891a7a7a1d6913ca06f6bd9e Mon Sep 17 00:00:00 2001 From: agraboso Date: Tue, 2 Aug 2016 06:18:47 -0400 Subject: [PATCH 202/359] DEPR: Remove legacy offsets Follow-up to #13590. Remove legacy offset aliases that remained in `pandas/tseries/frequencies.py`: `_period_alias_dictionary()` and `_period_alias_dict`. Author: agraboso Closes #13868 from agraboso/follow-13590 and squashes the following commits: 25d932d [agraboso] DEPR: Remove legacy offsets --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/src/period.pyx | 5 +- pandas/tseries/frequencies.py | 107 ---------------------------- pandas/tseries/tests/test_period.py | 10 ++- 4 files changed, 9 insertions(+), 115 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d069a25c58143..d440ff748292e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -701,7 +701,7 @@ Removal of prior version deprecations/changes - ``Series.to_csv`` has dropped the ``nanRep`` parameter in favor of ``na_rep`` (:issue:`13804`) - ``Series.xs``, ``DataFrame.xs``, ``Panel.xs``, ``Panel.major_xs``, and ``Panel.minor_xs`` have dropped the ``copy`` parameter (:issue:`13781`) - ``str.split`` has dropped the ``return_type`` parameter in favor of ``expand`` (:issue:`13701`) -- Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`) +- Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`, :issue:`13868`) Previous Behavior: diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 965ed53a4b802..0435b01920504 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -681,10 +681,7 @@ cdef class _Period(object): @classmethod def _maybe_convert_freq(cls, object freq): - if isinstance(freq, compat.string_types): - freq = freq.upper() - freq = frequencies._period_alias_dict.get(freq, freq) - elif isinstance(freq, (int, tuple)): + if isinstance(freq, (int, tuple)): code, stride = frequencies.get_freq_code(freq) freq = frequencies._get_freq_str(code, stride) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 8b3785d78d260..eaf826230e772 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -620,113 +620,6 @@ def get_standard_freq(freq): }) -def _period_alias_dictionary(): - """ - Build freq alias dictionary to support freqs from original c_dates.c file - of the scikits.timeseries library. - """ - alias_dict = {} - - M_aliases = ["M", "MTH", "MONTH", "MONTHLY"] - B_aliases = ["B", "BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY"] - D_aliases = ["D", "DAY", "DLY", "DAILY"] - H_aliases = ["H", "HR", "HOUR", "HRLY", "HOURLY"] - T_aliases = ["T", "MIN", "MINUTE", "MINUTELY"] - S_aliases = ["S", "SEC", "SECOND", "SECONDLY"] - L_aliases = ["L", "ms", "MILLISECOND", "MILLISECONDLY"] - U_aliases = ["U", "US", "MICROSECOND", "MICROSECONDLY"] - N_aliases = ["N", "NS", "NANOSECOND", "NANOSECONDLY"] - - for k in M_aliases: - alias_dict[k] = 'M' - - for k in B_aliases: - alias_dict[k] = 'B' - - for k in D_aliases: - alias_dict[k] = 'D' - - for k in H_aliases: - alias_dict[k] = 'H' - - for k in T_aliases: - alias_dict[k] = 'T' - - for k in S_aliases: - alias_dict[k] = 'S' - - for k in L_aliases: - alias_dict[k] = 'L' - - for k in U_aliases: - alias_dict[k] = 'U' - - for k in N_aliases: - alias_dict[k] = 'N' - - A_prefixes = ["A", "Y", "ANN", "ANNUAL", "ANNUALLY", "YR", "YEAR", - "YEARLY"] - - Q_prefixes = ["Q", "QTR", "QUARTER", "QUARTERLY", "Q-E", - "QTR-E", "QUARTER-E", "QUARTERLY-E"] - - month_names = [ - ["DEC", "DECEMBER"], - ["JAN", "JANUARY"], - ["FEB", "FEBRUARY"], - ["MAR", "MARCH"], - ["APR", "APRIL"], - ["MAY", "MAY"], - ["JUN", "JUNE"], - ["JUL", "JULY"], - ["AUG", "AUGUST"], - ["SEP", "SEPTEMBER"], - ["OCT", "OCTOBER"], - ["NOV", "NOVEMBER"]] - - seps = ["@", "-"] - - for k in A_prefixes: - alias_dict[k] = 'A' - for m_tup in month_names: - for sep in seps: - m1, m2 = m_tup - alias_dict[k + sep + m1] = 'A-' + m1 - alias_dict[k + sep + m2] = 'A-' + m1 - - for k in Q_prefixes: - alias_dict[k] = 'Q' - for m_tup in month_names: - for sep in seps: - m1, m2 = m_tup - alias_dict[k + sep + m1] = 'Q-' + m1 - alias_dict[k + sep + m2] = 'Q-' + m1 - - W_prefixes = ["W", "WK", "WEEK", "WEEKLY"] - - day_names = [ - ["SUN", "SUNDAY"], - ["MON", "MONDAY"], - ["TUE", "TUESDAY"], - ["WED", "WEDNESDAY"], - ["THU", "THURSDAY"], - ["FRI", "FRIDAY"], - ["SAT", "SATURDAY"]] - - for k in W_prefixes: - alias_dict[k] = 'W' - for d_tup in day_names: - for sep in ["@", "-"]: - d1, d2 = d_tup - alias_dict[k + sep + d1] = 'W-' + d1 - alias_dict[k + sep + d2] = 'W-' + d1 - - return alias_dict - - -_period_alias_dict = _period_alias_dictionary() - - def _period_str_to_code(freqstr): freqstr = _lite_rule_alias.get(freqstr, freqstr) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index c3d0ee28540e1..290c11bd8d79c 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -457,10 +457,14 @@ def test_period_deprecated_freq(self): for freq in freqs: with self.assertRaisesRegexp(ValueError, msg): Period('2016-03-01 09:00', freq=freq) + with self.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq=freq) - # check supported freq-aliases still works - p = Period('2016-03-01 09:00', freq=exp) - tm.assertIsInstance(p, Period) + # check supported freq-aliases still works + p1 = Period('2016-03-01 09:00', freq=exp) + p2 = Period(ordinal=1, freq=exp) + tm.assertIsInstance(p1, Period) + tm.assertIsInstance(p2, Period) def test_hash(self): self.assertEqual(hash(Period('2011-01', freq='M')), From a7f7e1d18baa36afe9317aa48fbcf170b0375318 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sa=C5=A1o=20Stanovnik?= Date: Mon, 25 Jul 2016 16:58:27 +0200 Subject: [PATCH 203/359] BUG: Fix slicing subclasses of SparseDataFrames. Use proper subclassing behaviour so subclasses work properly: this fixes an issue where a multi-element slice of a subclass of SparseDataFrame returned the SparseDataFrame type instead of the subclass type. closes #13787 --- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/io/tests/test_pickle.py | 8 +++ pandas/sparse/frame.py | 23 +++++---- pandas/sparse/series.py | 12 ++--- pandas/tests/frame/test_subclass.py | 30 ++++++++++++ pandas/tests/series/test_subclass.py | 24 +++++++++ pandas/util/testing.py | 73 ++++++++++++++++++++++++++-- 7 files changed, 151 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d440ff748292e..6de22272c65e6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -380,6 +380,8 @@ API changes - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) +- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) + .. _whatsnew_0190.api.tolist: diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 6019144d59698..94885d90d3c4a 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -86,6 +86,14 @@ def compare(self, vf, version): comparator(result, expected, typ, version) return data + def compare_sp_series_ts(self, res, exp, typ, version): + # SparseTimeSeries integrated into SparseSeries in 0.12.0 + # and deprecated in 0.17.0 + if version and LooseVersion(version) <= "0.12.0": + tm.assert_sp_series_equal(res, exp, check_series_type=False) + else: + tm.assert_sp_series_equal(res, exp) + def compare_series_ts(self, result, expected, typ, version): # GH 7748 tm.assert_series_equal(result, expected) diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 811d8019c7fee..2ea0536ca4fbb 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -188,7 +188,7 @@ def _init_matrix(self, data, index, columns, dtype=None): return self._init_dict(data, index, columns, dtype) def __array_wrap__(self, result): - return SparseDataFrame( + return self._constructor( result, index=self.index, columns=self.columns, default_kind=self._default_kind, default_fill_value=self._default_fill_value).__finalize__(self) @@ -407,7 +407,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None): raise NotImplementedError("'level' argument is not supported") if self.empty and other.empty: - return SparseDataFrame(index=new_index).__finalize__(self) + return self._constructor(index=new_index).__finalize__(self) new_data = {} new_fill_value = None @@ -519,7 +519,8 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, return self if len(self.index) == 0: - return SparseDataFrame(index=index, columns=self.columns) + return self._constructor( + index=index, columns=self.columns).__finalize__(self) indexer = self.index.get_indexer(index, method, limit=limit) indexer = _ensure_platform_int(indexer) @@ -540,8 +541,9 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, new_series[col] = new - return SparseDataFrame(new_series, index=index, columns=self.columns, - default_fill_value=self._default_fill_value) + return self._constructor( + new_series, index=index, columns=self.columns, + default_fill_value=self._default_fill_value).__finalize__(self) def _reindex_columns(self, columns, copy, level, fill_value, limit=None, takeable=False): @@ -556,8 +558,9 @@ def _reindex_columns(self, columns, copy, level, fill_value, limit=None, # TODO: fill value handling sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns) - return SparseDataFrame(sdict, index=self.index, columns=columns, - default_fill_value=self._default_fill_value) + return self._constructor( + sdict, index=self.index, columns=columns, + default_fill_value=self._default_fill_value).__finalize__(self) def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, limit=None, copy=False, allow_dups=False): @@ -586,8 +589,8 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, else: new_arrays[col] = self[col] - return SparseDataFrame(new_arrays, index=index, - columns=columns).__finalize__(self) + return self._constructor(new_arrays, index=index, + columns=columns).__finalize__(self) def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): @@ -644,7 +647,7 @@ def transpose(self, *args, **kwargs): Returns a DataFrame with the rows/columns switched. """ nv.validate_transpose(args, kwargs) - return SparseDataFrame( + return self._constructor( self.values.T, index=self.columns, columns=self.index, default_fill_value=self._default_fill_value, default_kind=self._default_kind).__finalize__(self) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 951c2ae0c0d5a..6c4392dbf7cb4 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -63,11 +63,11 @@ def wrapper(self, other): new_fill_value = op(np.float64(self.fill_value), np.float64(other)) - return SparseSeries(op(self.sp_values, other), - index=self.index, - sparse_index=self.sp_index, - fill_value=new_fill_value, - name=self.name) + return self._constructor(op(self.sp_values, other), + index=self.index, + sparse_index=self.sp_index, + fill_value=new_fill_value, + name=self.name) else: # pragma: no cover raise TypeError('operation with %s not supported' % type(other)) @@ -85,7 +85,7 @@ def _sparse_series_op(left, right, op, name): new_name = _maybe_match_name(left, right) result = _sparse_array_op(left, right, op, name) - return SparseSeries(result, index=new_index, name=new_name) + return left._constructor(result, index=new_index, name=new_name) class SparseSeries(Series): diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index ee12d9e84511c..0e0ee75a30c84 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -210,3 +210,33 @@ def test_subclass_align_combinations(self): tm.assert_series_equal(res1, exp2) tm.assertIsInstance(res2, tm.SubclassedDataFrame) tm.assert_frame_equal(res2, exp1) + + def test_subclass_sparse_slice(self): + rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] + ssdf = tm.SubclassedSparseDataFrame(rows) + ssdf.testattr = "testattr" + + tm.assert_sp_frame_equal(ssdf.loc[:2], + tm.SubclassedSparseDataFrame(rows[:3])) + tm.assert_sp_frame_equal(ssdf.iloc[:2], + tm.SubclassedSparseDataFrame(rows[:2])) + tm.assert_sp_frame_equal(ssdf[:2], + tm.SubclassedSparseDataFrame(rows[:2])) + tm.assert_equal(ssdf.loc[:2].testattr, "testattr") + tm.assert_equal(ssdf.iloc[:2].testattr, "testattr") + tm.assert_equal(ssdf[:2].testattr, "testattr") + + tm.assert_sp_series_equal(ssdf.loc[1], + tm.SubclassedSparseSeries(rows[1]), + check_names=False) + tm.assert_sp_series_equal(ssdf.iloc[1], + tm.SubclassedSparseSeries(rows[1]), + check_names=False) + + def test_subclass_sparse_transpose(self): + ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], + [4, 5, 6]]) + essdf = tm.SubclassedSparseDataFrame([[1, 4], + [2, 5], + [3, 6]]) + tm.assert_sp_frame_equal(ossdf.T, essdf) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 016113961ec74..dabecefaee9d1 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -31,3 +31,27 @@ def test_to_frame(self): exp = tm.SubclassedDataFrame({'xxx': [1, 2, 3, 4]}, index=list('abcd')) tm.assert_frame_equal(res, exp) tm.assertIsInstance(res, tm.SubclassedDataFrame) + + def test_subclass_sparse_slice(self): + s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5]) + tm.assert_sp_series_equal(s.loc[1:3], + tm.SubclassedSparseSeries([2.0, 3.0, 4.0], + index=[1, 2, 3])) + tm.assert_sp_series_equal(s.iloc[1:3], + tm.SubclassedSparseSeries([2.0, 3.0], + index=[1, 2])) + tm.assert_sp_series_equal(s[1:3], + tm.SubclassedSparseSeries([2.0, 3.0], + index=[1, 2])) + + def test_subclass_sparse_addition(self): + s1 = tm.SubclassedSparseSeries([1, 3, 5]) + s2 = tm.SubclassedSparseSeries([-2, 5, 12]) + tm.assert_sp_series_equal(s1 + s2, + tm.SubclassedSparseSeries([-1.0, 8.0, 17.0])) + + def test_subclass_sparse_to_frame(self): + s = tm.SubclassedSparseSeries([1, 2], index=list('abcd'), name='xxx') + res = s.to_frame() + exp = tm.SubclassedSparseDataFrame({'xxx': [1, 2]}, index=list('abcd')) + tm.assert_sp_frame_equal(res, exp) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e49d92e4ab202..e4a84ea4ae296 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1322,7 +1322,8 @@ def assert_panelnd_equal(left, right, check_less_precise=False, assert_func=assert_frame_equal, check_names=False, - by_blocks=False): + by_blocks=False, + obj='Panel'): """Check that left and right Panels are equal. Parameters @@ -1343,6 +1344,9 @@ def assert_panelnd_equal(left, right, by_blocks : bool, default False Specify how to compare internal data. If False, compare by columns. If True, compare by blocks. + obj : str, default 'Panel' + Specify the object name being compared, internally used to show + the appropriate assertion message. """ if check_panel_type: @@ -1404,10 +1408,30 @@ def assert_sp_array_equal(left, right): def assert_sp_series_equal(left, right, exact_indices=True, - check_names=True, obj='SparseSeries'): + check_series_type=True, + check_names=True, + obj='SparseSeries'): + """Check that the left and right SparseSeries are equal. + + Parameters + ---------- + left : SparseSeries + right : SparseSeries + exact_indices : bool, default True + check_series_type : bool, default True + Whether to check the SparseSeries class is identical. + check_names : bool, default True + Whether to check the SparseSeries name attribute. + obj : str, default 'SparseSeries' + Specify the object name being compared, internally used to show + the appropriate assertion message. + """ assertIsInstance(left, pd.SparseSeries, '[SparseSeries]') assertIsInstance(right, pd.SparseSeries, '[SparseSeries]') + if check_series_type: + assert_class_equal(left, right, obj=obj) + assert_index_equal(left.index, right.index, obj='{0}.index'.format(obj)) @@ -1421,14 +1445,29 @@ def assert_sp_series_equal(left, right, exact_indices=True, def assert_sp_frame_equal(left, right, exact_indices=True, + check_frame_type=True, obj='SparseDataFrame'): - """ - exact: Series SparseIndex objects must be exactly the same, otherwise just - compare dense representations + """Check that the left and right SparseDataFrame are equal. + + Parameters + ---------- + left : SparseDataFrame + right : SparseDataFrame + exact_indices : bool, default True + SparseSeries SparseIndex objects must be exactly the same, + otherwise just compare dense representations. + check_frame_type : bool, default True + Whether to check the SparseDataFrame class is identical. + obj : str, default 'SparseDataFrame' + Specify the object name being compared, internally used to show + the appropriate assertion message. """ assertIsInstance(left, pd.SparseDataFrame, '[SparseDataFrame]') assertIsInstance(right, pd.SparseDataFrame, '[SparseDataFrame]') + if check_frame_type: + assert_class_equal(left, right, obj=obj) + assert_index_equal(left.index, right.index, obj='{0}.index'.format(obj)) assert_index_equal(left.columns, right.columns, @@ -2607,6 +2646,30 @@ def _constructor_sliced(self): return SubclassedSeries +class SubclassedSparseSeries(pd.SparseSeries): + _metadata = ['testattr'] + + @property + def _constructor(self): + return SubclassedSparseSeries + + @property + def _constructor_expanddim(self): + return SubclassedSparseDataFrame + + +class SubclassedSparseDataFrame(pd.SparseDataFrame): + _metadata = ['testattr'] + + @property + def _constructor(self): + return SubclassedSparseDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSparseSeries + + @contextmanager def patch(ob, attr, value): """Temporarily patch an attribute of an object. From 2f8fea7f00349231d4d0a6dc6b4a2c550ef3f0ef Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 2 Aug 2016 06:45:10 -0400 Subject: [PATCH 204/359] CLN: Removed trellis rplot xref https://github.com/pydata/pandas/pull/9357. Title is self- explanatory. Author: gfyoung Closes #13855 from gfyoung/trellis-plot-remove and squashes the following commits: 5427086 [gfyoung] CLN: Removed trellis rplot --- doc/source/visualization.rst | 248 +------- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/tests/test_rplot.py | 316 ---------- pandas/tools/rplot.py | 984 -------------------------------- 4 files changed, 7 insertions(+), 1543 deletions(-) delete mode 100644 pandas/tests/test_rplot.py delete mode 100644 pandas/tools/rplot.py diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 7840ae29298b0..16ef76638ec5b 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -1615,246 +1615,8 @@ Trellis plotting interface .. warning:: - The ``rplot`` trellis plotting interface is **deprecated and will be removed - in a future version**. We refer to external packages like - `seaborn `_ for similar but more - refined functionality. - - The docs below include some example on how to convert your existing code to - ``seaborn``. - -.. ipython:: python - :suppress: - - tips_data = pd.read_csv('data/tips.csv') - iris_data = pd.read_csv('data/iris.data') - plt.close('all') - - -.. note:: - - The tips data set can be downloaded `here - `__. Once you download it execute - - .. code-block:: python - - tips_data = pd.read_csv('tips.csv') - - from the directory where you downloaded the file. - -We import the rplot API: - -.. ipython:: python - :okwarning: - - import pandas.tools.rplot as rplot - -Examples -~~~~~~~~ - -RPlot was an API for producing Trellis plots. These plots allow you to -arrange data in a rectangular grid by values of certain attributes. -In the example below, data from the tips data set is arranged by the attributes -'sex' and 'smoker'. Since both of those attributes can take on one of two -values, the resulting grid has two columns and two rows. A histogram is -displayed for each cell of the grid. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomHistogram()) - - @savefig rplot1_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -A similar plot can be made with ``seaborn`` using the ``FacetGrid`` object, -resulting in the following image: - -.. code-block:: python - - import seaborn as sns - g = sns.FacetGrid(tips_data, row="sex", col="smoker") - g.map(plt.hist, "total_bill") - -.. image:: _static/rplot-seaborn-example1.png - - -Example below is the same as previous except the plot is set to kernel density -estimation. A ``seaborn`` example is included beneath. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomDensity()) - - @savefig rplot2_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker") - g.map(sns.kdeplot, "total_bill") - -.. image:: _static/rplot-seaborn-example2.png - -The plot below shows that it is possible to have two or more plots for the same -data displayed on the same Trellis grid cell. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomScatter()) - plot.add(rplot.GeomPolyFit(degree=2)) - - @savefig rplot3_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -A seaborn equivalent for a simple scatter plot: - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker") - g.map(plt.scatter, "total_bill", "tip") - -.. image:: _static/rplot-seaborn-example3.png - -and with a regression line, using the dedicated ``seaborn`` ``regplot`` function: - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker", margin_titles=True) - g.map(sns.regplot, "total_bill", "tip", order=2) - -.. image:: _static/rplot-seaborn-example3b.png - - -Below is a similar plot but with 2D kernel density estimation plot superimposed, -followed by a ``seaborn`` equivalent: - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomScatter()) - plot.add(rplot.GeomDensity2D()) - - @savefig rplot4_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker") - g.map(plt.scatter, "total_bill", "tip") - g.map(sns.kdeplot, "total_bill", "tip") - -.. image:: _static/rplot-seaborn-example4.png - -It is possible to only use one attribute for grouping data. The example above -only uses 'sex' attribute. If the second grouping attribute is not specified, -the plots will be arranged in a column. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', '.'])) - plot.add(rplot.GeomHistogram()) - - @savefig rplot5_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -If the first grouping attribute is not specified the plots will be arranged in a row. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['.', 'smoker'])) - plot.add(rplot.GeomHistogram()) - - @savefig rplot6_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -In ``seaborn``, this can also be done by only specifying one of the ``row`` -and ``col`` arguments. - -In the example below the colour and shape of the scatter plot graphical -objects is mapped to 'day' and 'size' attributes respectively. You use -scale objects to specify these mappings. The list of scale classes is -given below with initialization arguments for quick reference. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='tip', y='total_bill') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomPoint(size=80.0, colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'), alpha=1.0)) - - @savefig rplot7_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -This can also be done in ``seaborn``, at least for 3 variables: - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker", hue="day") - g.map(plt.scatter, "tip", "total_bill") - g.add_legend() - -.. image:: _static/rplot-seaborn-example6.png + The ``rplot`` trellis plotting interface has been **removed**. Please use + external packages like `seaborn `_ for + similar but more refined functionality and refer to our 0.18.1 documentation + `here `__ + for how to convert to using it. diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 6de22272c65e6..2eadde8af4912 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -693,6 +693,8 @@ Removal of prior version deprecations/changes - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - The ``pandas.io.data`` and ``pandas.io.wb`` modules are removed in favor of the `pandas-datareader package `__ (:issue:`13724`). +- The ``pandas.tools.rplot`` module has been removed in favor of + the `seaborn package `__ (:issue:`13855`) - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) diff --git a/pandas/tests/test_rplot.py b/pandas/tests/test_rplot.py deleted file mode 100644 index 6be6c53cbb201..0000000000000 --- a/pandas/tests/test_rplot.py +++ /dev/null @@ -1,316 +0,0 @@ -# -*- coding: utf-8 -*- -from pandas.compat import range -import pandas.util.testing as tm -from pandas import read_csv -import os - -with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - import pandas.tools.rplot as rplot - - -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth - - -def between(a, b, x): - """Check if x is in the somewhere between a and b. - - Parameters: - ----------- - a: float, interval start - b: float, interval end - x: float, value to test for - - Returns: - -------- - True if x is between a and b, False otherwise - """ - if a < b: - return x >= a and x <= b - else: - return x <= a and x >= b - - -@tm.mplskip -class TestUtilityFunctions(tm.TestCase): - """ - Tests for RPlot utility functions. - """ - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - - def test_make_aes1(self): - aes = rplot.make_aes() - self.assertTrue(aes['x'] is None) - self.assertTrue(aes['y'] is None) - self.assertTrue(aes['size'] is None) - self.assertTrue(aes['colour'] is None) - self.assertTrue(aes['shape'] is None) - self.assertTrue(aes['alpha'] is None) - self.assertTrue(isinstance(aes, dict)) - - def test_make_aes2(self): - self.assertRaises(ValueError, rplot.make_aes, - size=rplot.ScaleShape('test')) - self.assertRaises(ValueError, rplot.make_aes, - colour=rplot.ScaleShape('test')) - self.assertRaises(ValueError, rplot.make_aes, - shape=rplot.ScaleSize('test')) - self.assertRaises(ValueError, rplot.make_aes, - alpha=rplot.ScaleShape('test')) - - def test_dictionary_union(self): - dict1 = {1: 1, 2: 2, 3: 3} - dict2 = {1: 1, 2: 2, 4: 4} - union = rplot.dictionary_union(dict1, dict2) - self.assertEqual(len(union), 4) - keys = list(union.keys()) - self.assertTrue(1 in keys) - self.assertTrue(2 in keys) - self.assertTrue(3 in keys) - self.assertTrue(4 in keys) - self.assertEqual(rplot.dictionary_union(dict1, {}), dict1) - self.assertEqual(rplot.dictionary_union({}, dict1), dict1) - self.assertEqual(rplot.dictionary_union({}, {}), {}) - - def test_merge_aes(self): - layer1 = rplot.Layer(size=rplot.ScaleSize('test')) - layer2 = rplot.Layer(shape=rplot.ScaleShape('test')) - rplot.merge_aes(layer1, layer2) - self.assertTrue(isinstance(layer2.aes['size'], rplot.ScaleSize)) - self.assertTrue(isinstance(layer2.aes['shape'], rplot.ScaleShape)) - self.assertEqual(layer2.aes['size'], layer1.aes['size']) - for key in layer2.aes.keys(): - if key != 'size' and key != 'shape': - self.assertTrue(layer2.aes[key] is None) - - def test_sequence_layers(self): - layer1 = rplot.Layer(self.data) - layer2 = rplot.GeomPoint(x='SepalLength', y='SepalWidth', - size=rplot.ScaleSize('PetalLength')) - layer3 = rplot.GeomPolyFit(2) - result = rplot.sequence_layers([layer1, layer2, layer3]) - self.assertEqual(len(result), 3) - last = result[-1] - self.assertEqual(last.aes['x'], 'SepalLength') - self.assertEqual(last.aes['y'], 'SepalWidth') - self.assertTrue(isinstance(last.aes['size'], rplot.ScaleSize)) - self.assertTrue(self.data is last.data) - self.assertTrue(rplot.sequence_layers([layer1])[0] is layer1) - - -@tm.mplskip -class TestTrellis(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/tips.csv') - self.data = read_csv(path, sep=',') - layer1 = rplot.Layer(self.data) - layer2 = rplot.GeomPoint(x='total_bill', y='tip') - layer3 = rplot.GeomPolyFit(2) - self.layers = rplot.sequence_layers([layer1, layer2, layer3]) - self.trellis1 = rplot.TrellisGrid(['sex', 'smoker']) - self.trellis2 = rplot.TrellisGrid(['sex', '.']) - self.trellis3 = rplot.TrellisGrid(['.', 'smoker']) - self.trellised1 = self.trellis1.trellis(self.layers) - self.trellised2 = self.trellis2.trellis(self.layers) - self.trellised3 = self.trellis3.trellis(self.layers) - - def test_grid_sizes(self): - self.assertEqual(len(self.trellised1), 3) - self.assertEqual(len(self.trellised2), 3) - self.assertEqual(len(self.trellised3), 3) - self.assertEqual(len(self.trellised1[0]), 2) - self.assertEqual(len(self.trellised1[0][0]), 2) - self.assertEqual(len(self.trellised2[0]), 2) - self.assertEqual(len(self.trellised2[0][0]), 1) - self.assertEqual(len(self.trellised3[0]), 1) - self.assertEqual(len(self.trellised3[0][0]), 2) - self.assertEqual(len(self.trellised1[1]), 2) - self.assertEqual(len(self.trellised1[1][0]), 2) - self.assertEqual(len(self.trellised2[1]), 2) - self.assertEqual(len(self.trellised2[1][0]), 1) - self.assertEqual(len(self.trellised3[1]), 1) - self.assertEqual(len(self.trellised3[1][0]), 2) - self.assertEqual(len(self.trellised1[2]), 2) - self.assertEqual(len(self.trellised1[2][0]), 2) - self.assertEqual(len(self.trellised2[2]), 2) - self.assertEqual(len(self.trellised2[2][0]), 1) - self.assertEqual(len(self.trellised3[2]), 1) - self.assertEqual(len(self.trellised3[2][0]), 2) - - def test_trellis_cols_rows(self): - self.assertEqual(self.trellis1.cols, 2) - self.assertEqual(self.trellis1.rows, 2) - self.assertEqual(self.trellis2.cols, 1) - self.assertEqual(self.trellis2.rows, 2) - self.assertEqual(self.trellis3.cols, 2) - self.assertEqual(self.trellis3.rows, 1) - - -@tm.mplskip -class TestScaleGradient(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - self.gradient = rplot.ScaleGradient("SepalLength", colour1=(0.2, 0.3, - 0.4), - colour2=(0.8, 0.7, 0.6)) - - def test_gradient(self): - for index in range(len(self.data)): - # row = self.data.iloc[index] - r, g, b = self.gradient(self.data, index) - r1, g1, b1 = self.gradient.colour1 - r2, g2, b2 = self.gradient.colour2 - self.assertTrue(between(r1, r2, r)) - self.assertTrue(between(g1, g2, g)) - self.assertTrue(between(b1, b2, b)) - - -@tm.mplskip -class TestScaleGradient2(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - self.gradient = rplot.ScaleGradient2("SepalLength", colour1=( - 0.2, 0.3, 0.4), colour2=(0.8, 0.7, 0.6), colour3=(0.5, 0.5, 0.5)) - - def test_gradient2(self): - for index in range(len(self.data)): - row = self.data.iloc[index] - r, g, b = self.gradient(self.data, index) - r1, g1, b1 = self.gradient.colour1 - r2, g2, b2 = self.gradient.colour2 - r3, g3, b3 = self.gradient.colour3 - value = row[self.gradient.column] - a_ = min(self.data[self.gradient.column]) - b_ = max(self.data[self.gradient.column]) - scaled = (value - a_) / (b_ - a_) - if scaled < 0.5: - self.assertTrue(between(r1, r2, r)) - self.assertTrue(between(g1, g2, g)) - self.assertTrue(between(b1, b2, b)) - else: - self.assertTrue(between(r2, r3, r)) - self.assertTrue(between(g2, g3, g)) - self.assertTrue(between(b2, b3, b)) - - -@tm.mplskip -class TestScaleRandomColour(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - self.colour = rplot.ScaleRandomColour('SepalLength') - - def test_random_colour(self): - for index in range(len(self.data)): - colour = self.colour(self.data, index) - self.assertEqual(len(colour), 3) - r, g, b = colour - self.assertTrue(r >= 0.0) - self.assertTrue(g >= 0.0) - self.assertTrue(b >= 0.0) - self.assertTrue(r <= 1.0) - self.assertTrue(g <= 1.0) - self.assertTrue(b <= 1.0) - - -@tm.mplskip -class TestScaleConstant(tm.TestCase): - - def test_scale_constant(self): - scale = rplot.ScaleConstant(1.0) - self.assertEqual(scale(None, None), 1.0) - scale = rplot.ScaleConstant("test") - self.assertEqual(scale(None, None), "test") - - -class TestScaleSize(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - self.scale1 = rplot.ScaleShape('Name') - self.scale2 = rplot.ScaleShape('PetalLength') - - def test_scale_size(self): - for index in range(len(self.data)): - marker = self.scale1(self.data, index) - self.assertTrue( - marker in ['o', '+', 's', '*', '^', '<', '>', 'v', '|', 'x']) - - def test_scale_overflow(self): - def f(): - for index in range(len(self.data)): - self.scale2(self.data, index) - - self.assertRaises(ValueError, f) - - -@tm.mplskip -class TestRPlot(tm.TestCase): - - def test_rplot1(self): - import matplotlib.pyplot as plt - path = os.path.join(curpath(), 'data/tips.csv') - plt.figure() - self.data = read_csv(path, sep=',') - self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') - self.plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour( - 'day'), shape=rplot.ScaleShape('size'))) - self.fig = plt.gcf() - self.plot.render(self.fig) - - def test_rplot2(self): - import matplotlib.pyplot as plt - path = os.path.join(curpath(), 'data/tips.csv') - plt.figure() - self.data = read_csv(path, sep=',') - self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') - self.plot.add(rplot.TrellisGrid(['.', 'smoker'])) - self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour( - 'day'), shape=rplot.ScaleShape('size'))) - self.fig = plt.gcf() - self.plot.render(self.fig) - - def test_rplot3(self): - import matplotlib.pyplot as plt - path = os.path.join(curpath(), 'data/tips.csv') - plt.figure() - self.data = read_csv(path, sep=',') - self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') - self.plot.add(rplot.TrellisGrid(['sex', '.'])) - self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour( - 'day'), shape=rplot.ScaleShape('size'))) - self.fig = plt.gcf() - self.plot.render(self.fig) - - def test_rplot_iris(self): - import matplotlib.pyplot as plt - path = os.path.join(curpath(), 'data/iris.csv') - plt.figure() - self.data = read_csv(path, sep=',') - plot = rplot.RPlot(self.data, x='SepalLength', y='SepalWidth') - plot.add(rplot.GeomPoint( - colour=rplot.ScaleGradient('PetalLength', - colour1=(0.0, 1.0, 0.5), - colour2=(1.0, 0.0, 0.5)), - size=rplot.ScaleSize('PetalWidth', min_size=10.0, - max_size=200.0), - shape=rplot.ScaleShape('Name'))) - self.fig = plt.gcf() - plot.render(self.fig) - - -if __name__ == '__main__': - import unittest - unittest.main() diff --git a/pandas/tools/rplot.py b/pandas/tools/rplot.py deleted file mode 100644 index 5a748b60aae9c..0000000000000 --- a/pandas/tools/rplot.py +++ /dev/null @@ -1,984 +0,0 @@ -import random -import warnings -from copy import deepcopy -from pandas.core.common import _values_from_object - -import numpy as np -from pandas.compat import range, zip -# -# TODO: -# * Make sure legends work properly -# - - -warnings.warn("\n" - "The rplot trellis plotting interface is deprecated and will be " - "removed in a future version. We refer to external packages " - "like seaborn for similar but more refined functionality. \n\n" - "See our docs http://pandas.pydata.org/pandas-docs/stable" - "/visualization.html#rplot " - "for some example how to convert your existing code to these " - "packages.", FutureWarning, stacklevel=2) - - -class Scale: - """ - Base class for mapping between graphical and data attributes. - """ - pass - - -class ScaleGradient(Scale): - """ - A mapping between a data attribute value and a - point in colour space between two specified colours. - """ - - def __init__(self, column, colour1, colour2): - """Initialize ScaleGradient instance. - - Parameters: - ----------- - column: string, pandas DataFrame column name - colour1: tuple - 3 element tuple with float values representing an RGB colour - colour2: tuple - 3 element tuple with float values representing an RGB colour - """ - self.column = column - self.colour1 = colour1 - self.colour2 = colour2 - self.categorical = False - - def __call__(self, data, index): - """Return a colour corresponding to data attribute value. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - - Returns: - -------- - A three element tuple representing an RGB somewhere between colour1 and - colour2 - """ - x = data[self.column].iget(index) - a = min(data[self.column]) - b = max(data[self.column]) - r1, g1, b1 = self.colour1 - r2, g2, b2 = self.colour2 - x_scaled = (x - a) / (b - a) - return (r1 + (r2 - r1) * x_scaled, - g1 + (g2 - g1) * x_scaled, - b1 + (b2 - b1) * x_scaled) - - -class ScaleGradient2(Scale): - """ - Create a mapping between a data attribute value and a - point in colour space in a line of three specified colours. - """ - - def __init__(self, column, colour1, colour2, colour3): - """Initialize ScaleGradient2 instance. - - Parameters: - ----------- - column: string, pandas DataFrame column name - colour1: tuple - 3 element tuple with float values representing an RGB colour - colour2: tuple - 3 element tuple with float values representing an RGB colour - colour3: tuple - 3 element tuple with float values representing an RGB colour - """ - self.column = column - self.colour1 = colour1 - self.colour2 = colour2 - self.colour3 = colour3 - self.categorical = False - - def __call__(self, data, index): - """Return a colour corresponding to data attribute value. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - - Returns: - -------- - A three element tuple representing an RGB somewhere along the line - of colour1, colour2 and colour3 - """ - x = data[self.column].iget(index) - a = min(data[self.column]) - b = max(data[self.column]) - r1, g1, b1 = self.colour1 - r2, g2, b2 = self.colour2 - r3, g3, b3 = self.colour3 - x_scaled = (x - a) / (b - a) - if x_scaled < 0.5: - x_scaled *= 2.0 - return (r1 + (r2 - r1) * x_scaled, - g1 + (g2 - g1) * x_scaled, - b1 + (b2 - b1) * x_scaled) - else: - x_scaled = (x_scaled - 0.5) * 2.0 - return (r2 + (r3 - r2) * x_scaled, - g2 + (g3 - g2) * x_scaled, - b2 + (b3 - b2) * x_scaled) - - -class ScaleSize(Scale): - """ - Provide a mapping between a DataFrame column and matplotlib - scatter plot shape size. - """ - - def __init__(self, column, min_size=5.0, max_size=100.0, - transform=lambda x: x): - """Initialize ScaleSize instance. - - Parameters: - ----------- - column: string, a column name - min_size: float, minimum point size - max_size: float, maximum point size - transform: function - a one argument function of form float -> float (e.g. lambda x: - log(x)) - """ - self.column = column - self.min_size = min_size - self.max_size = max_size - self.transform = transform - self.categorical = False - - def __call__(self, data, index): - """Return matplotlib scatter plot marker shape size. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - """ - x = data[self.column].iget(index) - a = float(min(data[self.column])) - b = float(max(data[self.column])) - return self.transform(self.min_size + ((x - a) / (b - a)) * - (self.max_size - self.min_size)) - - -class ScaleShape(Scale): - """ - Provides a mapping between matplotlib marker shapes - and attribute values. - """ - - def __init__(self, column): - """Initialize ScaleShape instance. - - Parameters: - ----------- - column: string, pandas DataFrame column name - """ - self.column = column - self.shapes = ['o', '+', 's', '*', '^', '<', '>', 'v', '|', 'x'] - self.legends = set([]) - self.categorical = True - - def __call__(self, data, index): - """Returns a matplotlib marker identifier. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - - Returns: - -------- - a matplotlib marker identifier - """ - values = sorted(list(set(data[self.column]))) - if len(values) > len(self.shapes): - raise ValueError("Too many different values of the categorical " - "attribute for ScaleShape") - x = data[self.column].iget(index) - return self.shapes[values.index(x)] - - -class ScaleRandomColour(Scale): - """ - Maps a random colour to a DataFrame attribute. - """ - - def __init__(self, column): - """Initialize ScaleRandomColour instance. - - Parameters: - ----------- - column: string, pandas DataFrame column name - """ - self.column = column - self.categorical = True - - def __call__(self, data, index): - """Return a tuple of three floats, representing - an RGB colour. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - """ - random.seed(data[self.column].iget(index)) - return [random.random() for _ in range(3)] - - -class ScaleConstant(Scale): - """ - Constant returning scale. Usually used automatically. - """ - - def __init__(self, value): - """Initialize ScaleConstant instance. - - Parameters: - ----------- - value: any Python value to be returned when called - """ - self.value = value - self.categorical = False - - def __call__(self, data, index): - """Return the constant value. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - - Returns: - -------- - A constant value specified during initialisation - """ - return self.value - - -def default_aes(x=None, y=None): - """Create the default aesthetics dictionary. - - Parameters: - ----------- - x: string, DataFrame column name - y: string, DataFrame column name - - Returns: - -------- - a dictionary with aesthetics bindings - """ - return { - 'x': x, - 'y': y, - 'size': ScaleConstant(40.0), - 'colour': ScaleConstant('grey'), - 'shape': ScaleConstant('o'), - 'alpha': ScaleConstant(1.0), - } - - -def make_aes(x=None, y=None, size=None, colour=None, shape=None, alpha=None): - """Create an empty aesthetics dictionary. - - Parameters: - ----------- - x: string, DataFrame column name - y: string, DataFrame column name - size: function, binding for size attribute of Geoms - colour: function, binding for colour attribute of Geoms - shape: function, binding for shape attribute of Geoms - alpha: function, binding for alpha attribute of Geoms - - Returns: - -------- - a dictionary with aesthetics bindings - """ - if not hasattr(size, '__call__') and size is not None: - size = ScaleConstant(size) - if not hasattr(colour, '__call__') and colour is not None: - colour = ScaleConstant(colour) - if not hasattr(shape, '__call__') and shape is not None: - shape = ScaleConstant(shape) - if not hasattr(alpha, '__call__') and alpha is not None: - alpha = ScaleConstant(alpha) - if any([isinstance(size, scale) - for scale in [ScaleConstant, ScaleSize]]) or size is None: - pass - else: - raise ValueError( - 'size mapping should be done through ScaleConstant or ScaleSize') - if (any([isinstance(colour, scale) - for scale in [ScaleConstant, ScaleGradient, - ScaleGradient2, ScaleRandomColour]]) or - colour is None): - pass - else: - raise ValueError('colour mapping should be done through ' - 'ScaleConstant, ScaleRandomColour, ScaleGradient ' - 'or ScaleGradient2') - if (any([isinstance(shape, scale) - for scale in [ScaleConstant, ScaleShape]]) or - shape is None): - pass - else: - raise ValueError('shape mapping should be done through ScaleConstant ' - 'or ScaleShape') - if (any([isinstance(alpha, scale) for scale in [ScaleConstant]]) or - alpha is None): - pass - else: - raise ValueError('alpha mapping should be done through ScaleConstant') - return { - 'x': x, - 'y': y, - 'size': size, - 'colour': colour, - 'shape': shape, - 'alpha': alpha, - } - - -class Layer: - """ - Layer object representing a single plot layer. - """ - - def __init__(self, data=None, **kwds): - """Initialize layer object. - - Parameters: - ----------- - data: pandas DataFrame instance - aes: aesthetics dictionary with bindings - """ - self.data = data - self.aes = make_aes(**kwds) - self.legend = {} - - def work(self, fig=None, ax=None): - """Do the drawing (usually) work. - - Parameters: - ----------- - fig: matplotlib figure - ax: matplotlib axis object - - Returns: - -------- - a tuple with the same figure and axis instances - """ - return fig, ax - - -class GeomPoint(Layer): - - def work(self, fig=None, ax=None): - """Render the layer on a matplotlib axis. - You can specify either a figure or an axis to draw on. - - Parameters: - ----------- - fig: matplotlib figure object - ax: matplotlib axis object to draw on - - Returns: - -------- - fig, ax: matplotlib figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - for index in range(len(self.data)): - row = self.data.iloc[index] - x = row[self.aes['x']] - y = row[self.aes['y']] - size_scaler = self.aes['size'] - colour_scaler = self.aes['colour'] - shape_scaler = self.aes['shape'] - alpha = self.aes['alpha'] - size_value = size_scaler(self.data, index) - colour_value = colour_scaler(self.data, index) - marker_value = shape_scaler(self.data, index) - alpha_value = alpha(self.data, index) - patch = ax.scatter(x, y, - s=size_value, - c=colour_value, - marker=marker_value, - alpha=alpha_value) - label = [] - if colour_scaler.categorical: - label += [colour_scaler.column, row[colour_scaler.column]] - if shape_scaler.categorical: - label += [shape_scaler.column, row[shape_scaler.column]] - self.legend[tuple(label)] = patch - ax.set_xlabel(self.aes['x']) - ax.set_ylabel(self.aes['y']) - return fig, ax - - -class GeomPolyFit(Layer): - """ - Draw a polynomial fit of specified degree. - """ - - def __init__(self, degree, lw=2.0, colour='grey'): - """Initialize GeomPolyFit object. - - Parameters: - ----------- - degree: an integer, polynomial degree - lw: line width - colour: matplotlib colour - """ - self.degree = degree - self.lw = lw - self.colour = colour - Layer.__init__(self) - - def work(self, fig=None, ax=None): - """Draw the polynomial fit on matplotlib figure or axis - - Parameters: - ----------- - fig: matplotlib figure - ax: matplotlib axis - - Returns: - -------- - a tuple with figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - from numpy.polynomial.polynomial import polyfit - from numpy.polynomial.polynomial import polyval - x = self.data[self.aes['x']] - y = self.data[self.aes['y']] - min_x = min(x) - max_x = max(x) - c = polyfit(x, y, self.degree) - x_ = np.linspace(min_x, max_x, len(x)) - y_ = polyval(x_, c) - ax.plot(x_, y_, lw=self.lw, c=self.colour) - return fig, ax - - -class GeomScatter(Layer): - """ - An efficient scatter plot, use this instead of GeomPoint for speed. - """ - - def __init__(self, marker='o', colour='lightblue', alpha=1.0): - """Initialize GeomScatter instance. - - Parameters: - ----------- - marker: matplotlib marker string - colour: matplotlib colour - alpha: matplotlib alpha - """ - self.marker = marker - self.colour = colour - self.alpha = alpha - Layer.__init__(self) - - def work(self, fig=None, ax=None): - """Draw a scatter plot on matplotlib figure or axis - - Parameters: - ----------- - fig: matplotlib figure - ax: matplotlib axis - - Returns: - -------- - a tuple with figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - x = self.data[self.aes['x']] - y = self.data[self.aes['y']] - ax.scatter(x, y, marker=self.marker, c=self.colour, alpha=self.alpha) - return fig, ax - - -class GeomHistogram(Layer): - """ - An efficient histogram, use this instead of GeomBar for speed. - """ - - def __init__(self, bins=10, colour='lightblue'): - """Initialize GeomHistogram instance. - - Parameters: - ----------- - bins: integer, number of histogram bins - colour: matplotlib colour - """ - self.bins = bins - self.colour = colour - Layer.__init__(self) - - def work(self, fig=None, ax=None): - """Draw a histogram on matplotlib figure or axis - - Parameters: - ----------- - fig: matplotlib figure - ax: matplotlib axis - - Returns: - -------- - a tuple with figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - x = self.data[self.aes['x']] - ax.hist(_values_from_object(x), self.bins, facecolor=self.colour) - ax.set_xlabel(self.aes['x']) - return fig, ax - - -class GeomDensity(Layer): - """ - A kernel density estimation plot. - """ - - def work(self, fig=None, ax=None): - """Draw a one dimensional kernel density plot. - You can specify either a figure or an axis to draw on. - - Parameters: - ----------- - fig: matplotlib figure object - ax: matplotlib axis object to draw on - - Returns: - -------- - fig, ax: matplotlib figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - from scipy.stats import gaussian_kde - x = self.data[self.aes['x']] - gkde = gaussian_kde(x) - ind = np.linspace(x.min(), x.max(), 200) - ax.plot(ind, gkde.evaluate(ind)) - return fig, ax - - -class GeomDensity2D(Layer): - - def work(self, fig=None, ax=None): - """Draw a two dimensional kernel density plot. - You can specify either a figure or an axis to draw on. - - Parameters: - ----------- - fig: matplotlib figure object - ax: matplotlib axis object to draw on - - Returns: - -------- - fig, ax: matplotlib figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - x = self.data[self.aes['x']] - y = self.data[self.aes['y']] - - # TODO: unused? - # rvs = np.array([x, y]) - - x_min = x.min() - x_max = x.max() - y_min = y.min() - y_max = y.max() - X, Y = np.mgrid[x_min:x_max:200j, y_min:y_max:200j] - positions = np.vstack([X.ravel(), Y.ravel()]) - values = np.vstack([x, y]) - import scipy.stats as stats - kernel = stats.gaussian_kde(values) - Z = np.reshape(kernel(positions).T, X.shape) - ax.contour(Z, extent=[x_min, x_max, y_min, y_max]) - return fig, ax - - -class TrellisGrid(Layer): - - def __init__(self, by): - """Initialize TreelisGrid instance. - - Parameters: - ----------- - by: column names to group by - """ - if len(by) != 2: - raise ValueError("You must give a list of length 2 to group by") - elif by[0] == '.' and by[1] == '.': - raise ValueError( - "At least one of grouping attributes must be not a dot") - self.by = by - - def trellis(self, layers): - """ - Create a trellis structure for a list of layers. Each layer will be - cloned with different data in to a two dimensional grid. - - Parameters: - ----------- - layers: a list of Layer objects - - Returns: - -------- - trellised_layers: Clones of each layer in the list arranged in a - trellised latice - """ - trellised_layers = [] - for layer in layers: - data = layer.data - if self.by[0] == '.': - grouped = data.groupby(self.by[1]) - elif self.by[1] == '.': - grouped = data.groupby(self.by[0]) - else: - grouped = data.groupby(self.by) - groups = list(grouped.groups.keys()) - if self.by[0] == '.' or self.by[1] == '.': - shingle1 = set([g for g in groups]) - else: - shingle1 = set([g[0] for g in groups]) - shingle2 = set([g[1] for g in groups]) - if self.by[0] == '.': - self.rows = 1 - self.cols = len(shingle1) - elif self.by[1] == '.': - self.rows = len(shingle1) - self.cols = 1 - else: - self.rows = len(shingle1) - self.cols = len(shingle2) - trellised = [[None for _ in range(self.cols)] - for _ in range(self.rows)] - self.group_grid = [[None for _ in range( - self.cols)] for _ in range(self.rows)] - row = 0 - col = 0 - for group, data in grouped: - new_layer = deepcopy(layer) - new_layer.data = data - trellised[row][col] = new_layer - self.group_grid[row][col] = group - col += 1 - if col >= self.cols: - col = 0 - row += 1 - trellised_layers.append(trellised) - return trellised_layers - - -def dictionary_union(dict1, dict2): - """Take two dictionaries, return dictionary union. - - Parameters: - ----------- - dict1: Python dictionary - dict2: Python dictionary - - Returns: - -------- - A union of the dictionaries. It assumes that values - with the same keys are identical. - """ - keys1 = list(dict1.keys()) - keys2 = list(dict2.keys()) - result = {} - for key1 in keys1: - result[key1] = dict1[key1] - for key2 in keys2: - result[key2] = dict2[key2] - return result - - -def merge_aes(layer1, layer2): - """Merges the aesthetics dictionaries for the two layers. - Look up sequence_layers function. Which layer is first and which - one is second is important. - - Parameters: - ----------- - layer1: Layer object - layer2: Layer object - """ - for key in layer2.aes.keys(): - if layer2.aes[key] is None: - layer2.aes[key] = layer1.aes[key] - - -def sequence_layers(layers): - """ - Go through the list of layers and fill in the missing bits of information. - The basic rules are this: - - * If the current layer has data set to None, take the data from previous - layer. - * For each aesthetic mapping, if that mapping is set to None, take it from - previous layer. - - Parameters: - ----------- - layers: a list of Layer objects - """ - for layer1, layer2 in zip(layers[:-1], layers[1:]): - if layer2.data is None: - layer2.data = layer1.data - merge_aes(layer1, layer2) - return layers - - -def sequence_grids(layer_grids): - """ - Go through the list of layer girds and perform the same thing as - sequence_layers. - - Parameters: - ----------- - layer_grids: a list of two dimensional layer grids - """ - for grid1, grid2 in zip(layer_grids[:-1], layer_grids[1:]): - for row1, row2 in zip(grid1, grid2): - for layer1, layer2 in zip(row1, row2): - if layer2.data is None: - layer2.data = layer1.data - merge_aes(layer1, layer2) - return layer_grids - - -def work_grid(grid, fig): - """ - Take a two dimensional grid, add subplots to a figure for each cell and do - layer work. - - Parameters: - ----------- - grid: a two dimensional grid of layers - fig: matplotlib figure to draw on - - Returns: - -------- - axes: a two dimensional list of matplotlib axes - """ - nrows = len(grid) - ncols = len(grid[0]) - axes = [[None for _ in range(ncols)] for _ in range(nrows)] - for row in range(nrows): - for col in range(ncols): - axes[row][col] = fig.add_subplot( - nrows, ncols, ncols * row + col + 1) - grid[row][col].work(ax=axes[row][col]) - return axes - - -def adjust_subplots(fig, axes, trellis, layers): - """Adjust the subtplots on matplotlib figure with the - fact that we have a trellis plot in mind. - - Parameters: - ----------- - fig: matplotlib figure - axes: a two dimensional grid of matplotlib axes - trellis: TrellisGrid object - layers: last grid of layers in the plot - """ - # Flatten the axes grid - axes = [ax for row in axes for ax in row] - min_x = min([ax.get_xlim()[0] for ax in axes]) - max_x = max([ax.get_xlim()[1] for ax in axes]) - min_y = min([ax.get_ylim()[0] for ax in axes]) - max_y = max([ax.get_ylim()[1] for ax in axes]) - [ax.set_xlim(min_x, max_x) for ax in axes] - [ax.set_ylim(min_y, max_y) for ax in axes] - for index, axis in enumerate(axes): - if index % trellis.cols == 0: - pass - else: - axis.get_yaxis().set_ticks([]) - axis.set_ylabel('') - if index / trellis.cols == trellis.rows - 1: - pass - else: - axis.get_xaxis().set_ticks([]) - axis.set_xlabel('') - if trellis.by[0] == '.': - label1 = "%s = %s" % (trellis.by[1], trellis.group_grid[ - index // trellis.cols][index % trellis.cols]) - label2 = None - elif trellis.by[1] == '.': - label1 = "%s = %s" % (trellis.by[0], trellis.group_grid[ - index // trellis.cols][index % trellis.cols]) - label2 = None - else: - label1 = "%s = %s" % ( - trellis.by[0], - trellis.group_grid[index // trellis.cols] - [index % trellis.cols][0]) - label2 = "%s = %s" % ( - trellis.by[1], - trellis.group_grid[index // trellis.cols] - [index % trellis.cols][1]) - if label2 is not None: - axis.table(cellText=[[label1], [label2]], - loc='top', cellLoc='center', - cellColours=[['lightgrey'], ['lightgrey']]) - else: - axis.table(cellText=[[label1]], loc='top', - cellLoc='center', cellColours=[['lightgrey']]) - # Flatten the layer grid - layers = [layer for row in layers for layer in row] - legend = {} - for layer in layers: - legend = dictionary_union(legend, layer.legend) - patches = [] - labels = [] - if len(list(legend.keys())) == 0: - key_function = lambda tup: tup - elif len(list(legend.keys())[0]) == 2: - key_function = lambda tup: (tup[1]) - else: - key_function = lambda tup: (tup[1], tup[3]) - for key in sorted(list(legend.keys()), key=key_function): - value = legend[key] - patches.append(value) - if len(key) == 2: - col, val = key - labels.append("%s" % str(val)) - elif len(key) == 4: - col1, val1, col2, val2 = key - labels.append("%s, %s" % (str(val1), str(val2))) - else: - raise ValueError( - "Maximum 2 categorical attributes to display a lengend of") - if len(legend): - fig.legend(patches, labels, loc='upper right') - fig.subplots_adjust(wspace=0.05, hspace=0.2) - - -class RPlot: - """ - The main plot object. Add layers to an instance of this object to create a - plot. - """ - - def __init__(self, data, x=None, y=None): - """Initialize RPlot instance. - - Parameters: - ----------- - data: pandas DataFrame instance - x: string, DataFrame column name - y: string, DataFrame column name - """ - self.layers = [Layer(data, **default_aes(x=x, y=y))] - - def add(self, layer): - """Add a layer to RPlot instance. - - Parameters: - ----------- - layer: Layer instance - """ - if not isinstance(layer, Layer): - raise TypeError( - "The operand on the right side of + must be a Layer instance") - self.layers.append(layer) - - def render(self, fig=None): - """Render all the layers on a matplotlib figure. - - Parameters: - ----------- - fig: matplotlib figure - """ - import matplotlib.pyplot as plt - if fig is None: - fig = plt.gcf() - # Look for the last TrellisGrid instance in the layer list - last_trellis = None - for layer in self.layers: - if isinstance(layer, TrellisGrid): - last_trellis = layer - if last_trellis is None: - # We have a simple, non-trellised plot - new_layers = sequence_layers(self.layers) - for layer in new_layers: - layer.work(fig=fig) - legend = {} - for layer in new_layers: - legend = dictionary_union(legend, layer.legend) - patches = [] - labels = [] - if len(list(legend.keys())) == 0: - key_function = lambda tup: tup - elif len(list(legend.keys())[0]) == 2: - key_function = lambda tup: (tup[1]) - else: - key_function = lambda tup: (tup[1], tup[3]) - for key in sorted(list(legend.keys()), key=key_function): - value = legend[key] - patches.append(value) - if len(key) == 2: - col, val = key - labels.append("%s" % str(val)) - elif len(key) == 4: - col1, val1, col2, val2 = key - labels.append("%s, %s" % (str(val1), str(val2))) - else: - raise ValueError("Maximum 2 categorical attributes to " - "display a lengend of") - if len(legend): - fig.legend(patches, labels, loc='upper right') - else: - # We have a trellised plot. First let's remove all other - # TrellisGrid instances from the layer list, including this one. - new_layers = [] - for layer in self.layers: - if not isinstance(layer, TrellisGrid): - new_layers.append(layer) - new_layers = sequence_layers(new_layers) - # Now replace the old layers by their trellised versions - new_layers = last_trellis.trellis(new_layers) - # Prepare the subplots and draw on them - new_layers = sequence_grids(new_layers) - axes_grids = [work_grid(grid, fig) for grid in new_layers] - axes_grid = axes_grids[-1] - adjust_subplots(fig, axes_grid, last_trellis, new_layers[-1]) - # And we're done - return fig From 768bf495b9b1f2e6a51708ca6ba83da239cbe504 Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Tue, 2 Aug 2016 06:50:10 -0400 Subject: [PATCH 205/359] =?UTF-8?q?BUG:=20fixes=2013822,=20incorrect=20Key?= =?UTF-8?q?Error=20string=20with=20non-unique=20columns=20w=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #13822 Author: Shawn Heide Closes #13845 from shawnheide/BUG_13822 and squashes the following commits: ae56be0 [Shawn Heide] BUG: fixes 13822, incorrect KeyError string with non-unique columns when missing column is accessed --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/indexing.py | 4 +++- pandas/tests/indexing/test_indexing.py | 9 +++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2eadde8af4912..64e6bc0ab307c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -863,3 +863,4 @@ Bug Fixes - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) - Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) +- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0cba8308c1c53..933ecd1b8de86 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1217,7 +1217,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): else: (indexer, missing) = labels.get_indexer_non_unique(objarr) - check = indexer + # 'indexer' has dupes, create 'check' using 'missing' + check = np.zeros_like(objarr) + check[missing] = -1 mask = check == -1 if mask.any(): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 44c7f2277293d..a96e4acfad89b 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1332,6 +1332,15 @@ def f(): self.assertEqual(result, 3) self.assertRaises(ValueError, lambda: df.at['a', 0]) + # GH 13822, incorrect error string with non-unique columns when missing + # column is accessed + df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]}) + df.columns = ['x', 'x', 'z'] + + # Check that we get the correct value in the KeyError + self.assertRaisesRegexp(KeyError, "\['y'\] not in index", + lambda: df[['x', 'y', 'z']]) + def test_loc_getitem_label_slice(self): # label slices (with ints) From 66c3b46f603d635ac5b4310a9da5672cd2417330 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 3 Aug 2016 06:15:47 -0400 Subject: [PATCH 206/359] TST: Add first line comment tests in read_csv (#13881) Closes gh-4623. --- pandas/io/tests/parser/comment.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/io/tests/parser/comment.py b/pandas/io/tests/parser/comment.py index f7cd1e190ec16..9987a017cf985 100644 --- a/pandas/io/tests/parser/comment.py +++ b/pandas/io/tests/parser/comment.py @@ -104,3 +104,15 @@ def test_custom_comment_char(self): result = self.read_csv(StringIO(data), comment='#') expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) tm.assert_frame_equal(result, expected) + + def test_commment_first_line(self): + # see gh-4623 + data = '# notes\na,b,c\n# more notes\n1,2,3' + + expected = DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']) + result = self.read_csv(StringIO(data), comment='#') + tm.assert_frame_equal(result, expected) + + expected = DataFrame({0: ['a', '1'], 1: ['b', '2'], 2: ['c', '3']}) + result = self.read_csv(StringIO(data), comment='#', header=None) + tm.assert_frame_equal(result, expected) From caf69d501e7322c1763bb0377ff50623f0a27977 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Wed, 3 Aug 2016 19:18:04 +0900 Subject: [PATCH 207/359] BUG: SparseDataFrame may not preserve passed dtype (#13866) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/sparse/array.py | 7 ++----- pandas/sparse/frame.py | 2 +- pandas/sparse/tests/test_frame.py | 22 ++++++++++++++++++++++ pandas/sparse/tests/test_series.py | 9 +++++++++ pandas/tests/series/test_subclass.py | 19 ++++++++++++++++++- 6 files changed, 53 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 64e6bc0ab307c..7ed98bd3170c0 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -752,6 +752,7 @@ Bug Fixes - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) - Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) +- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 35233d1b6ba94..a0dbb35bffe92 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -9,7 +9,6 @@ import pandas as pd from pandas.core.base import PandasObject -import pandas.core.common as com from pandas import compat, lib from pandas.compat import range @@ -577,11 +576,9 @@ def _maybe_to_dense(obj): def _maybe_to_sparse(array): + """ array must be SparseSeries or SparseArray """ if isinstance(array, ABCSparseSeries): - array = SparseArray(array.values, sparse_index=array.sp_index, - fill_value=array.fill_value, copy=True) - if not isinstance(array, SparseArray): - array = com._values_from_object(array) + array = array.values.copy() return array diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 2ea0536ca4fbb..985899e6c6b79 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -151,7 +151,7 @@ def _init_dict(self, data, index, columns, dtype=None): if not isinstance(v, SparseSeries): v = sp_maker(v.values) elif isinstance(v, SparseArray): - v = sp_maker(v.values) + v = v.copy() else: if isinstance(v, dict): v = [v.get(i, nan) for i in index] diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 43d35a4e7f72e..9514f9322f68e 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -192,6 +192,28 @@ def test_constructor_from_series(self): # without sparse value raises error # df2 = SparseDataFrame([x2_sparse, y]) + def test_constructor_preserve_attr(self): + # GH 13866 + arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) + self.assertEqual(arr.dtype, np.int64) + self.assertEqual(arr.fill_value, 0) + + df = pd.SparseDataFrame({'x': arr}) + self.assertEqual(df['x'].dtype, np.int64) + self.assertEqual(df['x'].fill_value, 0) + + s = pd.SparseSeries(arr, name='x') + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) + + df = pd.SparseDataFrame(s) + self.assertEqual(df['x'].dtype, np.int64) + self.assertEqual(df['x'].fill_value, 0) + + df = pd.SparseDataFrame({'x': s}) + self.assertEqual(df['x'].dtype, np.int64) + self.assertEqual(df['x'].fill_value, 0) + def test_dtypes(self): df = DataFrame(np.random.randn(10000, 4)) df.ix[:9998] = np.nan diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 27112319ea915..f9ac7d9d34072 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -136,6 +136,15 @@ def test_construct_DataFrame_with_sp_series(self): result = df.ftypes tm.assert_series_equal(expected, result) + def test_constructor_preserve_attr(self): + arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) + self.assertEqual(arr.dtype, np.int64) + self.assertEqual(arr.fill_value, 0) + + s = pd.SparseSeries(arr, name='x') + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) + def test_series_density(self): # GH2803 ts = Series(np.random.randn(10)) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index dabecefaee9d1..be7a0eccf6b7c 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -1,6 +1,8 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 +import numpy as np +import pandas as pd import pandas.util.testing as tm @@ -32,6 +34,11 @@ def test_to_frame(self): tm.assert_frame_equal(res, exp) tm.assertIsInstance(res, tm.SubclassedDataFrame) + +class TestSparseSeriesSubclassing(tm.TestCase): + + _multiprocess_can_split_ = True + def test_subclass_sparse_slice(self): s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5]) tm.assert_sp_series_equal(s.loc[1:3], @@ -53,5 +60,15 @@ def test_subclass_sparse_addition(self): def test_subclass_sparse_to_frame(self): s = tm.SubclassedSparseSeries([1, 2], index=list('abcd'), name='xxx') res = s.to_frame() - exp = tm.SubclassedSparseDataFrame({'xxx': [1, 2]}, index=list('abcd')) + + exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block') + exp = tm.SubclassedSparseDataFrame({'xxx': exp_arr}, + index=list('abcd')) + tm.assert_sp_frame_equal(res, exp) + + s = tm.SubclassedSparseSeries([1.1, 2.1], index=list('abcd'), + name='xxx') + res = s.to_frame() + exp = tm.SubclassedSparseDataFrame({'xxx': [1.1, 2.1]}, + index=list('abcd')) tm.assert_sp_frame_equal(res, exp) From 3db33659805fbb991556bfd99b1903a7d85de455 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 3 Aug 2016 06:21:12 -0400 Subject: [PATCH 208/359] ENH: add sort_categories argument to union_categoricals - needed for #13406, follow-up to #13763 Author: Chris Author: sinhrks Closes #13846 from chris-b1/union_categoricals_ordered and squashes the following commits: 3a710f0 [Chris] lint fix ff0bb5e [Chris] add follow-up PRs to whatsnew ecb2ae9 [Chris] more tests; handle sorth with ordered eea1777 [Chris] skip r-esort when possible on fastpath c559662 [sinhrks] ENH: add sort_categories argument to union_categoricals --- doc/source/categorical.rst | 10 ++- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tools/tests/test_concat.py | 100 ++++++++++++++++++++++++++++++ pandas/types/concat.py | 68 +++++++++++--------- 4 files changed, 150 insertions(+), 30 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index da9c707e07552..d59ad68c9ea83 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -656,7 +656,7 @@ Unioning .. versionadded:: 0.19.0 If you want to combine categoricals that do not necessarily have -the same categories, the `union_categorical` function will +the same categories, the ``union_categoricals`` function will combine a list-like of categoricals. The new categories will be the union of the categories being combined. @@ -667,6 +667,14 @@ will be the union of the categories being combined. b = pd.Categorical(["a", "b"]) union_categoricals([a, b]) +By default, the resulting categories will be ordered as +they appear in the data. If you want the categories to +be lexsorted, use ``sort_categories=True`` argument. + +.. ipython:: python + + union_categoricals([a, b], sort_categories=True) + .. note:: In addition to the "easy" case of combining two categoricals of the same diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 7ed98bd3170c0..8b8ab505c9d44 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -336,7 +336,7 @@ Other enhancements - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846') - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`). - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index dd5b4936c70bb..968ea979f7c75 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -989,6 +989,106 @@ def test_union_categoricals_ordered(self): with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2]) + def test_union_categoricals_sort(self): + # GH 13846 + c1 = Categorical(['x', 'y', 'z']) + c2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['a', 'b', 'c', 'x', 'y', 'z']) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) + c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath - skip resort + c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['x', np.nan]) + c2 = Categorical([np.nan, 'b']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['x', np.nan, np.nan, 'b'], + categories=['b', 'x']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([np.nan, np.nan], categories=[]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) + c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + with tm.assertRaises(TypeError): + union_categoricals([c1, c2], sort_categories=True) + + def test_union_categoricals_sort_false(self): + # GH 13846 + c1 = Categorical(['x', 'y', 'z']) + c2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) + c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['b', 'a', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath - skip resort + c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['x', np.nan]) + c2 = Categorical([np.nan, 'b']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['x', np.nan, np.nan, 'b'], + categories=['x', 'b']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([np.nan, np.nan], categories=[]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) + c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['b', 'a', 'a', 'c'], + categories=['b', 'a', 'c'], ordered=True) + tm.assert_categorical_equal(result, expected) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index e860ba3e201e9..0a985dd6141ae 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -211,22 +211,23 @@ def convert_categorical(x): return Categorical(concatted, rawcats) -def union_categoricals(to_union): +def union_categoricals(to_union, sort_categories=False): """ Combine list-like of Categoricals, unioning categories. All - must have the same dtype, and none can be ordered. + categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals + sort_categories : boolean, default False + If true, resulting categories will be lexsorted, otherwise + they will be ordered as they appear in the data. Returns ------- - Categorical - A single array, categories will be ordered as they - appear in the list + result : Categorical Raises ------ @@ -234,6 +235,7 @@ def union_categoricals(to_union): - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical + - sort_categories=True and Categoricals are ordered ValueError Emmpty list of categoricals passed """ @@ -244,19 +246,43 @@ def union_categoricals(to_union): first = to_union[0] - if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) - for c in to_union): + if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) + for other in to_union[1:]): raise TypeError("dtype of categories must be the same") + ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): - return Categorical(np.concatenate([c.codes for c in to_union]), - categories=first.categories, ordered=first.ordered, - fastpath=True) + # identical categories - fastpath + categories = first.categories + ordered = first.ordered + new_codes = np.concatenate([c.codes for c in to_union]) + + if sort_categories and ordered: + raise TypeError("Cannot use sort_categories=True with " + "ordered Categoricals") + + if sort_categories and not categories.is_monotonic_increasing: + categories = categories.sort_values() + indexer = first.categories.get_indexer(categories) + new_codes = take_1d(indexer, new_codes, fill_value=-1) elif all(not c.ordered for c in to_union): - # not ordered - pass + # different categories - union and recode + cats = first.categories.append([c.categories for c in to_union[1:]]) + categories = Index(cats.unique()) + if sort_categories: + categories = categories.sort_values() + + new_codes = [] + for c in to_union: + if len(c.categories) > 0: + indexer = categories.get_indexer(c.categories) + new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) + else: + # must be all NaN + new_codes.append(c.codes) + new_codes = np.concatenate(new_codes) else: - # to show a proper error message + # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") @@ -264,21 +290,7 @@ def union_categoricals(to_union): else: raise TypeError('Categorical.ordered must be the same') - cats = first.categories - unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() - categories = Index(unique_cats) - - new_codes = [] - for c in to_union: - if len(c.categories) > 0: - indexer = categories.get_indexer(c.categories) - new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) - else: - # must be all NaN - new_codes.append(c.codes) - - new_codes = np.concatenate(new_codes) - return Categorical(new_codes, categories=categories, ordered=False, + return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) From 97de42abbea588fc5a46be5c58788958fd817b7f Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Wed, 3 Aug 2016 06:25:30 -0400 Subject: [PATCH 209/359] ENH: Allow users to specify whether gbq should use standard SQL closes #13615 Author: Anthonios Partheniou Closes #13850 from parthea/gbq-enable-standard-sql-dialect and squashes the following commits: e1fbb07 [Anthonios Partheniou] Allow users to specify whether gbq should use standard SQL #13615 --- doc/source/io.rst | 7 ++++++ doc/source/whatsnew/v0.19.0.txt | 6 +++++ pandas/io/gbq.py | 25 ++++++++++++++++--- pandas/io/tests/test_gbq.py | 44 +++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 4 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ee5734aaf9494..2866371cce61a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4482,6 +4482,13 @@ destination DataFrame as well as a preferred column order as follows: You can toggle the verbose output via the ``verbose`` flag which defaults to ``True``. +.. note:: + + The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL + or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``. For more information + on BigQuery's standard SQL, see `BigQuery SQL Reference + `__ + .. _io.bigquery_writer: diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8b8ab505c9d44..0c60aeeae333b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -301,6 +301,12 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) +.. _whatsnew_0170.gbq: + +Google BigQuery Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). + .. _whatsnew_0190.enhancements.other: Other enhancements diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 140f5cc6bb6e3..6288fdb609962 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -145,13 +145,14 @@ class GbqConnector(object): scope = 'https://www.googleapis.com/auth/bigquery' def __init__(self, project_id, reauth=False, verbose=False, - private_key=None): + private_key=None, dialect='legacy'): _check_google_client_version() _test_google_api_imports() self.project_id = project_id self.reauth = reauth self.verbose = verbose self.private_key = private_key + self.dialect = dialect self.credentials = self.get_credentials() self.service = self.get_service() @@ -334,7 +335,8 @@ def run_query(self, query): job_data = { 'configuration': { 'query': { - 'query': query + 'query': query, + 'useLegacySql': self.dialect == 'legacy' # 'allowLargeResults', 'createDisposition', # 'preserveNulls', destinationTable, useQueryCache } @@ -563,7 +565,7 @@ def _parse_entry(field_value, field_type): def read_gbq(query, project_id=None, index_col=None, col_order=None, - reauth=False, verbose=True, private_key=None): + reauth=False, verbose=True, private_key=None, dialect='legacy'): """Load data from Google BigQuery. THIS IS AN EXPERIMENTAL LIBRARY @@ -602,6 +604,17 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, or string contents. This is useful for remote server authentication (eg. jupyter iPython notebook on remote host) + .. versionadded:: 0.18.1 + + dialect : {'legacy', 'standard'}, default 'legacy' + 'legacy' : Use BigQuery's legacy SQL dialect. + 'standard' : Use BigQuery's standard SQL (beta), which is + compliant with the SQL 2011 standard. For more information + see `BigQuery SQL Reference + `__ + + .. versionadded:: 0.19.0 + Returns ------- df: DataFrame @@ -612,8 +625,12 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, if not project_id: raise TypeError("Missing required parameter: project_id") + if dialect not in ('legacy', 'standard'): + raise ValueError("'{0}' is not valid for dialect".format(dialect)) + connector = GbqConnector(project_id, reauth=reauth, verbose=verbose, - private_key=private_key) + private_key=private_key, + dialect=dialect) schema, pages = connector.run_query(query) dataframe_list = [] while len(pages) > 0: diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 278c5d7215624..0d8512ffb5524 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -557,6 +557,50 @@ def test_zero_rows(self): expected_result = DataFrame(page_array, columns=['title', 'id']) self.assert_frame_equal(df, expected_result) + def test_legacy_sql(self): + legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10" + + # Test that a legacy sql statement fails when + # setting dialect='standard' + with tm.assertRaises(gbq.GenericGBQException): + gbq.read_gbq(legacy_sql, project_id=PROJECT_ID, + dialect='standard') + + # Test that a legacy sql statement succeeds when + # setting dialect='legacy' + df = gbq.read_gbq(legacy_sql, project_id=PROJECT_ID, + dialect='legacy') + self.assertEqual(len(df.drop_duplicates()), 10) + + def test_standard_sql(self): + standard_sql = "SELECT DISTINCT id FROM " \ + "`publicdata.samples.wikipedia` LIMIT 10" + + # Test that a standard sql statement fails when using + # the legacy SQL dialect (default value) + with tm.assertRaises(gbq.GenericGBQException): + gbq.read_gbq(standard_sql, project_id=PROJECT_ID) + + # Test that a standard sql statement succeeds when + # setting dialect='standard' + df = gbq.read_gbq(standard_sql, project_id=PROJECT_ID, + dialect='standard') + self.assertEqual(len(df.drop_duplicates()), 10) + + def test_invalid_option_for_sql_dialect(self): + sql_statement = "SELECT DISTINCT id FROM " \ + "`publicdata.samples.wikipedia` LIMIT 10" + + # Test that an invalid option for `dialect` raises ValueError + with tm.assertRaises(ValueError): + gbq.read_gbq(sql_statement, project_id=PROJECT_ID, + dialect='invalid') + + # Test that a correct option for dialect succeeds + # to make sure ValueError was due to invalid dialect + gbq.read_gbq(sql_statement, project_id=PROJECT_ID, + dialect='standard') + class TestToGBQIntegration(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 From e2cb79937d42f926dd0aae8680118f7ddfd70230 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Wed, 3 Aug 2016 23:51:53 +0900 Subject: [PATCH 210/359] DOC: small doc fixed (#13886) --- doc/source/whatsnew/v0.19.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0c60aeeae333b..0363d49333253 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -301,7 +301,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) -.. _whatsnew_0170.gbq: +.. _whatsnew_0190.gbq: Google BigQuery Enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -342,7 +342,7 @@ Other enhancements - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846') +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`). - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) From 5f476082097fae2cf977187292ad704a825b3660 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 3 Aug 2016 18:39:13 -0400 Subject: [PATCH 211/359] BUG: ufunc is not applied to sparse.fill_value Author: sinhrks Closes #13853 from sinhrks/sparse_ufunc and squashes the following commits: a14f573 [sinhrks] BUG: ufunc is not applied to sparse.fill_value --- doc/source/sparse.rst | 22 ++++++++++++++ doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/sparse/array.py | 28 ++++++++++++++++++ pandas/sparse/frame.py | 2 +- pandas/sparse/series.py | 19 ++++++++---- pandas/sparse/tests/test_array.py | 46 ++++++++++++++++++++++++++++++ pandas/sparse/tests/test_series.py | 16 +++++++++++ 7 files changed, 128 insertions(+), 7 deletions(-) diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 41ed0bf16ebae..2496335dc7b71 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -130,6 +130,28 @@ keeps an arrays of all of the locations where the data are not equal to the fill value. The ``block`` format tracks only the locations and sizes of blocks of data. +.. _sparse.calculation: + +Sparse Calculation +------------------ + +You can apply NumPy *ufuncs* to ``SparseArray`` and get a ``SparseArray`` as a result. + +.. ipython:: python + + arr = pd.SparseArray([1., np.nan, np.nan, -2., np.nan]) + np.abs(arr) + + +The *ufunc* is also applied to ``fill_value``. This is needed to get +the correct dense result. + +.. ipython:: python + + arr = pd.SparseArray([1., -1, -1, -2., -1], fill_value=-1) + np.abs(arr) + np.abs(arr).to_dense() + .. _sparse.scipysparse: Interaction with scipy.sparse diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0363d49333253..f53db81867377 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -759,6 +759,8 @@ Bug Fixes - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) - Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) - Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) +- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) +- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index a0dbb35bffe92..5e36bc514b419 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -212,6 +212,34 @@ def kind(self): elif isinstance(self.sp_index, IntIndex): return 'integer' + def __array_wrap__(self, out_arr, context=None): + """ + NumPy calls this method when ufunc is applied + + Parameters + ---------- + + out_arr : ndarray + ufunc result (note that ufunc is only applied to sp_values) + context : tuple of 3 elements (ufunc, signature, domain) + for example, following is a context when np.sin is applied to + SparseArray, + + (, (SparseArray,), 0)) + + See http://docs.scipy.org/doc/numpy/user/basics.subclassing.html + """ + if isinstance(context, tuple) and len(context) == 3: + ufunc, args, domain = context + # to apply ufunc only to fill_value (to avoid recursive call) + args = [getattr(a, 'fill_value', a) for a in args] + fill_value = ufunc(self.fill_value, *args[1:]) + else: + fill_value = self.fill_value + + return self._simple_new(out_arr, sp_index=self.sp_index, + fill_value=fill_value) + def __array_finalize__(self, obj): """ Gets called after any ufunc or other array operations, necessary diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 985899e6c6b79..b6a1e1e48c5c4 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -706,7 +706,7 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): new_series = {} for k, v in compat.iteritems(self): applied = func(v) - applied.fill_value = func(applied.fill_value) + applied.fill_value = func(v.fill_value) new_series[k] = applied return self._constructor( new_series, index=self.index, columns=self.columns, diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 6c4392dbf7cb4..e8f4feffb725f 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -307,13 +307,22 @@ def __unicode__(self): rep = '%s\n%s' % (series_rep, repr(self.sp_index)) return rep - def __array_wrap__(self, result): + def __array_wrap__(self, result, context=None): """ Gets called prior to a ufunc (and after) + + See SparseArray.__array_wrap__ for detail. """ + if isinstance(context, tuple) and len(context) == 3: + ufunc, args, domain = context + args = [getattr(a, 'fill_value', a) for a in args] + fill_value = ufunc(self.fill_value, *args[1:]) + else: + fill_value = self.fill_value + return self._constructor(result, index=self.index, sparse_index=self.sp_index, - fill_value=self.fill_value, + fill_value=fill_value, copy=False).__finalize__(self) def __array_finalize__(self, obj): @@ -434,10 +443,8 @@ def abs(self): ------- abs: type of caller """ - res_sp_values = np.abs(self.sp_values) - return self._constructor(res_sp_values, index=self.index, - sparse_index=self.sp_index, - fill_value=self.fill_value).__finalize__(self) + return self._constructor(np.abs(self.values), + index=self.index).__finalize__(self) def get(self, label, default=None): """ diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index dd2126d0f52d2..dcd5df3791fcb 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -829,6 +829,52 @@ def test_numpy_mean(self): tm.assertRaisesRegexp(ValueError, msg, np.mean, SparseArray(data), out=out) + def test_ufunc(self): + # GH 13853 make sure ufunc is applied to fill_value + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray([1, np.nan, 2, np.nan, 2]) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, + fill_value=1) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=-1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, + fill_value=1) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2])) + tm.assert_sp_array_equal(np.sin(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1)) + tm.assert_sp_array_equal(np.sin(sparse), result) + + sparse = SparseArray([1, -1, 0, -2], fill_value=0) + result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0)) + tm.assert_sp_array_equal(np.sin(sparse), result) + + def test_ufunc_args(self): + # GH 13853 make sure ufunc is applied to fill_value, including its arg + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray([2, np.nan, 3, np.nan, -1]) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray([2, 0, 3, -1], fill_value=2) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + sparse = SparseArray([1, -1, 0, -2], fill_value=0) + result = SparseArray([2, 0, 1, -1], fill_value=1) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index f9ac7d9d34072..c5480973b46bc 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -589,6 +589,21 @@ def test_abs(self): tm.assert_sp_series_equal(result, expected) self.assertEqual(result.name, 'x') + s = SparseSeries([1, -2, 2, -3], fill_value=-2, name='x') + expected = SparseSeries([1, 2, 3], sparse_index=s.sp_index, + fill_value=2, name='x') + result = s.abs() + tm.assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = abs(s) + tm.assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = np.abs(s) + tm.assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + def test_reindex(self): def _compare_with_series(sps, new_index): spsre = sps.reindex(new_index) @@ -1288,6 +1303,7 @@ def test_numpy_func_call(self): for series in ('bseries', 'zbseries'): getattr(np, func)(getattr(self, series)) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 45d54d067ed9a0dfdb456f253fb0bede9969187c Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 27 Jul 2016 19:57:38 +0900 Subject: [PATCH 212/359] ENH: add sparse op for other dtypes closes #13848 xref #667 --- doc/source/whatsnew/v0.19.0.txt | 32 +- pandas/sparse/array.py | 100 +- pandas/sparse/series.py | 14 +- pandas/sparse/tests/test_arithmetics.py | 346 ++ pandas/sparse/tests/test_array.py | 189 - pandas/sparse/tests/test_libsparse.py | 11 +- pandas/sparse/tests/test_series.py | 4 +- pandas/src/sparse.pyx | 342 +- pandas/src/sparse_op_helper.pxi | 5532 +++++++++++++++++++++++ pandas/src/sparse_op_helper.pxi.in | 337 ++ pandas/tests/series/test_subclass.py | 9 +- setup.py | 3 +- 12 files changed, 6337 insertions(+), 582 deletions(-) create mode 100644 pandas/sparse/tests/test_arithmetics.py create mode 100644 pandas/src/sparse_op_helper.pxi create mode 100644 pandas/src/sparse_op_helper.pxi.in diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f53db81867377..e67ca3b199369 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -307,6 +307,31 @@ Google BigQuery Enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). +.. _whatsnew_0190.sparse: + +Sparse changes +~~~~~~~~~~~~~~ + +These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. + +- Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) + +.. ipython:: python + + s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) + s.dtype + + s + 1 + + +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) +- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) +- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) +- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) +- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) + .. _whatsnew_0190.enhancements.other: Other enhancements @@ -754,13 +779,6 @@ Bug Fixes - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) -- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) -- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) -- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) -- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) -- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 5e36bc514b419..8aebb19d5b93e 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -48,16 +48,14 @@ def wrapper(self, other): raise AssertionError("length mismatch: %d vs. %d" % (len(self), len(other))) if not isinstance(other, ABCSparseArray): - other = SparseArray(other, fill_value=self.fill_value) - if name[0] == 'r': - return _sparse_array_op(other, self, op, name[1:]) - else: - return _sparse_array_op(self, other, op, name) + dtype = getattr(other, 'dtype', None) + other = SparseArray(other, fill_value=self.fill_value, + dtype=dtype) + return _sparse_array_op(self, other, op, name) elif is_scalar(other): - new_fill_value = op(np.float64(self.fill_value), np.float64(other)) - + fill = op(_get_fill(self), np.asarray(other)) return _wrap_result(name, op(self.sp_values, other), - self.sp_index, new_fill_value) + self.sp_index, fill) else: # pragma: no cover raise TypeError('operation with %s not supported' % type(other)) @@ -67,33 +65,74 @@ def wrapper(self, other): return wrapper -def _sparse_array_op(left, right, op, name): - if left.sp_index.equals(right.sp_index): - result = op(left.sp_values, right.sp_values) - result_index = left.sp_index +def _maybe_match_dtype(left, right): + if not hasattr(right, 'dtype'): + return left.dtype + elif left.dtype == right.dtype: + return getattr(left.dtype, '__name__', left.dtype) else: - sparse_op = getattr(splib, 'sparse_%s' % name) - result, result_index = sparse_op(left.sp_values, left.sp_index, - left.fill_value, right.sp_values, - right.sp_index, right.fill_value) + # ToDo: to be supported after GH 667 + raise NotImplementedError('dtypes must be identical') + + +def _get_fill(arr): + # coerce fill_value to arr dtype if possible + # int64 SparseArray can have NaN as fill_value if there is no missing try: - fill_value = op(left.fill_value, right.fill_value) - except: - fill_value = nan - return _wrap_result(name, result, result_index, fill_value) + return np.asarray(arr.fill_value, dtype=arr.dtype) + except ValueError: + return np.asarray(arr.fill_value) -def _wrap_result(name, data, sparse_index, fill_value): +def _sparse_array_op(left, right, op, name, series=False): + + if series and is_integer_dtype(left) and is_integer_dtype(right): + # series coerces to float64 if result should have NaN/inf + if name in ('floordiv', 'mod') and (right.values == 0).any(): + left = left.astype(np.float64) + right = right.astype(np.float64) + elif name in ('rfloordiv', 'rmod') and (left.values == 0).any(): + left = left.astype(np.float64) + right = right.astype(np.float64) + + dtype = _maybe_match_dtype(left, right) + + if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: + result = op(left.get_values(), right.get_values()) + + if left.sp_index.ngaps == 0: + index = left.sp_index + else: + index = right.sp_index + fill = op(_get_fill(left), _get_fill(right)) + elif left.sp_index.equals(right.sp_index): + result = op(left.sp_values, right.sp_values) + index = left.sp_index + fill = op(_get_fill(left), _get_fill(right)) + else: + if name[0] == 'r': + left, right = right, left + name = name[1:] + + opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) + sparse_op = getattr(splib, opname) + + result, index, fill = sparse_op(left.sp_values, left.sp_index, + left.fill_value, right.sp_values, + right.sp_index, right.fill_value) + return _wrap_result(name, result, index, fill, dtype=result.dtype) + + +def _wrap_result(name, data, sparse_index, fill_value, dtype=None): """ wrap op result to have correct dtype """ if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): # ToDo: We can remove this condition when removing # SparseArray's dtype default when closing GH 667 - return SparseArray(data, sparse_index=sparse_index, - fill_value=fill_value, - dtype=np.bool) - else: - return SparseArray(data, sparse_index=sparse_index, - fill_value=fill_value) + dtype = np.bool + elif name == 'truediv': + dtype = np.float64 + return SparseArray(data, sparse_index=sparse_index, + fill_value=fill_value, dtype=dtype) class SparseArray(PandasObject, np.ndarray): @@ -447,7 +486,12 @@ def astype(self, dtype=None): dtype = np.dtype(dtype) if dtype is not None and dtype not in (np.float_, float): raise TypeError('Can only support floating point data for now') - return self.copy() + + if self.dtype == dtype: + return self.copy() + else: + return self._simple_new(self.sp_values.astype(dtype), + self.sp_index, float(self.fill_value)) def copy(self, deep=True): """ diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index e8f4feffb725f..9045784287d9c 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -57,16 +57,9 @@ def wrapper(self, other): elif isinstance(other, DataFrame): return NotImplemented elif is_scalar(other): - if isnull(other) or isnull(self.fill_value): - new_fill_value = np.nan - else: - new_fill_value = op(np.float64(self.fill_value), - np.float64(other)) - - return self._constructor(op(self.sp_values, other), + new_values = op(self.values, other) + return self._constructor(new_values, index=self.index, - sparse_index=self.sp_index, - fill_value=new_fill_value, name=self.name) else: # pragma: no cover raise TypeError('operation with %s not supported' % type(other)) @@ -84,7 +77,8 @@ def _sparse_series_op(left, right, op, name): new_index = left.index new_name = _maybe_match_name(left, right) - result = _sparse_array_op(left, right, op, name) + result = _sparse_array_op(left.values, right.values, op, name, + series=True) return left._constructor(result, index=new_index, name=new_name) diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/sparse/tests/test_arithmetics.py new file mode 100644 index 0000000000000..87efc362581cd --- /dev/null +++ b/pandas/sparse/tests/test_arithmetics.py @@ -0,0 +1,346 @@ +import numpy as np +import pandas as pd +import pandas.util.testing as tm + + +class TestSparseArrayArithmetics(tm.TestCase): + + _multiprocess_can_split_ = True + + _base = np.array + _klass = pd.SparseArray + + def _assert(self, a, b): + tm.assert_numpy_array_equal(a, b) + + def _check_numeric_ops(self, a, b, a_dense, b_dense): + # sparse & sparse + self._assert((a + b).to_dense(), a_dense + b_dense) + self._assert((b + a).to_dense(), b_dense + a_dense) + + self._assert((a - b).to_dense(), a_dense - b_dense) + self._assert((b - a).to_dense(), b_dense - a_dense) + + self._assert((a * b).to_dense(), a_dense * b_dense) + self._assert((b * a).to_dense(), b_dense * a_dense) + + # pandas uses future division + self._assert((a / b).to_dense(), a_dense * 1.0 / b_dense) + self._assert((b / a).to_dense(), b_dense * 1.0 / a_dense) + + # ToDo: FIXME in GH 13843 + if not (self._base == pd.Series and a.dtype == 'int64'): + self._assert((a // b).to_dense(), a_dense // b_dense) + self._assert((b // a).to_dense(), b_dense // a_dense) + + self._assert((a % b).to_dense(), a_dense % b_dense) + self._assert((b % a).to_dense(), b_dense % a_dense) + + self._assert((a ** b).to_dense(), a_dense ** b_dense) + self._assert((b ** a).to_dense(), b_dense ** a_dense) + + # sparse & dense + self._assert((a + b_dense).to_dense(), a_dense + b_dense) + self._assert((b_dense + a).to_dense(), b_dense + a_dense) + + self._assert((a - b_dense).to_dense(), a_dense - b_dense) + self._assert((b_dense - a).to_dense(), b_dense - a_dense) + + self._assert((a * b_dense).to_dense(), a_dense * b_dense) + self._assert((b_dense * a).to_dense(), b_dense * a_dense) + + # pandas uses future division + self._assert((a / b_dense).to_dense(), a_dense * 1.0 / b_dense) + self._assert((b_dense / a).to_dense(), b_dense * 1.0 / a_dense) + + # ToDo: FIXME in GH 13843 + if not (self._base == pd.Series and a.dtype == 'int64'): + self._assert((a // b_dense).to_dense(), a_dense // b_dense) + self._assert((b_dense // a).to_dense(), b_dense // a_dense) + + self._assert((a % b_dense).to_dense(), a_dense % b_dense) + self._assert((b_dense % a).to_dense(), b_dense % a_dense) + + self._assert((a ** b_dense).to_dense(), a_dense ** b_dense) + self._assert((b_dense ** a).to_dense(), b_dense ** a_dense) + + def _check_bool_result(self, res): + tm.assertIsInstance(res, self._klass) + self.assertEqual(res.dtype, np.bool) + self.assertIsInstance(res.fill_value, bool) + + def _check_comparison_ops(self, a, b, a_dense, b_dense): + # sparse & sparse + self._check_bool_result(a == b) + self._assert((a == b).to_dense(), a_dense == b_dense) + + self._check_bool_result(a != b) + self._assert((a != b).to_dense(), a_dense != b_dense) + + self._check_bool_result(a >= b) + self._assert((a >= b).to_dense(), a_dense >= b_dense) + + self._check_bool_result(a <= b) + self._assert((a <= b).to_dense(), a_dense <= b_dense) + + self._check_bool_result(a > b) + self._assert((a > b).to_dense(), a_dense > b_dense) + + self._check_bool_result(a < b) + self._assert((a < b).to_dense(), a_dense < b_dense) + + # sparse & dense + self._check_bool_result(a == b_dense) + self._assert((a == b_dense).to_dense(), a_dense == b_dense) + + self._check_bool_result(a != b_dense) + self._assert((a != b_dense).to_dense(), a_dense != b_dense) + + self._check_bool_result(a >= b_dense) + self._assert((a >= b_dense).to_dense(), a_dense >= b_dense) + + self._check_bool_result(a <= b_dense) + self._assert((a <= b_dense).to_dense(), a_dense <= b_dense) + + self._check_bool_result(a > b_dense) + self._assert((a > b_dense).to_dense(), a_dense > b_dense) + + self._check_bool_result(a < b_dense) + self._assert((a < b_dense).to_dense(), a_dense < b_dense) + + def test_float_scalar(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + self._check_numeric_ops(a, 1, values, 1) + self._check_numeric_ops(a, 0, values, 0) + self._check_numeric_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=0) + self._check_numeric_ops(a, 1, values, 1) + self._check_numeric_ops(a, 0, values, 0) + self._check_numeric_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=2) + self._check_numeric_ops(a, 1, values, 1) + self._check_numeric_ops(a, 0, values, 0) + self._check_numeric_ops(a, 3, values, 3) + + def test_float_scalar_comparison(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=0) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=2) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + def test_float_same_index(self): + # when sp_index are the same + for kind in ['integer', 'block']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues) + + values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) + rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues) + + def test_float_same_index_comparison(self): + # when sp_index are the same + for kind in ['integer', 'block']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) + rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + def test_float_array(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues) + + def test_float_array_different_kind(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + a = self._klass(values, kind='integer') + b = self._klass(rvalues, kind='block') + self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind='integer', fill_value=0) + b = self._klass(rvalues, kind='block') + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind='integer', fill_value=0) + b = self._klass(rvalues, kind='block', fill_value=0) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind='integer', fill_value=1) + b = self._klass(rvalues, kind='block', fill_value=2) + self._check_numeric_ops(a, b, values, rvalues) + + def test_float_array_comparison(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + def test_int_array(self): + # have to specify dtype explicitly until fixing GH 667 + dtype = np.int64 + + values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + + for kind in ['integer', 'block']: + a = self._klass(values, dtype=dtype, kind=kind) + self.assertEqual(a.dtype, dtype) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self.assertEqual(b.dtype, dtype) + + self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + self.assertEqual(a.dtype, dtype) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self.assertEqual(b.dtype, dtype) + + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + self.assertEqual(a.dtype, dtype) + b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind) + self.assertEqual(b.dtype, dtype) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, fill_value=1, dtype=dtype, kind=kind) + self.assertEqual(a.dtype, dtype) + b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind) + self.assertEqual(b.dtype, dtype) + self._check_numeric_ops(a, b, values, rvalues) + + def test_int_array_comparison(self): + values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0]) + + dtype = np.int64 + + for kind in ['integer', 'block']: + a = self._klass(values, dtype=dtype, kind=kind) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, dtype=dtype, kind=kind, fill_value=1) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + +class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): + + _base = pd.Series + _klass = pd.SparseSeries + + def _assert(self, a, b): + tm.assert_series_equal(a, b) + + def _check_bool_result(self, res): + # ToDo: Must return SparseSeries after GH 667 + tm.assertIsInstance(res, self._base) + self.assertEqual(res.dtype, np.bool) + + def test_alignment(self): + da = pd.Series(np.arange(4)) + db = pd.Series(np.arange(4), index=[1, 2, 3, 4]) + + sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) + sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4], + dtype=np.int64, fill_value=0) + self._check_numeric_ops(sa, sb, da, db) + + sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) + sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4], + dtype=np.int64, fill_value=np.nan) + self._check_numeric_ops(sa, sb, da, db) + + da = pd.Series(np.arange(4)) + db = pd.Series(np.arange(4), index=[10, 11, 12, 13]) + + sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) + sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13], + dtype=np.int64, fill_value=0) + self._check_numeric_ops(sa, sb, da, db) + + sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) + sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13], + dtype=np.int64, fill_value=np.nan) + self._check_numeric_ops(sa, sb, da, db) diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index dcd5df3791fcb..2f12b9fba1842 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -539,195 +539,6 @@ def test_fillna_overlap(self): tm.assert_sp_array_equal(res, exp) -class TestSparseArrayArithmetic(tm.TestCase): - - _multiprocess_can_split_ = True - - def _check_numeric_ops(self, a, b, a_dense, b_dense): - tm.assert_numpy_array_equal((a + b).to_dense(), a_dense + b_dense) - tm.assert_numpy_array_equal((b + a).to_dense(), b_dense + a_dense) - - tm.assert_numpy_array_equal((a - b).to_dense(), a_dense - b_dense) - tm.assert_numpy_array_equal((b - a).to_dense(), b_dense - a_dense) - - tm.assert_numpy_array_equal((a * b).to_dense(), a_dense * b_dense) - tm.assert_numpy_array_equal((b * a).to_dense(), b_dense * a_dense) - - tm.assert_numpy_array_equal((a / b).to_dense(), a_dense / b_dense) - tm.assert_numpy_array_equal((b / a).to_dense(), b_dense / a_dense) - - tm.assert_numpy_array_equal((a // b).to_dense(), a_dense // b_dense) - tm.assert_numpy_array_equal((b // a).to_dense(), b_dense // a_dense) - - tm.assert_numpy_array_equal((a % b).to_dense(), a_dense % b_dense) - tm.assert_numpy_array_equal((b % a).to_dense(), b_dense % a_dense) - - tm.assert_numpy_array_equal((a ** b).to_dense(), a_dense ** b_dense) - tm.assert_numpy_array_equal((b ** a).to_dense(), b_dense ** a_dense) - - def _check_comparison_ops(self, a, b, a_dense, b_dense): - - def _check(res): - tm.assertIsInstance(res, SparseArray) - self.assertEqual(res.dtype, np.bool) - self.assertIsInstance(res.fill_value, bool) - - _check(a == b) - tm.assert_numpy_array_equal((a == b).to_dense(), a_dense == b_dense) - - _check(a != b) - tm.assert_numpy_array_equal((a != b).to_dense(), a_dense != b_dense) - - _check(a >= b) - tm.assert_numpy_array_equal((a >= b).to_dense(), a_dense >= b_dense) - - _check(a <= b) - tm.assert_numpy_array_equal((a <= b).to_dense(), a_dense <= b_dense) - - _check(a > b) - tm.assert_numpy_array_equal((a > b).to_dense(), a_dense > b_dense) - - _check(a < b) - tm.assert_numpy_array_equal((a < b).to_dense(), a_dense < b_dense) - - def test_float_scalar(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - - for kind in ['integer', 'block']: - a = SparseArray(values, kind=kind) - self._check_numeric_ops(a, 1, values, 1) - self._check_numeric_ops(a, 0, values, 0) - self._check_numeric_ops(a, 3, values, 3) - - a = SparseArray(values, kind=kind, fill_value=0) - self._check_numeric_ops(a, 1, values, 1) - self._check_numeric_ops(a, 0, values, 0) - self._check_numeric_ops(a, 3, values, 3) - - a = SparseArray(values, kind=kind, fill_value=2) - self._check_numeric_ops(a, 1, values, 1) - self._check_numeric_ops(a, 0, values, 0) - self._check_numeric_ops(a, 3, values, 3) - - def test_float_scalar_comparison(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - - for kind in ['integer', 'block']: - a = SparseArray(values, kind=kind) - self._check_comparison_ops(a, 1, values, 1) - self._check_comparison_ops(a, 0, values, 0) - self._check_comparison_ops(a, 3, values, 3) - - a = SparseArray(values, kind=kind, fill_value=0) - self._check_comparison_ops(a, 1, values, 1) - self._check_comparison_ops(a, 0, values, 0) - self._check_comparison_ops(a, 3, values, 3) - - a = SparseArray(values, kind=kind, fill_value=2) - self._check_comparison_ops(a, 1, values, 1) - self._check_comparison_ops(a, 0, values, 0) - self._check_comparison_ops(a, 3, values, 3) - - def test_float_same_index(self): - # when sp_index are the same - for kind in ['integer', 'block']: - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - - a = SparseArray(values, kind=kind) - b = SparseArray(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues) - - values = np.array([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) - rvalues = np.array([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind, fill_value=0) - self._check_numeric_ops(a, b, values, rvalues) - - def test_float_same_index_comparison(self): - # when sp_index are the same - for kind in ['integer', 'block']: - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - - a = SparseArray(values, kind=kind) - b = SparseArray(rvalues, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) - - values = np.array([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) - rvalues = np.array([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind, fill_value=0) - self._check_comparison_ops(a, b, values, rvalues) - - def test_float_array(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - - for kind in ['integer', 'block']: - a = SparseArray(values, kind=kind) - b = SparseArray(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues) - self._check_numeric_ops(a, b * 0, values, rvalues * 0) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind, fill_value=0) - self._check_numeric_ops(a, b, values, rvalues) - - a = SparseArray(values, kind=kind, fill_value=1) - b = SparseArray(rvalues, kind=kind, fill_value=2) - self._check_numeric_ops(a, b, values, rvalues) - - def test_float_array_different_kind(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - - a = SparseArray(values, kind='integer') - b = SparseArray(rvalues, kind='block') - self._check_numeric_ops(a, b, values, rvalues) - self._check_numeric_ops(a, b * 0, values, rvalues * 0) - - a = SparseArray(values, kind='integer', fill_value=0) - b = SparseArray(rvalues, kind='block') - self._check_numeric_ops(a, b, values, rvalues) - - a = SparseArray(values, kind='integer', fill_value=0) - b = SparseArray(rvalues, kind='block', fill_value=0) - self._check_numeric_ops(a, b, values, rvalues) - - a = SparseArray(values, kind='integer', fill_value=1) - b = SparseArray(rvalues, kind='block', fill_value=2) - self._check_numeric_ops(a, b, values, rvalues) - - def test_float_array_comparison(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - - for kind in ['integer', 'block']: - a = SparseArray(values, kind=kind) - b = SparseArray(rvalues, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) - self._check_comparison_ops(a, b * 0, values, rvalues * 0) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind, fill_value=0) - self._check_comparison_ops(a, b, values, rvalues) - - a = SparseArray(values, kind=kind, fill_value=1) - b = SparseArray(rvalues, kind=kind, fill_value=2) - self._check_comparison_ops(a, b, values, rvalues) - - class TestSparseArrayAnalytics(tm.TestCase): def test_sum(self): data = np.arange(10).astype(float) diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 11bf980a99fec..4417411403baa 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -486,13 +486,14 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xfill = 0 yfill = 2 - result_block_vals, rb_index = sparse_op(x, xindex, xfill, y, - yindex, yfill) - result_int_vals, ri_index = sparse_op(x, xdindex, xfill, y, - ydindex, yfill) + result_block_vals, rb_index, bfill = sparse_op(x, xindex, xfill, y, + yindex, yfill) + result_int_vals, ri_index, ifill = sparse_op(x, xdindex, xfill, y, + ydindex, yfill) self.assertTrue(rb_index.to_int_index().equals(ri_index)) tm.assert_numpy_array_equal(result_block_vals, result_int_vals) + self.assertEqual(bfill, ifill) # check versus Series... xseries = Series(x, xdindex.indices) @@ -517,7 +518,7 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): def make_optestf(op): def f(self): - sparse_op = getattr(splib, 'sparse_%s' % op) + sparse_op = getattr(splib, 'sparse_%s_float64' % op) python_op = getattr(operator, op) self._op_tests(sparse_op, python_op) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index c5480973b46bc..9c792b4171b49 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -512,6 +512,7 @@ def test_setslice(self): name=self.bseries.name)) def test_operators(self): + def _check_op(a, b, op): sp_result = op(a, b) adense = a.to_dense() if isinstance(a, SparseSeries) else a @@ -796,7 +797,7 @@ def test_fill_value_corner(self): cop2 = self.zbseries.copy() cop2.fill_value = 1 result = cop2 / cop - self.assertTrue(np.isnan(result.fill_value)) + self.assertEqual(result.fill_value, np.inf) def test_fill_value_when_combine_const(self): # GH12723 @@ -1254,6 +1255,7 @@ def _dense_series_compare(s, f): class TestSparseSeriesAnalytics(tm.TestCase): + def setUp(self): arr, index = _test_data1() self.bseries = SparseSeries(arr, index=index, kind='block', diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx index 94ae26e00f087..9908aef592ad3 100644 --- a/pandas/src/sparse.pyx +++ b/pandas/src/sparse.pyx @@ -1,4 +1,5 @@ -from numpy cimport ndarray, uint8_t, int32_t, float64_t +from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t, + float64_t, float32_t, float16_t) cimport numpy as np cimport cython @@ -754,346 +755,9 @@ cdef class BlockUnion(BlockMerge): #------------------------------------------------------------------------------- # Sparse arithmetic -ctypedef float64_t (* double_func)(float64_t a, float64_t b) +include "sparse_op_helper.pxi" -cdef inline tuple sparse_combine(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill, - double_func op): - if isinstance(xindex, BlockIndex): - return block_op(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill, op) - elif isinstance(xindex, IntIndex): - return int_op(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill, op) - - -@cython.boundscheck(False) -cdef inline tuple block_op(ndarray x_, BlockIndex xindex, float64_t xfill, - ndarray y_, BlockIndex yindex, float64_t yfill, - double_func op): - """ - Binary operator on BlockIndex objects with fill values - """ - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = op(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = op(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = op(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = op(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = op(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index - - -@cython.boundscheck(False) -cdef inline tuple int_op(ndarray x_, IntIndex xindex, float64_t xfill, - ndarray y_, IntIndex yindex, float64_t yfill, - double_func op): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = op(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = op(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = op(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = op(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = op(xfill, y[yi]) - yi += 1 - - return out, out_index - -cdef inline float64_t __add(float64_t a, float64_t b): - return a + b - -cdef inline float64_t __sub(float64_t a, float64_t b): - return a - b - -cdef inline float64_t __rsub(float64_t a, float64_t b): - return b - a - -cdef inline float64_t __div(float64_t a, float64_t b): - if b == 0: - if a > 0: - return INF - elif a < 0: - return -INF - else: - return NaN - else: - return a / b - -cdef inline float64_t __rdiv(float64_t a, float64_t b): - return __div(b, a) - -cdef inline float64_t __floordiv(float64_t a, float64_t b): - if b == 0: - # numpy >= 1.11 returns NaN - # for a // 0, rather than +-inf - if _np_version_under1p11: - if a > 0: - return INF - elif a < 0: - return -INF - return NaN - else: - return a // b - -cdef inline float64_t __rfloordiv(float64_t a, float64_t b): - return __floordiv(b, a) - -cdef inline float64_t __mul(float64_t a, float64_t b): - return a * b - -cdef inline float64_t __eq(float64_t a, float64_t b): - return a == b - -cdef inline float64_t __ne(float64_t a, float64_t b): - return a != b - -cdef inline float64_t __lt(float64_t a, float64_t b): - return a < b - -cdef inline float64_t __gt(float64_t a, float64_t b): - return a > b - -cdef inline float64_t __le(float64_t a, float64_t b): - return a <= b - -cdef inline float64_t __ge(float64_t a, float64_t b): - return a >= b - -cdef inline float64_t __mod(float64_t a, float64_t b): - if b == 0: - return NaN - else: - return a % b - -cdef inline float64_t __rmod(float64_t a, float64_t b): - return __mod(b, a) - -cdef inline float64_t __pow(float64_t a, float64_t b): - return a ** b - -cdef inline float64_t __rpow(float64_t a, float64_t b): - return __pow(b, a) - - -# This probably needs to be "templated" to achieve maximum performance. -# TODO: quantify performance boost to "templating" - -cpdef sparse_add(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __add) - -cpdef sparse_sub(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __sub) - -cpdef sparse_rsub(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rsub) - -cpdef sparse_mul(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __mul) - -cpdef sparse_div(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __div) - -cpdef sparse_rdiv(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rdiv) - -sparse_truediv = sparse_div -sparse_rtruediv = sparse_rdiv - -cpdef sparse_floordiv(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __floordiv) - -cpdef sparse_rfloordiv(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rfloordiv) - -cpdef sparse_mod(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __mod) - -cpdef sparse_rmod(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rmod) - -cpdef sparse_pow(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __pow) - -cpdef sparse_rpow(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rpow) - -cpdef sparse_eq(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __eq) - -cpdef sparse_ne(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __ne) - -cpdef sparse_lt(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __lt) - -cpdef sparse_gt(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __gt) - -cpdef sparse_le(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __le) - -cpdef sparse_ge(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __ge) - #------------------------------------------------------------------------------- # Indexing operations diff --git a/pandas/src/sparse_op_helper.pxi b/pandas/src/sparse_op_helper.pxi new file mode 100644 index 0000000000000..a49036d02896c --- /dev/null +++ b/pandas/src/sparse_op_helper.pxi @@ -0,0 +1,5532 @@ +""" +Template for each `dtype` helper function for sparse ops + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# Sparse op +#---------------------------------------------------------------------- + +cdef inline float64_t __div_float64(float64_t a, float64_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return float(a) / b + +cdef inline float64_t __truediv_float64(float64_t a, float64_t b): + return __div_float64(a, b) + +cdef inline float64_t __floordiv_float64(float64_t a, float64_t b): + if b == 0: + # numpy >= 1.11 returns NaN + # for a // 0, rather than +-inf + if _np_version_under1p11: + if a > 0: + return INF + elif a < 0: + return -INF + return NaN + else: + return a // b + +cdef inline float64_t __mod_float64(float64_t a, float64_t b): + if b == 0: + return NaN + else: + return a % b + +cdef inline float64_t __div_int64(int64_t a, int64_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return float(a) / b + +cdef inline float64_t __truediv_int64(int64_t a, int64_t b): + return __div_int64(a, b) + +cdef inline int64_t __floordiv_int64(int64_t a, int64_t b): + if b == 0: + return 0 + else: + return a // b + +cdef inline int64_t __mod_int64(int64_t a, int64_t b): + if b == 0: + return 0 + else: + return a % b + +#---------------------------------------------------------------------- +# sparse array op +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_add_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] + y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill + yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_add_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] + y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + return out, out_index, xfill + yfill + + +cpdef sparse_add_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_add_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_add_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_add_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = x[i] + y[i] + return out + + +cpdef sparse_fill_add_float64(float64_t xfill, + float64_t yfill): + return xfill + yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_add_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] + y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill + yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_add_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] + y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + return out, out_index, xfill + yfill + + +cpdef sparse_add_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_add_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_add_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_add_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[int64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.int64) + + for i in range(len(x)): + out[i] = x[i] + y[i] + return out + + +cpdef sparse_fill_add_int64(int64_t xfill, + int64_t yfill): + return xfill + yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_sub_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] - y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill - yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_sub_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] - y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + return out, out_index, xfill - yfill + + +cpdef sparse_sub_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_sub_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_sub_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_sub_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = x[i] - y[i] + return out + + +cpdef sparse_fill_sub_float64(float64_t xfill, + float64_t yfill): + return xfill - yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_sub_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] - y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill - yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_sub_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] - y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + return out, out_index, xfill - yfill + + +cpdef sparse_sub_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_sub_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_sub_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_sub_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[int64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.int64) + + for i in range(len(x)): + out[i] = x[i] - y[i] + return out + + +cpdef sparse_fill_sub_int64(int64_t xfill, + int64_t yfill): + return xfill - yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_mul_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] * y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill * yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_mul_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] * y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + return out, out_index, xfill * yfill + + +cpdef sparse_mul_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_mul_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_mul_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_mul_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = x[i] * y[i] + return out + + +cpdef sparse_fill_mul_float64(float64_t xfill, + float64_t yfill): + return xfill * yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_mul_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] * y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill * yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_mul_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] * y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + return out, out_index, xfill * yfill + + +cpdef sparse_mul_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_mul_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_mul_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_mul_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[int64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.int64) + + for i in range(len(x)): + out[i] = x[i] * y[i] + return out + + +cpdef sparse_fill_mul_int64(int64_t xfill, + int64_t yfill): + return xfill * yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_div_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __div_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __div_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __div_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __div_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __div_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __div_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_div_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __div_float64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __div_float64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __div_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __div_float64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __div_float64(xfill, y[yi]) + yi += 1 + + return out, out_index, __div_float64(xfill, yfill) + + +cpdef sparse_div_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_div_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_div_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_div_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = __div_float64(x[i], y[i]) + return out + + +cpdef sparse_fill_div_float64(float64_t xfill, + float64_t yfill): + return __div_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_div_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __div_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __div_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __div_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __div_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __div_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __div_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_div_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __div_int64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __div_int64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __div_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __div_int64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __div_int64(xfill, y[yi]) + yi += 1 + + return out, out_index, __div_int64(xfill, yfill) + + +cpdef sparse_div_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_div_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_div_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_div_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = __div_int64(x[i], y[i]) + return out + + +cpdef sparse_fill_div_int64(int64_t xfill, + int64_t yfill): + return __div_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_mod_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __mod_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __mod_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __mod_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __mod_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __mod_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __mod_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_mod_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __mod_float64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __mod_float64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __mod_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __mod_float64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __mod_float64(xfill, y[yi]) + yi += 1 + + return out, out_index, __mod_float64(xfill, yfill) + + +cpdef sparse_mod_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_mod_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_mod_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_mod_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = __mod_float64(x[i], y[i]) + return out + + +cpdef sparse_fill_mod_float64(float64_t xfill, + float64_t yfill): + return __mod_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_mod_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __mod_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __mod_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __mod_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __mod_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __mod_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __mod_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_mod_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __mod_int64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __mod_int64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __mod_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __mod_int64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __mod_int64(xfill, y[yi]) + yi += 1 + + return out, out_index, __mod_int64(xfill, yfill) + + +cpdef sparse_mod_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_mod_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_mod_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_mod_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[int64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.int64) + + for i in range(len(x)): + out[i] = __mod_int64(x[i], y[i]) + return out + + +cpdef sparse_fill_mod_int64(int64_t xfill, + int64_t yfill): + return __mod_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_truediv_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __truediv_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __truediv_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __truediv_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __truediv_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __truediv_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __truediv_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_truediv_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __truediv_float64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __truediv_float64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __truediv_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __truediv_float64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __truediv_float64(xfill, y[yi]) + yi += 1 + + return out, out_index, __truediv_float64(xfill, yfill) + + +cpdef sparse_truediv_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_truediv_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_truediv_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_truediv_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = __truediv_float64(x[i], y[i]) + return out + + +cpdef sparse_fill_truediv_float64(float64_t xfill, + float64_t yfill): + return __truediv_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_truediv_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __truediv_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __truediv_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __truediv_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __truediv_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __truediv_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __truediv_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_truediv_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __truediv_int64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __truediv_int64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __truediv_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __truediv_int64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __truediv_int64(xfill, y[yi]) + yi += 1 + + return out, out_index, __truediv_int64(xfill, yfill) + + +cpdef sparse_truediv_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_truediv_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_truediv_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_truediv_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = __truediv_int64(x[i], y[i]) + return out + + +cpdef sparse_fill_truediv_int64(int64_t xfill, + int64_t yfill): + return __truediv_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_floordiv_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __floordiv_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __floordiv_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __floordiv_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __floordiv_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __floordiv_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __floordiv_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_floordiv_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __floordiv_float64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __floordiv_float64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __floordiv_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __floordiv_float64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __floordiv_float64(xfill, y[yi]) + yi += 1 + + return out, out_index, __floordiv_float64(xfill, yfill) + + +cpdef sparse_floordiv_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_floordiv_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_floordiv_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_floordiv_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = __floordiv_float64(x[i], y[i]) + return out + + +cpdef sparse_fill_floordiv_float64(float64_t xfill, + float64_t yfill): + return __floordiv_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_floordiv_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __floordiv_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __floordiv_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __floordiv_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __floordiv_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __floordiv_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __floordiv_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_floordiv_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __floordiv_int64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __floordiv_int64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __floordiv_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __floordiv_int64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __floordiv_int64(xfill, y[yi]) + yi += 1 + + return out, out_index, __floordiv_int64(xfill, yfill) + + +cpdef sparse_floordiv_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_floordiv_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_floordiv_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_floordiv_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[int64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.int64) + + for i in range(len(x)): + out[i] = __floordiv_int64(x[i], y[i]) + return out + + +cpdef sparse_fill_floordiv_int64(int64_t xfill, + int64_t yfill): + return __floordiv_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_pow_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] ** y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill ** yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_pow_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] ** y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + return out, out_index, xfill ** yfill + + +cpdef sparse_pow_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_pow_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_pow_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_pow_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[float64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.float64) + + for i in range(len(x)): + out[i] = x[i] ** y[i] + return out + + +cpdef sparse_fill_pow_float64(float64_t xfill, + float64_t yfill): + return xfill ** yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_pow_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] ** y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill ** yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_pow_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] ** y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + return out, out_index, xfill ** yfill + + +cpdef sparse_pow_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_pow_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_pow_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_pow_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[int64_t, ndim=1] out + + out = np.empty(len(x), dtype=np.int64) + + for i in range(len(x)): + out[i] = x[i] ** y[i] + return out + + +cpdef sparse_fill_pow_int64(int64_t xfill, + int64_t yfill): + return xfill ** yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_eq_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] == y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill == yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_eq_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] == y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + return out, out_index, xfill == yfill + + +cpdef sparse_eq_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_eq_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_eq_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_eq_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] == y[i] + return out + + +cpdef sparse_fill_eq_float64(float64_t xfill, + float64_t yfill): + return xfill == yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_eq_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] == y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill == yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_eq_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] == y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + return out, out_index, xfill == yfill + + +cpdef sparse_eq_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_eq_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_eq_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_eq_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] == y[i] + return out + + +cpdef sparse_fill_eq_int64(int64_t xfill, + int64_t yfill): + return xfill == yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_ne_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] != y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill != yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_ne_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] != y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + return out, out_index, xfill != yfill + + +cpdef sparse_ne_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_ne_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_ne_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_ne_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] != y[i] + return out + + +cpdef sparse_fill_ne_float64(float64_t xfill, + float64_t yfill): + return xfill != yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_ne_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] != y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill != yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_ne_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] != y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + return out, out_index, xfill != yfill + + +cpdef sparse_ne_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_ne_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_ne_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_ne_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] != y[i] + return out + + +cpdef sparse_fill_ne_int64(int64_t xfill, + int64_t yfill): + return xfill != yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_lt_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] < y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill < yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_lt_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] < y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + return out, out_index, xfill < yfill + + +cpdef sparse_lt_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_lt_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_lt_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_lt_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] < y[i] + return out + + +cpdef sparse_fill_lt_float64(float64_t xfill, + float64_t yfill): + return xfill < yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_lt_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] < y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill < yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_lt_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] < y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + return out, out_index, xfill < yfill + + +cpdef sparse_lt_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_lt_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_lt_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_lt_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] < y[i] + return out + + +cpdef sparse_fill_lt_int64(int64_t xfill, + int64_t yfill): + return xfill < yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_gt_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] > y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill > yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_gt_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] > y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + return out, out_index, xfill > yfill + + +cpdef sparse_gt_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_gt_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_gt_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_gt_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] > y[i] + return out + + +cpdef sparse_fill_gt_float64(float64_t xfill, + float64_t yfill): + return xfill > yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_gt_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] > y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill > yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_gt_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] > y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + return out, out_index, xfill > yfill + + +cpdef sparse_gt_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_gt_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_gt_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_gt_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] > y[i] + return out + + +cpdef sparse_fill_gt_int64(int64_t xfill, + int64_t yfill): + return xfill > yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_le_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] <= y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill <= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_le_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] <= y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + return out, out_index, xfill <= yfill + + +cpdef sparse_le_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_le_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_le_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_le_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] <= y[i] + return out + + +cpdef sparse_fill_le_float64(float64_t xfill, + float64_t yfill): + return xfill <= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_le_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] <= y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill <= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_le_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] <= y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + return out, out_index, xfill <= yfill + + +cpdef sparse_le_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_le_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_le_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_le_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] <= y[i] + return out + + +cpdef sparse_fill_le_int64(int64_t xfill, + int64_t yfill): + return xfill <= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_ge_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] >= y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill >= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_ge_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] >= y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + return out, out_index, xfill >= yfill + + +cpdef sparse_ge_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_ge_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_ge_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_ge_float64(ndarray[float64_t, ndim=1] x, + ndarray[float64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] >= y[i] + return out + + +cpdef sparse_fill_ge_float64(float64_t xfill, + float64_t yfill): + return xfill >= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_ge_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] >= y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill >= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_ge_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] >= y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + return out, out_index, xfill >= yfill + + +cpdef sparse_ge_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_ge_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_ge_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_ge_int64(ndarray[int64_t, ndim=1] x, + ndarray[int64_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[uint8_t, ndim=1] out + + out = np.empty(len(x), dtype=np.uint8) + + for i in range(len(x)): + out[i] = x[i] >= y[i] + return out + + +cpdef sparse_fill_ge_int64(int64_t xfill, + int64_t yfill): + return xfill >= yfill diff --git a/pandas/src/sparse_op_helper.pxi.in b/pandas/src/sparse_op_helper.pxi.in new file mode 100644 index 0000000000000..73fd5e46f46a6 --- /dev/null +++ b/pandas/src/sparse_op_helper.pxi.in @@ -0,0 +1,337 @@ +""" +Template for each `dtype` helper function for sparse ops + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# Sparse op +#---------------------------------------------------------------------- + +{{py: + +# dtype, float_group +dtypes = [('float64', True), ('int64', False)] + +}} + +{{for dtype, float_group in dtypes}} + +{{if float_group}} + +cdef inline {{dtype}}_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return float(a) / b + +cdef inline {{dtype}}_t __truediv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + return __div_{{dtype}}(a, b) + +cdef inline {{dtype}}_t __floordiv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + # numpy >= 1.11 returns NaN + # for a // 0, rather than +-inf + if _np_version_under1p11: + if a > 0: + return INF + elif a < 0: + return -INF + return NaN + else: + return a // b + +cdef inline {{dtype}}_t __mod_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + return NaN + else: + return a % b + +{{else}} + +cdef inline float64_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return float(a) / b + +cdef inline float64_t __truediv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + return __div_{{dtype}}(a, b) + +cdef inline {{dtype}}_t __floordiv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + return 0 + else: + return a // b + +cdef inline {{dtype}}_t __mod_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + return 0 + else: + return a % b + +{{endif}} + +{{endfor}} + +#---------------------------------------------------------------------- +# sparse array op +#---------------------------------------------------------------------- + +{{py: + +# dtype +dtypes = ['float64', 'int64'] + +def get_op(tup): + assert isinstance(tup, tuple) + assert len(tup) == 4 + + opname, lval, rval, dtype = tup + + ops_dict = {'add': '{0} + {1}', + 'sub': '{0} - {1}', + 'mul': '{0} * {1}', + 'div': '__div_{2}({0}, {1})', + 'mod': '__mod_{2}({0}, {1})', + 'truediv': '__truediv_{2}({0}, {1})', + 'floordiv': '__floordiv_{2}({0}, {1})', + 'pow': '{0} ** {1}', + 'eq': '{0} == {1}', + 'ne': '{0} != {1}', + 'lt': '{0} < {1}', + 'gt': '{0} > {1}', + 'le': '{0} <= {1}', + 'ge': '{0} >= {1}'} + + return ops_dict[opname].format(lval, rval, dtype) + + +def get_dispatch(dtypes): + + ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv', + 'floordiv', 'pow', 'eq', 'ne', 'lt', 'gt', 'le', 'ge'] + + for opname in ops_list: + for dtype in dtypes: + + if opname in ('div', 'truediv'): + rdtype = 'float64' + elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): + rdtype = 'uint8' + else: + rdtype = dtype + + yield opname, dtype, rdtype + +}} + + +{{for opname, dtype, rdtype in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_{{opname}}_{{dtype}}(ndarray x_, + BlockIndex xindex, + {{dtype}}_t xfill, + ndarray y_, + BlockIndex yindex, + {{dtype}}_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[{{dtype}}_t, ndim=1] x, y + ndarray[{{rdtype}}_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.{{rdtype}}) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}} + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_{{opname}}_{{dtype}}(ndarray x_, IntIndex xindex, + {{dtype}}_t xfill, + ndarray y_, IntIndex yindex, + {{dtype}}_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[{{dtype}}_t, ndim=1] x, y + ndarray[{{rdtype}}_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.{{rdtype}}) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}} + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + else: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + + return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}} + + +cpdef sparse_{{opname}}_{{dtype}}(ndarray[{{dtype}}_t, ndim=1] x, + SparseIndex xindex, {{dtype}}_t xfill, + ndarray[{{dtype}}_t, ndim=1] y, + SparseIndex yindex, {{dtype}}_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_{{opname}}_{{dtype}}(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_{{opname}}_{{dtype}}(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_align_{{opname}}_{{dtype}}(ndarray[{{dtype}}_t, ndim=1] x, + ndarray[{{dtype}}_t, ndim=1] y): + """ to return NumPy compat result """ + cdef: + Py_ssize_t i = 0 + ndarray[{{rdtype}}_t, ndim=1] out + + out = np.empty(len(x), dtype=np.{{rdtype}}) + + for i in range(len(x)): + out[i] = {{(opname, 'x[i]', 'y[i]', dtype) | get_op}} + return out + + +cpdef sparse_fill_{{opname}}_{{dtype}}({{dtype}}_t xfill, + {{dtype}}_t yfill): + return {{(opname, 'xfill', 'yfill', dtype) | get_op}} + +{{endfor}} diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index be7a0eccf6b7c..440e433ffd95c 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -54,8 +54,13 @@ def test_subclass_sparse_slice(self): def test_subclass_sparse_addition(self): s1 = tm.SubclassedSparseSeries([1, 3, 5]) s2 = tm.SubclassedSparseSeries([-2, 5, 12]) - tm.assert_sp_series_equal(s1 + s2, - tm.SubclassedSparseSeries([-1.0, 8.0, 17.0])) + exp = tm.SubclassedSparseSeries([-1, 8, 17]) + tm.assert_sp_series_equal(s1 + s2, exp) + + s1 = tm.SubclassedSparseSeries([4.0, 5.0, 6.0]) + s2 = tm.SubclassedSparseSeries([1.0, 2.0, 3.0]) + exp = tm.SubclassedSparseSeries([5., 7., 9.]) + tm.assert_sp_series_equal(s1 + s2, exp) def test_subclass_sparse_to_frame(self): s = tm.SubclassedSparseSeries([1, 2], index=list('abcd'), name='xxx') diff --git a/setup.py b/setup.py index e81cae633427d..7ef907aada6dc 100755 --- a/setup.py +++ b/setup.py @@ -108,7 +108,8 @@ def is_platform_mac(): _pxipath = pjoin('pandas', 'src') _pxifiles = ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', 'algos_join_helper.pxi.in', 'algos_take_helper.pxi.in', - 'hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in'] + 'hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in', + 'sparse_op_helper.pxi.in'] class build_ext(_build_ext): From 8ec740614ff91d00d59858eb38ddee67395b3430 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 4 Aug 2016 06:19:37 -0400 Subject: [PATCH 213/359] BLD: remove generated dep from algos.pyx build --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7ef907aada6dc..c985445a08155 100755 --- a/setup.py +++ b/setup.py @@ -465,8 +465,7 @@ def pxd(name): 'pandas/src/datetime/np_datetime_strings.c']}, algos={'pyxfile': 'algos', 'pxdfiles': ['src/util'], - 'depends': [srcpath('generated', suffix='.pyx'), - srcpath('join', suffix='.pyx')]}, + 'depends': [srcpath('join', suffix='.pyx')]}, _window={'pyxfile': 'window', 'pxdfiles': ['src/skiplist', 'src/util'], 'depends': ['pandas/src/skiplist.pyx', From 61b14b2aebb5c9334e96af5d3083fcf575793a96 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 4 Aug 2016 06:36:18 -0400 Subject: [PATCH 214/359] COMPAT: Categorical Subclassing xref #8640 Author: sinhrks Closes #13827 from sinhrks/categorical_subclass and squashes the following commits: 13c456c [sinhrks] COMPAT: Categorical Subclassing --- pandas/core/categorical.py | 62 +++++++++++++++++--------------- pandas/tests/test_categorical.py | 30 ++++++++++++++++ pandas/util/testing.py | 9 ++++- 3 files changed, 71 insertions(+), 30 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 39e140e962821..6ea0a5e96672d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -328,11 +328,16 @@ def __init__(self, values, categories=None, ordered=False, self._categories = categories self._codes = _coerce_indexer_dtype(codes, categories) + @property + def _constructor(self): + return Categorical + def copy(self): """ Copy constructor. """ - return Categorical(values=self._codes.copy(), - categories=self.categories, ordered=self.ordered, - fastpath=True) + return self._constructor(values=self._codes.copy(), + categories=self.categories, + ordered=self.ordered, + fastpath=True) def astype(self, dtype, copy=True): """ @@ -414,7 +419,7 @@ def from_array(cls, data, **kwargs): Can be an Index or array-like. The categories are assumed to be the unique values of `data`. """ - return Categorical(data, **kwargs) + return cls(data, **kwargs) @classmethod def from_codes(cls, codes, categories, ordered=False, name=None): @@ -458,8 +463,8 @@ def from_codes(cls, codes, categories, ordered=False, name=None): raise ValueError("codes need to be between -1 and " "len(categories)-1") - return Categorical(codes, categories=categories, ordered=ordered, - fastpath=True) + return cls(codes, categories=categories, ordered=ordered, + fastpath=True) _codes = None @@ -916,9 +921,9 @@ def map(self, mapper): """ new_categories = self.categories.map(mapper) try: - return Categorical.from_codes(self._codes.copy(), - categories=new_categories, - ordered=self.ordered) + return self.from_codes(self._codes.copy(), + categories=new_categories, + ordered=self.ordered) except ValueError: return np.take(new_categories, self._codes) @@ -968,8 +973,8 @@ def shift(self, periods): else: codes[periods:] = -1 - return Categorical.from_codes(codes, categories=self.categories, - ordered=self.ordered) + return self.from_codes(codes, categories=self.categories, + ordered=self.ordered) def __array__(self, dtype=None): """ @@ -1159,8 +1164,8 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = Categorical(ix, categories=cat, ordered=obj.ordered, - fastpath=True) + ix = self._constructor(ix, categories=cat, ordered=obj.ordered, + fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype='int64') @@ -1313,8 +1318,8 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): self._codes = codes return else: - return Categorical(values=codes, categories=self.categories, - ordered=self.ordered, fastpath=True) + return self._constructor(values=codes, categories=self.categories, + ordered=self.ordered, fastpath=True) def order(self, inplace=False, ascending=True, na_position='last'): """ @@ -1441,8 +1446,8 @@ def fillna(self, value=None, method=None, limit=None): values = values.copy() values[mask] = self.categories.get_loc(value) - return Categorical(values, categories=self.categories, - ordered=self.ordered, fastpath=True) + return self._constructor(values, categories=self.categories, + ordered=self.ordered, fastpath=True) def take_nd(self, indexer, allow_fill=True, fill_value=None): """ Take the codes by the indexer, fill with the fill_value. @@ -1455,8 +1460,8 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): assert isnull(fill_value) codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) - result = Categorical(codes, categories=self.categories, - ordered=self.ordered, fastpath=True) + result = self._constructor(codes, categories=self.categories, + ordered=self.ordered, fastpath=True) return result take = take_nd @@ -1476,8 +1481,8 @@ def _slice(self, slicer): slicer = slicer[1] _codes = self._codes[slicer] - return Categorical(values=_codes, categories=self.categories, - ordered=self.ordered, fastpath=True) + return self._constructor(values=_codes, categories=self.categories, + ordered=self.ordered, fastpath=True) def __len__(self): """The length of this Categorical.""" @@ -1588,10 +1593,9 @@ def __getitem__(self, key): else: return self.categories[i] else: - return Categorical(values=self._codes[key], - categories=self.categories, - ordered=self.ordered, - fastpath=True) + return self._constructor(values=self._codes[key], + categories=self.categories, + ordered=self.ordered, fastpath=True) def __setitem__(self, key, value): """ Item assignment. @@ -1742,8 +1746,8 @@ def mode(self): import pandas.hashtable as htable good = self._codes != -1 values = sorted(htable.mode_int64(_ensure_int64(self._codes[good]))) - result = Categorical(values=values, categories=self.categories, - ordered=self.ordered, fastpath=True) + result = self._constructor(values=values, categories=self.categories, + ordered=self.ordered, fastpath=True) return result def unique(self): @@ -1837,8 +1841,8 @@ def repeat(self, repeats, *args, **kwargs): """ nv.validate_repeat(args, kwargs) codes = self._codes.repeat(repeats) - return Categorical(values=codes, categories=self.categories, - ordered=self.ordered, fastpath=True) + return self._constructor(values=codes, categories=self.categories, + ordered=self.ordered, fastpath=True) # The Series.cat accessor diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 42636c6330fba..0e37f5bf17405 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4415,6 +4415,36 @@ def test_concat_categorical(self): tm.assert_frame_equal(df_expected, df_concat) +class TestCategoricalSubclassing(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_constructor(self): + sc = tm.SubclassedCategorical(['a', 'b', 'c']) + self.assertIsInstance(sc, tm.SubclassedCategorical) + tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c'])) + + def test_from_array(self): + sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + self.assertIsInstance(sc, tm.SubclassedCategorical) + exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + tm.assert_categorical_equal(sc, exp) + + def test_map(self): + sc = tm.SubclassedCategorical(['a', 'b', 'c']) + res = sc.map(lambda x: x.upper()) + self.assertIsInstance(res, tm.SubclassedCategorical) + exp = Categorical(['A', 'B', 'C']) + tm.assert_categorical_equal(res, exp) + + def test_map(self): + sc = tm.SubclassedCategorical(['a', 'b', 'c']) + res = sc.map(lambda x: x.upper()) + self.assertIsInstance(res, tm.SubclassedCategorical) + exp = Categorical(['A', 'B', 'C']) + tm.assert_categorical_equal(res, exp) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e4a84ea4ae296..c6573934bff57 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -43,7 +43,7 @@ from pandas.computation import expressions as expr -from pandas import (bdate_range, CategoricalIndex, DatetimeIndex, +from pandas import (bdate_range, CategoricalIndex, Categorical, DatetimeIndex, TimedeltaIndex, PeriodIndex, RangeIndex, Index, MultiIndex, Series, DataFrame, Panel, Panel4D) from pandas.util.decorators import deprecate @@ -2670,6 +2670,13 @@ def _constructor_sliced(self): return SubclassedSparseSeries +class SubclassedCategorical(Categorical): + + @property + def _constructor(self): + return SubclassedCategorical + + @contextmanager def patch(ob, attr, value): """Temporarily patch an attribute of an object. From 9ee8c0d311cd22dba8cd804219a591bb1b90f7be Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 3 Aug 2016 20:14:26 -0500 Subject: [PATCH 215/359] BUG: union_categorical fastpath sort closes #13899 xref #13846 --- pandas/tools/tests/test_concat.py | 7 +++++++ pandas/types/concat.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 968ea979f7c75..225ba533161b3 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -1006,6 +1006,13 @@ def test_union_categoricals_sort(self): categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) + c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) + c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + # fastpath - skip resort c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 0a985dd6141ae..a7fd692cfb9cf 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -263,7 +263,7 @@ def union_categoricals(to_union, sort_categories=False): if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() - indexer = first.categories.get_indexer(categories) + indexer = categories.get_indexer(first.categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) elif all(not c.ordered for c in to_union): # different categories - union and recode From 5d163ceb8a39218aba61747cdd4c0b508aa510b5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 4 Aug 2016 06:43:39 -0500 Subject: [PATCH 216/359] ENH: DataFrame.style sparsified MultiIndex - [x] closes #11655 - [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` - [x] whatsnew entry [Notebook comparing `DataFrame._html_repr_` to `DataFrame.style`](http s://gist.github.com/609c398f814b4a505bf4f406670e457e) I think we're identical for non-truncated DataFrames. That' has not been implemented in `Styler` yet. Along the way I noticed two other things that ended up needing fixing. 1. DataFrame.columns.names were not displayed 2. CSS classes weren't being assigned correctly to row labels. The fixes ended up being pretty intertwined, so I've put them in a single PR. Unfortunately, the commits are a bit jumbled as well :/ Author: Tom Augspurger Closes #13775 from TomAugspurger/style-sparse-mi-2 and squashes the following commits: 7c03a72 [Tom Augspurger] ENH: DataFrame.style column names ecba615 [Tom Augspurger] ENH: MultiIndex Structure for DataFrame.style --- doc/source/html-styling.ipynb | 21 ++++ doc/source/whatsnew/v0.19.0.txt | 4 +- pandas/formats/style.py | 128 +++++++++++++++++--- pandas/tests/formats/test_style.py | 186 ++++++++++++++++++++++++++--- 4 files changed, 301 insertions(+), 38 deletions(-) diff --git a/doc/source/html-styling.ipynb b/doc/source/html-styling.ipynb index 8668ee3de7470..e55712b2bb4f6 100644 --- a/doc/source/html-styling.ipynb +++ b/doc/source/html-styling.ipynb @@ -788,6 +788,27 @@ "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CSS Classes\n", + "\n", + "Certain CSS classes are attached to cells.\n", + "\n", + "- Index and Column names include `index_name` and `level` where `k` is its level in a MultiIndex\n", + "- Index label cells include\n", + " + `row_heading`\n", + " + `row` where `n` is the numeric position of the row\n", + " + `level` where `k` is the level in a MultiIndex\n", + "- Column label cells include\n", + " + `col_heading`\n", + " + `col` where `n` is the numeric position of the column\n", + " + `level` where `k` is the level in a MultiIndex\n", + "- Blank cells include `blank`\n", + "- Data cells include `data`" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e67ca3b199369..f68fa957df133 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -373,6 +373,8 @@ Other enhancements - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) - ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`) +- ``DataFrame.style`` will now render sparsified MultiIndexes (:issue:`11655`) +- ``DataFrame.style`` will now show column level names (e.g. ``DataFrame.columns.names``) (:issue:`13775`) - ``DataFrame`` has gained support to re-order the columns based on the values in a row using ``df.sort_values(by='...', axis=1)`` (:issue:`10806`) @@ -884,10 +886,10 @@ Bug Fixes - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) - Bug in ``df.groupby(...)[...]`` where getitem with ``Int64Index`` raised an error (:issue:`13731`) +- Bug in the CSS classes assigned to ``DataFrame.style`` for index names. Previously they were assigned ``"col_heading level col"`` where ``n`` was the number of levels + 1. Now they are assigned ``"index_name level"``, where ``n`` is the correct level for that MultiIndex. - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) - - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) - Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) diff --git a/pandas/formats/style.py b/pandas/formats/style.py index 472fd958d35eb..4d5e72a38bb98 100644 --- a/pandas/formats/style.py +++ b/pandas/formats/style.py @@ -21,7 +21,9 @@ import numpy as np import pandas as pd -from pandas.compat import lzip, range +from pandas.compat import range +from pandas.core.config import get_option +import pandas.core.common as com from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice try: import matplotlib.pyplot as plt @@ -79,6 +81,24 @@ class Styler(object): to automatically render itself. Otherwise call Styler.render to get the genterated HTML. + CSS classes are attached to the generated HTML + + * Index and Column names include ``index_name`` and ``level`` + where `k` is its level in a MultiIndex + * Index label cells include + + * ``row_heading`` + * ``row`` where `n` is the numeric position of the row + * ``level`` where `k` is the level in a MultiIndex + + * Column label cells include + * ``col_heading`` + * ``col`` where `n` is the numeric position of the column + * ``evel`` where `k` is the level in a MultiIndex + + * Blank cells include ``blank`` + * Data cells include ``data`` + See Also -------- pandas.DataFrame.style @@ -110,7 +130,10 @@ class Styler(object): {% for r in head %} {% for c in r %} - <{{c.type}} class="{{c.class}}">{{c.value}} + {% if c.is_visible != False %} + <{{c.type}} class="{{c.class}}" {{ c.attributes|join(" ") }}> + {{c.value}} + {% endif %} {% endfor %} {% endfor %} @@ -119,8 +142,11 @@ class Styler(object): {% for r in body %} {% for c in r %} - <{{c.type}} id="T_{{uuid}}{{c.id}}" class="{{c.class}}"> + {% if c.is_visible != False %} + <{{c.type}} id="T_{{uuid}}{{c.id}}" + class="{{c.class}}" {{ c.attributes|join(" ") }}> {{ c.display_value }} + {% endif %} {% endfor %} {% endfor %} @@ -148,7 +174,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, self.table_styles = table_styles self.caption = caption if precision is None: - precision = pd.options.display.precision + precision = get_option('display.precision') self.precision = precision self.table_attributes = table_attributes # display_funcs maps (row, col) -> formatting function @@ -177,10 +203,19 @@ def _translate(self): uuid = self.uuid or str(uuid1()).replace("-", "_") ROW_HEADING_CLASS = "row_heading" COL_HEADING_CLASS = "col_heading" + INDEX_NAME_CLASS = "index_name" + DATA_CLASS = "data" BLANK_CLASS = "blank" BLANK_VALUE = "" + def format_attr(pair): + return "{key}={value}".format(**pair) + + # for sparsifying a MultiIndex + idx_lengths = _get_level_lengths(self.index) + col_lengths = _get_level_lengths(self.columns) + cell_context = dict() n_rlvls = self.data.index.nlevels @@ -188,10 +223,6 @@ def _translate(self): rlabels = self.data.index.tolist() clabels = self.data.columns.tolist() - idx_values = self.data.index.format(sparsify=False, adjoin=False, - names=False) - idx_values = lzip(*idx_values) - if n_rlvls == 1: rlabels = [[x] for x in rlabels] if n_clvls == 1: @@ -202,9 +233,24 @@ def _translate(self): head = [] for r in range(n_clvls): + # Blank for Index columns... row_es = [{"type": "th", "value": BLANK_VALUE, - "class": " ".join([BLANK_CLASS])}] * n_rlvls + "display_value": BLANK_VALUE, + "is_visible": True, + "class": " ".join([BLANK_CLASS])}] * (n_rlvls - 1) + + # ... except maybe the last for columns.names + name = self.data.columns.names[r] + cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS, + "level%s" % r] + name = BLANK_VALUE if name is None else name + row_es.append({"type": "th", + "value": name, + "display_value": name, + "class": " ".join(cs), + "is_visible": True}) + for c in range(len(clabels[0])): cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] cs.extend(cell_context.get( @@ -213,16 +259,23 @@ def _translate(self): row_es.append({"type": "th", "value": value, "display_value": value, - "class": " ".join(cs)}) + "class": " ".join(cs), + "is_visible": _is_visible(c, r, col_lengths), + "attributes": [ + format_attr({"key": "colspan", + "value": col_lengths.get( + (r, c), 1)}) + ]}) head.append(row_es) - if self.data.index.names and self.data.index.names != [None]: + if self.data.index.names and not all(x is None + for x in self.data.index.names): index_header_row = [] for c, name in enumerate(self.data.index.names): - cs = [COL_HEADING_CLASS, - "level%s" % (n_clvls + 1), - "col%s" % c] + cs = [INDEX_NAME_CLASS, + "level%s" % c] + name = '' if name is None else name index_header_row.append({"type": "th", "value": name, "class": " ".join(cs)}) @@ -236,12 +289,17 @@ def _translate(self): body = [] for r, idx in enumerate(self.data.index): - cs = [ROW_HEADING_CLASS, "level%s" % c, "row%s" % r] - cs.extend( - cell_context.get("row_headings", {}).get(r, {}).get(c, [])) + # cs.extend( + # cell_context.get("row_headings", {}).get(r, {}).get(c, [])) row_es = [{"type": "th", + "is_visible": _is_visible(r, c, idx_lengths), + "attributes": [ + format_attr({"key": "rowspan", + "value": idx_lengths.get((c, r), 1)}) + ], "value": rlabels[r][c], - "class": " ".join(cs), + "class": " ".join([ROW_HEADING_CLASS, "level%s" % c, + "row%s" % r]), "display_value": rlabels[r][c]} for c in range(len(rlabels[r]))] @@ -893,6 +951,40 @@ def _highlight_extrema(data, color='yellow', max_=True): index=data.index, columns=data.columns) +def _is_visible(idx_row, idx_col, lengths): + """ + Index -> {(idx_row, idx_col): bool}) + """ + return (idx_col, idx_row) in lengths + + +def _get_level_lengths(index): + """ + Given an index, find the level lenght for each element. + + Result is a dictionary of (level, inital_position): span + """ + sentinel = com.sentinel_factory() + levels = index.format(sparsify=sentinel, adjoin=False, names=False) + + if index.nlevels == 1: + return {(0, i): 1 for i, value in enumerate(levels)} + + lengths = {} + + for i, lvl in enumerate(levels): + for j, row in enumerate(lvl): + if not get_option('display.multi_sparse'): + lengths[(i, j)] = 1 + elif row != sentinel: + last_label = j + lengths[(i, last_label)] = 1 + else: + lengths[(i, last_label)] += 1 + + return lengths + + def _maybe_wrap_formatter(formatter): if is_string_like(formatter): return lambda x: formatter.format(x) diff --git a/pandas/tests/formats/test_style.py b/pandas/tests/formats/test_style.py index 9a34f545bd119..3083750e582fc 100644 --- a/pandas/tests/formats/test_style.py +++ b/pandas/tests/formats/test_style.py @@ -8,7 +8,7 @@ from pandas.util.testing import TestCase import pandas.util.testing as tm -# this is a mess. Getting failures on a python 2.7 build with +# Getting failures on a python 2.7 build with # whenever we try to import jinja, whether it's installed or not. # so we're explicitly skipping that one *before* we try to import # jinja. We still need to export the imports as globals, @@ -22,7 +22,7 @@ import jinja2 # noqa except ImportError: raise SkipTest("No Jinja2") -from pandas.formats.style import Styler # noqa +from pandas.formats.style import Styler, _get_level_lengths # noqa class TestStyler(TestCase): @@ -148,19 +148,29 @@ def test_empty_index_name_doesnt_display(self): df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) result = df.style._translate() - expected = [[{'class': 'blank', 'type': 'th', 'value': ''}, + expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', + 'is_visible': True, 'display_value': ''}, {'class': 'col_heading level0 col0', 'display_value': 'A', 'type': 'th', - 'value': 'A'}, + 'value': 'A', + 'is_visible': True, + 'attributes': ["colspan=1"], + }, {'class': 'col_heading level0 col1', 'display_value': 'B', 'type': 'th', - 'value': 'B'}, + 'value': 'B', + 'is_visible': True, + 'attributes': ["colspan=1"], + }, {'class': 'col_heading level0 col2', 'display_value': 'C', 'type': 'th', - 'value': 'C'}]] + 'value': 'C', + 'is_visible': True, + 'attributes': ["colspan=1"], + }]] self.assertEqual(result['head'], expected) @@ -169,12 +179,15 @@ def test_index_name(self): df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) result = df.set_index('A').style._translate() - expected = [[{'class': 'blank', 'type': 'th', 'value': ''}, + expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', + 'display_value': '', 'is_visible': True}, {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'B', 'display_value': 'B'}, + 'value': 'B', 'display_value': 'B', + 'is_visible': True, 'attributes': ['colspan=1']}, {'class': 'col_heading level0 col1', 'type': 'th', - 'value': 'C', 'display_value': 'C'}], - [{'class': 'col_heading level2 col0', 'type': 'th', + 'value': 'C', 'display_value': 'C', + 'is_visible': True, 'attributes': ['colspan=1']}], + [{'class': 'index_name level0', 'type': 'th', 'value': 'A'}, {'class': 'blank', 'type': 'th', 'value': ''}, {'class': 'blank', 'type': 'th', 'value': ''}]] @@ -186,15 +199,20 @@ def test_multiindex_name(self): df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) result = df.set_index(['A', 'B']).style._translate() - expected = [[{'class': 'blank', 'type': 'th', 'value': ''}, - {'class': 'blank', 'type': 'th', 'value': ''}, - {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'C', 'display_value': 'C'}], - [{'class': 'col_heading level2 col0', 'type': 'th', - 'value': 'A'}, - {'class': 'col_heading level2 col1', 'type': 'th', - 'value': 'B'}, - {'class': 'blank', 'type': 'th', 'value': ''}]] + expected = [[ + {'class': 'blank', 'type': 'th', 'value': '', + 'display_value': '', 'is_visible': True}, + {'class': 'blank level0', 'type': 'th', 'value': '', + 'display_value': '', 'is_visible': True}, + {'class': 'col_heading level0 col0', 'type': 'th', + 'value': 'C', 'display_value': 'C', + 'is_visible': True, 'attributes': ['colspan=1'], + }], + [{'class': 'index_name level0', 'type': 'th', + 'value': 'A'}, + {'class': 'index_name level1', 'type': 'th', + 'value': 'B'}, + {'class': 'blank', 'type': 'th', 'value': ''}]] self.assertEqual(result['head'], expected) @@ -581,6 +599,136 @@ def f(x): with tm.assertRaises(ValueError): df.style._apply(f, axis=None) + def test_get_level_lengths(self): + index = pd.MultiIndex.from_product([['a', 'b'], [0, 1, 2]]) + expected = {(0, 0): 3, (0, 3): 3, (1, 0): 1, (1, 1): 1, (1, 2): 1, + (1, 3): 1, (1, 4): 1, (1, 5): 1} + result = _get_level_lengths(index) + tm.assert_dict_equal(result, expected) + + def test_get_level_lengths_un_sorted(self): + index = pd.MultiIndex.from_arrays([ + [1, 1, 2, 1], + ['a', 'b', 'b', 'd'] + ]) + expected = {(0, 0): 2, (0, 2): 1, (0, 3): 1, + (1, 0): 1, (1, 1): 1, (1, 2): 1, (1, 3): 1} + result = _get_level_lengths(index) + tm.assert_dict_equal(result, expected) + + def test_mi_sparse(self): + df = pd.DataFrame({'A': [1, 2]}, + index=pd.MultiIndex.from_arrays([['a', 'a'], + [0, 1]])) + result = df.style._translate() + body_0 = result['body'][0][0] + expected_0 = { + "value": "a", "display_value": "a", "is_visible": True, + "type": "th", "attributes": ["rowspan=2"], + "class": "row_heading level0 row0", + } + tm.assert_dict_equal(body_0, expected_0) + + body_1 = result['body'][0][1] + expected_1 = { + "value": 0, "display_value": 0, "is_visible": True, + "type": "th", "attributes": ["rowspan=1"], + "class": "row_heading level1 row0", + } + tm.assert_dict_equal(body_1, expected_1) + + body_10 = result['body'][1][0] + expected_10 = { + "value": 'a', "display_value": 'a', "is_visible": False, + "type": "th", "attributes": ["rowspan=1"], + "class": "row_heading level0 row1", + } + tm.assert_dict_equal(body_10, expected_10) + + head = result['head'][0] + expected = [ + {'type': 'th', 'class': 'blank', 'value': '', + 'is_visible': True, "display_value": ''}, + {'type': 'th', 'class': 'blank level0', 'value': '', + 'is_visible': True, 'display_value': ''}, + {'attributes': ['colspan=1'], 'class': 'col_heading level0 col0', + 'is_visible': True, 'type': 'th', 'value': 'A', + 'display_value': 'A'}] + self.assertEqual(head, expected) + + def test_mi_sparse_disabled(self): + with pd.option_context('display.multi_sparse', False): + df = pd.DataFrame({'A': [1, 2]}, + index=pd.MultiIndex.from_arrays([['a', 'a'], + [0, 1]])) + result = df.style._translate() + body = result['body'] + for row in body: + self.assertEqual(row[0]['attributes'], ['rowspan=1']) + + def test_mi_sparse_index_names(self): + df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays( + [['a', 'a'], [0, 1]], + names=['idx_level_0', 'idx_level_1']) + ) + result = df.style._translate() + head = result['head'][1] + expected = [{ + 'class': 'index_name level0', 'value': 'idx_level_0', + 'type': 'th'}, + {'class': 'index_name level1', 'value': 'idx_level_1', + 'type': 'th'}, + {'class': 'blank', 'value': '', 'type': 'th'}] + + self.assertEqual(head, expected) + + def test_mi_sparse_column_names(self): + df = pd.DataFrame( + np.arange(16).reshape(4, 4), + index=pd.MultiIndex.from_arrays( + [['a', 'a', 'b', 'a'], [0, 1, 1, 2]], + names=['idx_level_0', 'idx_level_1']), + columns=pd.MultiIndex.from_arrays( + [['C1', 'C1', 'C2', 'C2'], [1, 0, 1, 0]], + names=['col_0', 'col_1'] + ) + ) + result = df.style._translate() + head = result['head'][1] + expected = [ + {'class': 'blank', 'value': '', 'display_value': '', + 'type': 'th', 'is_visible': True}, + {'class': 'index_name level1', 'value': 'col_1', + 'display_value': 'col_1', 'is_visible': True, 'type': 'th'}, + {'attributes': ['colspan=1'], + 'class': 'col_heading level1 col0', + 'display_value': 1, + 'is_visible': True, + 'type': 'th', + 'value': 1}, + {'attributes': ['colspan=1'], + 'class': 'col_heading level1 col1', + 'display_value': 0, + 'is_visible': True, + 'type': 'th', + 'value': 0}, + + {'attributes': ['colspan=1'], + 'class': 'col_heading level1 col2', + 'display_value': 1, + 'is_visible': True, + 'type': 'th', + 'value': 1}, + + {'attributes': ['colspan=1'], + 'class': 'col_heading level1 col3', + 'display_value': 0, + 'is_visible': True, + 'type': 'th', + 'value': 0}, + ] + self.assertEqual(head, expected) + @tm.mplskip class TestStylerMatplotlibDep(TestCase): From 0be0d679873234bfa5198e27a213afe8d43edad7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 4 Aug 2016 07:26:28 -0400 Subject: [PATCH 217/359] BLD: increase cloning depth on travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b909a1f980d6d..2716fa7628d61 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,7 @@ env: git: # for cloning - depth: 500 + depth: 1000 matrix: fast_finish: true From 9c1e738df7effbf89c98dea59b0482b057c8c8b8 Mon Sep 17 00:00:00 2001 From: John Zwinck Date: Thu, 4 Aug 2016 17:00:49 -0400 Subject: [PATCH 218/359] BUG: preserve DatetimeIndex.name in HDFStore/read_hdf() with tz (#13884) closes #13884 Author: John Zwinck Closes #13888 from jzwinck/fix-13884 and squashes the following commits: 789fa59 [John Zwinck] BUG: preserve DatetimeIndex.name in HDFStore/read_hdf() with tz (#13884) --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/io/pytables.py | 3 ++- pandas/io/tests/test_pytables.py | 18 +++++++++++++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f68fa957df133..cf5d715b0ddce 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -880,7 +880,7 @@ Bug Fixes - Bug in ``isnull`` ``notnull`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) - Bug in ``.merge`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) - +- Bug in ``HDFStore``/``read_hdf()`` discarded ``DatetimeIndex.name`` if ``tz`` was set (:issue:`13884`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b2da4218db99b..9c1ef077c3e74 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4339,9 +4339,10 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray """ if tz is not None: + name = getattr(values, 'name', None) values = values.ravel() tz = tslib.get_timezone(_ensure_decoded(tz)) - values = DatetimeIndex(values) + values = DatetimeIndex(values, name=name) if values.tz is None: values = values.tz_localize('UTC').tz_convert(tz) if preserve_UTC: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index e9ba80c3a026a..f821714b54a76 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2851,7 +2851,7 @@ def test_store_hierarchical(self): with ensure_clean_store(self.path) as store: store['frame'] = frame recons = store['frame'] - assert(recons.index.names == ('foo', 'bar')) + tm.assert_frame_equal(recons, frame) def test_store_index_name(self): df = tm.makeDataFrame() @@ -2860,7 +2860,19 @@ def test_store_index_name(self): with ensure_clean_store(self.path) as store: store['frame'] = df recons = store['frame'] - assert(recons.index.name == 'foo') + tm.assert_frame_equal(recons, df) + + def test_store_index_name_with_tz(self): + # GH 13884 + df = pd.DataFrame({'A': [1, 2]}) + df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788]) + df.index = df.index.tz_localize('UTC') + df.index.name = 'foo' + + with ensure_clean_store(self.path) as store: + store.put('frame', df, format='table') + recons = store['frame'] + tm.assert_frame_equal(recons, df) def test_store_series_name(self): df = tm.makeDataFrame() @@ -2869,7 +2881,7 @@ def test_store_series_name(self): with ensure_clean_store(self.path) as store: store['series'] = series recons = store['series'] - assert(recons.name == 'A') + tm.assert_series_equal(recons, series) def test_store_mixed(self): From 2beab41aa288a98bb2f81e5aca0efa67792d3505 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 4 Aug 2016 17:07:38 -0400 Subject: [PATCH 219/359] ENH: sparse astype now supports int64 and bool split from #13849 Author: sinhrks Closes #13900 from sinhrks/sparse_astype and squashes the following commits: 1c669ad [sinhrks] ENH: sparse astype now supports int64 and bool --- doc/source/whatsnew/v0.19.0.txt | 19 ++++++++- pandas/core/internals.py | 10 ++++- pandas/sparse/array.py | 48 +++++++++++++++-------- pandas/sparse/frame.py | 21 ++++++---- pandas/sparse/tests/test_array.py | 63 +++++++++++++++++++++++++++++- pandas/sparse/tests/test_frame.py | 56 +++++++++++++++++++++++++- pandas/sparse/tests/test_series.py | 3 +- 7 files changed, 189 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index cf5d715b0ddce..59a106291dad8 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -323,7 +323,24 @@ These changes allow pandas to handle sparse data with more dtypes, and for work s + 1 +- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) +.. ipython:: python + + s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) + s + s.astype(np.int64) + +``astype`` fails if data contains values which cannot be converted to specified ``dtype``. +Note that the limitation is applied to ``fill_value`` which default is ``np.nan``. + +.. code-block:: ipython + + In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64) + Out[7]: + ValueError: unable to coerce current fill_value nan to int64 dtype + +- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) @@ -413,7 +430,7 @@ API changes - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) -- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) + diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8e77486457546..83fba7a0ce8b5 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2504,6 +2504,14 @@ def sp_index(self): def kind(self): return self.values.kind + def _astype(self, dtype, copy=False, raise_on_error=True, values=None, + klass=None, mgr=None, **kwargs): + if values is None: + values = self.values + values = values.astype(dtype, copy=copy) + return self.make_block_same_class(values=values, + placement=self.mgr_locs) + def __len__(self): try: return self.sp_index.length @@ -2521,7 +2529,7 @@ def make_block_same_class(self, values, placement, sparse_index=None, copy=False, fastpath=True, **kwargs): """ return a new block """ if dtype is None: - dtype = self.dtype + dtype = values.dtype if fill_value is None and not isinstance(values, SparseArray): fill_value = self.values.fill_value diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 8aebb19d5b93e..e22a62ee7f917 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -18,8 +18,9 @@ from pandas.types.common import (is_float, is_integer, is_integer_dtype, _ensure_platform_int, is_list_like, - is_scalar) -from pandas.types.cast import _possibly_convert_platform + is_scalar, is_dtype_equal) +from pandas.types.cast import (_possibly_convert_platform, _maybe_promote, + _astype_nansafe) from pandas.types.missing import isnull, notnull from pandas._sparse import SparseIndex, BlockIndex, IntIndex @@ -236,7 +237,7 @@ def _simple_new(cls, data, sp_index, fill_value): raise ValueError('sp_index must be a SparseIndex') result.sp_index = sp_index - result.fill_value = fill_value + result._fill_value = fill_value return result @property @@ -285,7 +286,7 @@ def __array_finalize__(self, obj): to pass on the index. """ self.sp_index = getattr(obj, 'sp_index', None) - self.fill_value = getattr(obj, 'fill_value', None) + self._fill_value = getattr(obj, 'fill_value', None) def __reduce__(self): """Necessary for making this object picklable""" @@ -301,7 +302,7 @@ def __setstate__(self, state): fill_value, sp_index = own_state[:2] self.sp_index = sp_index - self.fill_value = fill_value + self._fill_value = fill_value def __len__(self): try: @@ -344,6 +345,22 @@ def sp_values(self): # caching not an option, leaks memory return self.view(np.ndarray) + @property + def fill_value(self): + return self._fill_value + + @fill_value.setter + def fill_value(self, value): + if not is_scalar(value): + raise ValueError('fill_value must be a scalar') + # if the specified value triggers type promotion, raise ValueError + new_dtype, fill_value = _maybe_promote(self.dtype, value) + if is_dtype_equal(self.dtype, new_dtype): + self._fill_value = fill_value + else: + msg = 'unable to set fill_value {0} to {1} dtype' + raise ValueError(msg.format(value, self.dtype)) + def get_values(self, fill=None): """ return a dense representation """ return self.to_dense(fill=fill) @@ -479,19 +496,16 @@ def __setslice__(self, i, j, value): raise TypeError("SparseArray does not support item assignment via " "slices") - def astype(self, dtype=None): - """ - - """ + def astype(self, dtype=None, copy=True): dtype = np.dtype(dtype) - if dtype is not None and dtype not in (np.float_, float): - raise TypeError('Can only support floating point data for now') - - if self.dtype == dtype: - return self.copy() - else: - return self._simple_new(self.sp_values.astype(dtype), - self.sp_index, float(self.fill_value)) + sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy) + try: + fill_value = dtype.type(self.fill_value) + except ValueError: + msg = 'unable to coerce current fill_value {0} to {1} dtype' + raise ValueError(msg.format(self.fill_value, dtype)) + return self._simple_new(sp_values, self.sp_index, + fill_value=fill_value) def copy(self, deep=True): """ diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index b6a1e1e48c5c4..f382a4b869a3e 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -235,8 +235,19 @@ def to_dense(self): data = dict((k, v.to_dense()) for k, v in compat.iteritems(self)) return DataFrame(data, index=self.index, columns=self.columns) + def _apply_columns(self, func): + """ get new SparseDataFrame applying func to each columns """ + + new_data = {} + for col, series in compat.iteritems(self): + new_data[col] = func(series) + + return self._constructor( + data=new_data, index=self.index, columns=self.columns, + default_fill_value=self.default_fill_value).__finalize__(self) + def astype(self, dtype): - raise NotImplementedError + return self._apply_columns(lambda x: x.astype(dtype)) def copy(self, deep=True): """ @@ -499,13 +510,7 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None): default_fill_value=self.default_fill_value).__finalize__(self) def _combine_const(self, other, func): - new_data = {} - for col, series in compat.iteritems(self): - new_data[col] = func(series, other) - - return self._constructor( - data=new_data, index=self.index, columns=self.columns, - default_fill_value=self.default_fill_value).__finalize__(self) + return self._apply_columns(lambda x: func(x, other)) def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 2f12b9fba1842..70cda5acc3f4c 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -324,7 +324,68 @@ def test_astype(self): res.sp_values[:3] = 27 self.assertFalse((self.arr.sp_values[:3] == 27).any()) - assertRaisesRegexp(TypeError, "floating point", self.arr.astype, 'i8') + msg = "unable to coerce current fill_value nan to int64 dtype" + with tm.assertRaisesRegexp(ValueError, msg): + self.arr.astype('i8') + + arr = SparseArray([0, np.nan, 0, 1]) + with tm.assertRaisesRegexp(ValueError, msg): + arr.astype('i8') + + arr = SparseArray([0, np.nan, 0, 1], fill_value=0) + msg = "Cannot convert NA to integer" + with tm.assertRaisesRegexp(ValueError, msg): + arr.astype('i8') + + def test_astype_all(self): + vals = np.array([1, 2, 3]) + arr = SparseArray(vals, fill_value=1) + + types = [np.float64, np.float32, np.int64, + np.int32, np.int16, np.int8] + for typ in types: + res = arr.astype(typ) + self.assertEqual(res.dtype, typ) + self.assertEqual(res.sp_values.dtype, typ) + + tm.assert_numpy_array_equal(res.values, vals.astype(typ)) + + def test_set_fill_value(self): + arr = SparseArray([1., np.nan, 2.], fill_value=np.nan) + arr.fill_value = 2 + self.assertEqual(arr.fill_value, 2) + + arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64) + arr.fill_value = 2 + self.assertEqual(arr.fill_value, 2) + + # coerces to int + msg = "unable to set fill_value 3\\.1 to int64 dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = 3.1 + + msg = "unable to set fill_value nan to int64 dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = np.nan + + arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) + arr.fill_value = True + self.assertTrue(arr.fill_value) + + # coerces to bool + msg = "unable to set fill_value 0 to bool dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = 0 + + msg = "unable to set fill_value nan to bool dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = np.nan + + # invalid + msg = "fill_value must be a scalar" + for val in [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]: + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = val def test_copy_shallow(self): arr2 = self.arr.copy(deep=False) diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 9514f9322f68e..67b108c5dc648 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -15,7 +15,7 @@ import pandas.sparse.frame as spf from pandas._sparse import BlockIndex, IntIndex -from pandas.sparse.api import SparseSeries, SparseDataFrame +from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray from pandas.tests.frame.test_misc_api import SharedWithSparse @@ -588,7 +588,59 @@ def test_applymap(self): tm.assertIsInstance(result, SparseDataFrame) def test_astype(self): - self.assertRaises(Exception, self.frame.astype, np.int64) + sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], + dtype=np.int64), + 'B': SparseArray([4, 5, 6, 7], + dtype=np.int64)}) + self.assertEqual(sparse['A'].dtype, np.int64) + self.assertEqual(sparse['B'].dtype, np.int64) + + res = sparse.astype(np.float64) + exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.]), + 'B': SparseArray([4., 5., 6., 7.])}, + default_fill_value=np.nan) + tm.assert_sp_frame_equal(res, exp) + self.assertEqual(res['A'].dtype, np.float64) + self.assertEqual(res['B'].dtype, np.float64) + + sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], + dtype=np.int64), + 'B': SparseArray([0, 5, 0, 7], + dtype=np.int64)}, + default_fill_value=0) + self.assertEqual(sparse['A'].dtype, np.int64) + self.assertEqual(sparse['B'].dtype, np.int64) + + res = sparse.astype(np.float64) + exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.]), + 'B': SparseArray([0., 5., 0., 7.])}, + default_fill_value=0.) + tm.assert_sp_frame_equal(res, exp) + self.assertEqual(res['A'].dtype, np.float64) + self.assertEqual(res['B'].dtype, np.float64) + + def test_astype_bool(self): + sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], + fill_value=0, + dtype=np.int64), + 'B': SparseArray([0, 5, 0, 7], + fill_value=0, + dtype=np.int64)}, + default_fill_value=0) + self.assertEqual(sparse['A'].dtype, np.int64) + self.assertEqual(sparse['B'].dtype, np.int64) + + res = sparse.astype(bool) + exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True], + dtype=np.bool, + fill_value=False), + 'B': SparseArray([False, True, False, True], + dtype=np.bool, + fill_value=False)}, + default_fill_value=False) + tm.assert_sp_frame_equal(res, exp) + self.assertEqual(res['A'].dtype, np.bool) + self.assertEqual(res['B'].dtype, np.bool) def test_fillna(self): df = self.zframe.reindex(lrange(5)) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 9c792b4171b49..95361a8899c46 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -797,7 +797,8 @@ def test_fill_value_corner(self): cop2 = self.zbseries.copy() cop2.fill_value = 1 result = cop2 / cop - self.assertEqual(result.fill_value, np.inf) + # 1 / 0 is inf + self.assertTrue(np.isinf(result.fill_value)) def test_fill_value_when_combine_const(self): # GH12723 From c8e7863cc3432725ccea2ad0256b4b36758623db Mon Sep 17 00:00:00 2001 From: Robert Gieseke Date: Fri, 5 Aug 2016 06:23:38 -0400 Subject: [PATCH 220/359] DOC: Add missing word in docstring Author: Robert Gieseke Closes #13915 from rgieseke/patch-2 and squashes the following commits: bbdcdea [Robert Gieseke] DOC: Add missing word --- pandas/io/pytables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9c1ef077c3e74..aa38958f6c92e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -285,8 +285,8 @@ def read_hdf(path_or_buf, key=None, **kwargs): .. versionadded:: 0.19.0 support for pathlib, py.path. - key : group identifier in the store. Can be omitted a HDF file contains - a single pandas object. + key : group identifier in the store. Can be omitted if the HDF file + contains a single pandas object. where : list of Term (or convertable) objects, optional start : optional, integer (defaults to None), row number to start selection From e5ee5d2e034c9a4be795596633c199d0ba23970b Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 3 Aug 2016 03:35:01 -0400 Subject: [PATCH 221/359] BUG: Ignore the BOM in BOM UTF-8 CSV files closes #4793 closes #13855 --- doc/source/whatsnew/v0.19.0.txt | 101 ++++++++++++++++--------------- pandas/io/parsers.py | 76 ++++++++++++++++++++++- pandas/io/tests/parser/common.py | 51 ++++++++++++++++ pandas/src/parser/tokenizer.c | 9 +++ 4 files changed, 186 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 59a106291dad8..9b976c9a7e4da 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -43,8 +43,8 @@ The following are now part of this API: .. _whatsnew_0190.enhancements.asof_merge: -:func:`merge_asof` for asof-style time-series joining -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``merge_asof`` for asof-style time-series joining +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A long-time requested feature has been added through the :func:`merge_asof` function, to support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`). Full documentation is @@ -192,8 +192,8 @@ default of the index) in a DataFrame. .. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: -:func:`read_csv` has improved support for duplicate column names -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``read_csv`` has improved support for duplicate column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :ref:`Duplicate column names ` are now supported in :func:`read_csv` whether they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) @@ -307,48 +307,6 @@ Google BigQuery Enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). -.. _whatsnew_0190.sparse: - -Sparse changes -~~~~~~~~~~~~~~ - -These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. - -- Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) - -.. ipython:: python - - s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) - s.dtype - - s + 1 - -- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) - -.. ipython:: python - - s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) - s - s.astype(np.int64) - -``astype`` fails if data contains values which cannot be converted to specified ``dtype``. -Note that the limitation is applied to ``fill_value`` which default is ``np.nan``. - -.. code-block:: ipython - - In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64) - Out[7]: - ValueError: unable to coerce current fill_value nan to int64 dtype - -- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) -- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) -- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) -- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) -- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) -- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) - .. _whatsnew_0190.enhancements.other: Other enhancements @@ -684,8 +642,8 @@ New Behavior: .. _whatsnew_0190.api.autogenerated_chunksize_index: -:func:`read_csv` called with ``chunksize`` will progressively enumerate chunks -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``read_csv`` will progressively enumerate chunks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When :func:`read_csv` is called with ``chunksize='n'`` and without specifying an index, each chunk used to have an independently generated index from `0`` to ``n-1``. @@ -716,10 +674,52 @@ New behaviour: pd.concat(pd.read_csv(StringIO(data), chunksize=2)) +.. _whatsnew_0190.sparse: + +Sparse Changes +^^^^^^^^^^^^^^ + +These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. + +- Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) + +.. ipython:: python + + s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) + s.dtype + + s + 1 + +- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) + +.. ipython:: python + + s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) + s + s.astype(np.int64) + +``astype`` fails if data contains values which cannot be converted to specified ``dtype``. +Note that the limitation is applied to ``fill_value`` which default is ``np.nan``. + +.. code-block:: ipython + + In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64) + Out[7]: + ValueError: unable to coerce current fill_value nan to int64 dtype + +- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) +- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) +- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) +- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) +- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) + .. _whatsnew_0190.deprecations: Deprecations -^^^^^^^^^^^^ +~~~~~~~~~~~~ - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) @@ -738,7 +738,7 @@ Deprecations .. _whatsnew_0190.prior_deprecations: Removal of prior version deprecations/changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The ``SparsePanel`` class has been removed (:issue:`13778`) - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - The ``pandas.io.data`` and ``pandas.io.wb`` modules are removed in favor of @@ -797,6 +797,7 @@ Bug Fixes - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) +- Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8c615741679b5..7846ccd1a6660 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -11,7 +11,8 @@ import numpy as np from pandas import compat -from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map +from pandas.compat import (range, lrange, StringIO, lzip, + zip, string_types, map, u) from pandas.types.common import (is_integer, _ensure_object, is_list_like, is_integer_dtype, is_float, @@ -40,6 +41,12 @@ 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' ]) +# BOM character (byte order mark) +# This exists at the beginning of a file to indicate endianness +# of a file (stream). Unfortunately, this marker screws up parsing, +# so we need to remove it if we see it. +_BOM = u('\ufeff') + _parser_params = """Also supports optionally iterating or breaking of the file into chunks. @@ -2161,6 +2168,67 @@ def _buffered_line(self): else: return self._next_line() + def _check_for_bom(self, first_row): + """ + Checks whether the file begins with the BOM character. + If it does, remove it. In addition, if there is quoting + in the field subsequent to the BOM, remove it as well + because it technically takes place at the beginning of + the name, not the middle of it. + """ + # first_row will be a list, so we need to check + # that that list is not empty before proceeding. + if not first_row: + return first_row + + # The first element of this row is the one that could have the + # BOM that we want to remove. Check that the first element is a + # string before proceeding. + if not isinstance(first_row[0], compat.string_types): + return first_row + + # Check that the string is not empty, as that would + # obviously not have a BOM at the start of it. + if not first_row[0]: + return first_row + + # Since the string is non-empty, check that it does + # in fact begin with a BOM. + first_elt = first_row[0][0] + + # This is to avoid warnings we get in Python 2.x if + # we find ourselves comparing with non-Unicode + if compat.PY2 and not isinstance(first_elt, unicode): # noqa + try: + first_elt = u(first_elt) + except UnicodeDecodeError: + return first_row + + if first_elt != _BOM: + return first_row + + first_row = first_row[0] + + if len(first_row) > 1 and first_row[1] == self.quotechar: + start = 2 + quote = first_row[1] + end = first_row[2:].index(quote) + 2 + + # Extract the data between the quotation marks + new_row = first_row[start:end] + + # Extract any remaining data after the second + # quotation mark. + if len(first_row) > end + 1: + new_row += first_row[end + 1:] + return [new_row] + elif len(first_row) > 1: + return [first_row[1:]] + else: + # First row is just the BOM, so we + # return an empty string. + return [""] + def _empty(self, line): return not line or all(not x for x in line) @@ -2212,6 +2280,12 @@ def _next_line(self): line = ret[0] break + # This was the first line of the file, + # which could contain the BOM at the + # beginning of it. + if self.pos == 1: + line = self._check_for_bom(line) + self.line_pos += 1 self.buf.append(line) return line diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 129e925e38d5b..7558e4bb63226 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1517,3 +1517,54 @@ def test_null_byte_char(self): msg = "NULL byte detected" with tm.assertRaisesRegexp(csv.Error, msg): self.read_csv(StringIO(data), names=cols) + + def test_utf8_bom(self): + # see gh-4793 + bom = u('\ufeff') + utf8 = 'utf-8' + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + # basic test + data = 'a\n1' + expected = DataFrame({'a': [1]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8) + tm.assert_frame_equal(out, expected) + + # test with "regular" quoting + data = '"a"\n1' + expected = DataFrame({'a': [1]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, quotechar='"') + tm.assert_frame_equal(out, expected) + + # test in a data row instead of header + data = 'b\n1' + expected = DataFrame({'a': ['b', '1']}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, names=['a']) + tm.assert_frame_equal(out, expected) + + # test in empty data row with skipping + data = '\n1' + expected = DataFrame({'a': [1]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, names=['a'], + skip_blank_lines=True) + tm.assert_frame_equal(out, expected) + + # test in empty data row without skipping + data = '\n1' + expected = DataFrame({'a': [np.nan, 1.0]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, names=['a'], + skip_blank_lines=False) + tm.assert_frame_equal(out, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index cc89fc51792dd..3c09933b3ec87 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -704,6 +704,11 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { self->datapos = i; \ TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); +#define CHECK_FOR_BOM() \ + if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ + buf += 3; \ + self->datapos += 3; \ + } int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipset != NULL) { @@ -736,6 +741,10 @@ int tokenize_bytes(parser_t *self, size_t line_limit) TRACE(("%s\n", buf)); + if (self->file_lines == 0) { + CHECK_FOR_BOM(); + } + for (i = self->datapos; i < self->datalen; ++i) { // next character in file From 3186fef545b55ca7b4a3c79c800f32b1d586545e Mon Sep 17 00:00:00 2001 From: conquistador1492 Date: Fri, 5 Aug 2016 23:52:04 +0400 Subject: [PATCH 222/359] BUG: pd.to_datetime doesn't raises AttributeError with specific inputs when errors='ignore'(#12424) (#13909) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tests/test_algos.py | 6 ++++++ pandas/tslib.pyx | 6 +++--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9b976c9a7e4da..581daab5cea58 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -876,6 +876,7 @@ Bug Fixes - Bug in ``factorize`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13750`) - Bug in ``.set_index`` raises ``AmbiguousTimeError`` if new index contains DST boundary and multi levels (:issue:`12920`) - Bug in ``pd.read_hdf()`` returns incorrect result when a ``DataFrame`` with a ``categorical`` column and a query which doesn't match any values (:issue:`13792`) +- Bug in ``pd.to_datetime()`` raise ``AttributeError`` with NaN and the other string is not valid when errors='ignore' (:issue:`12424`) - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 94c67ac7dd61a..490f4fe81ecbd 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -568,6 +568,12 @@ def test_value_counts_datetime_outofbounds(self): exp = pd.Series([3, 2, 1], index=exp_index) tm.assert_series_equal(res, exp) + # GH 12424 + res = pd.to_datetime(pd.Series(['2362-01-01', np.nan]), + errors='ignore') + exp = pd.Series(['2362-01-01', np.nan], dtype=object) + tm.assert_series_equal(res, exp) + def test_categorical(self): s = Series(pd.Categorical(list('aaabbc'))) result = s.value_counts() diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 56a007bfa352c..32b2bf075991b 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2396,10 +2396,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', # set as nan except if its a NaT if _checknull_with_nat(val): - if val.view('i8') == NPY_NAT: - oresult[i] = NaT - else: + if PyFloat_Check(val): oresult[i] = np.nan + else: + oresult[i] = NaT elif util.is_datetime64_object(val): if get_datetime64_value(val) == NPY_NAT: oresult[i] = NaT From 55a0c2eaa585531445a19dca760348e14ba1229f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Aug 2016 17:44:17 -0400 Subject: [PATCH 223/359] BLD: split join.pyx from algos.pyx (#13925) closes #13921 --- ci/lint.sh | 2 +- pandas/algos.pyx | 3 - pandas/indexes/base.py | 9 +-- pandas/indexes/numeric.py | 17 +++--- pandas/src/join.pyx | 55 ++++++++++++++++--- ...{algos_join_helper.pxi => join_helper.pxi} | 0 ..._join_helper.pxi.in => join_helper.pxi.in} | 0 pandas/tests/test_algos.py | 33 +++++------ pandas/tests/test_expressions.py | 5 +- pandas/tools/merge.py | 20 +++---- pandas/tools/tests/test_join.py | 8 +-- pandas/tseries/index.py | 9 +-- pandas/tseries/tdi.py | 10 ++-- pandas/util/testing.py | 2 +- setup.py | 8 ++- 15 files changed, 113 insertions(+), 68 deletions(-) rename pandas/src/{algos_join_helper.pxi => join_helper.pxi} (100%) rename pandas/src/{algos_join_helper.pxi.in => join_helper.pxi.in} (100%) diff --git a/ci/lint.sh b/ci/lint.sh index 3adfa8d1e3d33..61d74ae28377e 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -20,7 +20,7 @@ if [ "$LINT" ]; then echo "Linting *.py DONE" echo "Linting *.pyx" - for path in 'window.pyx' + for path in 'window.pyx' "src/join.pyx" do echo "linting -> pandas/$path" flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126 diff --git a/pandas/algos.pyx b/pandas/algos.pyx index cccc5377d0dec..44288ab9621f1 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1340,10 +1340,7 @@ cdef inline float64_t _median_linear(float64_t* a, int n): return result -include "join.pyx" - # generated from template include "algos_common_helper.pxi" include "algos_groupby_helper.pxi" -include "algos_join_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 32bcb0bcc732f..de7780d25b1e5 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -5,6 +5,7 @@ import numpy as np import pandas.tslib as tslib import pandas.lib as lib +import pandas._join as _join import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, Timedelta, is_datetime_array @@ -110,10 +111,10 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): # Cython methods _groupby = _algos.groupby_object _arrmap = _algos.arrmap_object - _left_indexer_unique = _algos.left_join_indexer_unique_object - _left_indexer = _algos.left_join_indexer_object - _inner_indexer = _algos.inner_join_indexer_object - _outer_indexer = _algos.outer_join_indexer_object + _left_indexer_unique = _join.left_join_indexer_unique_object + _left_indexer = _join.left_join_indexer_object + _inner_indexer = _join.inner_join_indexer_object + _outer_indexer = _join.outer_join_indexer_object _box_scalars = False _typ = 'index' diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 86d22e141f781..82a6ec0b28ac9 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -1,5 +1,6 @@ import numpy as np import pandas.lib as lib +import pandas._join as _join import pandas.algos as _algos import pandas.index as _index @@ -114,10 +115,10 @@ class Int64Index(NumericIndex): _typ = 'int64index' _groupby = _algos.groupby_int64 _arrmap = _algos.arrmap_int64 - _left_indexer_unique = _algos.left_join_indexer_unique_int64 - _left_indexer = _algos.left_join_indexer_int64 - _inner_indexer = _algos.inner_join_indexer_int64 - _outer_indexer = _algos.outer_join_indexer_int64 + _left_indexer_unique = _join.left_join_indexer_unique_int64 + _left_indexer = _join.left_join_indexer_int64 + _inner_indexer = _join.inner_join_indexer_int64 + _outer_indexer = _join.outer_join_indexer_int64 _can_hold_na = False @@ -211,10 +212,10 @@ class Float64Index(NumericIndex): _engine_type = _index.Float64Engine _groupby = _algos.groupby_float64 _arrmap = _algos.arrmap_float64 - _left_indexer_unique = _algos.left_join_indexer_unique_float64 - _left_indexer = _algos.left_join_indexer_float64 - _inner_indexer = _algos.inner_join_indexer_float64 - _outer_indexer = _algos.outer_join_indexer_float64 + _left_indexer_unique = _join.left_join_indexer_unique_float64 + _left_indexer = _join.left_join_indexer_float64 + _inner_indexer = _join.inner_join_indexer_float64 + _outer_indexer = _join.outer_join_indexer_float64 _default_dtype = np.float64 diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index fbbef8a31071f..f3c7577ef528a 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -1,3 +1,40 @@ +# cython: profile=False + +from numpy cimport * +cimport numpy as np +import numpy as np + +cimport cython + +import_array() + +cimport util + +from numpy cimport NPY_INT8 as NPY_int8 +from numpy cimport NPY_INT16 as NPY_int16 +from numpy cimport NPY_INT32 as NPY_int32 +from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT16 as NPY_float16 +from numpy cimport NPY_FLOAT32 as NPY_float32 +from numpy cimport NPY_FLOAT64 as NPY_float64 + +from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, + uint32_t, uint64_t, float16_t, float32_t, float64_t) + +int8 = np.dtype(np.int8) +int16 = np.dtype(np.int16) +int32 = np.dtype(np.int32) +int64 = np.dtype(np.int64) +float16 = np.dtype(np.float16) +float32 = np.dtype(np.float32) +float64 = np.dtype(np.float64) + +cdef double NaN = np.NaN +cdef double nan = NaN + +from pandas.algos import groupsort_indexer + + def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): cdef: @@ -48,6 +85,7 @@ def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, return (_get_result_indexer(left_sorter, left_indexer), _get_result_indexer(right_sorter, right_indexer)) + def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups, sort=True): cdef: @@ -117,14 +155,13 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, rev, _ = groupsort_indexer(left_indexer, len(left)) if rev.dtype != np.int_: - rev = rev.astype(np.int_) + rev = rev.astype(np.int_) right_indexer = right_indexer.take(rev) left_indexer = left_indexer.take(rev) return left_indexer, right_indexer - def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups, # ignored bint allow_exact_matches=1, @@ -140,7 +177,8 @@ def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, int64_t tolerance_ # if we are using tolerance, set our objects - if left_values is not None and right_values is not None and tolerance is not None: + if (left_values is not None and right_values is not None and + tolerance is not None): has_tolerance = 1 left_values_ = left_values right_values_ = right_values @@ -160,10 +198,12 @@ def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, # find last position in right whose value is less than left's value if allow_exact_matches: - while right_pos < right_size and right[right_pos] <= left[left_pos]: + while (right_pos < right_size and + right[right_pos] <= left[left_pos]): right_pos += 1 else: - while right_pos < right_size and right[right_pos] < left[left_pos]: + while (right_pos < right_size and + right[right_pos] < left[left_pos]): right_pos += 1 right_pos -= 1 @@ -243,7 +283,6 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, _get_result_indexer(right_sorter, right_indexer)) - def _get_result_indexer(sorter, indexer): if indexer.dtype != np.int_: indexer = indexer.astype(np.int_) @@ -258,7 +297,6 @@ def _get_result_indexer(sorter, indexer): return res - def ffill_indexer(ndarray[int64_t] indexer): cdef: Py_ssize_t i, n = len(indexer) @@ -301,3 +339,6 @@ def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, last_obs[gid] = val return result + + +include "join_helper.pxi" diff --git a/pandas/src/algos_join_helper.pxi b/pandas/src/join_helper.pxi similarity index 100% rename from pandas/src/algos_join_helper.pxi rename to pandas/src/join_helper.pxi diff --git a/pandas/src/algos_join_helper.pxi.in b/pandas/src/join_helper.pxi.in similarity index 100% rename from pandas/src/algos_join_helper.pxi.in rename to pandas/src/join_helper.pxi.in diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 490f4fe81ecbd..66fd1861f08f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -12,6 +12,7 @@ import pandas.algos as _algos from pandas.compat import lrange import pandas.core.algorithms as algos +import pandas._join as _join import pandas.util.testing as tm import pandas.hashtable as hashtable from pandas.compat.numpy import np_array_datetime64_compat @@ -303,11 +304,11 @@ class TestIndexer(tm.TestCase): _multiprocess_can_split_ = True def test_outer_join_indexer(self): - typemap = [('int32', algos.algos.outer_join_indexer_int32), - ('int64', algos.algos.outer_join_indexer_int64), - ('float32', algos.algos.outer_join_indexer_float32), - ('float64', algos.algos.outer_join_indexer_float64), - ('object', algos.algos.outer_join_indexer_object)] + typemap = [('int32', _join.outer_join_indexer_int32), + ('int64', _join.outer_join_indexer_int64), + ('float32', _join.outer_join_indexer_float32), + ('float64', _join.outer_join_indexer_float64), + ('object', _join.outer_join_indexer_object)] for dtype, indexer in typemap: left = np.arange(3, dtype=dtype) @@ -1070,7 +1071,7 @@ def test_left_join_indexer_unique(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([2, 2, 3, 4, 4], dtype=np.int64) - result = _algos.left_join_indexer_unique_int64(b, a) + result = _join.left_join_indexer_unique_int64(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) assert (np.array_equal(result, expected)) @@ -1086,7 +1087,7 @@ def test_left_outer_join_bug(): right = np.array([3, 1], dtype=np.int64) max_groups = 4 - lidx, ridx = _algos.left_outer_join(left, right, max_groups, sort=False) + lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) exp_lidx = np.arange(len(left)) exp_ridx = -np.ones(len(left)) @@ -1101,7 +1102,7 @@ def test_inner_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _algos.inner_join_indexer_int64(a, b) + index, ares, bres = _join.inner_join_indexer_int64(a, b) index_exp = np.array([3, 5], dtype=np.int64) assert_almost_equal(index, index_exp) @@ -1114,7 +1115,7 @@ def test_inner_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _algos.inner_join_indexer_int64(a, b) + index, ares, bres = _join.inner_join_indexer_int64(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -1124,7 +1125,7 @@ def test_outer_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _algos.outer_join_indexer_int64(a, b) + index, ares, bres = _join.outer_join_indexer_int64(a, b) index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) assert_almost_equal(index, index_exp) @@ -1137,7 +1138,7 @@ def test_outer_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _algos.outer_join_indexer_int64(a, b) + index, ares, bres = _join.outer_join_indexer_int64(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -1147,7 +1148,7 @@ def test_left_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _algos.left_join_indexer_int64(a, b) + index, ares, bres = _join.left_join_indexer_int64(a, b) assert_almost_equal(index, a) @@ -1159,7 +1160,7 @@ def test_left_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _algos.left_join_indexer_int64(a, b) + index, ares, bres = _join.left_join_indexer_int64(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -1169,7 +1170,7 @@ def test_left_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = _algos.left_join_indexer_int64(idx2.values, idx.values) + res, lidx, ridx = _join.left_join_indexer_int64(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) assert_almost_equal(res, exp_res) @@ -1185,7 +1186,7 @@ def test_outer_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = _algos.outer_join_indexer_int64(idx2.values, idx.values) + res, lidx, ridx = _join.outer_join_indexer_int64(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) assert_almost_equal(res, exp_res) @@ -1201,7 +1202,7 @@ def test_inner_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = _algos.inner_join_indexer_int64(idx2.values, idx.values) + res, lidx, ridx = _join.inner_join_indexer_int64(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5], dtype=np.int64) assert_almost_equal(res, exp_res) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index cc0972937b8a2..c037f02f20609 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -208,8 +208,9 @@ def test_float_panel(self): @slow def test_panel4d(self): - self.run_panel(tm.makePanel4D(), np.random.randn() + 0.5, - assert_func=assert_panel4d_equal, binary_comp=3) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.run_panel(tm.makePanel4D(), np.random.randn() + 0.5, + assert_func=assert_panel4d_equal, binary_comp=3) def test_mixed_arithmetic_frame(self): # TODO: FIGURE OUT HOW TO GET IT TO WORK... diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 9f8e27c4d8176..571df70e05c6d 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -39,7 +39,7 @@ import pandas.core.common as com import pandas.types.concat as _concat -import pandas.algos as _algos +import pandas._join as _join import pandas.hashtable as _hash @@ -918,8 +918,8 @@ def get_result(self): rdata.items, rsuf) if self.fill_method == 'ffill': - left_join_indexer = _algos.ffill_indexer(left_indexer) - right_join_indexer = _algos.ffill_indexer(right_indexer) + left_join_indexer = _join.ffill_indexer(left_indexer) + right_join_indexer = _join.ffill_indexer(right_indexer) else: left_join_indexer = left_indexer right_join_indexer = right_indexer @@ -1094,13 +1094,13 @@ def _get_multiindex_indexer(join_keys, index, sort): # factorize keys to a dense i8 space lkey, rkey, count = fkeys(lkey, rkey) - return _algos.left_outer_join(lkey, rkey, count, sort=sort) + return _join.left_outer_join(lkey, rkey, count, sort=sort) def _get_single_indexer(join_key, index, sort=False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) - left_indexer, right_indexer = _algos.left_outer_join( + left_indexer, right_indexer = _join.left_outer_join( _ensure_int64(left_key), _ensure_int64(right_key), count, sort=sort) @@ -1135,15 +1135,15 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): def _right_outer_join(x, y, max_groups): - right_indexer, left_indexer = _algos.left_outer_join(y, x, max_groups) + right_indexer, left_indexer = _join.left_outer_join(y, x, max_groups) return left_indexer, right_indexer _join_functions = { - 'inner': _algos.inner_join, - 'left': _algos.left_outer_join, + 'inner': _join.inner_join, + 'left': _join.left_outer_join, 'right': _right_outer_join, - 'outer': _algos.full_outer_join, - 'asof': _algos.left_outer_asof_join, + 'outer': _join.full_outer_join, + 'asof': _join.left_outer_asof_join, } diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index cb84c1f06653b..f33d5f16cd439 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -12,7 +12,7 @@ from pandas.util.testing import assert_frame_equal from pandas import DataFrame, MultiIndex, Series -import pandas.algos as algos +import pandas._join as _join import pandas.util.testing as tm from pandas.tools.tests.test_merge import get_test_data, N, NGROUPS @@ -51,7 +51,7 @@ def test_cython_left_outer_join(self): right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 - ls, rs = algos.left_outer_join(left, right, max_group) + ls, rs = _join.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') @@ -75,7 +75,7 @@ def test_cython_right_outer_join(self): right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 - rs, ls = algos.left_outer_join(right, left, max_group) + rs, ls = _join.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') @@ -101,7 +101,7 @@ def test_cython_inner_join(self): right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 - ls, rs = algos.inner_join(left, right, max_group) + ls, rs = _join.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 4a7ba0286aab1..aa50fbe316b94 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -46,6 +46,7 @@ import pandas.lib as lib import pandas.tslib as tslib import pandas._period as period +import pandas._join as _join import pandas.algos as _algos import pandas.index as _index @@ -204,11 +205,11 @@ def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]', **kwargs) - _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(_join.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_join.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_join.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - _algos.left_join_indexer_unique_int64, with_indexers=False) + _join.left_join_indexer_unique_int64, with_indexers=False) _arrmap = None __eq__ = _dt_index_cmp('__eq__') diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 8aad5bdd35f65..921f60b23d187 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -32,7 +32,7 @@ import pandas.lib as lib import pandas.tslib as tslib -import pandas.algos as _algos +import pandas._join as _join import pandas.index as _index Timedelta = tslib.Timedelta @@ -122,11 +122,11 @@ def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper( joinf, dtype='m8[ns]', **kwargs) - _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(_join.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_join.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_join.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - _algos.left_join_indexer_unique_int64, with_indexers=False) + _join.left_join_indexer_unique_int64, with_indexers=False) _arrmap = None _datetimelike_ops = ['days', 'seconds', 'microseconds', 'nanoseconds', 'freq', 'components'] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c6573934bff57..e95808ddc8225 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1040,7 +1040,7 @@ def assert_numpy_array_equal(left, right, strict_nan=False, Specify object name being compared, internally used to show appropriate assertion message check_same : None|'copy'|'same', default None - Ensure "left" and "right refer/do not refer to the same memory area + Ensure left and right refer/do not refer to the same memory area """ # instance validation diff --git a/setup.py b/setup.py index c985445a08155..5bf188d829d26 100755 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def is_platform_mac(): _pxipath = pjoin('pandas', 'src') _pxifiles = ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', - 'algos_join_helper.pxi.in', 'algos_take_helper.pxi.in', + 'join_helper.pxi.in', 'algos_take_helper.pxi.in', 'hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in', 'sparse_op_helper.pxi.in'] @@ -308,6 +308,7 @@ class CheckSDist(sdist_class): 'pandas/tslib.pyx', 'pandas/index.pyx', 'pandas/algos.pyx', + 'pandas/join.pyx', 'pandas/window.pyx', 'pandas/parser.pyx', 'pandas/src/period.pyx', @@ -464,8 +465,9 @@ def pxd(name): 'sources': ['pandas/src/datetime/np_datetime.c', 'pandas/src/datetime/np_datetime_strings.c']}, algos={'pyxfile': 'algos', - 'pxdfiles': ['src/util'], - 'depends': [srcpath('join', suffix='.pyx')]}, + 'pxdfiles': ['src/util']}, + _join={'pyxfile': 'src/join', + 'pxdfiles': ['src/util']}, _window={'pyxfile': 'window', 'pxdfiles': ['src/skiplist', 'src/util'], 'depends': ['pandas/src/skiplist.pyx', From cffe6f224322ab859eb952db8e5f1c6c58117d0b Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 6 Aug 2016 18:41:02 -0400 Subject: [PATCH 224/359] BUG: DatetimeTz shift raises AmbiguousTimeError near DST xref #13650 ``` Author: sinhrks Closes #13926 from sinhrks/dttz_shift_dst and squashes the following commits: c079ee3 [sinhrks] BUG: DatetimeTz shift raises AmbiguousTimeError near DST --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/internals.py | 5 ++--- pandas/tests/series/test_timeseries.py | 29 +++++++++++++++++++++++++- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 581daab5cea58..644ad1e94fe66 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -875,6 +875,7 @@ Bug Fixes - Clean some compile time warnings in datetime parsing (:issue:`13607`) - Bug in ``factorize`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13750`) - Bug in ``.set_index`` raises ``AmbiguousTimeError`` if new index contains DST boundary and multi levels (:issue:`12920`) +- Bug in ``.shift`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13926`) - Bug in ``pd.read_hdf()`` returns incorrect result when a ``DataFrame`` with a ``categorical`` column and a query which doesn't match any values (:issue:`13792`) - Bug in ``pd.to_datetime()`` raise ``AttributeError`` with NaN and the other string is not valid when errors='ignore' (:issue:`12424`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 83fba7a0ce8b5..18b67c41b4554 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2438,15 +2438,14 @@ def shift(self, periods, axis=0, mgr=None): else: indexer[:periods] = np.arange(-periods, N) - # move to UTC & take - new_values = self.values.tz_localize(None).asi8.take(indexer) + new_values = self.values.asi8.take(indexer) if periods > 0: new_values[:periods] = tslib.iNaT else: new_values[periods:] = tslib.iNaT - new_values = DatetimeIndex(new_values, tz=self.values.tz) + new_values = self.values._shallow_copy(new_values) return [self.make_block_same_class(new_values, placement=self.mgr_locs)] diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 19acf54c7a3cb..341d18f987abc 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -5,7 +5,7 @@ import numpy as np -from pandas import Index, Series, date_range +from pandas import Index, Series, date_range, NaT from pandas.tseries.index import DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex @@ -93,6 +93,33 @@ def test_shift(self): tz='CET'), name='foo') self.assertRaises(ValueError, lambda: s - s2) + def test_shift_dst(self): + # GH 13926 + dates = date_range('2016-11-06', freq='H', periods=10, tz='US/Eastern') + s = Series(dates) + + res = s.shift(0) + tm.assert_series_equal(res, s) + self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') + + res = s.shift(1) + exp_vals = [NaT] + dates.asobject.values.tolist()[:9] + exp = Series(exp_vals) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') + + res = s.shift(-2) + exp_vals = dates.asobject.values.tolist()[2:] + [NaT, NaT] + exp = Series(exp_vals) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') + + for ex in [10, -10, 20, -20]: + res = s.shift(ex) + exp = Series([NaT] * 10, dtype='datetime64[ns, US/Eastern]') + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') + def test_tshift(self): # PeriodIndex ps = tm.makePeriodSeries() From a292c13a7f831145f3daac9881813aeb7ff08138 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 6 Aug 2016 18:50:17 -0400 Subject: [PATCH 225/359] ENH: parse categoricals in read_csv Closes #10153 Author: Chris Closes #13406 from chris-b1/categorical-parse and squashes the following commits: c78f39f [Chris] rebase fixup 75ed6ba [Chris] doc fixups 1f6093a [Chris] rebase 0f0dba6 [Chris] wip da5c5b5 [Chris] flake8 fix 1254768 [Chris] doc fixups; addl tests 2490949 [Chris] fix hash table ordering, null categories 4e0722d [Chris] undo type inference add docs and asv 849a112 [Chris] fix some dtype checking cfa0ce4 [Chris] clean up dtype checking, add function specialization 286d907 [Chris] ENH: parse categoricals in read_csv --- asv_bench/benchmarks/parser_vb.py | 21 +++ doc/source/io.rst | 37 ++++ doc/source/whatsnew/v0.19.0.txt | 49 +++++ pandas/io/tests/parser/c_parser_only.py | 94 +++++++++- pandas/parser.pyx | 240 ++++++++++++++++-------- pandas/tools/tests/test_concat.py | 3 + 6 files changed, 367 insertions(+), 77 deletions(-) diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 04f25034638cd..6dc8bffd6dac9 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -114,6 +114,27 @@ def teardown(self): os.remove('test.csv') +class read_csv_categorical(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] + df = DataFrame({'a': np.random.choice(group1, N).astype('object'), + 'b': np.random.choice(group1, N).astype('object'), + 'c': np.random.choice(group1, N).astype('object')}) + df.to_csv('strings.csv', index=False) + + def time_read_csv_categorical_post(self): + read_csv('strings.csv').apply(pd.Categorical) + + def time_read_csv_categorical_direct(self): + read_csv('strings.csv', dtype='category') + + def teardown(self): + os.remove('strings.csv') + + class read_table_multiple_date(object): goal_time = 0.2 diff --git a/doc/source/io.rst b/doc/source/io.rst index 2866371cce61a..7917e6b4cdfce 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -500,6 +500,43 @@ worth trying. data that was read in. It is important to note that the overall column will be marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes. +.. _io.categorical: + +Specifying Categorical dtype +'''''''''''''''''''''''''''' + +.. versionadded:: 0.19.0 + +``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` + +.. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data)).dtypes + pd.read_csv(StringIO(data), dtype='category').dtypes + +Individual columns can be parsed as a ``Categorical`` using a dict specification + +.. ipython:: python + + pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + +.. note:: + + The resulting categories will always be parsed as strings (object dtype). + If the categories are numeric they can be converted using the + :func:`to_numeric` function, or as appropriate, another converter + such as :func:`to_datetime`. + + .. ipython:: python + + df = pd.read_csv(StringIO(data), dtype='category') + df.dtypes + df['col3'] + df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) + df['col3'] Naming and Using Columns diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 644ad1e94fe66..385599b1c6b9e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -12,6 +12,7 @@ Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` +- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here ` .. contents:: What's new in v0.19.0 :local: @@ -195,6 +196,14 @@ default of the index) in a DataFrame. ``read_csv`` has improved support for duplicate column names ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: + + :ref:`Duplicate column names ` are now supported in :func:`read_csv` whether they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) @@ -222,6 +231,46 @@ New behaviour: In [2]: pd.read_csv(StringIO(data), names=names) + +.. _whatsnew_0190.enhancements.read_csv_categorical: + +:func:`read_csv` supports parsing ``Categorical`` directly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`read_csv` function now supports parsing a ``Categorical`` column when +specified as a dtype (:issue:`10153`). Depending on the structure of the data, +this can result in a faster parse time and lower memory usage compared to +converting to ``Categorical`` after parsing. See the io :ref:`docs here ` + +.. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data)).dtypes + pd.read_csv(StringIO(data), dtype='category').dtypes + +Individual columns can be parsed as a ``Categorical`` using a dict specification + +.. ipython:: python + + pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + +.. note:: + + The resulting categories will always be parsed as strings (object dtype). + If the categories are numeric they can be converted using the + :func:`to_numeric` function, or as appropriate, another converter + such as :func:`to_datetime`. + + .. ipython:: python + + df = pd.read_csv(StringIO(data), dtype='category') + df.dtypes + df['col3'] + df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) + df['col3'] + .. _whatsnew_0190.enhancements.semi_month_offsets: Semi-Month Offsets diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 103c9fa2b7ce8..4cea9e1d6b595 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -12,9 +12,10 @@ import pandas as pd import pandas.util.testing as tm -from pandas import DataFrame, Series, Index, MultiIndex +from pandas import DataFrame, Series, Index, MultiIndex, Categorical from pandas import compat from pandas.compat import StringIO, range, lrange +from pandas.types.dtypes import CategoricalDtype class CParserTests(object): @@ -135,6 +136,11 @@ def test_passing_dtype(self): dtype={'A': 'timedelta64', 'B': 'float64'}, index_col=0) + # valid but unsupported - fixed width unicode string + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'U8'}, + index_col=0) + # see gh-12048: empty frame actual = self.read_csv(StringIO('A,B'), dtype=str) expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) @@ -184,6 +190,92 @@ def test_pass_dtype(self): self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'object') + def test_categorical_dtype(self): + # GH 10153 + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['a', 'a', 'b']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'a': 'category', + 'b': 'category', + 'c': CategoricalDtype()}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) + expected = pd.DataFrame({'a': [1, 1, 2], + 'b': Categorical(['a', 'a', 'b']), + 'c': [3.4, 3.4, 4.5]}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + # unsorted + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', 'b', 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + # missing + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', np.nan, 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_encoding(self): + # GH 10153 + pth = tm.get_data_path('unicode_series.csv') + encoding = 'latin-1' + expected = self.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + actual = self.read_csv(pth, header=None, encoding=encoding, + dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + pth = tm.get_data_path('utf16_ex.txt') + encoding = 'utf-16' + expected = self.read_table(pth, encoding=encoding) + expected = expected.apply(Categorical) + actual = self.read_table(pth, encoding=encoding, dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'])}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'])}, + index=[2, 3])] + actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( diff --git a/pandas/parser.pyx b/pandas/parser.pyx index e72e2f90a5213..5af82be5b741b 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -25,6 +25,7 @@ cdef extern from "Python.h": cdef extern from "stdlib.h": void memcpy(void *dst, void *src, size_t n) +cimport cython cimport numpy as cnp from numpy cimport ndarray, uint8_t, uint64_t @@ -33,6 +34,15 @@ import numpy as np cimport util import pandas.lib as lib +from pandas.types.common import (is_categorical_dtype, CategoricalDtype, + is_integer_dtype, is_float_dtype, + is_bool_dtype, is_object_dtype, + is_string_dtype, is_datetime64_dtype, + pandas_dtype) +from pandas.core.categorical import Categorical +from pandas.core.algorithms import take_1d +from pandas.types.concat import union_categoricals +from pandas import Index import time import os @@ -399,11 +409,12 @@ cdef class TextReader: self._set_quoting(quotechar, quoting) - # TODO: endianness just a placeholder? + + dtype_order = ['int64', 'float64', 'bool', 'object'] if quoting == QUOTE_NONNUMERIC: - self.dtype_cast_order = [' 1: @@ -472,15 +483,10 @@ cdef class TextReader: self.encoding = encoding if isinstance(dtype, dict): - conv = {} - for k in dtype: - v = dtype[k] - if isinstance(v, basestring): - v = np.dtype(v) - conv[k] = v - dtype = conv + dtype = {k: pandas_dtype(dtype[k]) + for k in dtype} elif dtype is not None: - dtype = np.dtype(dtype) + dtype = pandas_dtype(dtype) self.dtype = dtype @@ -689,6 +695,7 @@ cdef class TextReader: int status Py_ssize_t size char *errors = "strict" + cdef StringPath path = _string_path(self.c_encoding) header = [] @@ -718,20 +725,18 @@ cdef class TextReader: field_count = self.parser.line_fields[hr] start = self.parser.line_start[hr] - # TODO: Py3 vs. Py2 counts = {} unnamed_count = 0 for i in range(field_count): word = self.parser.words[start + i] - if self.c_encoding == NULL and not PY3: + if path == CSTRING: name = PyBytes_FromString(word) - else: - if self.c_encoding == NULL or self.c_encoding == b'utf-8': - name = PyUnicode_FromString(word) - else: - name = PyUnicode_Decode(word, strlen(word), - self.c_encoding, errors) + elif path == UTF8: + name = PyUnicode_FromString(word) + elif path == ENCODED: + name = PyUnicode_Decode(word, strlen(word), + self.c_encoding, errors) if name == '': if self.has_mi_columns: @@ -1076,17 +1081,12 @@ cdef class TextReader: col_dtype = self.dtype[i] else: if self.dtype.names: - col_dtype = self.dtype.descr[i][1] + # structured array + col_dtype = np.dtype(self.dtype.descr[i][1]) else: col_dtype = self.dtype if col_dtype is not None: - if not isinstance(col_dtype, basestring): - if isinstance(col_dtype, np.dtype): - col_dtype = col_dtype.str - else: - col_dtype = np.dtype(col_dtype).str - col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end, na_filter, 1, na_hashset, na_flist) @@ -1104,7 +1104,7 @@ cdef class TextReader: dt, i, start, end, na_filter, 0, na_hashset, na_flist) except OverflowError: col_res, na_count = self._convert_with_dtype( - '|O8', i, start, end, na_filter, 0, na_hashset, na_flist) + np.dtype('object'), i, start, end, na_filter, 0, na_hashset, na_flist) if col_res is not None: break @@ -1136,90 +1136,88 @@ cdef class TextReader: bint user_dtype, kh_str_t *na_hashset, object na_flist): - if dtype[1] == 'i' or dtype[1] == 'u': - result, na_count = _try_int64(self.parser, i, start, end, - na_filter, na_hashset) + if is_integer_dtype(dtype): + result, na_count = _try_int64(self.parser, i, start, end, na_filter, + na_hashset) if user_dtype and na_count is not None: if na_count > 0: raise ValueError("Integer column has NA values in " - "column {column}".format(column=i)) + "column {column}".format(column=i)) - if result is not None and dtype[1:] != 'i8': + if result is not None and dtype != 'int64': result = result.astype(dtype) return result, na_count - elif dtype[1] == 'f': + elif is_float_dtype(dtype): result, na_count = _try_double(self.parser, i, start, end, na_filter, na_hashset, na_flist) - if result is not None and dtype[1:] != 'f8': + if result is not None and dtype != 'float64': result = result.astype(dtype) return result, na_count - elif dtype[1] == 'b': + elif is_bool_dtype(dtype): result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, self.true_set, self.false_set) return result, na_count - elif dtype[1] == 'c': - raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) - - elif dtype[1] == 'S': + elif dtype.kind == 'S': # TODO: na handling - width = int(dtype[2:]) + width = dtype.itemsize if width > 0: result = _to_fw_string(self.parser, i, start, end, width) return result, 0 # treat as a regular string parsing return self._string_convert(i, start, end, na_filter, - na_hashset) - elif dtype[1] == 'U': - width = int(dtype[2:]) + na_hashset) + elif dtype.kind == 'U': + width = dtype.itemsize if width > 0: - raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) + raise TypeError("the dtype %s is not supported for parsing" % dtype) # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) - - - elif dtype[1] == 'O': + elif is_categorical_dtype(dtype): + codes, cats, na_count = _categorical_convert(self.parser, i, start, + end, na_filter, na_hashset, + self.c_encoding) + # sort categories and recode if necessary + cats = Index(cats) + if not cats.is_monotonic_increasing: + unsorted = cats.copy() + cats = cats.sort_values() + indexer = cats.get_indexer(unsorted) + codes = take_1d(indexer, codes, fill_value=-1) + + return Categorical(codes, categories=cats, ordered=False, + fastpath=True), na_count + elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) + elif is_datetime64_dtype(dtype): + raise TypeError("the dtype %s is not supported for parsing, " + "pass this column using parse_dates instead" % dtype) else: - if dtype[1] == 'M': - raise TypeError("the dtype %s is not supported for parsing, " - "pass this column using parse_dates instead" % dtype) raise TypeError("the dtype %s is not supported for parsing" % dtype) cdef _string_convert(self, Py_ssize_t i, int start, int end, bint na_filter, kh_str_t *na_hashset): - if PY3: - if self.c_encoding != NULL: - if self.c_encoding == b"utf-8": - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - return _string_box_decode(self.parser, i, start, end, - na_filter, na_hashset, - self.c_encoding) - else: - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - if self.c_encoding != NULL: - if self.c_encoding == b"utf-8": - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - return _string_box_decode(self.parser, i, start, end, - na_filter, na_hashset, - self.c_encoding) - else: - return _string_box_factorize(self.parser, i, start, end, - na_filter, na_hashset) + + cdef StringPath path = _string_path(self.c_encoding) + + if path == UTF8: + return _string_box_utf8(self.parser, i, start, end, na_filter, + na_hashset) + elif path == ENCODED: + return _string_box_decode(self.parser, i, start, end, + na_filter, na_hashset, self.c_encoding) + elif path == CSTRING: + return _string_box_factorize(self.parser, i, start, end, + na_filter, na_hashset) + def _get_converter(self, i, name): if self.converters is None: @@ -1331,6 +1329,19 @@ def _maybe_upcast(arr): return arr +cdef enum StringPath: + CSTRING + UTF8 + ENCODED + +# factored out logic to pick string converter +cdef inline StringPath _string_path(char *encoding): + if encoding != NULL and encoding != b"utf-8": + return ENCODED + elif PY3 or encoding != NULL: + return UTF8 + else: + return CSTRING # ---------------------------------------------------------------------- # Type conversions / inference support code @@ -1500,6 +1511,77 @@ cdef _string_box_decode(parser_t *parser, int col, return result, na_count +@cython.boundscheck(False) +cdef _categorical_convert(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, + char *encoding): + "Convert column data into codes, categories" + cdef: + int error, na_count = 0 + Py_ssize_t i, size + size_t lines + coliter_t it + const char *word = NULL + + int64_t NA = -1 + int64_t[:] codes + int64_t current_category = 0 + + char *errors = "strict" + cdef StringPath path = _string_path(encoding) + + int ret = 0 + kh_str_t *table + khiter_t k + + lines = line_end - line_start + codes = np.empty(lines, dtype=np.int64) + + # factorize parsed values, creating a hash table + # bytes -> category code + with nogil: + table = kh_init_str() + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + COLITER_NEXT(it, word) + + if na_filter: + k = kh_get_str(na_hashset, word) + # is in NA values + if k != na_hashset.n_buckets: + na_count += 1 + codes[i] = NA + continue + + k = kh_get_str(table, word) + # not in the hash table + if k == table.n_buckets: + k = kh_put_str(table, word, &ret) + table.vals[k] = current_category + current_category += 1 + + codes[i] = table.vals[k] + + # parse and box categories to python strings + result = np.empty(table.n_occupied, dtype=np.object_) + if path == ENCODED: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + size = strlen(table.keys[k]) + result[table.vals[k]] = PyUnicode_Decode(table.keys[k], size, encoding, errors) + elif path == UTF8: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[table.vals[k]] = PyUnicode_FromString(table.keys[k]) + elif path == CSTRING: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[table.vals[k]] = PyBytes_FromString(table.keys[k]) + + kh_destroy_str(table) + return np.asarray(codes), result, na_count cdef _to_fw_string(parser_t *parser, int col, int line_start, int line_end, size_t width): @@ -1719,6 +1801,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l const char *word = NULL khiter_t k na_count[0] = 0 + coliter_setup(&it, parser, col, line_start) if na_filter: @@ -1836,6 +1919,7 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, return 0 + cdef kh_str_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: @@ -1924,7 +2008,11 @@ def _concatenate_chunks(list chunks): common_type = np.find_common_type(dtypes, []) if common_type == np.object: warning_columns.append(str(name)) - result[name] = np.concatenate(arrs) + + if is_categorical_dtype(dtypes.pop()): + result[name] = union_categoricals(arrs, sort_categories=True) + else: + result[name] = np.concatenate(arrs) if warning_columns: warning_names = ','.join(warning_columns) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 225ba533161b3..e3cc60e2856c2 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -850,6 +850,9 @@ def test_union_categorical(self): ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), + (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], + ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), + (pd.date_range('2014-01-01', '2014-01-05'), pd.date_range('2014-01-06', '2014-01-07'), pd.date_range('2014-01-01', '2014-01-07')), From 63e8f689a13650bda01281ef257f9266e6394881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A2=9C=E5=8F=91=E6=89=8D=EF=BC=88Yan=20Facai=EF=BC=89?= Date: Sat, 6 Aug 2016 18:54:52 -0400 Subject: [PATCH 226/359] BUG: agg() function on groupby dataframe changes dtype of datetime64[ns] column to float64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #12821 closes #12941 Author: 颜发才(Yan Facai) Closes #12992 from ningchi/agg_time_dtype and squashes the following commits: 607a170 [颜发才(Yan Facai)] add whatsnew entry 8d17eed [颜发才(Yan Facai)] BUG: fix GH12941, Operations on NaT returning float instead of datetime64[ns] a949cee [颜发才(Yan Facai)] BUG: fix GH12821, agg() function on groupby dataframe changes dtype of datetime64[ns] column to float64 914ed71 [颜发才(Yan Facai)] add test function --- .gitignore | 1 + doc/source/whatsnew/v0.19.0.txt | 6 ++++++ pandas/core/nanops.py | 13 +++++++++---- pandas/tests/frame/test_timeseries.py | 24 ++++++++++++++++++++++++ pandas/tests/test_groupby.py | 26 ++++++++++++++++++++++++++ pandas/types/cast.py | 3 ++- 6 files changed, 68 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index d987bab6fd5d7..19f1cc804dca0 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ .vagrant .noseids .ipynb_checkpoints +.tags # Compiled source # ################### diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 385599b1c6b9e..f93e8f4240787 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -960,5 +960,11 @@ Bug Fixes - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) + +- Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`) + +- Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) + - Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) + - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 7b89373dda7ba..2199daf549824 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -17,6 +17,7 @@ is_integer, is_complex, is_float_dtype, is_complex_dtype, is_integer_dtype, is_bool_dtype, is_object_dtype, + is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype, is_datetime_or_timedelta_dtype, is_int_or_datetime_dtype, is_any_int_dtype) @@ -638,11 +639,15 @@ def _maybe_null_out(result, axis, mask): if axis is not None and getattr(result, 'ndim', False): null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 if np.any(null_mask): - if np.iscomplexobj(result): - result = result.astype('c16') + if is_numeric_dtype(result): + if np.iscomplexobj(result): + result = result.astype('c16') + else: + result = result.astype('f8') + result[null_mask] = np.nan else: - result = result.astype('f8') - result[null_mask] = np.nan + # GH12941, use None to auto cast null + result[null_mask] = None elif result is not tslib.NaT: null_mask = mask.size - mask.sum() if null_mask == 0: diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index b9baae6cbeda7..4916d81b18c22 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -341,3 +341,27 @@ def test_first_last_valid(self): empty = DataFrame() self.assertIsNone(empty.last_valid_index()) self.assertIsNone(empty.first_valid_index()) + + def test_operation_on_NaT(self): + # Both NaT and Timestamp are in DataFrame. + df = pd.DataFrame({'foo': [pd.NaT, pd.NaT, + pd.Timestamp('2012-05-01')]}) + + res = df.min() + exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"]) + tm.assert_series_equal(res, exp) + + # GH12941, only NaTs are in DataFrame. + df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]}) + + res = df.min() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 268dcfc5744c1..cc588d891b398 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -720,6 +720,32 @@ def test_agg_period_index(self): grouped = df.groupby(df.index.month) list(grouped) + def test_agg_dict_parameter_cast_result_dtypes(self): + # GH 12821 + + df = DataFrame( + {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], + 'time': date_range('1/1/2011', periods=8, freq='H')}) + df.loc[[0, 1, 2, 5], 'time'] = None + + # test for `first` function + exp = df.loc[[0, 3, 4, 6]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.first(), exp) + assert_frame_equal(grouped.agg('first'), exp) + assert_frame_equal(grouped.agg({'time': 'first'}), exp) + assert_series_equal(grouped.time.first(), exp['time']) + assert_series_equal(grouped.time.agg('first'), exp['time']) + + # test for `last` function + exp = df.loc[[0, 3, 4, 7]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.last(), exp) + assert_frame_equal(grouped.agg('last'), exp) + assert_frame_equal(grouped.agg({'time': 'last'}), exp) + assert_series_equal(grouped.time.last(), exp['time']) + assert_series_equal(grouped.time.agg('last'), exp['time']) + def test_agg_must_agg(self): grouped = self.df.groupby('A')['C'] self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index f4cb476672ec7..e37b418664ba3 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -122,7 +122,8 @@ def trans(x): # noqa return new_result # a datetimelike - elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i']: + # GH12821, iNaT is casted to float + elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']: try: result = result.astype(dtype) except: From cfbb24f2c0e33ee221c54fd4944458cb0c6f4e84 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Aug 2016 20:23:30 -0400 Subject: [PATCH 227/359] BLD: fix conda-build version --- appveyor.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index c424420dda666..f6b55bf7abf7c 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,6 +16,9 @@ environment: CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci\\run_with_env.cmd" matrix: + # https://github.com/conda/conda-build/issues/1001 + # disabling 3.4 as windows complains upon compiling byte + # code - PYTHON: "C:\\Python34_64" PYTHON_VERSION: "3.4" PYTHON_ARCH: "64" @@ -62,6 +65,9 @@ install: # install our build environment - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false - cmd: conda update -q conda + + # fix conda-build version + - cmd: conda install conda-build=1.21.7 - cmd: conda config --set ssl_verify false # add the pandas channel *before* defaults to have defaults take priority From 142c7968a1918fd56b7ad40ea7c013a48b0c25fc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Aug 2016 21:24:12 -0400 Subject: [PATCH 228/359] COMPAT/TST: windows sparse int comparisons (#13927) --- pandas/sparse/tests/test_arithmetics.py | 35 +++++++++++++------------ 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/sparse/tests/test_arithmetics.py index 87efc362581cd..b5945151db678 100644 --- a/pandas/sparse/tests/test_arithmetics.py +++ b/pandas/sparse/tests/test_arithmetics.py @@ -281,28 +281,29 @@ def test_int_array(self): self._check_numeric_ops(a, b, values, rvalues) def test_int_array_comparison(self): - values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0]) - rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0]) - dtype = np.int64 + # int32 NI ATM + for dtype in ['int64']: + values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) - for kind in ['integer', 'block']: - a = self._klass(values, dtype=dtype, kind=kind) - b = self._klass(rvalues, dtype=dtype, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) - self._check_comparison_ops(a, b * 0, values, rvalues * 0) + for kind in ['integer', 'block']: + a = self._klass(values, dtype=dtype, kind=kind) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) - a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) - b = self._klass(rvalues, dtype=dtype, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) - b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, dtype=dtype, kind=kind, fill_value=1) - b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, dtype=dtype, kind=kind, fill_value=1) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): From 7e15923d908a42008fb7fecde58be626277fd82e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 7 Aug 2016 10:45:32 -0400 Subject: [PATCH 229/359] CI: disable py3.4 build appveyor --- appveyor.yml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index f6b55bf7abf7c..503e154e2b8f9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,14 +16,13 @@ environment: CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci\\run_with_env.cmd" matrix: - # https://github.com/conda/conda-build/issues/1001 - # disabling 3.4 as windows complains upon compiling byte - # code - - PYTHON: "C:\\Python34_64" - PYTHON_VERSION: "3.4" - PYTHON_ARCH: "64" - CONDA_PY: "34" - CONDA_NPY: "19" + + # disable python 3.4 ATM + #- PYTHON: "C:\\Python34_64" + # PYTHON_VERSION: "3.4" + # PYTHON_ARCH: "64" + # CONDA_PY: "34" + # CONDA_NPY: "19" - PYTHON: "C:\\Python27_64" PYTHON_VERSION: "2.7" @@ -67,6 +66,9 @@ install: - cmd: conda update -q conda # fix conda-build version + # https://github.com/conda/conda-build/issues/1001 + # disabling 3.4 as windows complains upon compiling byte + # code - cmd: conda install conda-build=1.21.7 - cmd: conda config --set ssl_verify false From cff1f5595888becee0e58dedf014a12e8131b352 Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Mon, 8 Aug 2016 08:20:21 -0400 Subject: [PATCH 230/359] ENH: Faster merge_asof() performs a single pass when joining tables (#13902) This version passes existing regression tests but is ultimately wrong because it requires the "by" column to be a single object. A proper version would handle int (and possily float) columns through type differentiation. Author: Christopher C. Aycock Closes #13903 from chrisaycock/master and squashes the following commits: f0d0165 [Christopher C. Aycock] ENH: Faster merge_asof() performs a single pass when joining tables (#13902) --- asv_bench/benchmarks/join_merge.py | 23 +- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/src/join.pyx | 60 +---- pandas/src/joins_func_helper.pxi | 373 ++++++++++++++++++++++++++ pandas/src/joins_func_helper.pxi.in | 160 +++++++++++ pandas/tools/merge.py | 176 +++++++----- pandas/tools/tests/test_merge_asof.py | 94 +++++++ setup.py | 2 +- 8 files changed, 755 insertions(+), 135 deletions(-) create mode 100644 pandas/src/joins_func_helper.pxi create mode 100644 pandas/src/joins_func_helper.pxi.in diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 86d5f84cb9b36..c98179c8950c5 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -310,7 +310,7 @@ def time_merge_asof_noby(self): merge_asof(self.df1, self.df2, on='time') -class merge_asof_by(object): +class merge_asof_by_object(object): def setup(self): import string @@ -326,7 +326,26 @@ def setup(self): self.df1 = self.df1.sort_values('time') self.df2 = self.df2.sort_values('time') - def time_merge_asof_by(self): + def time_merge_asof_by_object(self): + merge_asof(self.df1, self.df2, on='time', by='key') + + +class merge_asof_by_int(object): + + def setup(self): + np.random.seed(0) + one_count = 200000 + two_count = 1000000 + self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), + 'key': np.random.randint(0, 25, one_count), + 'value1': np.random.randn(one_count)}) + self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), + 'key': np.random.randint(0, 25, two_count), + 'value2': np.random.randn(two_count)}) + self.df1 = self.df1.sort_values('time') + self.df2 = self.df2.sort_values('time') + + def time_merge_asof_by_int(self): merge_asof(self.df1, self.df2, on='time', by='key') diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f93e8f4240787..843e6de70ce93 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -48,7 +48,7 @@ The following are now part of this API: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A long-time requested feature has been added through the :func:`merge_asof` function, to -support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`). Full documentation is +support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`, :issue:`13902`). Full documentation is :ref:`here ` The :func:`merge_asof` performs an asof merge, which is similar to a left-join diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index f3c7577ef528a..9281453c643ee 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -34,6 +34,8 @@ cdef double nan = NaN from pandas.algos import groupsort_indexer +include "joins_func_helper.pxi" + def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): @@ -162,64 +164,6 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, return left_indexer, right_indexer -def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, - Py_ssize_t max_groups, # ignored - bint allow_exact_matches=1, - left_values=None, - right_values=None, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - ndarray[int64_t] left_values_, right_values_ - int64_t tolerance_ - - # if we are using tolerance, set our objects - if (left_values is not None and right_values is not None and - tolerance is not None): - has_tolerance = 1 - left_values_ = left_values - right_values_ = right_values - tolerance_ = tolerance - - left_size = len(left) - right_size = len(right) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's value - if allow_exact_matches: - while (right_pos < right_size and - right[right_pos] <= left[left_pos]): - right_pos += 1 - else: - while (right_pos < right_size and - right[right_pos] < left[left_pos]): - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = right_pos - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != -1: - diff = left_values[left_pos] - right_values[right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): cdef: diff --git a/pandas/src/joins_func_helper.pxi b/pandas/src/joins_func_helper.pxi new file mode 100644 index 0000000000000..7a59da37c5ced --- /dev/null +++ b/pandas/src/joins_func_helper.pxi @@ -0,0 +1,373 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# asof_join_by +#---------------------------------------------------------------------- + + +from hashtable cimport * + + +def asof_join_int64_t_by_object(ndarray[int64_t] left_values, + ndarray[int64_t] right_values, + ndarray[object] left_by_values, + ndarray[object] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + int64_t tolerance_ + PyObjectHashTable hash_table + object by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = PyObjectHashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_double_by_object(ndarray[double] left_values, + ndarray[double] right_values, + ndarray[object] left_by_values, + ndarray[object] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + double tolerance_ + PyObjectHashTable hash_table + object by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = PyObjectHashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_int64_t_by_int64_t(ndarray[int64_t] left_values, + ndarray[int64_t] right_values, + ndarray[int64_t] left_by_values, + ndarray[int64_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + int64_t tolerance_ + Int64HashTable hash_table + int64_t by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = Int64HashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_double_by_int64_t(ndarray[double] left_values, + ndarray[double] right_values, + ndarray[int64_t] left_by_values, + ndarray[int64_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + double tolerance_ + Int64HashTable hash_table + int64_t by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = Int64HashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +#---------------------------------------------------------------------- +# asof_join +#---------------------------------------------------------------------- + + +def asof_join_int64_t(ndarray[int64_t] left_values, + ndarray[int64_t] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + int64_t tolerance_ + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != -1: + diff = left_values[left_pos] - right_values[right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_double(ndarray[double] left_values, + ndarray[double] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + double tolerance_ + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != -1: + diff = left_values[left_pos] - right_values[right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer diff --git a/pandas/src/joins_func_helper.pxi.in b/pandas/src/joins_func_helper.pxi.in new file mode 100644 index 0000000000000..06c35cfb69e53 --- /dev/null +++ b/pandas/src/joins_func_helper.pxi.in @@ -0,0 +1,160 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# asof_join_by +#---------------------------------------------------------------------- + +{{py: + +# table_type, by_dtype +by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t')] + +# on_dtype +on_dtypes = ['int64_t', 'double'] + +}} + + +from hashtable cimport * + +{{for table_type, by_dtype in by_dtypes}} +{{for on_dtype in on_dtypes}} + + +def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + ndarray[{{by_dtype}}] left_by_values, + ndarray[{{by_dtype}}] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + {{on_dtype}} tolerance_ + {{table_type}} hash_table + {{by_dtype}} by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = {{table_type}}(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + +{{endfor}} +{{endfor}} + + +#---------------------------------------------------------------------- +# asof_join +#---------------------------------------------------------------------- + +{{py: + +# on_dtype +dtypes = ['int64_t', 'double'] + +}} + +{{for on_dtype in dtypes}} + + +def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + {{on_dtype}} tolerance_ + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != -1: + diff = left_values[left_pos] - right_values[right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + +{{endfor}} + diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 571df70e05c6d..1572363fc6136 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -17,13 +17,15 @@ is_datetime64_dtype, needs_i8_conversion, is_int64_dtype, + is_integer_dtype, + is_float_dtype, is_integer, is_int_or_datetime_dtype, is_dtype_equal, is_bool, is_list_like, _ensure_int64, - _ensure_platform_int, + _ensure_float64, _ensure_object) from pandas.types.missing import na_value_for_dtype @@ -275,20 +277,17 @@ def merge_asof(left, right, on=None, ---------- left : DataFrame right : DataFrame - on : label or list - Field names to join on. Must be found in both DataFrames. + on : label + Field name to join on. Must be found in both DataFrames. The data MUST be ordered. Furthermore this must be a numeric column, - typically a datetimelike or integer. On or left_on/right_on + such as datetimelike, integer, or float. On or left_on/right_on must be given. - left_on : label or list, or array-like - Field names to join on in left DataFrame. Can be a vector or list of - vectors of the length of the DataFrame to use a particular vector as - the join key instead of columns - right_on : label or list, or array-like - Field names to join on in right DataFrame or vector/list of vectors per - left_on docs - by : column name or list of column names - Group both the left and right DataFrames by the group columns; perform + left_on : label + Field name to join on in left DataFrame. + right_on : label + Field name to join on in right DataFrame. + by : column name + Group both the left and right DataFrames by the group column; perform the merge operation on these pieces and recombine. suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right @@ -415,38 +414,12 @@ def merge_asof(left, right, on=None, merge_ordered """ - def _merger(x, y): - # perform the ordered merge operation - op = _AsOfMerge(x, y, - on=on, left_on=left_on, right_on=right_on, - by=by, suffixes=suffixes, - how='asof', tolerance=tolerance, - allow_exact_matches=allow_exact_matches) - return op.get_result() - - if by is not None: - result, groupby = _groupby_and_merge(by, on, left, right, - lambda x, y: _merger(x, y), - check_duplicates=False) - - # we want to preserve the original order - # we had grouped, so need to reverse this - # if we DO have duplicates, then - # we cannot guarantee order - - sorter = _ensure_platform_int( - np.concatenate([groupby.indices[g] for g, _ in groupby])) - if len(result) != len(sorter): - return result - - rev = np.empty(len(sorter), dtype=np.int_) - rev.put(sorter, np.arange(len(sorter))) - return result.take(rev).reset_index(drop=True) - - if right.duplicated(on).any(): - right = right.drop_duplicates(on, keep='last') - - return _merger(left, right) + op = _AsOfMerge(left, right, + on=on, left_on=left_on, right_on=right_on, + by=by, suffixes=suffixes, + how='asof', tolerance=tolerance, + allow_exact_matches=allow_exact_matches) + return op.get_result() # TODO: transformations?? @@ -942,6 +915,35 @@ def get_result(self): return result +_asof_functions = { + 'int64_t': _join.asof_join_int64_t, + 'double': _join.asof_join_double, +} + +_asof_by_functions = { + ('int64_t', 'int64_t'): _join.asof_join_int64_t_by_int64_t, + ('double', 'int64_t'): _join.asof_join_double_by_int64_t, + ('int64_t', 'object'): _join.asof_join_int64_t_by_object, + ('double', 'object'): _join.asof_join_double_by_object, +} + +_type_casters = { + 'int64_t': _ensure_int64, + 'double': _ensure_float64, + 'object': _ensure_object, +} + + +def _get_cython_type(dtype): + """ Given a dtype, return 'int64_t', 'double', or 'object' """ + if is_integer_dtype(dtype): + return 'int64_t' + elif is_float_dtype(dtype): + return 'double' + else: + return 'object' + + class _AsOfMerge(_OrderedMerge): _merge_type = 'asof_merge' @@ -977,6 +979,9 @@ def _validate_specification(self): if not is_list_like(self.by): self.by = [self.by] + if len(self.by) != 1: + raise MergeError("can only asof by a single key") + self.left_on = self.by + list(self.left_on) self.right_on = self.by + list(self.right_on) @@ -1030,36 +1035,62 @@ def _get_merge_keys(self): def _get_join_indexers(self): """ return the join indexers """ + # values to compare + left_values = self.left_join_keys[-1] + right_values = self.right_join_keys[-1] + tolerance = self.tolerance + # we required sortedness in the join keys msg = " keys must be sorted" - for lk in self.left_join_keys: - if not Index(lk).is_monotonic: - raise ValueError('left' + msg) - for rk in self.right_join_keys: - if not Index(rk).is_monotonic: - raise ValueError('right' + msg) - - kwargs = {} - - # tolerance - t = self.tolerance - if t is not None: - lt = self.left_join_keys[self.left_on.index(self._asof_key)] - rt = self.right_join_keys[self.right_on.index(self._asof_key)] - if needs_i8_conversion(lt): - lt = lt.view('i8') - t = t.value - rt = rt.view('i8') - kwargs['left_values'] = lt - kwargs['right_values'] = rt - kwargs['tolerance'] = t + if not Index(left_values).is_monotonic: + raise ValueError('left' + msg) + if not Index(right_values).is_monotonic: + raise ValueError('right' + msg) + + # initial type conversion as needed + if needs_i8_conversion(left_values): + left_values = left_values.view('i8') + right_values = right_values.view('i8') + if tolerance is not None: + tolerance = tolerance.value + + # a "by" parameter requires special handling + if self.by is not None: + left_by_values = self.left_join_keys[0] + right_by_values = self.right_join_keys[0] + + # choose appropriate function by type + on_type = _get_cython_type(left_values.dtype) + by_type = _get_cython_type(left_by_values.dtype) + + on_type_caster = _type_casters[on_type] + by_type_caster = _type_casters[by_type] + func = _asof_by_functions[(on_type, by_type)] + + left_values = on_type_caster(left_values) + right_values = on_type_caster(right_values) + left_by_values = by_type_caster(left_by_values) + right_by_values = by_type_caster(right_by_values) + + return func(left_values, + right_values, + left_by_values, + right_by_values, + self.allow_exact_matches, + tolerance) + else: + # choose appropriate function by type + on_type = _get_cython_type(left_values.dtype) + type_caster = _type_casters[on_type] + func = _asof_functions[on_type] - return _get_join_indexers(self.left_join_keys, - self.right_join_keys, - sort=self.sort, - how=self.how, - allow_exact_matches=self.allow_exact_matches, - **kwargs) + left_values = type_caster(left_values) + right_values = type_caster(right_values) + + return func(left_values, + right_values, + self.allow_exact_matches, + tolerance) def _get_multiindex_indexer(join_keys, index, sort): @@ -1143,7 +1174,6 @@ def _right_outer_join(x, y, max_groups): 'left': _join.left_outer_join, 'right': _right_outer_join, 'outer': _join.full_outer_join, - 'asof': _join.left_outer_asof_join, } diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index e0c50cf3baaf7..f413618624592 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -364,6 +364,100 @@ def test_allow_exact_matches_and_tolerance3(self): 'version': [np.nan, np.nan]}) assert_frame_equal(result, expected) + def test_by_int(self): + # we specialize by type, so test that this is correct + df1 = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.020', + '20160525 13:30:00.030', + '20160525 13:30:00.040', + '20160525 13:30:00.050', + '20160525 13:30:00.060']), + 'key': [1, 2, 1, 3, 2], + 'value1': [1.1, 1.2, 1.3, 1.4, 1.5]}, + columns=['time', 'key', 'value1']) + + df2 = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.015', + '20160525 13:30:00.020', + '20160525 13:30:00.025', + '20160525 13:30:00.035', + '20160525 13:30:00.040', + '20160525 13:30:00.055', + '20160525 13:30:00.060', + '20160525 13:30:00.065']), + 'key': [2, 1, 1, 3, 2, 1, 2, 3], + 'value2': [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8]}, + columns=['time', 'key', 'value2']) + + result = pd.merge_asof(df1, df2, on='time', by='key') + + expected = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.020', + '20160525 13:30:00.030', + '20160525 13:30:00.040', + '20160525 13:30:00.050', + '20160525 13:30:00.060']), + 'key': [1, 2, 1, 3, 2], + 'value1': [1.1, 1.2, 1.3, 1.4, 1.5], + 'value2': [2.2, 2.1, 2.3, 2.4, 2.7]}, + columns=['time', 'key', 'value1', 'value2']) + + assert_frame_equal(result, expected) + + def test_on_float(self): + # mimics how to determine the minimum-price variation + df1 = pd.DataFrame({ + 'price': [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078], + 'symbol': list("ABCDEFG")}, + columns=['symbol', 'price']) + + df2 = pd.DataFrame({ + 'price': [0.0, 1.0, 100.0], + 'mpv': [0.0001, 0.01, 0.05]}, + columns=['price', 'mpv']) + + df1 = df1.sort_values('price').reset_index(drop=True) + + result = pd.merge_asof(df1, df2, on='price') + + expected = pd.DataFrame({ + 'symbol': list("BGACEDF"), + 'price': [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90], + 'mpv': [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05]}, + columns=['symbol', 'price', 'mpv']) + + assert_frame_equal(result, expected) + + def test_on_float_by_int(self): + # type specialize both "by" and "on" parameters + df1 = pd.DataFrame({ + 'symbol': list("AAABBBCCC"), + 'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3], + 'price': [3.26, 3.2599, 3.2598, 12.58, 12.59, + 12.5, 378.15, 378.2, 378.25]}, + columns=['symbol', 'exch', 'price']) + + df2 = pd.DataFrame({ + 'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0], + 'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]}, + columns=['exch', 'price', 'mpv']) + + df1 = df1.sort_values('price').reset_index(drop=True) + df2 = df2.sort_values('price').reset_index(drop=True) + + result = pd.merge_asof(df1, df2, on='price', by='exch') + + expected = pd.DataFrame({ + 'symbol': list("AAABBBCCC"), + 'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3], + 'price': [3.2598, 3.2599, 3.26, 12.5, 12.58, + 12.59, 378.15, 378.2, 378.25], + 'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]}, + columns=['symbol', 'exch', 'price', 'mpv']) + + assert_frame_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/setup.py b/setup.py index 5bf188d829d26..1c12ff4aca372 100755 --- a/setup.py +++ b/setup.py @@ -109,7 +109,7 @@ def is_platform_mac(): _pxifiles = ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', 'join_helper.pxi.in', 'algos_take_helper.pxi.in', 'hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in', - 'sparse_op_helper.pxi.in'] + 'sparse_op_helper.pxi.in', 'joins_func_helper.pxi.in'] class build_ext(_build_ext): From 72be37bcc855ef1fb01ebda78f6aa4ef3bcc6315 Mon Sep 17 00:00:00 2001 From: agraboso Date: Mon, 8 Aug 2016 10:05:20 -0400 Subject: [PATCH 231/359] BUG: allow describe() for DataFrames with only boolean columns closes #13891 Author: agraboso Closes #13898 from agraboso/fix-13891 and squashes the following commits: 26201aa [agraboso] BUG: allow describe() for DataFrames with only boolean columns --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/generic.py | 7 +++--- pandas/tests/frame/test_analytics.py | 33 +++++++++++++++++++++++++++ pandas/tests/series/test_analytics.py | 21 +++++++++++++++++ 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 843e6de70ce93..081e4bf984e47 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -885,6 +885,7 @@ Bug Fixes - Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) - Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) +- Bug in ``DataFrame.describe()`` raising ``ValueError`` with only boolean columns (:issue:`13898`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) - Bug in ``.str.replace`` does not raise ``TypeError`` for invalid replacement (:issue:`13438`) - Bug in ``MultiIndex.from_arrays`` which didn't check for input array lengths matching (:issue:`13599`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f57b94fe0a326..17cc76e703631 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5138,10 +5138,9 @@ def describe_1d(data): if self.ndim == 1: return describe_1d(self) elif (include is None) and (exclude is None): - if len(self._get_numeric_data()._info_axis) > 0: - # when some numerics are found, keep only numerics - data = self.select_dtypes(include=[np.number]) - else: + # when some numerics are found, keep only numerics + data = self.select_dtypes(include=[np.number]) + if len(data.columns) == 0: data = self elif include == 'all': if exclude is not None: diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 370f3b5ee5b8b..390d796ced006 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -249,6 +249,39 @@ def test_bool_describe_in_mixed_frame(self): index=['count', 'unique', 'top', 'freq']) tm.assert_frame_equal(result, expected) + def test_describe_bool_frame(self): + # GH 13891 + df = pd.DataFrame({ + 'bool_data_1': [False, False, True, True], + 'bool_data_2': [False, True, True, True] + }) + result = df.describe() + expected = DataFrame({'bool_data_1': [4, 2, True, 2], + 'bool_data_2': [4, 2, True, 3]}, + index=['count', 'unique', 'top', 'freq']) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({ + 'bool_data': [False, False, True, True, False], + 'int_data': [0, 1, 2, 3, 4] + }) + result = df.describe() + expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1, + 2, 3, 4]}, + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max']) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({ + 'bool_data': [False, False, True, True], + 'str_data': ['a', 'b', 'c', 'a'] + }) + result = df.describe() + expected = DataFrame({'bool_data': [4, 2, True, 2], + 'str_data': [4, 3, 'a', 2]}, + index=['count', 'unique', 'top', 'freq']) + tm.assert_frame_equal(result, expected) + def test_describe_categorical_columns(self): # GH 11558 columns = pd.CategoricalIndex(['int1', 'int2', 'obj'], diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 34cfb2f0c1529..6575c106f006f 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -260,6 +260,27 @@ def test_kurt(self): self.assertEqual(0, s.kurt()) self.assertTrue((df.kurt() == 0).all()) + def test_describe(self): + s = Series([0, 1, 2, 3, 4], name='int_data') + result = s.describe() + expected = Series([5, 2, s.std(), 0, 1, 2, 3, 4], + name='int_data', + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max']) + self.assert_series_equal(result, expected) + + s = Series([True, True, False, False, False], name='bool_data') + result = s.describe() + expected = Series([5, 2, False, 3], name='bool_data', + index=['count', 'unique', 'top', 'freq']) + self.assert_series_equal(result, expected) + + s = Series(['a', 'a', 'b', 'c', 'd'], name='str_data') + result = s.describe() + expected = Series([5, 4, 'a', 2], name='str_data', + index=['count', 'unique', 'top', 'freq']) + self.assert_series_equal(result, expected) + def test_argsort(self): self._check_accum_op('argsort', check_dtype=False) argsorted = self.ts.argsort() From 81819b7aa2537469448fbaeb4cd9e3d500f4e2a1 Mon Sep 17 00:00:00 2001 From: agraboso Date: Mon, 8 Aug 2016 10:08:14 -0400 Subject: [PATCH 232/359] BUG: Fix Period and PeriodIndex support of combined offsets aliases - [x] closes #13730 - [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` - [x] whatsnew entry Essentially, makes sure any `freq` string passed to a `Period` or `PeriodIndex` goes through [`_Period._maybe_convert_freq()`](https://g ithub.com/pydata/pandas/blob/master/pandas/src/period.pyx#L682-L697), which calls [`to_offset()`](https://github.com/pydata/pandas/blob/mast er/pandas/tseries/frequencies.py#L389-L451), which is where the logic for combining aliases is. All the examples in #13730 result in the correct output, and all existing tests pass. I have not written any new ones yet — I first wanted to get the opinion of the maintainers. This PR builds on #13868 (without it, some existing tests fail). Author: agraboso Closes #13874 from agraboso/fix-13730 and squashes the following commits: 49a3783 [agraboso] BUG: Fix Period and PeriodIndex support of combined alias offsets --- doc/source/whatsnew/v0.19.0.txt | 3 + pandas/src/period.pyx | 8 +- pandas/tseries/frequencies.py | 85 ++++++++---- pandas/tseries/period.py | 13 +- pandas/tseries/tests/test_offsets.py | 27 ++-- pandas/tseries/tests/test_period.py | 169 ++++++++++++++++++++++++ pandas/tseries/tests/test_timeseries.py | 3 +- 7 files changed, 268 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 081e4bf984e47..a041e175d5f1a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -783,6 +783,8 @@ Deprecations - ``pd.tseries.util.pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) - ``pd.tseries.util.isleapyear`` has been deprecated and will be removed in a subsequent release. Datetime-likes now have a ``.is_leap_year`` property. (:issue:`13727`) - ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. (:issue:`13564`) +- ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead. (:issue:`13874`) +- ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq``. (:issue:`13874`) .. _whatsnew_0190.prior_deprecations: @@ -969,3 +971,4 @@ Bug Fixes - Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) +- Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 0435b01920504..bb0108fcb141c 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -739,7 +739,7 @@ cdef class _Period(object): msg = 'Input cannot be converted to Period(freq={0})' raise IncompatibleFrequency(msg.format(self.freqstr)) elif isinstance(other, offsets.DateOffset): - freqstr = frequencies.get_standard_freq(other) + freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: ordinal = self.ordinal + other.n @@ -806,6 +806,7 @@ cdef class _Period(object): ------- resampled : Period """ + freq = self._maybe_convert_freq(freq) how = _validate_end_alias(how) base1, mult1 = frequencies.get_freq_code(self.freq) base2, mult2 = frequencies.get_freq_code(freq) @@ -849,6 +850,8 @@ cdef class _Period(object): ------- Timestamp """ + if freq is not None: + freq = self._maybe_convert_freq(freq) how = _validate_end_alias(how) if freq is None: @@ -1122,6 +1125,9 @@ class Period(_Period): cdef _Period self + if freq is not None: + freq = cls._maybe_convert_freq(freq) + if ordinal is not None and value is not None: raise ValueError(("Only value or ordinal but not both should be " "given but not both")) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index eaf826230e772..3011e8dc0ae3d 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -15,7 +15,7 @@ import pandas.core.algorithms as algos from pandas.core.algorithms import unique from pandas.tseries.offsets import DateOffset -from pandas.util.decorators import cache_readonly +from pandas.util.decorators import cache_readonly, deprecate_kwarg import pandas.tseries.offsets as offsets import pandas.lib as lib import pandas.tslib as tslib @@ -386,37 +386,71 @@ def get_period_alias(offset_str): _INVALID_FREQ_ERROR = "Invalid frequency: {0}" -def to_offset(freqstr): +@deprecate_kwarg(old_arg_name='freqstr', new_arg_name='freq') +def to_offset(freq): """ - Return DateOffset object from string representation or - Timedelta object + Return DateOffset object from string or tuple representation + or datetime.timedelta object + + Parameters + ---------- + freq : str, tuple, datetime.timedelta, DateOffset or None + + Returns + ------- + delta : DateOffset + None if freq is None + + Raises + ------ + ValueError + If freq is an invalid frequency + + See Also + -------- + pandas.DateOffset Examples -------- - >>> to_offset('5Min') - Minute(5) + >>> to_offset('5min') + <5 * Minutes> + + >>> to_offset('1D1H') + <25 * Hours> + + >>> to_offset(('W', 2)) + <2 * Weeks: weekday=6> + + >>> to_offset((2, 'B')) + <2 * BusinessDays> + + >>> to_offset(datetime.timedelta(days=1)) + + + >>> to_offset(Hour()) + """ - if freqstr is None: + if freq is None: return None - if isinstance(freqstr, DateOffset): - return freqstr + if isinstance(freq, DateOffset): + return freq - if isinstance(freqstr, tuple): - name = freqstr[0] - stride = freqstr[1] + if isinstance(freq, tuple): + name = freq[0] + stride = freq[1] if isinstance(stride, compat.string_types): name, stride = stride, name name, _ = _base_and_stride(name) delta = get_offset(name) * stride - elif isinstance(freqstr, timedelta): + elif isinstance(freq, timedelta): delta = None - freqstr = Timedelta(freqstr) + freq = Timedelta(freq) try: - for name in freqstr.components._fields: + for name in freq.components._fields: offset = _name_to_offset_map[name] - stride = getattr(freqstr.components, name) + stride = getattr(freq.components, name) if stride != 0: offset = stride * offset if delta is None: @@ -424,13 +458,13 @@ def to_offset(freqstr): else: delta = delta + offset except Exception: - raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) + raise ValueError(_INVALID_FREQ_ERROR.format(freq)) else: delta = None stride_sign = None try: - for stride, name, _ in opattern.findall(freqstr): + for stride, name, _ in opattern.findall(freq): offset = get_offset(name) if stride_sign is None: stride_sign = -1 if stride.startswith('-') else 1 @@ -443,10 +477,10 @@ def to_offset(freqstr): else: delta = delta + offset except Exception: - raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) + raise ValueError(_INVALID_FREQ_ERROR.format(freq)) if delta is None: - raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) + raise ValueError(_INVALID_FREQ_ERROR.format(freq)) return delta @@ -542,14 +576,11 @@ def get_standard_freq(freq): """ Return the standardized frequency string """ - if freq is None: - return None - if isinstance(freq, DateOffset): - return freq.rule_code - - code, stride = get_freq_code(freq) - return _get_freq_str(code, stride) + msg = ("get_standard_freq is deprecated. Use to_offset(freq).rule_code " + "instead.") + warnings.warn(msg, FutureWarning, stacklevel=2) + return to_offset(freq).rule_code # --------------------------------------------------------------------- # Period codes diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 810c89b3f969b..da8868bb2bd84 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -57,6 +57,7 @@ def dt64arr_to_periodarr(data, freq, tz): if data.dtype != np.dtype('M8[ns]'): raise ValueError('Wrong dtype: %s' % data.dtype) + freq = Period._maybe_convert_freq(freq) base, mult = _gfc(freq) return period.dt64arr_to_periodarr(data.view('i8'), base, tz) @@ -206,6 +207,9 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, @classmethod def _generate_range(cls, start, end, periods, freq, fields): + if freq is not None: + freq = Period._maybe_convert_freq(freq) + field_count = len(fields) if com._count_not_none(start, end) > 0: if field_count > 0: @@ -222,6 +226,9 @@ def _generate_range(cls, start, end, periods, freq, fields): @classmethod def _from_arraylike(cls, data, freq, tz): + if freq is not None: + freq = Period._maybe_convert_freq(freq) + if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): if is_scalar(data) or isinstance(data, Period): @@ -478,7 +485,7 @@ def asfreq(self, freq=None, how='E'): """ how = _validate_end_alias(how) - freq = frequencies.get_standard_freq(freq) + freq = Period._maybe_convert_freq(freq) base1, mult1 = _gfc(self.freq) base2, mult2 = _gfc(freq) @@ -579,6 +586,8 @@ def to_timestamp(self, freq=None, how='start'): if freq is None: base, mult = _gfc(self.freq) freq = frequencies.get_to_timestamp_base(base) + else: + freq = Period._maybe_convert_freq(freq) base, mult = _gfc(freq) new_data = self.asfreq(freq, how) @@ -596,7 +605,7 @@ def _maybe_convert_timedelta(self, other): if nanos % offset_nanos == 0: return nanos // offset_nanos elif isinstance(other, offsets.DateOffset): - freqstr = frequencies.get_standard_freq(other) + freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: return other.n diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index b31e4d54c551f..3ec07c27ef854 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -4591,21 +4591,30 @@ def test_parse_time_quarter_w_dash(self): def test_get_standard_freq(): - fstr = get_standard_freq('W') - assert fstr == get_standard_freq('w') - assert fstr == get_standard_freq('1w') - assert fstr == get_standard_freq(('W', 1)) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + fstr = get_standard_freq('W') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq('w') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq('1w') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq(('W', 1)) with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): - get_standard_freq('WeEk') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + get_standard_freq('WeEk') - fstr = get_standard_freq('5Q') - assert fstr == get_standard_freq('5q') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + fstr = get_standard_freq('5Q') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq('5q') with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): - get_standard_freq('5QuarTer') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + get_standard_freq('5QuarTer') - assert fstr == get_standard_freq(('q', 5)) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq(('q', 5)) def test_quarterly_dont_normalize(): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 290c11bd8d79c..17e6e36d52acd 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -102,6 +102,9 @@ def test_period_cons_nat(self): p = Period(tslib.iNaT, freq='3D') self.assertIs(p, pd.NaT) + p = Period(tslib.iNaT, freq='1D1H') + self.assertIs(p, pd.NaT) + p = Period('NaT') self.assertIs(p, pd.NaT) @@ -152,6 +155,73 @@ def test_period_cons_mult(self): with tm.assertRaisesRegexp(ValueError, msg): Period('2011-01', freq='0M') + def test_period_cons_combined(self): + p = [(Period('2011-01', freq='1D1H'), + Period('2011-01', freq='1H1D'), + Period('2011-01', freq='H')), + (Period(ordinal=1, freq='1D1H'), + Period(ordinal=1, freq='1H1D'), + Period(ordinal=1, freq='H'))] + + for p1, p2, p3 in p: + self.assertEqual(p1.ordinal, p3.ordinal) + self.assertEqual(p2.ordinal, p3.ordinal) + + self.assertEqual(p1.freq, offsets.Hour(25)) + self.assertEqual(p1.freqstr, '25H') + + self.assertEqual(p2.freq, offsets.Hour(25)) + self.assertEqual(p2.freqstr, '25H') + + self.assertEqual(p3.freq, offsets.Hour()) + self.assertEqual(p3.freqstr, 'H') + + result = p1 + 1 + self.assertEqual(result.ordinal, (p3 + 25).ordinal) + self.assertEqual(result.freq, p1.freq) + self.assertEqual(result.freqstr, '25H') + + result = p2 + 1 + self.assertEqual(result.ordinal, (p3 + 25).ordinal) + self.assertEqual(result.freq, p2.freq) + self.assertEqual(result.freqstr, '25H') + + result = p1 - 1 + self.assertEqual(result.ordinal, (p3 - 25).ordinal) + self.assertEqual(result.freq, p1.freq) + self.assertEqual(result.freqstr, '25H') + + result = p2 - 1 + self.assertEqual(result.ordinal, (p3 - 25).ordinal) + self.assertEqual(result.freq, p2.freq) + self.assertEqual(result.freqstr, '25H') + + msg = ('Frequency must be positive, because it' + ' represents span: -25H') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='-1D1H') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='-1H1D') + with tm.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq='-1D1H') + with tm.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq='-1H1D') + + msg = ('Frequency must be positive, because it' + ' represents span: 0D') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='0D0H') + with tm.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq='0D0H') + + # You can only combine together day and intraday offsets + msg = ('Invalid frequency: 1W1D') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='1W1D') + msg = ('Invalid frequency: 1D1W') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='1D1W') + def test_timestamp_tz_arg(self): tm._skip_if_no_pytz() import pytz @@ -624,6 +694,14 @@ def _ex(*args): xp = _ex(2012, 1, 16) self.assertEqual(xp, p.end_time) + p = Period('2012', freq='1D1H') + xp = _ex(2012, 1, 2, 1) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='1H1D') + xp = _ex(2012, 1, 2, 1) + self.assertEqual(xp, p.end_time) + def test_anchor_week_end_time(self): def _ex(*args): return Timestamp(Timestamp(datetime(*args)).value - 1) @@ -1518,6 +1596,44 @@ def test_asfreq_mult(self): self.assertEqual(result.ordinal, expected.ordinal) self.assertEqual(result.freq, expected.freq) + def test_asfreq_combined(self): + # normal freq to combined freq + p = Period('2007', freq='H') + + # ordinal will not change + expected = Period('2007', freq='25H') + for freq, how in zip(['1D1H', '1H1D'], ['E', 'S']): + result = p.asfreq(freq, how=how) + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) + + # combined freq to normal freq + p1 = Period(freq='1D1H', year=2007) + p2 = Period(freq='1H1D', year=2007) + + # ordinal will change because how=E is the default + result1 = p1.asfreq('H') + result2 = p2.asfreq('H') + expected = Period('2007-01-02', freq='H') + self.assertEqual(result1, expected) + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freq, expected.freq) + self.assertEqual(result2, expected) + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freq, expected.freq) + + # ordinal will not change + result1 = p1.asfreq('H', how='S') + result2 = p2.asfreq('H', how='S') + expected = Period('2007-01-01', freq='H') + self.assertEqual(result1, expected) + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freq, expected.freq) + self.assertEqual(result2, expected) + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freq, expected.freq) + def test_is_leap_year(self): # GH 13727 for freq in ['A', 'M', 'D', 'H']: @@ -1861,6 +1977,17 @@ def test_constructor_freq_mult_dti_compat(self): periods=10).to_period(freqstr) tm.assert_index_equal(pidx, expected) + def test_constructor_freq_combined(self): + for freq in ['1D1H', '1H1D']: + pidx = PeriodIndex(['2016-01-01', '2016-01-02'], freq=freq) + expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 00:00'], + freq='25H') + for freq, func in zip(['1D1H', '1H1D'], [PeriodIndex, period_range]): + pidx = func(start='2016-01-01', periods=2, freq=freq) + expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 01:00'], + freq='25H') + tm.assert_index_equal(pidx, expected) + def test_is_(self): create_index = lambda: PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -2130,6 +2257,21 @@ def test_to_timestamp_pi_mult(self): ['2011-02-28', 'NaT', '2011-03-31'], name='idx') self.assert_index_equal(result, expected) + def test_to_timestamp_pi_combined(self): + idx = PeriodIndex(start='2011', periods=2, freq='1D1H', name='idx') + result = idx.to_timestamp() + expected = DatetimeIndex( + ['2011-01-01 00:00', '2011-01-02 01:00'], name='idx') + self.assert_index_equal(result, expected) + result = idx.to_timestamp(how='E') + expected = DatetimeIndex( + ['2011-01-02 00:59:59', '2011-01-03 01:59:59'], name='idx') + self.assert_index_equal(result, expected) + result = idx.to_timestamp(how='E', freq='H') + expected = DatetimeIndex( + ['2011-01-02 00:00', '2011-01-03 01:00'], name='idx') + self.assert_index_equal(result, expected) + def test_start_time(self): index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') @@ -2541,6 +2683,33 @@ def test_asfreq_mult_pi(self): self.assert_index_equal(result, exp) self.assertEqual(result.freq, exp.freq) + def test_asfreq_combined_pi(self): + pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], + freq='H') + exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], + freq='25H') + for freq, how in zip(['1D1H', '1H1D'], ['S', 'E']): + result = pi.asfreq(freq, how=how) + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + + for freq in ['1D1H', '1H1D']: + pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', + 'NaT'], freq=freq) + result = pi.asfreq('H') + exp = PeriodIndex(['2001-01-02 00:00', '2001-01-03 02:00', 'NaT'], + freq='H') + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + + pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', + 'NaT'], freq=freq) + result = pi.asfreq('H', how='S') + exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], + freq='H') + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + def test_period_index_length(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') self.assertEqual(len(pi), 9) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 09fb4beb74f28..a3abfd0321677 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4700,7 +4700,8 @@ def test_frequency_misc(self): self.assertRaises(ValueError, frequencies.to_offset, ('', '')) - result = frequencies.get_standard_freq(offsets.Hour()) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frequencies.get_standard_freq(offsets.Hour()) self.assertEqual(result, 'H') def test_hash_equivalent(self): From b7abef4949fb1ba7fd1004feba4f47ace7004282 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Mon, 8 Aug 2016 12:10:22 -0400 Subject: [PATCH 233/359] DOC: Added example of using OrderedDict for agg. closes #12879 Author: Ben Kandel Closes #13938 from bkandel/ordered_dict_example and squashes the following commits: 8097de8 [Ben Kandel] Changed to note. 0676294 [Ben Kandel] DOC: Added example of using OrderedDict for agg. --- doc/source/groupby.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index c9095c3ae1a60..c5a77770085d6 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -13,6 +13,7 @@ matplotlib.style.use('ggplot') import matplotlib.pyplot as plt plt.close('all') + from collections import OrderedDict ***************************** Group By: split-apply-combine @@ -487,6 +488,17 @@ must be either implemented on GroupBy or available via :ref:`dispatching grouped.agg({'C' : 'sum', 'D' : 'std'}) +.. note:: + + If you pass a dict to ``aggregate``, the ordering of the output colums is + non-deterministic. If you want to be sure the output columns will be in a specific + order, you can use an ``OrderedDict``. Compare the output of the following two commands: + +.. ipython:: python + + grouped.agg({'D': 'std', 'C': 'mean'}) + grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) + .. _groupby.aggregate.cython: Cython-optimized aggregation functions From ae26ec75290f622bac8b36c8f2dff3dd33a72907 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 9 Aug 2016 06:29:57 +0900 Subject: [PATCH 234/359] BLD: Fix sparse warnings closes #13942 xref #13849 --- pandas/src/sparse.pyx | 12 +++---- pandas/src/sparse_op_helper.pxi | 56 +++++++++++++++--------------- pandas/src/sparse_op_helper.pxi.in | 2 +- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx index 9908aef592ad3..646f9126b984c 100644 --- a/pandas/src/sparse.pyx +++ b/pandas/src/sparse.pyx @@ -147,13 +147,13 @@ cdef class IntIndex(SparseIndex): return IntIndex(self.length, new_indices) @cython.wraparound(False) - cpdef int lookup(self, Py_ssize_t index): + cpdef int32_t lookup(self, Py_ssize_t index): """ Return the internal location if value exists on given index. Return -1 otherwise. """ cdef: - Py_ssize_t res + int32_t res ndarray[int32_t, ndim=1] inds inds = self.indices @@ -290,7 +290,7 @@ cdef class BlockIndex(SparseIndex): ---------- """ cdef readonly: - Py_ssize_t nblocks, npoints, length + int32_t nblocks, npoints, length ndarray blocs, blengths cdef: @@ -308,7 +308,7 @@ cdef class BlockIndex(SparseIndex): self.lenbuf = self.blengths.data self.length = length - self.nblocks = len(self.blocs) + self.nblocks = np.int32(len(self.blocs)) self.npoints = self.blengths.sum() # self.block_start = blocs @@ -381,7 +381,7 @@ cdef class BlockIndex(SparseIndex): def to_int_index(self): cdef: - Py_ssize_t i = 0, j, b + int32_t i = 0, j, b int32_t offset ndarray[int32_t, ndim=1] indices @@ -498,7 +498,7 @@ cdef class BlockIndex(SparseIndex): """ return BlockUnion(self, y.to_block_index()).result - cpdef int lookup(self, Py_ssize_t index): + cpdef Py_ssize_t lookup(self, Py_ssize_t index): """ Return the internal location if value exists on given index. Return -1 otherwise. diff --git a/pandas/src/sparse_op_helper.pxi b/pandas/src/sparse_op_helper.pxi index a49036d02896c..5ff96469195e3 100644 --- a/pandas/src/sparse_op_helper.pxi +++ b/pandas/src/sparse_op_helper.pxi @@ -87,7 +87,7 @@ cdef inline tuple block_op_add_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -282,7 +282,7 @@ cdef inline tuple block_op_add_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -477,7 +477,7 @@ cdef inline tuple block_op_sub_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -672,7 +672,7 @@ cdef inline tuple block_op_sub_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -867,7 +867,7 @@ cdef inline tuple block_op_mul_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -1062,7 +1062,7 @@ cdef inline tuple block_op_mul_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -1257,7 +1257,7 @@ cdef inline tuple block_op_div_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -1452,7 +1452,7 @@ cdef inline tuple block_op_div_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -1647,7 +1647,7 @@ cdef inline tuple block_op_mod_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -1842,7 +1842,7 @@ cdef inline tuple block_op_mod_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -2037,7 +2037,7 @@ cdef inline tuple block_op_truediv_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -2232,7 +2232,7 @@ cdef inline tuple block_op_truediv_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -2427,7 +2427,7 @@ cdef inline tuple block_op_floordiv_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -2622,7 +2622,7 @@ cdef inline tuple block_op_floordiv_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -2817,7 +2817,7 @@ cdef inline tuple block_op_pow_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -3012,7 +3012,7 @@ cdef inline tuple block_op_pow_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -3207,7 +3207,7 @@ cdef inline tuple block_op_eq_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -3402,7 +3402,7 @@ cdef inline tuple block_op_eq_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -3597,7 +3597,7 @@ cdef inline tuple block_op_ne_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -3792,7 +3792,7 @@ cdef inline tuple block_op_ne_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -3987,7 +3987,7 @@ cdef inline tuple block_op_lt_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -4182,7 +4182,7 @@ cdef inline tuple block_op_lt_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -4377,7 +4377,7 @@ cdef inline tuple block_op_gt_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -4572,7 +4572,7 @@ cdef inline tuple block_op_gt_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -4767,7 +4767,7 @@ cdef inline tuple block_op_le_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -4962,7 +4962,7 @@ cdef inline tuple block_op_le_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -5157,7 +5157,7 @@ cdef inline tuple block_op_ge_float64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers @@ -5352,7 +5352,7 @@ cdef inline tuple block_op_ge_int64(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers diff --git a/pandas/src/sparse_op_helper.pxi.in b/pandas/src/sparse_op_helper.pxi.in index 73fd5e46f46a6..1a0e1aa0250f6 100644 --- a/pandas/src/sparse_op_helper.pxi.in +++ b/pandas/src/sparse_op_helper.pxi.in @@ -155,7 +155,7 @@ cdef inline tuple block_op_{{opname}}_{{dtype}}(ndarray x_, cdef: BlockIndex out_index Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xbp = 0, ybp = 0 # block positions int32_t xloc, yloc Py_ssize_t xblock = 0, yblock = 0 # block numbers From 49f99ac59f0bf185df48dd23735a919a330a6b0d Mon Sep 17 00:00:00 2001 From: wcwagner Date: Tue, 9 Aug 2016 17:44:52 -0400 Subject: [PATCH 235/359] BUG: Fixed float parsing with unit when using pd.to_datetime (GH13834) closes #13834 Author: wcwagner Closes #13847 from wcwagner/bug/13834 and squashes the following commits: 54a26ee [wcwagner] BUG: Fixed float parsing with unit when using pd.to_datetime (GH13834) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tseries/tests/test_timeseries.py | 8 ++++++++ pandas/tslib.pyx | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a041e175d5f1a..4f81eafa3adaf 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -972,3 +972,4 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) +- Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13845`) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index a3abfd0321677..e5bbb923935e0 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -745,6 +745,14 @@ def test_to_datetime_unit(self): seconds=t) for t in range(20)] + [NaT]) assert_series_equal(result, expected) + # GH13834 + s = Series([epoch + t for t in np.arange(0, 2, .25)] + + [iNaT]).astype(float) + result = to_datetime(s, unit='s') + expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( + seconds=t) for t in np.arange(0, 2, .25)] + [NaT]) + assert_series_equal(result, expected) + s = concat([Series([epoch + t for t in range(20)] ).astype(float), Series([np.nan])], ignore_index=True) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 32b2bf075991b..3c07cfd2446ed 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2095,7 +2095,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): # if we have nulls that are not type-compat # then need to iterate try: - iresult = values.astype('i8') + iresult = values.astype('i8', casting='same_kind', copy=False) mask = iresult == iNaT iresult[mask] = 0 fvalues = iresult.astype('f8') * m From e89a0a078c813f5d16617552d2b23e20e540dd0e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 9 Aug 2016 17:52:29 -0400 Subject: [PATCH 236/359] TST: Removed regex warning for read_csv Follow-up to #13481 and partially fixes #13932. Regex mistakenly matches to the empty string, which will cause Python 3.x to issue a warning. In addition, not sure why this test was placed in `python_parsers_only.py`... Author: gfyoung Closes #13943 from gfyoung/regex-empty-match and squashes the following commits: b93325e [gfyoung] TST: Removed regex warning for read_csv --- pandas/io/tests/parser/common.py | 14 ++++++++++++++ pandas/io/tests/parser/python_parser_only.py | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 7558e4bb63226..619ac7b4c77ef 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1568,3 +1568,17 @@ def _encode_data_with_bom(_data): encoding=utf8, names=['a'], skip_blank_lines=False) tm.assert_frame_equal(out, expected) + + def test_temporary_file(self): + # see gh-13398 + data1 = "0 0" + + from tempfile import TemporaryFile + new_file = TemporaryFile("w+") + new_file.write(data1) + new_file.flush() + new_file.seek(0) + + result = self.read_csv(new_file, sep='\s+', header=None) + expected = DataFrame([[0, 0]]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index 619b6b63568f3..a7389fd174e1d 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -172,20 +172,6 @@ def test_read_table_buglet_4x_multiindex(self): actual = self.read_table(StringIO(data), sep='\s+') tm.assert_frame_equal(actual, expected) - def test_temporary_file(self): - # GH13398 - data1 = "0 0" - - from tempfile import TemporaryFile - new_file = TemporaryFile("w+") - new_file.write(data1) - new_file.flush() - new_file.seek(0) - - result = self.read_csv(new_file, sep=r"\s*", header=None) - expected = DataFrame([[0, 0]]) - tm.assert_frame_equal(result, expected) - def test_skipfooter_with_decimal(self): # see gh-6971 data = '1#2\n3#4' From cce79936ca1790fe9870e8a5b6bccb3dba41b8ae Mon Sep 17 00:00:00 2001 From: Kamil Sindi Date: Wed, 10 Aug 2016 06:15:35 -0400 Subject: [PATCH 237/359] ENH: raise ImporError if conn is string and sqlalchemy not installed (#11920) --- pandas/io/sql.py | 2 ++ pandas/io/tests/test_sql.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 49f277f6ba7bc..47642c2e2bc28 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -528,6 +528,8 @@ def pandasSQL_builder(con, flavor=None, schema=None, meta=None, con = _engine_builder(con) if _is_sqlalchemy_connectable(con): return SQLDatabase(con, schema=schema, meta=meta) + elif isinstance(con, string_types): + raise ImportError("Using URI string without sqlalchemy installed.") else: return SQLiteDatabase(con, is_cursor=is_cursor) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 21c3ea416e091..ffe7b9d6b460a 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -1051,6 +1051,14 @@ def test_sql_open_close(self): tm.assert_frame_equal(self.test_frame3, result) + def test_con_string_import_error(self): + if not SQLALCHEMY_INSTALLED: + conn = 'mysql://root@localhost/pandas_nosetest' + self.assertRaises(ImportError, sql.read_sql, "SELECT * FROM iris", + conn) + else: + raise nose.SkipTest('SQLAlchemy is installed') + def test_read_sql_delegate(self): iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) From 0e7ae89115e60419b807c38b9e4b8a19d4c8f830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sa=C5=A1o=20Stanovnik?= Date: Wed, 10 Aug 2016 06:36:34 -0400 Subject: [PATCH 238/359] BUG: multi-type SparseDataFrame fixes and improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Sašo Stanovnik Closes #13917 from sstanovnik/fix-multitype-series-slice and squashes the following commits: 8c7d1ea [Sašo Stanovnik] Colon to comma. 057d56b [Sašo Stanovnik] Wording and code organization fixes. 926ca1e [Sašo Stanovnik] Fix a derp. 442b8c1 [Sašo Stanovnik] Whatsnew, issue tag, test reordering. 8d675ad [Sašo Stanovnik] Add tests for common dtypes, raises check for pandas ones. eebcb23 [Sašo Stanovnik] Moved multitype tests to sparse/tests/test_multitype.py ac790d7 [Sašo Stanovnik] Modify .values docs to process issue #10364. 2104948 [Sašo Stanovnik] Factor the common type discovery to an internal function. 6782bc7 [Sašo Stanovnik] Revert default argument change. 93d2de6 [Sašo Stanovnik] Modified the whatsnew message. 33973a5 [Sašo Stanovnik] Additional multitype tests. 114217e [Sašo Stanovnik] Infer dtype instead of forcing float in SparseArray. c7fb0f2 [Sašo Stanovnik] Use numpy to determine common dtypes. fb6237c [Sašo Stanovnik] Add a whatsnew note. 2e833fa [Sašo Stanovnik] BUG: multi-type sparse slicing fixes and improvements --- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/core/generic.py | 8 ++- pandas/core/internals.py | 18 ++--- pandas/core/ops.py | 4 +- pandas/sparse/tests/test_indexing.py | 78 ++++++++++++++++++++++ pandas/tests/frame/test_block_internals.py | 12 +++- pandas/tests/types/test_cast.py | 45 ++++++++++++- pandas/types/cast.py | 10 +++ 8 files changed, 155 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 4f81eafa3adaf..30a0d918b46ec 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -437,6 +437,7 @@ API changes - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) +- ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) @@ -764,6 +765,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` - Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) - Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) +- Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`) .. _whatsnew_0190.deprecations: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 17cc76e703631..d0295afe990c8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2887,7 +2887,8 @@ def as_matrix(self, columns=None): e.g. If the dtypes are float16 and float32, dtype will be upcast to float32. If dtypes are int32 and uint8, dtype will be upcase to - int32. + int32. By numpy.find_common_type convention, mixing int64 and uint64 + will result in a flot64 dtype. This method is provided for backwards compatibility. Generally, it is recommended to use '.values'. @@ -2913,8 +2914,9 @@ def values(self): with care if you are not dealing with the blocks. e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcase to - int32. + float32. If dtypes are int32 and uint8, dtype will be upcast to + int32. By numpy.find_common_type convention, mixing int64 and uint64 + will result in a flot64 dtype. """ return self.as_matrix() diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 18b67c41b4554..e9b45e444d8d8 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -35,7 +35,8 @@ _infer_dtype_from_scalar, _soft_convert_objects, _possibly_convert_objects, - _astype_nansafe) + _astype_nansafe, + _find_common_type) from pandas.types.missing import (isnull, array_equivalent, _is_na_compat, is_null_datelike_scalar) @@ -4435,14 +4436,6 @@ def _interleaved_dtype(blocks): for x in blocks: counts[type(x)].append(x) - def _lcd_dtype(l): - """ find the lowest dtype that can accomodate the given types """ - m = l[0].dtype - for x in l[1:]: - if x.dtype.itemsize > m.itemsize: - m = x.dtype - return m - have_int = len(counts[IntBlock]) > 0 have_bool = len(counts[BoolBlock]) > 0 have_object = len(counts[ObjectBlock]) > 0 @@ -4455,7 +4448,6 @@ def _lcd_dtype(l): # TODO: have_sparse is not used have_sparse = len(counts[SparseBlock]) > 0 # noqa have_numeric = have_float or have_complex or have_int - has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat if (have_object or @@ -4467,10 +4459,9 @@ def _lcd_dtype(l): elif have_bool: return np.dtype(bool) elif have_int and not have_float and not have_complex: - # if we are mixing unsigned and signed, then return # the next biggest int type (if we can) - lcd = _lcd_dtype(counts[IntBlock]) + lcd = _find_common_type([b.dtype for b in counts[IntBlock]]) kinds = set([i.dtype.kind for i in counts[IntBlock]]) if len(kinds) == 1: return lcd @@ -4486,7 +4477,8 @@ def _lcd_dtype(l): elif have_complex: return np.dtype('c16') else: - return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock]) + introspection_blks = counts[FloatBlock] + counts[SparseBlock] + return _find_common_type([b.dtype for b in introspection_blks]) def _consolidate(blocks): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 44e3be32c23df..66d9391d2facf 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -30,7 +30,7 @@ is_bool_dtype, is_datetimetz, is_list_like, _ensure_object) -from pandas.types.cast import _maybe_upcast_putmask +from pandas.types.cast import _maybe_upcast_putmask, _find_common_type from pandas.types.generic import ABCSeries, ABCIndex, ABCPeriodIndex # ----------------------------------------------------------------------------- @@ -616,7 +616,7 @@ def na_op(x, y): raise_on_error=True, **eval_kwargs) except TypeError: if isinstance(y, (np.ndarray, ABCSeries, pd.Index)): - dtype = np.find_common_type([x.dtype, y.dtype], []) + dtype = _find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) result[mask] = op(x[mask], _values_from_object(y[mask])) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index 1f88d22bd8f93..74c3785b06d77 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -829,3 +829,81 @@ def test_reindex_fill_value(self): res = sparse.reindex(['A', 'C', 'B']) exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0) tm.assert_sp_frame_equal(res, exp) + + +class TestMultitype(tm.TestCase): + def setUp(self): + self.cols = ['string', 'int', 'float', 'object'] + + self.string_series = pd.SparseSeries(['a', 'b', 'c']) + self.int_series = pd.SparseSeries([1, 2, 3]) + self.float_series = pd.SparseSeries([1.1, 1.2, 1.3]) + self.object_series = pd.SparseSeries([[], {}, set()]) + self.sdf = pd.SparseDataFrame({ + 'string': self.string_series, + 'int': self.int_series, + 'float': self.float_series, + 'object': self.object_series, + }) + self.sdf = self.sdf[self.cols] + self.ss = pd.SparseSeries(['a', 1, 1.1, []], index=self.cols) + + def test_frame_basic_dtypes(self): + for _, row in self.sdf.iterrows(): + self.assertEqual(row.dtype, object) + tm.assert_sp_series_equal(self.sdf['string'], self.string_series, + check_names=False) + tm.assert_sp_series_equal(self.sdf['int'], self.int_series, + check_names=False) + tm.assert_sp_series_equal(self.sdf['float'], self.float_series, + check_names=False) + tm.assert_sp_series_equal(self.sdf['object'], self.object_series, + check_names=False) + + def test_frame_indexing_single(self): + tm.assert_sp_series_equal(self.sdf.iloc[0], + pd.SparseSeries(['a', 1, 1.1, []], + index=self.cols), + check_names=False) + tm.assert_sp_series_equal(self.sdf.iloc[1], + pd.SparseSeries(['b', 2, 1.2, {}], + index=self.cols), + check_names=False) + tm.assert_sp_series_equal(self.sdf.iloc[2], + pd.SparseSeries(['c', 3, 1.3, set()], + index=self.cols), + check_names=False) + + def test_frame_indexing_multiple(self): + tm.assert_sp_frame_equal(self.sdf, self.sdf[:]) + tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:]) + tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]], + pd.SparseDataFrame({ + 'string': self.string_series.iloc[[1, 2]], + 'int': self.int_series.iloc[[1, 2]], + 'float': self.float_series.iloc[[1, 2]], + 'object': self.object_series.iloc[[1, 2]] + }, index=[1, 2])[self.cols]) + tm.assert_sp_frame_equal(self.sdf[['int', 'string']], + pd.SparseDataFrame({ + 'int': self.int_series, + 'string': self.string_series, + })) + + def test_series_indexing_single(self): + for i, idx in enumerate(self.cols): + self.assertEqual(self.ss.iloc[i], self.ss[idx]) + self.assertEqual(type(self.ss.iloc[i]), + type(self.ss[idx])) + self.assertEqual(self.ss['string'], 'a') + self.assertEqual(self.ss['int'], 1) + self.assertEqual(self.ss['float'], 1.1) + self.assertEqual(self.ss['object'], []) + + def test_series_indexing_multiple(self): + tm.assert_sp_series_equal(self.ss.loc[['string', 'int']], + pd.SparseSeries(['a', 1], + index=['string', 'int'])) + tm.assert_sp_series_equal(self.ss.loc[['string', 'object']], + pd.SparseSeries(['a', []], + index=['string', 'object'])) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 38163d89355e9..e51cc0f5a6ec7 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -104,15 +104,21 @@ def test_as_matrix_lcd(self): values = self.mixed_float.as_matrix(['C']) self.assertEqual(values.dtype, np.float16) + # GH 10364 + # B uint64 forces float because there are other signed int types values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D']) - self.assertEqual(values.dtype, np.int64) + self.assertEqual(values.dtype, np.float64) values = self.mixed_int.as_matrix(['A', 'D']) self.assertEqual(values.dtype, np.int64) - # guess all ints are cast to uints.... + # B uint64 forces float because there are other signed int types values = self.mixed_int.as_matrix(['A', 'B', 'C']) - self.assertEqual(values.dtype, np.int64) + self.assertEqual(values.dtype, np.float64) + + # as B and C are both unsigned, no forcing to float is needed + values = self.mixed_int.as_matrix(['B', 'C']) + self.assertEqual(values.dtype, np.uint64) values = self.mixed_int.as_matrix(['A', 'C']) self.assertEqual(values.dtype, np.int32) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index dd3f07ea8157f..3394974d833fb 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -15,7 +15,10 @@ _possibly_convert_objects, _infer_dtype_from_scalar, _maybe_convert_string_to_object, - _maybe_convert_scalar) + _maybe_convert_scalar, + _find_common_type) +from pandas.types.dtypes import (CategoricalDtype, + DatetimeTZDtype) from pandas.util import testing as tm _multiprocess_can_split_ = True @@ -188,6 +191,46 @@ def test_possibly_convert_objects_copy(self): self.assertTrue(values is not out) +class TestCommonTypes(tm.TestCase): + def test_numpy_dtypes(self): + # (source_types, destination_type) + testcases = ( + # identity + ((np.int64,), np.int64), + ((np.uint64,), np.uint64), + ((np.float32,), np.float32), + ((np.object,), np.object), + + # into ints + ((np.int16, np.int64), np.int64), + ((np.int32, np.uint32), np.int64), + ((np.uint16, np.uint64), np.uint64), + + # into floats + ((np.float16, np.float32), np.float32), + ((np.float16, np.int16), np.float32), + ((np.float32, np.int16), np.float32), + ((np.uint64, np.int64), np.float64), + ((np.int16, np.float64), np.float64), + ((np.float16, np.int64), np.float64), + + # into others + ((np.complex128, np.int32), np.complex128), + ((np.object, np.float32), np.object), + ((np.object, np.int16), np.object), + ) + for src, common in testcases: + self.assertEqual(_find_common_type(src), common) + + def test_pandas_dtypes(self): + # TODO: not implemented yet + with self.assertRaises(TypeError): + self.assertEqual(_find_common_type([CategoricalDtype()]), + CategoricalDtype) + with self.assertRaises(TypeError): + self.assertEqual(_find_common_type([DatetimeTZDtype()]), + DatetimeTZDtype) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index e37b418664ba3..93be926fe1eeb 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -19,6 +19,7 @@ _ensure_int32, _ensure_int64, _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, _DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) +from .dtypes import ExtensionDtype from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries from .missing import isnull, notnull from .inference import is_list_like @@ -861,3 +862,12 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): value = _possibly_infer_to_datetimelike(value) return value + + +def _find_common_type(types): + """Find a common data type among the given dtypes.""" + # TODO: enable using pandas-specific types + if any(isinstance(t, ExtensionDtype) for t in types): + raise TypeError("Common type discovery is currently only " + "supported for pure numpy dtypes.") + return np.find_common_type(types, []) From 576f3190ad81d9e20d7ef0a7ce41c1eb52ff5a5c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 10 Aug 2016 06:25:18 -0400 Subject: [PATCH 239/359] COMPAT: py3 compat for scripts/merge-pr.py --- scripts/merge-py.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/merge-py.py b/scripts/merge-py.py index 9d611213ba517..ff89651a256a1 100755 --- a/scripts/merge-py.py +++ b/scripts/merge-py.py @@ -34,6 +34,8 @@ import sys import textwrap +from six.moves import input + PANDAS_HOME = '.' PROJECT_NAME = 'pandas' print("PANDAS_HOME = " + PANDAS_HOME) @@ -96,11 +98,11 @@ def run_cmd(cmd): if cmd is None: cmd = popenargs[0] raise subprocess.CalledProcessError(retcode, cmd, output=output) - return output + return six.text_type(output) def continue_maybe(prompt): - result = raw_input("\n%s (y/n): " % prompt) + result = input("\n%s (y/n): " % prompt) if result.lower() != "y": fail("Okay, exiting") @@ -114,7 +116,7 @@ def clean_up(): branches = run_cmd("git branch").replace(" ", "").split("\n") - for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches): + for branch in [b for b in branches if x.startswith(BRANCH_PREFIX)]: print("Deleting local branch %s" % branch) run_cmd("git branch -D %s" % branch) @@ -199,7 +201,7 @@ def merge_pr(pr_num, target_ref): def cherry_pick(pr_num, merge_hash, default_branch): - pick_ref = raw_input("Enter a branch name [%s]: " % default_branch) + pick_ref = input("Enter a branch name [%s]: " % default_branch) if pick_ref == "": pick_ref = default_branch @@ -245,7 +247,7 @@ def fix_version_from_branch(branch, versions): # Assumes branch names can be sorted lexicographically # latest_branch = sorted(branch_names, reverse=True)[0] -pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ") +pr_num = input("Which pull request would you like to merge? (e.g. 34): ") pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num)) url = pr["url"] From 4df08a9d9fcb679410743b97851899d60776b3b1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 10 Aug 2016 06:57:06 -0400 Subject: [PATCH 240/359] BLD: fix up merge scripts --- scripts/merge-py.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/merge-py.py b/scripts/merge-py.py index ff89651a256a1..936330049c252 100755 --- a/scripts/merge-py.py +++ b/scripts/merge-py.py @@ -116,7 +116,7 @@ def clean_up(): branches = run_cmd("git branch").replace(" ", "").split("\n") - for branch in [b for b in branches if x.startswith(BRANCH_PREFIX)]: + for branch in [b for b in branches if b.startswith(BRANCH_PREFIX)]: print("Deleting local branch %s" % branch) run_cmd("git branch -D %s" % branch) From 5bfb220d2eb0107e2c116f6a4dc38e25385a3813 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 10 Aug 2016 07:21:48 -0400 Subject: [PATCH 241/359] BLD: py3 compat in scripts/merge-pr.py --- scripts/merge-py.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/merge-py.py b/scripts/merge-py.py index 936330049c252..65cc9d5a2bffe 100755 --- a/scripts/merge-py.py +++ b/scripts/merge-py.py @@ -98,7 +98,10 @@ def run_cmd(cmd): if cmd is None: cmd = popenargs[0] raise subprocess.CalledProcessError(retcode, cmd, output=output) - return six.text_type(output) + + if isinstance(output, six.binary_type): + output = output.decode('utf-8') + return output def continue_maybe(prompt): @@ -123,6 +126,7 @@ def clean_up(): # merge the requested PR and return the merge hash def merge_pr(pr_num, target_ref): + pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, target_ref.upper()) From 1c7d19466caf36c7cde09ff6cf3b7726025c65c4 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Wed, 10 Aug 2016 21:25:05 +0900 Subject: [PATCH 242/359] COMPAT: use mpl area legend if available (#13680) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tools/plotting.py | 29 +++++++++++++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 30a0d918b46ec..b98cbcd35845d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -853,6 +853,7 @@ Bug Fixes - Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) +- Bug in area plot draws legend incorrectly if subplot is enabled or legend is moved after plot (matplotlib 1.5.0 is required to draw area plot legend properly) (issue:`9161`, :issue:`13544`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 4cf3364a03056..a61a21d259e57 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1839,10 +1839,16 @@ def __init__(self, data, **kwargs): @classmethod def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, is_errorbar=False, **kwds): + if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(y)) y_values = cls._get_stacked_values(ax, stacking_id, y, kwds['label']) - lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds) + + # need to remove label, because subplots uses mpl legend as it is + line_kwds = kwds.copy() + if cls.mpl_ge_1_5_0(): + line_kwds.pop('label') + lines = MPLPlot._plot(ax, x, y_values, style=style, **line_kwds) # get data from the line to get coordinates for fill_between xdata, y_values = lines[0].get_data(orig=False) @@ -1860,18 +1866,21 @@ def _plot(cls, ax, x, y, style=None, column_num=None, if 'color' not in kwds: kwds['color'] = lines[0].get_color() - if cls.mpl_ge_1_5_0(): # mpl 1.5 added real support for poly legends - kwds.pop('label') - ax.fill_between(xdata, start, y_values, **kwds) + rect = ax.fill_between(xdata, start, y_values, **kwds) cls._update_stacker(ax, stacking_id, y) - return lines + + # LinePlot expects list of artists + res = [rect] if cls.mpl_ge_1_5_0() else lines + return res def _add_legend_handle(self, handle, label, index=None): - from matplotlib.patches import Rectangle - # Because fill_between isn't supported in legend, - # specifically add Rectangle handle here - alpha = self.kwds.get('alpha', None) - handle = Rectangle((0, 0), 1, 1, fc=handle.get_color(), alpha=alpha) + if not self.mpl_ge_1_5_0(): + from matplotlib.patches import Rectangle + # Because fill_between isn't supported in legend, + # specifically add Rectangle handle here + alpha = self.kwds.get('alpha', None) + handle = Rectangle((0, 0), 1, 1, fc=handle.get_color(), + alpha=alpha) LinePlot._add_legend_handle(self, handle, label, index=index) def _post_plot_logic(self, ax, data): From 5df9123e09168e130af0e73ba335d82f5c1a8d46 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Aug 2016 14:27:13 +0200 Subject: [PATCH 243/359] DOC: add whatsnew for #11920 (#13953) --- doc/source/whatsnew/v0.19.0.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index b98cbcd35845d..0edf52c7301ee 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -409,6 +409,8 @@ Other enhancements df.sort_values(by='row2', axis=1) - Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) +- Raise ImportError for in the sql functions when sqlalchemy is not installed and a connection string is used (:issue:`11920`). + .. _whatsnew_0190.api: From 7bfc7c4bda4f92f934eeef1560d7f108067c7820 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Aug 2016 16:16:10 +0200 Subject: [PATCH 244/359] DOC: remove rplot seaborn equivalent example figures (#13954) * DOC: remove rplot seaborn equivalent examples (removed from docs in #13855) * DOC: fix some doc build errors --- doc/source/_static/rplot-seaborn-example1.png | Bin 13244 -> 0 bytes doc/source/_static/rplot-seaborn-example2.png | Bin 31360 -> 0 bytes doc/source/_static/rplot-seaborn-example3.png | Bin 19494 -> 0 bytes doc/source/_static/rplot-seaborn-example3b.png | Bin 31101 -> 0 bytes doc/source/_static/rplot-seaborn-example4.png | Bin 55627 -> 0 bytes doc/source/_static/rplot-seaborn-example6.png | Bin 26604 -> 0 bytes doc/source/dsintro.rst | 4 ++-- doc/source/io.rst | 12 ++++++------ doc/source/whatsnew/v0.19.0.txt | 3 --- pandas/tseries/tdi.py | 8 ++++---- 10 files changed, 12 insertions(+), 15 deletions(-) delete mode 100644 doc/source/_static/rplot-seaborn-example1.png delete mode 100644 doc/source/_static/rplot-seaborn-example2.png delete mode 100644 doc/source/_static/rplot-seaborn-example3.png delete mode 100644 doc/source/_static/rplot-seaborn-example3b.png delete mode 100644 doc/source/_static/rplot-seaborn-example4.png delete mode 100644 doc/source/_static/rplot-seaborn-example6.png diff --git a/doc/source/_static/rplot-seaborn-example1.png b/doc/source/_static/rplot-seaborn-example1.png deleted file mode 100644 index d19a3a018bfbfc1e862d8a0fc95a03b22594bea5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13244 zcmb`O2UJt*w(l3BSP=^df`D#8K?FpaQtcY461p^v)JSg z2~V|Xry3e9c6Cp6`FSZVjvFtwdJA1c5KV7=IdKHJfZb<+AacKzB1pg$cN*mMQ-l`L zJU~N-q;qZAgBW18vLL0T9cK`C?)?T9q&IKkUmd@0)|qFWy_uq0*c`RsFx53NKPx08 z#4$E-mrp%e`Pwy44KoahVxe*A(xuRHF^kUlmAN73l$q>d^S5L2_P+wm!Vo-{tHD$Kl*Gn2m4j4A=E_Nj>Q5+YZ z>#x20LU6ZbIo>s4IzRlqy}&*xKu}+xkRpH~Y^)41adM``$Md8u^zdrvy1T+lf+QVN zPn|l|KQt8A-+%Aw_3QY}jdkq;yM&rB=_CQ&0v=IOwNOdN=GF|He?UO&vCEz&qm2pA ze0-WdKRt-gU0N9mZ=vExJlQPjDg{e)(i|@5`n#+zv}Buih&44fnhe)P5(eKI+pR5I zOV4~hINFvOx0h2UnYg}c`lIw=dzPu+v**wKU%iU_`t`>8#!^X(cY0Sm4u-{rK=$yU-!!(&fwC80ykgi*2p6MqKyHm-|w*GGl9SV!;a7IK`%YH7M#41QLNK)kC%^7*?4<) zr8lc+MY2|AnbhiVltOCo%a;b%yjkw__V&u_=!87=^%XB;nft=5|3)j*=xtCCD;A5z zg%L)1Go9v){DOn=tBVu(y&O^|NA`Lh7P|Yp)5<^~e(BCk@0YS3t2Y?G6K~(W`wi}- zgi4vJ$}(**8#6RC6wO;s$ZDGNAQ!K<8q~~8IBqRtGI!=l8mH7yl zu?XXvPtS2~&Ty8o;9jsx8hP$zH-lKOf_Xdl6>*8Cu z0NHsdsOOVca79JMiIzgvGuK@3-f*D+{@Ycx#6@9Sh^gR`&Gjj`69^Jqj%7D`f&OoP zC&a{>Uk3yPeg1r9F1KX88eUX5RgB*la@&a0%9b8Ul%y?OU+h@u_ftA0DS6wm%PFq6 zug~AlFRHUsM?2!Nd{aH*sM3ksW-BJGAIr1Gj=1IBaS{bT5pGIo(oaR&)Is1Ow zdqSn&H8Pium!abskcKQLq& z*K)Mo7c;zfZyVG+KR zmE9!0i;8MWz@A)(G*+h|PU&wHDqt15=3?pXeOUM!8a;O9?2f~bKVEW8bmn8vpFd9+ z@YgWI4-O8VI&tEflGM!acX7CdhVTv5+-_nawmgwr$DXFS#Ldlh_S?z!44hYv1tk%} zVWiDCTcc`1(@slU7PjuvJC@Q`vaG20Gc0PX1yY9*U;~|OW z0&|N`KK0Z^WBlf)KQYLWAQLg|+ZO?Woo(6>Tl1cV#$Nx$?5qvNZF8MfmRLl$_egxP zE4A6#IU;gGvd^Y`6wY8kv;mGP=Yl$= ze(cyW?R=XUlAqsLRkL-|fItl#ea&ETOLZrw{KTXrQQJXz*TpttMf0hzua2Yn7mmCL zM78V87v_bH8S|P&LC53}G|pY8KWryO!|tyyq`5Wu66cQ}#x>?z>PydlJ-)iwVVR99 z-;0JF#J+@0{}ZkeI9n<84Gp2G1IcKHUY#EyK-6=EmT>Ij;LvlHTAZXLJ$kfdVWgpa zB0J@Gq25AV|J}#i8B(QhQ{j4|4&$0?s;a|7vYYvE#zjo)xfFs#zQwpM<|Iw-IV~fj zhvrr>^L8#-Sy_JVtQ!v>KGZID$%iUxH`#d}jaRuq0W}MYRCJHfRcX<4PJx7!$gJ;@ z(PipnyN6Ro$7?TpJ(@1ilGd7jhgIAnYciTFA)Cns_Vf z+6Y=7GIcI@IF=2(P{Rq#2ZV$q4~01^vWeY)S#5;X)(%2rk?ImGW}ap8{kOIW2Q-+q ziyma4g$lBrEW*s>xXV0Du*%+gmePNj)3)z@j zGB1h4A{@rekLhn}fB5hrNZL6EGU(d+dQ-8hb4+Y(0hz%u<85VVLto-SkQTCQJq-L5 zx;itVYsuv?vR9K56SE9HYzaMQeF+LmkPyxj%^7f{6v%`H5|={Axm%FTb59bk3TKto zenL`q;yx2}BxpPGtfPuo2HC2^W%a*&`2xqM;h>UKWPQDgLYP#WpX(Mx*-MT_T~AK{ z=l$sMW7wtW?VK*km%a9?!D)ot!nl8bMW>ay?#F)=ZOwV86!J0G`# z>`-*2E;dz*cPFbQsVC&+Y0eE+&s|AFkVYFPR#!c}uoDVF(GcJxVa}7^;Y{1jRg24! zAYHJLFLK#lLx#?%yYtza9=Lw-;=@?^x9=c#wJUv=g-E=4^JYDNZYNrA6ZA`n8l9f7 zEkq)wLDIrAJ$-NE;=_WJ$Q;Qt>Mn6Pf|GL|?bSUh?kU>YH@UgF;iMQ&q~H#n){Nkm z3*b3=^a7>8zG1FbwmaDqL99IGXwdBjQ9$rbyPaEOPEUjU@J^ak2RYn*;lI<6|FPKo z-#wAH)ZgRJ+x^?|19daH!Gc!}L)aC3S7N$ql(b9S=;V|AhEcWrEM4ooDdHe*M{q2IB!sXGKxi9U- zd9~F8D(*~-j3yPYd2eDc-+z`&xaoGXb1s^~tE%Mb51)AsP`Z#r}rn6!p0W|5ya$4rLn>O z{rfAvetju)jb!(ulxCte!yOA^S#1>L(LD{Wt4M88y%SSrHRGr9n4EBmnH^HY}`Q^ zVN}e_5>>ERw4_^53Pzw)vHZDTQ5c%$sOtUOi!xLf{Ks`EP7Sv9jEmnR;ZOBcv1 zC=)YK>#cV09;cuK9a8Lp;2%XOQW)!FoaJ(95E&o*f6$2rF&e;m#NNI8j%#H2dsNUl z>vteZv!TWu#&xzIMUcQg8+YWST!|&F4J!$ZuB(IcR5deV~GN#R7l1kh`@l2 z`_(H~c#a>BT9oGlc}q*T=Oq^iYYr2K zQXfTsfrUTE)!z_^|L}=3K2;n);uD%PaM#DA)KwYZ!A42={v$YpQ)u+8^M!tRPfzb) zuzQ@uL^DWZET=RUv$UJ3kvHvwVa0PL(-mPH)8V%A=b%e5jcrzjJqtf(;}%4ft-9ZH#X=i_tm&@;|Uipq=o0qf(|XQG)R0oKaB$@ zF--kvQxYZPJbdnffZTly>=Mz>p6!A@Sk3Q*LOo=$&I~R1U4MXZI`koUpb!L*n)dx) z_e(>EyE@;XG%!@oofDgwoUCeL(bw?u?K6f_jePD>ba$RSdD6Z))tR3;J2z)Pk=>%_ zumV|Ul4p$c?%lq$mizK1CO00J_fGeG3JNX0dNH&>G>_;s!<(Wv`?A2XB;&g_zjf8^ z_ZD(34uQ>9YenZB01%8eTI}>@8ncpfH}A~5)$KCp1Dz?V%mI{~I(<6%!@%5}T<5K3 z?m*&`JW-z0r_HA|84m?Wh|;gz?8rNMI{EVRBXa?X9js=BauCy9MZqtHtc%<&J|vHX19jc~;x0b#<%Vopt+&CtW+#4jNRdlat1$ zSY;wl^VMi{2)L+y8t5P7us5raajeaiG1eQC3zZ=%x6D78HP9Gp>iR$owslM&ay+1+ z;GT`=eCN<2A~jIdF8K91MU%mC?-F}zar3~JlhUmUx@#OCnRICX0+^)+VU8d!l^&)` zsdq=$n;s*y2AJ%INy+o#6MEh&!<(ht?Q zG8!09B9m1jy|cYoZ8Fyt-2$ENWa4*;b;Hez6DtB$BLi~JPr>IHYtfGD;yiEjdBAe# zkK;63HJ=yM!)vFI8j-uDqAB)OS+1tn=lX|Q3IsCznZpZu=a{%XNcU6j9(HU(D@;g8 zNM%>V{3BfX3o*idR%edpo>BZr)s&W8a$ia@9PRV=4da760yT~+_IKoOis8IXC$9wv z=q3&in^5dqbV=1gdmAG!dzotPK%KXM8W51eav%EE=~Ei#y;$l zAdyX3&)$0ys~DQt+NufdScZc$gozWmWYr?ChmO29Te)E73%V+txSHU}z$t)1x$iX} z)|gFzp#EI!GA^2eMbnRfjH+pB#((oW*+Q-47Z%2%Yh`brjn*;!joHA_&V1V_s;-(^ z3^Zkp&4l4dulXO387eGt`FMD4xB<~nRZ)2dzcYb8Ou}yXC3IS%mR(ZkkL=~wE7H_= zUGTpz7L20K>zuAxHH({bvYS{mo>6Lv!seR_3Qcc?jT=*SCHZyoltGif@0gIV9rOkU z7;7Bvs*XZ5u7udwYfIg3ZlM#P;!tb`U1;JQsxIx9r|-BzcZl`^W69WCw<3XAqn{4N zva#~t^Q`+=rN!cXt?yE-4*=>m@5nK&4iZIGO4oxvdQ|a=ii#3|S!4q%LwotPVRd!Q0U|!!&T&T4Vf+k8WxzZD9I8@v3%!NP zjz~IGtqot7JqrZ3f-mh2`nG?~Imy}%g_L`ShD}bV`S>cO7F%&}A#eszstObp)?OIM z4BQTYgtNI6hv4(Bt69AK{G#jBC2?p~jo{N80a8GjEovwfXTE-IjC;jXsQYT^~B1-!Q2T`2LhVx!& zN+TNTJ?Pm6Zx1YLFiv(o;k@0jvftRs*|MkhEARix}9~V9FsmFBm_3sTUvtXME9n|s6TIkd&BzzYpDYv1SNy@zsycNw5qu@+5 zLhgzK;+YM|Y|*KZk_uZz-dG(&!x;PzF4s-<@{M(>e_)^nwZ%w5!3XQ6cC+3`yemE= zmz1OwF0+fC-k-~Uv=IXU1n^Tmq!q)=HZ;P0fNw&S zsL0EEquLhu4I0ak4zSQOLGq3O+oz>DH9o!$js!430cxNwyzeAHW`gC&>DxKUHO%jV zHl5|70ZAhi!2r0dsI3JW%vlX90LzB%!DxFnK6CLC@u`m)BK8liQietC# zP<~scaS~ufRvDMP$zrMnw3*p8b8zVX2|?>2@tJMXn4&6WVWylD58u&pB0^kyfI=`* zw7dJ1KGDr>lQ71LGy8N|!p#K;BsT_Sl;BID?iWNE{p$DYBI60 z@nJx*sajd3x96DOp2~(`&kap&U|p`_dJ(8czyJwthUAiE)pH4W#u(6QWkBg0YeTYf zB(eLg%GAIXSQ(}R2j2PfX`DKLK7TQd9Gl*Dzo74hsvi(}^&w^=d|m>4 zPNstgv67M%z&r#Fva_c^X&i+NM~d#=Sjt@lZiH-PSVb7&uPjyou9~IZc+Qdsxp|Md&#VFrjP{8u#}1hR?Z1;0)Mx z>`8v^tA;{H;LSj@62)5S=vku~WR@?~uYAaNr8+=;{K}v0O8;w>p8K|WY;?J8x~get zBxPl(f)H(ohMC*~UP6fjp0n$Ru(1I=NLU?-LjwqG2Gj5FuI}2ir*U?m67>rq6cT{< zp&MsoYr8O-lxFtHlfGcF9WM%-NM7!JULCZJnS~|(wk25&vN*t5Qn76216ZJiy2}g+ zK*|dyayl3fA5MVYp+%-U8?S-Z8r#9DTX0xXOEw5`+XvYlJ0pSesxEcf`oab#p$Ha` zMl!bBD6`4g`J+*$>Brx9R($(*3#=~?SB%WeaZpLYv1$_7guF2d&2JMpoC_#G<>loi z07qnko9bR06t~Z5Q4LJ(NYj@Etc*byn``9h$uMPr3uWGij|1d_`oMGQRN}V)J!w}Bo9@-D?zphx3RYJ5AUJ1J_SiR)Ym`SQ-8P>Dr|IJoib2=@D(*d{SbPe zo)j^xU_{X?R{G;MjXq-?#1R*w!w#gDPf##W=o+A5NOkp(8D!!?=bQp#1P)zf^#SpTDWkS2lI$!4t!fq*3$9qIks!#h;Mng2XMuP7<~u5doE% z`{rh6QR42yCa#DsJ~)h#p!T##jBIR4z~~Epv4nr(xSv*Vx9bvy#NpJxf1>bT!PprTqjv2ElwV`D-vwla`8 zynfcWEYW!hFV%S%Sw6HSdFQ_xUS#=KxBI`+wEsR(@JD0!my-uHsi$>*Nws0H;-Twp z1V;@H3NM)bV0D0BavehXecS!8QDy|e2+;jGWE*Y|ow)y#qZ zu&_j7Z@$#iJ=d9AVY`C#7dZt2>GD&pd~ssHy1{ zYd=O;ouaGC*gGo$Y{(*ka)e(E&|h09%5Q&>LmS>w^%U(G66pX+ZozGR%>}mu%18m| z?K5xhMyOnD6h$JjvZ$KnP;+Ek_U_$s!9_3pmpX3R-@a__IMdWOgwJ-h4i{{+*c-Uz z^!f8z-@biY;Ls-%>@$N9q*Tr2<>CKp{v+Rxe%uUd`>*h^9e;zbq(!7Y#nGM?Uy3)1 zv%IAoXQ7y8ySjW)x8ZojV_g~~q^E)A1O{LOZ?v3aT%e$!%*Y1|1gYs|{Vzi$f5brl zoZmUA>RZ?pSh7~C4J9c68e6dPb303gZUjQLPN&JE5Qpgi^+)&+JY zj{!d z1UC*+&*&_nr)NJD)%gq85gNMEo%#9KMI~LIL(;J&gre+Xqe}DYwZ$0&<7q>fM~d5O zKN|Z|fO5NlKH^YI>7OT;qLa)Hl(S70rwPaS)5~PFOrt#7PMDIW1>?D42UkVyx% zZLQ+@Urx<$y3q?xrxnN^>gUubtO=XgiTpTR|#jJJ-mwxBiw{E66P(`UhG=rk z+l%kp=~m@LYXmLlZZJ>oXJzee?(0v$b!m^~*qJ+;@|UN@^F(7hOh%`y@okp+@g4Qq z!GDjUg^f*zPT{#W%K{4{M3uyOe+J90-D5v8a9ylpXt>mCztF#P$J8&U;dOUk0C9qa zuPX)OX78EciM5sI6pNOR9=l-`VqjXzheNtR>!-JPVw)z+ilD>@qmrga&LS+B<%&TL z$1NWf{$n~#)cL^n&e~t;qaaK1!Gqto(9%jb?*CB`yT&0`|IP0qNCPZLLmh2jp&!mW z($$CWkC>9@iII;aKy$hP4CQw-O8)IQ^+BF+sZ;6(hrSi0vg|1buOVuhxcfGLyD*t*78V|f_sV&LhB;0N6dJku~a;Jsmb0=$~8yT{IXPwpQIJ7 zLHDP>MCzC$8V}22_c1eh#HXicb@j<}dZrcHjf;q=3w&ucW#5sWbO9T?PuP{kU|f@( z#=+lUmr$LEH~ZMHOx@`~Ck9H-f(=zAE@F zk8wa##>B)FBw?S3`qyA_=i@TrcVB`2f&bFdn1Ee?{v2%w2=m{sqYYL?t*_jD1gQnq zyfE4%2ckZEvm59TShi!(dy&UyAzC|c<%H!t+)(kEtL%`f_ptxn)APZ2c3x>ihh<7O z<;y{8oeYoiDI2Ph>$8((dvRavEq11Za2o}|b_+(-6vL#Txl(~vUNp~zNPH?;(QKRg z{@Sb~(W&p1Kyp-idLu5)!;Yx@z=oQVpHukYF6XHVK1#8Ie&1w9Lw!k_Iy)a+aXZI@ z3v)xY8bB9*noC8lJCJFf+^w20lLuoSlnhABAs)5~^ooj`1A_${KKlq@o$jF*YhPB_FvFdcsn!9C+wL|hmND>hCD<;989=?JEjgN8HU$l zVWOr0*8nmt;C4-^%;uUcBo9-lKj_#eP~Fjg0OYN$`kSj76zg(!QZ#^G_H$OH?7<-M zFIqV5&Va@bdhcvmDwSHep<5X9* z#a_hS6YLTMna*jJt=s}m^U;4+y>EE&IrFC-3UIKLN;VeKvLQLLOA}Nb3UvX!jyA;k zf@yOvGk-&Cr?D!m&4)+ZB&>UXKgGu3Ws!}4(9u#(6wexL3TiPn<=>RnXJA6CR5a%2-xedg` zAn+D(lv=uNQoG3xRbV{e%Y4{0hJ-AeHlR002RY$IAk5zmR0fJu-w3N0%}I8U`aCjv z!!mlabN7*53lp9^*#X+?>smKg<@or0dX_U6NFw)J&%pFv11=k&3QWoxVI($N?BK*} zjFyAV57v7i*ZN%erfQsGC@KV<%}exMX0D=>MR)GpAr(1R{hQqf<8EYiD$4C(W-z*( z(@h5esfN6~e5UVt+!x7(+crgK)H~t!4jIr>EGn?qUL4^CWYivBeKq{h4(LZPn5X4t z+qZ4w1?>qlMd(Nra8RtVzO+_%ceg%lKT6es?6xdA1DOPj*HjmInLAXrt1FQLQ&TAN zvm`GB0EQO@>L*d_{NCZtok*11Y3Mm!g>hCCdIXcZy_u%GnGy`kcH_Rd;=bTpN_EJT z*M+(|NnpgWOFGP(kpDEpGMT-Kj+{Ze!{IW05rzW@F_W+jzztEYf-*a}y{lKR>LpN; zGz38-H*UBAHL{=naWSoN61X4C>!4E&ut-VJ7faZWQk^SAV<51caaU(I9^6UJvrK+q zu9`JJ5&Q1lyDV^BFe)${Y*9EN=G%0Nho|l}FLu64xOTXhIytAk&LF#{oJL*@;!1#V zC@@H|CB*fj_uYWDzk?-)6g zCvD8f$u>31z)&;*sm|nfSer)61PFBNZq1qY!>ur)H9rujKZ?%kMbG61&wB0_*8Y>f z&XJUF)^rMpm&rhdKRTAk1$`_Awu>W0mN0Ju!*b9h#1|B3!?1Tjk0)DEPB$|;3NJax zAT{kiSR05rvQFT6k3?}dR>NV!BnhsrkSw1l&pGmv#=fupF&!7 zN;K%?&;P}`|I0c5boKb?=qoUj`k5*4t*l$|?Cfl6o;-}EPk3z=90K+W5mjXv?mV7R zZa3PoS9?a-<&=DvY9jtpJgcC8UVnID&Ab|ux&_JGf(Lp`qnvm~PZkx{N z5Gh=jRn+V?N+%_C(J%xjxDn_UjN_aJCU;X}leu4Zb46+9yknM;H&gdD_v}l%nFUmH zEhr-J6f(d$X#0Htn;u88FO2UH;PW)5+!jZH`DklK6p94Vr?nS3WpcRA-y*LJ1}nBm zIL$^wqKbu%FCNms>V&1+*avCa&gU7u9jKoCsBh!EIDQLwI7BQLX0@%B7@JnSnppE5 zk|#_TDMN_DA1kDy;RyOjv84d!R2DvQZYYxg*lAc&FW}2GXUw;1@(o&@8!h6$<9*BC z-agb2u)E#n`qCIIFiJkb7SvsxjF&6fKH3{jlVp&*d}2#7)28{dTk9f@}cE7g|0Wy ybrJ@ql+n}KxxY?ANIlt07#$F?{-3GqO-^f8o3iYO2?GfH!(3Iol6uMD(fPr-Jl>Pr6Ap)bax{nf*>F%je>-9NVl}ojg)k3x|@IO z=RMy!-^u?y=eaJf=W_48)|zXsImfuiJ??vbc&;dodxPu-3JMCYtjtqo6cp4V-wu8_&2tljHV+B%1uJ#AL_uA&p8ST4T|j3CokQS)~DRv zUM|)(?re3v>Xqj$5x-7%12sd6@Y>Vo{!d#1B+)Saj1nyob;B0gKZ=!Owdm9*lqGlm zXl*#R%AuB-=@Q(&8T0~2i0+-${l&MSJJ>I+6soPP&tl zPLhvAw$fCdF}@=ZzfO|w>VhJEo%X4#3>uO6_1Q=|KWy4}cCwL;@NFfXtO0zd6G_KT z`;LHBmM{*E_)Lc_f1UQ7TqFklJA$7NWYHYINQMyG|7NLRw;W+c_}bQ=WJ-h|4Xf)r zpIoS9%G+x&FKgAfu=I)?=@e4jq+wwR4Y*0-;$}UVsUUnfW?VOks-Ji#wxwp4NqwN?C&ke~PPrk#cNI zRD`SRd&f{;ZU-ucJaJMi2%ipJ9nrR3^v4(d8*iFI%ff>DX69nkm06?cO@F%dkru@v zCk^3PJeR3P$t!HQ*vf%NEUWG_9UY3Zqb&vzk>vO9G11V`HLDyRprWF7#c}wn=Bn9G zIgdspB>Wost`aZiTVGb2uU(Pi+J2jes5$j=KWi_{yZAf()$Zqp?b$%;fq{Xa-(;iQ z79yni-FNB4&c=I#f`jidF@$$6|5YHJfVN(;@D|&|G-_*$Fdm?vvSkds}`SZp?-wVr3rBp?8 zbH;G7OMXd7$;zs#oe=%=x1CW8+P_D*Lj$RO;A0rqF9a=sIt}VM=C%2F5-EZ{aN2flZS`ryUWb$YM*h9Mb~9S}nfK+%^4hZ4Rh5Og z`3n{nmbK3f=NTSHQ?9CphF{5eETRP+7J>%26&^fz@LlX;RcWMmdm&2ck;356Tx*D` zq+|e$2A7;S8~NNv<~PPmm(vz~^^J^-KG)|S9@U?&_ax|H)rKL3l>odco zJ0T-l`Jc;rEXJtAFz(#Goh=(h|9xOUxy)rl>tMaKH|NI>?lO-93;5;h*RL&>y5e)H ztF;(2hO}+#GLW;3D2A0Yxv?d}!lGdP=ZEaY#g=b{&oVim%}aB0bGR%Vk}i6(Km=!q zkB@)E!SMp-%lXW$0QQBSv-iV{A)@8wX09$>!^5nV(9DrYfK-s6ypySedu)TIOkUC+tMK^ZGG#m`mCM>AOI z``XglT5LNbiZbniV6m{YJb@+WH$qY@z#$ab>r}pad%-wq@@9lO;*eSfg^jIn?^k$8NNGx8&%=zq zRi`|?x`djo`b796H@mQ5y`7b5uKTD$ED~Y-SC_gk!7LDmbbvMm_QxGDF)@_jkPvKa zY{Q}K7xEDFAhILl0vD`vJU-qYZW<5T%bjrFg(=sW*CTnF=MV%Jw`ZGMT3Vi~s>^?| zy)KA3)~8)zyfhXDH;I}5syCe8N#wJYDjY2}YP}&WEbO)vZ>j-_F*GzZPM&^XaFCIi zc_DyA&tSeS^j$yzhxq_qipSdby__YHh`6{8$bj*Dwp0-ICheMWyApUpU}@_8kXAlJ zz~eXCU+I@m6;1x~g(g!Wu??2M_^dYxQv(z0zOHkoNIA@AiPbnKugzqTVwUl6jw%Kg zmUcx=O$~A|5c1rvE{@z`qq`r^20}ohy?5_t0m(|$jIpPRQ4LH>!s74Gdl2a6R#!V6 zdqqYUg|;tcATqgndx@1h&L=|3c*Y2sTNMIzF-N>|HO2D#PpG@hub5pQm8B2y?#KjP4OqP*BoEpp}C|eE*Jv_r5ppaHHtuu`z^! zp0(lJ@d{gNw)*3Hh>?8sMx$c8vyF;U`*|r?H*sRVqR4OjF>y;auWDv4!eD;6TsL*j zdq~WY0<3J<6QddPG7W^ZtO^CUXsy;;Zc$VB9&Sv; zi+T$}j-WJQ4f^nbR!l4v0z_?H9fVaH9-cdIK7PbO1f3MXE{={Ka=Mfv4|o^%O$RF$ z(NUsXtRw9(#Di9FqAOPw#!^$L$H4ob=-W3sm(5ApXU}e+CKg^i1QO$&=8WkUM}pvV=*M8Y`5EFoJ0Z6~K62UkQyM6&Yo!nO^E2}rDI z$cAq8h!N)TBK-ddLH{3->;K{~7!E{C?edS&(YGNjJbm)yN#Mprx%GVkfq3|c*J{kb z`=XUZpAPn?fa6k;`<}_!)6KWc-DK@I9%UC;I=DOp6uoPu72ytxMT41mq)@-N-up~O zNy&7o+8G5tfgL(tX@6f&Pp>OgER|6u)9(D(zW(A+8-A*!t4oQ5I})NY`mW_^VOi6s zvAb(hlQ>V?O~4FRA3F8PjG4zY+{DFp|ac(CxPe%So(?4*74=tflMyP6sy z2&1oFy()5CRx~m;J}JC9WkUv{O!;`D8Ku8y3EJEvHo}kHo$nhrC63?h*cvGtxgz@0YQ*&Oew*$ox85vGh2qbI4HJ z+M313#|HxggMf(0KQWQKzrWwj!$Vn5Px1Nlz6arV2#u;24mZ~}9JIMx+eri5JnkNc zkE`D@+ci>~yDr-E@YT^*uJh%|Tn>+#qjI^?JADvFJv~PviC8 zLW5aK3|w4MV`CbK^0>IWyL)@zb8>!8OeDe@S5{Hk+23D)dSy{s?DSS^YwOVHDAC=! zW2?o_o>^F3aVKBL<~Z=EI&*v`p#7p3*WTaUrs(;A0jj}VoZaP)Cf?QQy}-mayj8B9 z!{L8+MwH{j#Y44m?*qYNMnAxi#qboAm6cI6>O6R~%dNfwXmH({UF#6Be@eS`(5Ur&*&IGX-Nx3j?C6s2-!t%*{vVe{7r#^G?pTnIg~93wGR zK^ue&t5a3dYzZ?ra-}3MPshQrQ+q+>z7S#ae)&%VGt0V=o|M@ezUF43X>*JOVNxoa zduOli9E5hedH!*F^p3!&G0LwC&*@=dl9b#{>|$DyJZC-UaT+b|)~}Ne3{M%}5fms8 z#-URR_b0>^P}$G!=ouSfqS}4?k~V5YC?0evj{Nc$OZ&v&W2bjB*DCPw`DSM`8ET43 zKHrD!HOhe9_2d~7LO@%yH#__BBo4`C!W52&5$2hF&bQb<<>7%Dl=n^)hVDF zkocsxs7MZfDCCA^azgIkkH(n1H~L41`FQtyC&KrJBT98Rm_K<&N)ejvI=N{4ye5O& zg-Vu?z$k1@nCdmGa>6^ zxY31_0x8Byt03R3t>IOtrvdQ~^DRDwQ5ZFLifVR}oP`A8aO`&VGB|EzKX*`6FTI#h z>A_Q|M3FVX>$^Nh?6T6a^JkI1Nb&!Gx-|b$9EEtAlaS9p1x7ZvYWP>*b2o#SUhn~S z7b+Z#uAA$JI0bT3L@82DB-YYrK+BrJ-Tg~+IVWG{ds}GzE(hpAs z;S$fE>qva*B=Xe{XRT`x@#}PC=lHZPpd%lWS?MGHH1PX~O>MQa1EM#B^o0{VwdCO; zo)_jeMrPT?dwdnx*!yk2n`sn+j2?Fi>)mKPX8I{uE6P0C()av+;O?*QwpK1uW}E+w7bi;VE#j{Dq^cqNDroCkO;LEWv^fN^f3 zz)bTzjpDa?dCZejQ<{K>piUh@hHhhsyHFN$nD$UC^(5bH1gy(JrtRAoh)4M=N=ekb zJ-n~tb_&n@W+nr3$3*4@Ce|6s^OyA4^83D;0WmAif>KhLI(9XRS@XYUpPZjPzQxd+ z@L}zh6gda`cywdA%~Wu!k>SPZ!8#BT`}#ftQ&7O=;NbX{ znaK%YXLonE5`gvbN_u$TgocFuwGB%Cdo>Y_`+}6S{$39s8g^1%FQgf40GM2-b^wLFE8PV3R~0D{nb|{CWPeVmQ%z;L=rweqN=K@Zl0b|5fM)y zBq#D(&y5uuLl91aYPidUiHl|lU+NG zGU9HstXyyWFy@KjU(Vd>B7qEY5M!%%{KtwEwaMm;nvacqZ?z7Gw&;`EcRElu!LMG8 zd++>TrHIbXP9$RnJXT}1l6nHZ+_QDDd}W!S&aaD0sReC=Q*TCIef`#QNQ!Y@r=`^1 zPn7XA@K{h-Pui|5j#9HAy=qQC*tt3~%CuWYv?aBqq{lP``@k|e%zCESsG!QLPx7~O zSp%-RuknJ17QSaEN2~G~n2$gd7~J3#N3ZQ) z)$-)J(N8@tIGmtYcwg{mcv^{|CYw@{Gz5$vu79uNulb-kx(*VpZ%G!@vG*(8aY@hG zd@Z3R+ZWGgHMs<7I4cp@dET`eVRY;} zB9V>j3-S}UpKowkIH`N$7RXo_rcn#(YRQ>>IeK9F_qhIW^H3lwm*P&W-3$2bx9mJT zN&;dt&!ZE^fQs&~54(fX8on1;|J1Q9u8N=PZTIg(v2{>V|J_obhAwO1qWn^CRFE>- zb99TD?c2p%Y#2pagMjXz({rXg^4<@|?~h+h^fB|f{PWxAqXu8A%fD-rEFHWsC1Juul4h@?K~$ma+Hy-i{fXn=N2yTru3MPuawTOr!kVDpPWh zf*{#W5+>wJ9i~Yy2IkCE@r~KFXyv3wJVeARgB#yR-UlW^3?Ipqp{4U1Zz2`qrF{!e z>gr`eqM4P&>I-QGgkurcE`|J$NVq4-*UrZk&hX^5A6{$V=b{XXzUHhN_L z$U#*Y-#Q_AQ;|h}DTOa}2{{`T!#ky-z} z8#itUTFsPOlfxE7eq`AE9&h5;uV0lrZRDNZ-5c&h-ZHY<>PG}ZBaL4U6ji@(5q={m zSeJLFq^EZMIK3(I$kfdZ;m95zA@L>@+v>lS29XcspfsqkDb-;9mX);!?8^2Z%fg+* z30oC4HIu1wsI_bBHqP$Pka(%1BTl%Lmm>vkN&5DunMc1I;o|>>kjvKj*wsPX^I8RM z;{Pu%aB^~@xUGH9LKdwM(_C=sYyrV27&M{kxv;3ms@}SgG*?AuNiYhcB-Ci3p%$Jz z;JhD*VkK{2Qc9Y>QzIts-}GjL_t7*m55(t9czJt6eldi48i->g&+x_hv9YhR+SHq@ zKvWg{M5+41I;Og=dKMksv%SKmCJ}8W{KhY%T+Cf9s{>kOi3=y&j{(JUk%bP8`0(A( z8|27C={ln?E!`)XiQF>EH zzNmr31~ZwP_QvN?A0m@*`loMZXMG-CJ82#G$lq5rzPogM?1rr4_g4oyySnIjc}J>Q z=;_~|pL+pAxP5p?fB(M!(9lbuJ50QL;Y!C-TRzfuc6M3qH?TRJPg6FFOskx6i7m(z zNO>mPX?Nb^?`Bk{rTjHWpCbB}Uav15yJ4{?n2?*)oa1p5MMC_e3*%AF*mKd7>QBm! zM57fk%ND-m1rD1Y>60l?+5fu*fmsDQ6Jzn!lq$scg2fc8M8^Bv9dLO(Y;(#zhqNyg zV$>51kL&Jw`r_BUC})aAm=!qS?J26G65Lw1{Oo{rQqDU{-NkA+xAOe+`Gj4J<0$#! z;X4Iv`K-ZWhs9rbg_6?J+E-#691%dEg=}8IvKzmvH>0LCjPlqF--M(x`Y>?X^w-*Y zLgTv?h1Nx9jJqP+hfd8@=@sJWbvuVnQ3dGm6oDD6iDA+Srz@g*m}`^kcVO^#K+e3-Pgd#aY)+5bNczh7 zZDkUpqMMuD{8rD#+!si!pqa=NVE3T)j`NDsp>}?4GLT(v zmDI0?abL!hZ)las(fj+ctz;(7?Mo^%?08^4auCceTYh6UZraj#vIgj%>56PDYzN2} zmd7xN$9aj@U&>lDY~4*`dm}KK?Nk)b_tL^z+-aT6wCPgMt(yEH zL`gdiQ;MO0`s9ZGC<9-%u%(#vgHl!2r=v1x2zpE0t|o;bkL0ip0{x!o;r_{cBDVq% z4_%&l00%5CvjZUkq4J)Ft4V+#Gz;q*8Se8{YeBGeEq3fhbsKFPlex!)s z;<6!fO9{CO(yGfpyw4;KbpC6WHMn}0h)CYqI;yNw8HWOf{Po9YOpc1Z0|$n_1=+6< z2}t_MdmH}fx{uSKlzbz#Z;Zeg!L1t(xYTKGR(Khon4KMA?)Y@ZG`VH8iHH3Sld%@j zSTPzrf>78VgWS;lk^&MdgqS7m%a@5Rb_Ja6FWp6I>Vn>Q<5~cQr7gjM%Mj}_wN(fJ!+yJHQhiHK17o!4$d>*6qY%KTRj8T$(Cbjnh^v4E%2$6wJMl;qHe zyl2LPJ!(a8k54L(e(kii;LIV6A5ieWSnBf|OtUTij`^PmqaWqYIb9~DJnpE+r>8)! z7<=mjhud&6=wlb~yxpC|2d9V`x3)?9)R1T~flKJ0T z>Yj(bZ{u&oK({R|ZKj;fRzXVa#4Gc!zb0E$KV!K*%;lS-q@--gA`Rd+V`t7YZFhWG z`!Ce~quv8A^xxnEE~<$}1p7J`+;wz?%J1Lf zmfV+bFzV^$=aN(~)X6bIV1Jei!=w5TD!YgqJiK>GU zZCN=v;Kv!em zP;xQ9_vd}g0~@qKgi}lS^)`rLam)?^G(b_j&}7SR@#0X&!N>S(m$F|@=&kZXlk$mA zikth`P@^%)eBpI;gQM9wJ+bl?@5n~0m&&T)dRh1T3Xj^}BqM?i^Rd@uQYH<9>$B>M z;(TekF}088ZMn1>(Q4b0A1bRB$~VPGwsvmrUrt0>y|`pD1P*DTF8}u?p@|J=HblYFFy4 z4N9{W&|OPf)?ZF=QpG??_4fQ_&Ji?Ci`|9!+8XiZ8rHte(ci5QYErbu(e@re9z5Fm z-f^CyL*ww^8-LB7Jdk`{mb^uy6HCWV&R>c(R|TgI3eO^+p^#|r5Gx~kNoiuhN4e;I#}!Cz%#dnUC&I*j3DByDqNB1i5{JuUmdZ83@m@% zVes~%?)b>>UXEpA5?P2Hg{%SZ!+b097K*=-F=VD(2$=4c{{TyZ+l$-(UIUjODZDVYt`WpzD}*gS1+ss1jN$1-McT}N>Ydw z1P)F{tLXLbG|`ADmHVie)t%hF2!U}%s}9eb^v&8jUFw7Z2q+*YQgpT^It%+OAVfjS zulr}(FWok;eR0<2Zvi=5dX;3cc{{dg;C%|p0x`VghZTyP z%LOTv9)5mmjm{&(!kcd`lsDgU78tn_n=ih&Mv%zVm!ax39nG436 zbu@5M|DqXRe|2|wQ`yyQQ8$J&$T4fEX4296UKkrRkC=Bak+r7!+$D#VZ;`2B^rE8PI5lA@w9Ng7y!VdgMP)zkO0@j2I2z z|%R8hG5w=3|)E=cx`rw&`+OG4QEc~Sa*BFj*MgM&3{-8f23y62#htdO3p`=w+1wV-%>wY1MQvyQm@ zfPD24vxkzT!u>Mb?xzQ{OU;g}2ZBD_-nTL9!bU|!r6g2JOC<=WUJY(2|9hlaMn=Z@ zal4qw$?l?j9Q)VbP(0bo+h%S?YrlAo+PF=7Nix;I){@BG(IF<Q9h=D7E_oa?oqi+tqmRfM@DlhJ=LBXicq3TImNb*Rgw* zuHDsU>_q%#bRxgYb%a>A$zWD{459(x&vi?VNx^tcs|oT3{nx+Dt&#S zKtA0&O${!K#r6~SZJ{_#D^oisF8U#(@vo1D5iPLiV4YSuybc^qOHa?gN{x$?0mV?@ zhTLb?*Q)~=z^9$`sQm~-y@|W-DMHX-yQ_MucCQs%^kRu_uuXE-?)rT z841}c@!2G@f4rLTNy0z6{mEr~DFiNXhJ@i+bc`T8WvuWg)M_1A?tUHmi@oseFwUW>diY!jW3I8qj##GMqd=U3=XzY4gp{{uD1)=S08L4N}4Ve!@ix->)z=XOkEL;R~j0wY-1}Sibt>v@p2O_ zq4hTOIR7rQMo(;rt$8-f&j6*Y9YkFQ=>c!f)NDi~#>j`rXQQhST_YsWgKqHbb{Cg& zo|W6hC6$wDe7rf@_;`uPSv9gz@P*E%ASHiK#^9<U^6udZdl8`!5eZ|@!a-Sf_RU5;w#nk8p}Y<#RjKDJ2h zTvCbeY;Bg#1*+)icCLe5uh$9I$P>riAOATF@TO$S!=T}P)Yus@)?Q>s z1my;xWW@}p@2=P|F#GBs#Q1z7O+Mb9R4pM@=IaELI0NNu>xB&uAgEk_tP(thwfMgG zjtp82hJwoNnBiNZ9a&1OQPX#P>Z#-_?QzdYIAiD4V8n<%t=!c1A5j zoBpl$HHj%~ha{bMpOMgDuM=Sppel<6ceQ|RD)(yfdTG&9_t3$ zXf>RI9WW;|*@k{NyL)ppXNvu15%B`2oU#U=?W%w9_*Hoj-D_Noxco?Yt7|7pbv=1`eAoovh$C9rXkswbYg`1718!iWoGQ>bRHqZUFa+!ubSW5{e^-5i?7Z%N1Ixl zI>mPV8IgzcKc7~-zN0oUlxM%VeW`wC!Dlbek5YqoTR?!&o-wkKRa_-_@O9Zctdq(v zy82{NiuK?7*e{Y;3g^qh!(>&`R$^^SwJjg0>x;s``4!zh;itd9=D=V0Hkgc*XhePd z{CD1epF^Mb8J_)jIn-$kDIxwgwx&m=%Q&R!on-gUd9cl%umozo_caSa0 zbaTq6IOX8EV8O5@b-SD&YLQ7Rj~QDl#xssRsyK|yu1K$(X;lhB#ZhZAI_r8%%q30U zfcVs?SVN=1N962q<39A7oDn0akhGCH1JM1rxw~`tUV1^l|L2z{{>KRYt6v}i3oI!xO6G`r55Qf5X#Z(%`b>1ZFkG@boy_hezj2L(vJF-PUxS ze2OsXo61UlJSt(cso1C}iI*>L(??;uy1Eh)5_WWSpcsr5844dwI2Q-lsk(l&01Qb=2pIxHhOYW{3zIKl{uV;yJBEALC^f)B<&Pfk}fT`2h_oZcY{+(-1s zbR>{ETt8XNX{7`9o@WU}>{`HiXf(P`r7#OK6TIjmi^} z-|6sn)qA2++BQ>8@l|p#>GpTK@_B|_=>O9Byt=+=1G`O; z^o<>HnMFlYfJ5N&K6L`k*@w{3GMkY9S?E)y?HBNlKXYj?#^HC)8Zd~3P1?tu!KHD+R3fnT_c)92sKp40C(k@d8Tex0I~qJdy?SPYZ8iyMoe;Gfg04U?@1e$liWv|OIPKQ7sFV_fnG`CTYFmAb#>Q#3U+BV^|^YEAThcG2)}bK#8z_ePYo z=(#~##jm5Qc|=oqbTEt$#`nznZ+qgA01C7RQg9}nQL^XzBOo&m&hRM8IX!B){*}+Y zrbd#r_I;&qJ`&J0qNnBkrr+Z|c890{M1lL zDJFlwrKIy5G$iWb*Xd#Vw=1ba0-7~|cG*~}N?XTQR2^P7gK&QfThXcdQRlYOL)Xa0 z`em~xh_~VCXS3qohYoTR$;S*OKOGy6HX9sK<9HZ78kI6`i*d$5O^$4+!#qv>9*w?N zk)2>$`m94#Q#LA3a)dF_llIUwArt5qS+h4sZ>$kp!T)4_u3b=OWW*d`V7Uni(aoa3 zi0rqY3Z$o@-F?941m&iU;@w|Y)?z)&@mc?XHb(Wq+Y=2Ut!FiRf?X188>?;6$F!3i z_WUWE=*E95^S@!bgq_ckt&uODsGC9)twKlWc7_3hZy;%anlWRnb{+$z^pDyev7N-E z0VqYiPP=7m_FM9~`F4l2XLj2dG>7=g*x|V>J<#hJvb<;SI=FYa>7`bC9^-7U+aEz! z;Ndp@=gS$kOEge4fR>3S@wUd>!HtaD= znN+Ud#Z>g#bJQDXbBiM1jS*a)WsUC4Pe?f;jvsEu3r1emUkTb&xjbyWbCBn;VX2r@ zP(GNoB1Y$zY;3EB%$6J+=*DRSDRx{dRRW@sz@OHSx&9fSy)KkvYqgZGRBF$)QpsTc zt+aX7F;kD|!;`}KRH40U-J3$%FWOaXH#738{l=rxPYN}SxA!*qt=b2t9{=jTe;h6( zHP|=wGj+P>+ZPR!Rxx0%c_Aw-5UUcA-=Vp`QL~a+EU|4jxYs$#@H%qF=ha1fdv><@ ztJ7n{cUT97*K{{%$7T}A3#q!se|fk#%`a_Oxl<}N9I(VJ(JN=_^dyh!SNnIhaNPPf zEt+uJW^!31s**s|ON*VS;OY2S+$t#QzC9(-NA#C8Z%fuN4^3hOg(wWMU`4vQO5*_| zKIt=+!C+0*u~{oh#)glc=$iBeTCRJPe_jr8?;c;Rf6J?m5a4Tba(`A1JMMTkzc>!T zl6TS||Hy)@hTNm!9`?r(tPvr6V#7WJq@Awu*~)0`0?J@TtgVeM$-!3!rA)b&McZ>V zfql_)(iNd?5+3!s(G7)Iu2TdO))HKWzLCQjPc9aRafJ_-^nh^Zm_ME6Ju<4@S)=W$ zr+r_tCN*+CBf3VmFkVW>PP6=yC+nF-mB-T}Zt6b|g*yTvg$SrBGwrohv1i=&z1qH; zp^Dp;D5%`_#HA_vScWxxe;JW(I$nM>^k7C?=SBP4=#jF{i{8mZX?92D{RcC?9IiTi zA6tqg_%)4|sua$Ox$)OCJ!K6V+e!0P7UdO!oE+#1%$E-O@9qoo`r4z(mEzPpNm6^o z_l`>Ua^4$AJ&`R2CaDkMJSjH+3_2yXRXm8aJvu6;Y=Z+tJ|*vFHSK%qX&1=~tN?_Q zATeF&tMwp*FyfcHyI9hFnA_1d=%RjiZjWwv7ZXBp`>n=~?;4g{)LpT6138$}(+P_} z^@O^t3w5{kDnaV&yx?j#V8=O54PdrLvTECzt2vlDrQyj}_2*|e7>w?jk5yti?#&SqbHoZdWN+swu0b{7<)`^+Ug8QxCZ zwm0V)h?H#TuWxA(kpnuM(>5l(W!^yk3f~o7OsPJ2pYO2bATf$tLrcR8(U>cMzoMPr z%cbN<*YGx%mAoZxg~l{+exO!ls?zn;j9X#nXAPG1TKof#)XdxDV!gA4+IaJj2;g2p z4}n-O(i!15F)?sS??LBp zJS0V^f2-}7$)k{=w6on`dMSLdN&)MbwAw=1wGVM zb+=J5^gDW(3TuB_7S^>**fuC=YK}iRvd2_ltRM4D5VIW%RJOayP zlW*qx7~fu+3YpA?RVHZU6<-co#a>wB2kt&Pe$qt~%UUlL`8j0o&}sg&YddnIMXQeN z&Di?h}{qHcT3ZgU|Bva5D@965PT@kI5WRygbiVnZsm-i@kdx!qQ zQ))1|dczbbH~3Wq2wUPAGC3%mk6ah`TCRM$$8I(0T=`#@lDsApnnwcFEP&HURgF&2 zPa>3`0Ag605q=B7+uVJH0mnn6l)jx^z~>?2vN*g>^TB%eBict@GUq`AQaHE`2qo(Z+wwW6YR*-nDi?$WuZQi9se(g=o9$L*uCUr@Oq4VcfAYt}T<8 zjF|;C)sVMsKhg7*^8S^B0#732|JuyQAFcY(@ru)VOif!mBsF#B!BJK8r%x#dMmMqj z_Oa-XvT-KDmgDqX3)D8NUI)I5tDMz3Q;=Q=o(Z_0Onx*qDCKzcqaTlPBITP-d+bdD zvNdV5Z0Zm5?kexWVD!rtM*rDE|n#&U7of|EsPy?fdr+(A@%`hG(ZYNaTYy z2N=nv8W}S*ijA(pA?OANYr_XW6f$r~r zd=bV#sO=z4luU}+pjgZr9NRDlp~XjVqm-0zA}kq{QeORvW(4=79Zr>~$I?P))JJl@ zuypBgEq#?|$l?#_TGIRQwL%8Xkr2!^Ak8 z#KAO>mf~?i_-4q*j|*P|spcvgE?71k%K)Avf-tVspb3pZ+EkyH#cX^AV##}GTt3&= z57#R+1?mcEtOC;@FuUevW;E>Vp8!k~g0~R#dPrZ&sIDs|$VB9m1!JIRs{I@IV1&D? zFQ^C4SCxDPx5mJ%g|M>+g>SBJ`+{_MT>Z zv4w+C{pDC~n^~}TG9g9_v!LUgmEsW$iyF*?aU!0);CI}T>~&7f!YqQh$oZ`2AqyVC z6cNv%-)s$eu%w2PU8lbT3k=>pm@tVcDE`44=(07P4>FfbJ>LswXXMSERo*0FeJSL! z{v3*)oT@5K#*CK^4kf2x&>8v}a1%V0`C4WF_$uLxCr|vygk18#BWRlHt(LD_qXv`k z9bB|IYimfuqHWzltjGCId#{k1mR7=_Qd6bC(Zluebf4X*)Q1_-V1|1BFN-2_@^i7M z+#bW2VFM2HjOwI=mMVJON)0S;IhmPK|2R1Pu&ACVWPU$A^{|}uJY0W)9O@NN%tPQd zeCh372Ll0HnKq@Umj;Z^#icxf*E-=qW21c|3|ye!p~dO^w-J>Gtct2yS~)jv-c&U* zN`r&0F1N1KkP8cLFXr>-+(g@YO-d1u9~155K?|pLQwRyh3~&exgMW8O$!C|S3`9uC z+XG*-R-H!~oT05vs?2UyJSHN-e!A*%V&-xr-S{XLRTn*x-~JxH=rK(oKJ`*&qK#40 z+iMF;OUfD=9T!J4w%+<+ct9bdmp3>+-pSJ}eUC48zFoVEbZe%7A35u@KaRj(_W4{sfoN_rgM~dduCh2468S{*Ki2#Ky+5 zPU)h3q!5UloYaB-ZWXB0peg>LY@OVu67>yuBBegwee(P{0SdJHU#hAWoAy$f^rnO= zW;Or*9Ri-(D3BI&z5aE(q@+Zn-b=u8B(Dj+03RCAI332}A|@g(u!P~UuQ*lToSrTF z@xu;wck<{De$6S*i~xf(m>9ai<|{;D+M6OGe6kRQf`N(Y06TFH0$OMAU*1oM zX6HiuH%NP2{h5uWJPMdVmsa}Iy-pTCkIcU|HBEIb0PF5p#@4TKOeeIE@bG1rz-2H^ zBAfdL(Bv0BT}j8G6pVJAcE>_d7=)OHs^!b==;*iuf-XM3&wu~^#ajfaRRnmK7UyR? z5c$Xu19mw33F`{yvBN$I{96PBkByA(RZO{%Q-hZ^a04uHE$!`lKX0<3^o@;~UC^$| zKYok?yVnq+vKOR?t`w2PqZwan6uyeBOFA6PX7{s48F`)PS`Dd7OC~5FU52g}(s%~0 z-mm6lDJdxdA|go^lPyokWsz1J(&JCGrl@9I7E zb#OIK*qsv@3Bg@lU`l%arzrF~FI zY%0UpcBVdAn#%oMqD}P&u*On=9UKBr^^dK0*hFq1e(2*4g)Iz1#-H%P^Jg4x+Xf&y zLAp)B(9TRB^#Qb$I}p!Mk!GcugBHP~8V6TbQ^?Why;tYqJjMTEzfo!Cg1$eKZA8f)g+R^hLoi|7vaMJEZ*f z(@Uh?S7@`+ACePTBG3Jh=E(s$w%OU)CZw7IZ_#=5=!lZb1Sf=q4e7T-IbSywGlqmU z39=Kjgf`#HWnWH@{S_N(TbfAZo(m6&iHM98*VCgaHSNu;s)`2>8}fAo&=?g=LO|X= zJRYlil2xCY#s==*|FNd{Ke{RN+o(l6qG8b(a+7UbnYXx8d9HInbm^|NpHHF;Ch&%( zUC^CL_MhZW;7W*qascqan~{Pn*gIB}mABZ~*p_-zd%@pX2`>jg{-GchneCHRjt|@q zN-p_FB(Hl$z|D;B29+9Pw>9o(mB^ z{_8Iist$HVq!d;vy;ws1C6Z0n4uNJeW%=;P$lTTz{$pon9`OGT^f1F~F>Ig#4Y_Q6 ztQh@GS63IFzqqlHQAclYR!&ZEW+pvU80Jep$Xc+Frvv+qtEs)*%*``PV zfM|WPDhm9DU&=1r{`Wpg8t_|7*tYhpI0=GPP0%+zR9swMxd8n)vY)C^a9?1J40ZIp zAOj%ulzJi2f3aG6HC7FN2C${YFe9vPVPV173~-f})#`C?3n1N}&a(fnF9P{DQ1Acz z{*dt>_e%d==S~P)eFva*5YI8|RDK#SHJvz8IrsH+rJ{Q6EqZw=A~zD>DeCu0gxOH~ zt@7>X{%`xth`7NXI~5g#w=dPMh|<@&fGF@bFjp}Sc!bYr2HaNPn zB==3cJpH~$-G*tXl>e}WWWE~cvaMQg!g~_>0U$%JhPOyqO;yLhivp0_QS@S8rCkIa z9&>Y~?lt+sQHYqBB1!zkwt9YTidl?7~TG5V;Ks z6zMtPx7#wx$QR(a(J>-lfFm4v8(9QW$=*PklEdk)Azy$a9f^i~0nYt#9(;NJ>8cWX zQ4x)&a-IyO4AORpg@^)0Svc_RZFZ7hEP3;;TU1%4zdd%AJ>>I=mu3hz=DOh6i_Qm6 zNz6H2w;Nm+QEprv`K!3C_G@CS)r}W#F)x-akx%Pir+qKPXuw)>p>N7w;atM(jx*R! zMoK!@>&qh3lJl_kXbVj}3tXBuN;l$e-d5+`bGCV4e;;=;=JxBe3kBomyOgAj{aidR zM=C1WC1~H-`6@T+{7vr?N7-x;K`lES_;vm$CfI$j=H?%g_TRb@32@Iij0XOBIrRbe zvvNO#v$G3ait!yJMVPhpO3N*MaL7|@y+(S2_MPb`G~SNO0|mw#>{m-Gxtk$N8ySXD zX%UzzSXpvwSYl6L5($(WF(WccL%N6-G1h9U+`W!@Z27$9@1g`Ms^O?vfKl3hJps$4 zObc0m3wWP$gD1Kc0xk&B$HKP2PRF8M{$<$09jZdjaw}34z+Pe=byRP_kAxHmVToSr zwcnS=W!Xq(PN4EvbKKa%BbpcQvlyZf)@-^LPXy0L{q?`SQC+8;Nd%Me-hkX!K-3dI z%02Duqs8Gu5MsKgUAy_+n1NzaZbp4ovVniE!8e$4s@4LzAn7m)!_CPm6!}EnP!JeR z6u*6o29OIJbI2Af>|Z=e!KYv_%g@jMmXT3x(oF^kere3~FYQ|5M5lnh+|O(jsen}* zw=?YGCX@vdIWbsbI-$DPx?XXPvtC0p<0i{f%iV|rbrL1GeF5amFE2~OdiOnUrXKrY zP6YK~#gr-k$?2A8rYK8xuy^5n?UQavoww!9iqg)q;5t3Uu9n%l~wy~ zONbySh=4SxC@qahD{X-<3J6lt(%q$$GzbVNog&>G(v5Vtbc2-CUHd!djC1Zd=iKqT ze=x>3Ucddmdq4Y$wdR_0F3W|d<1UWfW8VG^#|mf@Lg&q1CzihXKdHJie>*!6^|y@K zY25ZCTZM;%wF0T-{$Dj`=czVwosz6YD6oLrT(q z;Sh0T%Q#J+wPF5^q06iH)Ts0M6P%wtEj1g%%~Ey*zaMH0{SFv_y&(Asx>Z3#x+5E{ zy3<0S5$BN;_nnd8y@jqfB+8^q?A1PeZ9tcgwFLf8qE9GgQWrwxnZdp z%h$zxFUb$Y8>FXi^q$!=nqFw2*&#%CSCt|b`@{lYo;1H?h#n;%P&CbXL&}CU&uZbH z31SHsE+36jXli1y(yInbWzWULfCZQYJAB^QH!7))5++PMC-XS{OWpD;^R|Gh+!F=v zShWVTD{Qwt%Wg7h%6`_EcqE-}^GIJqSWB8>d0n0{lXLp0Sqz8sxeU71=XosQGmSG! z7yI(%?M7QHbosaqepCP;Ic^G{9{k?JOkHm-_&Y#1R z4&}K?0sq|SQ@Ej-*A|#)^X!`|yf0lvRO&rW88f&NTm7iSaM0nW{9fIg#A4l<)2a%( zPMy;-V3_!GU^XubwwbmRcmAIlbB?DjMI|*X(q#h#i!m;qFzWqqS>9OlgCCS*WTe!d zD%qBACgLy@RhiCU-$$3PquPA^L>SE%fAwOOP+07;B^><=Fc09cfh^%OxPbw!nu(2V z)5dwa#s!37A@C7^660%cAl0{DGQgDsLqlUhCJ-2l)}FIx{AQ9R;hR~SIcHy< zx+ci1DyOT4w?VCjA9&yMdR%J`hVAcG!hclGMeqAt+uHJ}Sr{2_f$tCqw<04+x%E14 z0r=#ualRV~Vrh)z-sQT1Nx4_guobRl2l!sKHn>m3JPvofVV`G=;}~rEZyn zekzmH^_m=i1VAlt1qVlSsXG=)_z{mF5Mu#+2q>Y&%fFkc`W<0$FBqw_^(BMw29cLv zpJVbd?DJ;?>Ht;79YMiwjg1oH`H+-Bs%cWr&k2mbbR`(S zujXu@;BK~?X;5k%%E26xTFP*pdv{d>UoAGbve8D**v^>2mLw0b%&_R_C!iNb13MnQ z!_CQ}qM|&2QsF^>OW$Bv47_`Aqq46h%FMAn^;Aplohd3)PUNW3}W% zZj?o_bA!j}L!~hp1qC8zg^RzVG5fHlOBkAMN)rFlC=L~G?>e<)d^dWWiV>*NHTwDT z0Tt_9P5BOM&HbF=>6*+vo5-^6fPdTk(&RJXPA5Nxx)eas?BVM8VSN<6vT<5snH_pYZoX<(YIYa!jYugwoO;|QDj%6E>A z4ft(KyldC_yRhTq(@Qb}dGT5fPw-;M1a$5cdc;&dbf^3I zDBlkh?rvHX*;wxkrKfxUnUJ?|ft?{Csrz|ls(%z=kebM<;ea7Ya=mr<;MbLkhrX&f zazmyob78suJA1tx7hje|CiS}qaOhinLab_nTzsCz#3Y(4A>Xk*cWE*1oJrrGYjtH^ z{`>c^4u<+Y>h|o$0mFCd^27ofKp&A#mEY08Bih*++CHgW_@!xRXZfX`8g_K|xO*u{ zR8{U<#ealkGyZs7AxLM$JqZd=l7GDWN2YR<^5$|&fuOXuY4LVswSG)^qPaf&W}RZ) z<&CS)ha`x_#otMCOGb1}@X!Z@xUU_$sCC8m(~;9scFvwM%ibK~zb!u74Lj$fcMe33 zqU<04OnL0umxO+T73tF7R*aWPDJ1L7R8D?v54X_3p?5qtC%rPtD}ggFG!4~q_H^79EnM6?w@i$ zyKH@x0m8=bMa=<8SJh!VVoVMiOM9R)CV;(f;bk5kwzS4ktP;il?h?g*oroWVu{0jF zq_lc*={Xr)kI8ZE7_f}p9ogouDPAZMj6VvZC=Xy1>QAiNjNs9GLBT)s=DbFoRgOm7 zr>XtGt<61XRt$FpZC|z7pXLuY#|k#wU5DX<{o`vtFVp(mueF6pKEN0E_jZP%W%sba z-86GExVdGq2$*;fwq4%0U^g*^!=ipKrux^X^6ap0&7S;OTW?@U4A09p(W#xfHCMvX zsl{VqL|zhm!SM{2ZSVA5UDXc8-ILCiN8Pdd>51JgGG5Jc@JF~lXP30g5x8kV_%>tO-(NE;-j|Wcmftyu>*q?){sejQ|DI*f zKw$?eRjuvqa{!$#B3^uGPNacFeQM(H2VC@A z*F9>jBGvuzhq=w9?kenOJ3Gq2d3BotG9IOS>+aHLmYv)P5}ac z43?rnxm){6aSteBt(3b_7UKc}8779>mSR!j^Xe;77W2|rmOskk>TbDJZIHEZWjRCY z4F};jh8iC)Qg*v(f&1-=!4}nTf+zUArmN1bjUU7;HrW)SGEYRdI*Be8g*cbaVR#Wy zGn4=k$1Gdh2T)8Z(FO-9ViJ8D)jIqxPs>S5Re!JXaCn|~Ql3YZJi#NOlWu5UBwa3Snqp;QCfY2ph#;Vs%p|WO z7lP~njz_DKKex7AY=fR$NtD3fZRfA&|KAb^flctZ+j2duL{C|cb6j+n^sFdxZ&G8` zlqBgsmSXTsj$vG;=@t=rVDO7p5Kh6bUh@vROIM~wVmr7q^{*Ndl62qR(iDvtwmm!y zw6;K}BuNv`M7?6fdR|sOum1x*>iXKzAm86R?_)y8>4Q_gCq)O>GFv{uug_0$-+v{k zhBu{RQ(N-b?ryf!3+u8VJO%pbwL24IJB7SQCkDP2y;V=#eYR+x$+#2H>fW?G5KHdZ zy&iNMD~mSEPYkZ66R#j0uqhr1MYmQa8Krz)mi^URk$Fu;Cg4he5phED@rmP3>zpqm zW7WYs{4_=X4ogBz2Dp0l+t^Cp8#UMCJn0EtOny&uX@{nLv;eza*edQdUn;bg6|#A; zak?`vqNUin^* ztn{BS;yl+f9*kaL?>Q+78v|ZueoP|MCn(}wztt-PPL*BTHpMDU$%nFcU_6~J*b)++ z|7{Qog^RwMhu!`oTtvZ(k`hiGq!*~vx#eTd4rdZilt7d~^K#x~B=zl=OI^v&=(UQO zt4h{V{`ffDV$wYO#qH_p`At>q(x+gYgu-WuW%7_F970lrR!7WW8sDb@kVKiCtWfa_ zlhb#0Y>F!cdyMjRe-o>6yndl+IIq8P`Zd}tVUsK9a3JR4&rO(ATIGMH(La&rj?P+s zKgieOCl$;0g?^6$!)k_5488QB16hg})TZ1R`7I-@n;5oIizPUGS_;Iw{;O9W99^HK zKhsc>&r}eq(7Ey0(ahG>Hs}?p^mE)p$RA}l3og$^fIj0&pOaNu5y~42uN0+X(hR|d zm~r0QeK$2G7?z6EFm(HkhA^J5f~qm0%XmzhwyP@_mXYXG!mpW0w39y?aM``QcS3W8 zZ`QP7DOX?B`Z~c>^rPeqk6}@vCy@#@I28a{*{+g{iT1tl{-v=gFB%0|=w@R2g^_c> z2`PuW7q@oV`{!g1lKaLNOH4}Xbf%ARqhcpaaLyK*C2ewdU8G|Ikkv71$pc3 z2{DUQUpVYa--xu+Kp09>X~soS{B>BgGZi8QxYP(P^)R9}m9j4k$cTvCao=K(4agX*H*ynX!n;()REZlkXs8RtG;lvE0JB~{f2W@4 z5wM74Vq$`ikO~|IqMw6m72=Tx_`Nn{hd~{#A%i?cy}Ak$de<5=pHz(uhrDGJ|KMe$ zwt7KvdX1HOcD%HS(@N~2{xiyBj=iePenO>p{}sI|bUkx~6VkW^4o)zQL=2maf#O9i zX#4W*+YYD|Nyy0HnAM&=Fss_RjL6!k&+lG-<`8>w;I<$2^*U~yua)J7dYP<#WJEtH z1sO>i-OH`{RqMZ!sq*=Hv-b5EF2_ficVr`FrVA?npTP8g9+_I87)A=bj}FOWq~=93 zl}0*ZvCaQ{qDf?UwDh98Tt)e&bXg~WqXMjo^!BaG4Mq?V@3wySMLRuDR)wHWEhe-q z<{z{xad^!{)aXCpF7x;43Siw}WyKApYN&5Krza9Uoh3N#4R-yVFo3PmL&af7B{mJQ zC+9s8&$OF(O%VkG)#icDd}9|z3C&Vi`A0(!4Zn@qI|X0OwqsCnGR;Czu&$DLI4=!t z?5^>UU9AoRo)rdW<VS;-HOS~3XS6{RWVGB*esoc7pRF);)Rg1?wR+wJC8;b>L5e>7joL~85y%q(H$HiTX6!+?8_<#I6T? zb8o4q1$+`AMqHug z!R>58kN$*X*mfM5qv0fPWM!FHbSDW+i>DQHC4tqa8ScSiCyIhcS z7b`1RXya-p+3%KR6#8$H!G43?s%Itj+|lE##ELwC8lZ{NR2% z+&}e|;)*#5lctLfrPvm_i>5H7B)&$5o2W|(C~#gPlP#>lZaww9yQIZ|AOKJptK*fbqm^3$%xBI zPipC#2{te;Z~v;|1clSi%5YP`Qy+gB^fiw!5Ad#~;g?R{I8(8%QJ>%viTUwY5s+>8 zRxPqtVGW=7VDBTG!*ATb4Qi%TYHIE8e4h!NEH5hQV5atXf%}GbVFWy@mtMZfz1p6eMnc?*`M4`kVbi9<|`B zf95YAhi;t4jzC{r9p~rQ`IpunzGbe2D6LS1WEXc0=fRQvfQ*{7FW0KW%t0?t@f-WK z{#RTEh9Uo4Pr?1c(F{f@M>chO^V>JHI*r4)FZawn8)8|^>L-TUeji;goyd0PV2vVw zp=Ett8fI7Ob2V~e`8v~t?Bp;=TZ5~eN3`L90@$olv#=gC7zHV$c-BZN* zj;*`V2Jn&h{tPo~*|QtM=>VPB*gO@FBhI5NdpS9IL83b_J@aML%wdq^?OwpBhp z+f;>s>!$`3m29ci6j?}rbV+!x^cs41KjRhVjEN?y=T{nPTxw}ojSbq?G@*|DO*%-V zJ8*udaRI-6e5DL4NVy^lN3yUp6lm9Dsom2;4 zRj?VrTkJ18x0rKH5(_&7?*6UK`SYtP|H#qEerpZWm5*E(eh1_KeQe=zN0Mf^B4`;mnE%3VXX# z=TOI^6B^(n6%9|lSX(`HE;^m5X#-H|bar0o2c?cdHQlIMT@V$2JrE(#qaK)0@bmMR zXk%evLA6YigKT?uSF1UITt`LO2TN|%9Ey9u-}@I%V`elT-RCHCVT~BZrB19- ze6f-AhQ21)BLR*6%)wkYxKft=-CQ{H=H0%&skSd_{MK(ORAc)>-r1pp^Y8M0q{uH` ziff9QgSMTJ$Cmt)$8NG=09)}BkE4x?)YgqA+cFA;3~Nyo<4YK%<-}KC)#0 z_`V6XT4xNIh_TZ2awh*Px+v~@9|3urI^tMrw!7}4WryDXk1B7xM=_IRzbTvO>Sgg< zZN0+iy1GZ$Df~yol8I#n(c7A$!DEem>e_cO%T0D={8(O4g`iW{<74kJqaO`<62da# z&Qj$F0<1``hBf@>D}1d(2CILmBCS$;gPS6@e`{d4EW+k~)VM%&9NAR*NtwX(heoxN z8J?iHj9HJKoP+c*$6J{JJB_2Od)4rh| zz|A#q;eD9TkRyDVd-L=7oW1^q^3>7HLwIxQ^KobX8#r-f!=o{we1A50+k{X-sr~H% z7GN^`_W^2{Xj)vlWgnw&c%t4a*R2W9O{i6Fcp`wa)uNw3Xr*}B7jk9N|FJ%v>o^TW zJbd`(ZX(G zby^_r?)IsL@ktBQ`SfA%UYG*6w>6&^_qEv3a(vs8EW&Il$@S}0Vs5-DfnOoj^iQ%T zSl&+QP0}(SfJ7)3Q=fE~ zfeES}{Gj~!KzAv9P>3rf_V)hJ?m$6?gy-MAo~#=pN2gW9oPsjwm3~!8jy#5tV|llr z1+h#D3c`$zbJLG(OO@#D5XHC2b#XilU?kP&vL-vNKGqt*mAJUEPD5@@XIP zSeBw98~HcA63!BJd6O|c&oXE8Pv~biQWxdof1{L#=T^f#DH*d2=3YO+44V`G^cnR7 zpXA0&$3-ERFX8QT=Up34w@D`{#pV@Ec%W8(4gaLNl%wk~a`J?U0_SUS5-Q`BNBalK z1vnWq^X&d;W%_zW{@mnWKVB0_5fzGU<)C(|Ew9U@jR|pG27uqzOPiE$DQvKKwxo?v z$lM8|_{B!m_Ru>BIN+G8m-J))iok+bk};x)`KoFs_+KSV`TS9M^8C}aMF2IqrbGZ_ z`Igyl`BuJ;4dmLERS939{vM+CVM^WK%wK!|qXj(h?zmmhPG&95!nE|Wj2~v9YkwMA zvE%K`TJ5JN?3K%g+u#g8H8|!mjlcOArOVxOwsYMlmHqDzZ+(Uy-ruCWnx<sJr7s35~Lv2t@T>U!Ud;I=M z06zW5%zT3W?Y}Hbs`p#)49RJU?&@oJNMxc4A*;F}g|!=5ojzK?;xqb6Y8TFz<+Cc) z*tl^RoM?8dieb?h-Y1Kts!Rf^J^d_cuat+yOy5yEG+hgS{lMx+*@x4)&FV%75uVI9 zl)JRc&^P(~IerFUum=Hm&tB7xrGNl?woHQj&n#a$Wf8MZciiM<(P!xqwd=wjPqA1w zX&5xtFNZ!&ojZ7+Oa@Bgq1xEHP+)249{Oy#WU%<$_fhtjLx8{~;ZUXGR}189jpS_5 zIcqp4Bb@8`blvT9ScMO&)W%v~#Uw4EC^XlKg+FHaj9sL@Zf<-IlK2r`(LVx}S?ec@ z7;YV`DaF-cd5<(qzX^Rty%Rtf4nnA(N(?NmEI;PUrA_bsYAHMjqr^QwXmVg)RVia( zdOFl%9s21dbx%%tw40072MBMMV5TCgV)Gl3oO}}DKF7H{tn(!HpvZpH6|IACvg0rB zu{F%@C(~h)QUL|OH8lIh8k{lv!Z-qDrCdTYCIojh$m=UZ}_&76#LmCrP(5R;}g zdoak*uBfZ<+Q|l6+X&E@Siin5TIPlekdnxG#V3#Qh|?`K{$=N43H6E5ij;NGrkegCa%EVeA#>rJ*+>ovgoKs^2qnar8SpAT>WJLQS|0zwv%W5Pho9S6zyNhqs&H z@pDHvU1Y3~q}WIBp=bIn$lWq@4_c>}o?eI3Y~uw=I&TC*l$lN;)*v+&o;X zgKW7lkmJ8`mPCEuwGZ_DG@EWqYnynys-{g&DttP0$;J+1BL|%}3HeQ`F8$He(-Q-x zcpB!#e(@$Kpy+gmzOm-iTcnGRHT|{_3RPGJS+!&j#0p5Q=;*!j#)=e(;xCtMKk z!IAm&#HAW5BPt&`h0q9%u6*u6BoWj&h@l7gQ#nkEgOlx2w%&5rSBfF5=P2Tmvw7mc zMf}ZglNSnWd(?!!IE`8RUnL~VNL7!vHk1Rf>p*Zd~P8T06t~Qw^aUS zfvE{uzT)K(IEfGK-4n^_gAGvdq{n>b4gb3jHA1cz;kO+eg7#*ZVWLGbv>2*dS&0t1HM~AC=H1nBp z0VeVqB%u;QNiF)~0ufKEeqU&3hPd2%`GXjS^YKnYKj3ac7Y|?8e~bTJxaPliH;|i) ziOo|cIp`PK^gGc}$YcZrvU#G$GK`Fq0l?i`cmcCaM%U#KqYF-o!!F;YxMOU489 z1jS4M!s$8bqayH{QLlSDcRp^CD0YYQm{+9yw0)}`!VU+j)TI-M>EeH((Q1uXEU)Hi zZXeoas;zD?Yl2rs^?9F;)c?fmbe=qU_}0Zru=aEf^<(X=lgV)Q-k{~h#ue5W^1w(% zimdcL!vP`l6NAkvrq2NR&%wTM@plme{yuIzsF|WLC8zDSs}=s~6^)+*2Spd80!77k zy{0QkYs}-8NA~ZdLJfYuC6eNRX)>kWhwqJ7{8Z9e0uA^?tg%FL&%SwC>Ar}$Z;}Eh z!e?e8IP*SavQK5BGGSTl4E@;YGlY`+I2)f_z_rEMJjln#;bHRgh~noeW*<#RMJ8yVX3?9D*$m6pj@HT?FOZ4Ip?a*(i=c#;()>)d`G=n zS&uo=+NT~LVW#{iMRYv-b^+*V5Q?qq-nVJANjg#uASv=tTm!28BDBoPV~OMbQXl|p ztoD3A7pZXqD-KMbp}t8jXv+%do@(uv86Y>G{$>zO1pm*VAbf18tnwa<)F$oO^)$+B6 z2k`IV@h}*ENsR5#b0z{l<-ePg#Q0QN+GiI$Dp|ljCRqOE%bC34r0nb$<dnlg3QV_l!>o5BxF#!{-F8ykM`cmuhK1S%!gUPaJCkrGARW5{4? zbA=P!GNEA@&B9!rOjw(Qv@|w!RY8NUfj;cDr+qn5!*9=jh(b3V2k72s%)hv}XbC+p z(U4wHH8hwdkgq^EIMT zvi~MXz1&6Kkh~xkb+aKO6oKA=@B&4x z14M<-_4F3q=Yi=8&Ux37j&UFWLJ>E;%k_flF0d+*mPybFuLfjwhXPa6Bj7U{9vN9^ zv!)2k^G&H~Y=fouW(S*ahJC5K@hskQ&eWeXlGbXIixFCabA7A@CxpT(!CyR|hTEzKLo1zr~COc_Y#B!Dlo z5UKGwK3)zOpU~bDzl{$%WkBa*3W zS43W3UY#-gL}W+PO`0X4nN6X#b5k3Q$|jeDGGogI?=`Zl>^nl=FA$tPMiB$g6zS}- z;WxXUnuRnx1gRd{L-QH;$>aV1K<$BB&O;!y?cSiZHCF5ons^LoZRW3%tq10Wuoj^W zWi&93k(P$d!L&W>t~+=zyz?Q$T0oBjUIf}p;Kz;{>w%IIGe{T+#UdA{R&OUtN*;jZ zZXqP5u%IA{*XYZJzVw;e#3#Bj92vo#qW>7z2~*;vr>U#FxJb`4gkqSJ!-l*(=wZ6y z$M>H^*$VcChs<*B@VZ_9_Q!*<6q&Fi50t~#m8yo-odQAJhHQ(-A6lHlA~}RFP3TSP zW#+g4UGLW2EsJ~(vH_~kdke3bBO25Z0zP!hs%wv6`LKD&%E)L?m%mlDCkuK8nI}(v z8W!&Y_m_~I92Ny09-dR+nAL!h1#gfk-K~wr{xRReF&ukE!O+x{f411t8c#;}4^{vF z2M_-VI-)-bO>uW{hxT~msR&il7mH_LI+lu zFggw1*U%7GT(~z%9rnhV5L>R(vPlWOop=gLN~iloR=n5*G>wMEQyhwJrAqc^`@`7Q zZntAzjJ0jk>1)!=K(oCB5cV*{*?$CzdOUo89yndX(<4nKb8F89puI?zTICmLD~pId zX&jetL%-Q}=nmR2+u(C@`UdbWe-% z_u?-QVVNf^z}%``++dNvj35lFf=w6JPM&rPKD4H3pFTg}gLY7a zh!|W%R8d#=9&}V)0D;@vOys6=otvcQ9SciE$G&Ce4It0?LDCpOV2BPM(;3KgzmiG(|!NeQ{TE!|( zYF~lx)ZN{!UJ7M5B)SEQPfpOd0U|q@ZgeipL-8W_DDr~UlL3a?Sq6O%O(pva%ul`U zn6?_16JT{-nt?)7zR3%a0IaO9r{v{b50q*cD>gbFDy?Y(4YHn6LnjUI@%Xh(glWt%)n3aY}1*`NRPv)D1BrPdP;ivn@u?;+PPZoG z>G3bso{Tq({f_X+q)+la9rG=RM~Z>yCZ5 zj)~30UHETqQ;3zp2O%YmY~Q<+jKRprXu|CUf3frY4_;j~wX~>k2&sG`ndGQWp8xvP znm11?WczJpIXXIeB#y#}O+B~%dn}t+6giTSm1Q(sVZ)O8Dq>sERDS6N)DPPrwm-L|-~C|6-7kpidTYk}c5tyw)X0pO&7E$)=GnqoYG}ayWK6 z`FVG=A(UFcda8+#U8jOX?8Vlztv%&R_14x_N@4qJJIlY%mTHBXq`H%@)@%}S>Qs1n z9!=3#?2fwLQ&;yJDlvIiQ9*ffa^gIjXkxuP;zS*m*E7Gi)uZxGx7sm*#|VdrM$n6l zP5o_tK2cX!m-GHiJouelrReo3*c+Y6`u8b(ZKNlWYhH`U7Slb&g?Yb&M;; zZR1W_pP8?E4q2!8VeVkr^EcPn*)P(HI9`OoAQw))OF$#2%C28KY)uqQ&T$?NgwRXq z=~O!^!#8=JY|`_%Y#2+w7#bSl54Wwp_^KA6vHOc8VS zA(7Ilb$bN%9q{oZYh+v;OYPx`2wVw$BthpjgXnI@D~)CK2RqBb^F3Ki0s<7rhr4oS zW~_L4c&eJ30oTYs=Blm#_%KIFJUBG85e`2s z$}6>uZty3b>djVj9)EcjLF$fosMFuQ>-y~K^vQCB3MS$V^2xLZ$NgXn0~xk$;}CqY zb?pHwnR>oXAXnYdd2DQK^hmE+scy5=j@JD4;s7!7v+TL{(FC!lS86JKdV0F*y?Yo) zOGii3S6_S#golUcuVtPhii_-xdlReb>V}HB?jRplC)qeTWxhYYq z>>G|7(|3%GjT7AZ({JNj3|DkL){a&1tvg<}+B@3pTt7KnWx9FuB4T}f;Bb+gobk#P zY~*LDxrUFg??hIrhm@4mE8PM44o!*}4GJ(xzf(v4Ylrvg+|RpZd+Y)&J(^A zBo{7thfADLO1ekae)@D#(p^LD)-8-z-ro1%vi*75Z)mK>>t9j|TEEpb#b+B~5l$wW zY7TGq!M%VWux8hGcbiMi2S`Xr42tvW>gH3I54IO6#a`T$4W;tc`}`u!s5y*$1g9*_ zir9Q$VZ><+8;Pz-^t`7jqQP+HlUCkDh37#}>e@yd*Hoy$sMp;r>Bo;5M_(**Alv%v z2VBa?IXt{#L63oaT|&=-rgIE$?`Ib}uRjjCbOXnzDFj2+^Lg%!T2{}HgHEN9uBx6c zvViscciD60jZGNWcTd{l5U;j)E+$^y^I!@xwzj-GDh*l=z93yT2%{rgMTzG>*Y zlfgL+Es;z-PAfXov$N^ok3W3)P+w%!j3BTu8sYm8UmneLUywWA(J*S zrT%4LSzpSY|3;^;x3ACe*C*3GaC=L_qx+k4Y3e!X`rcX@p+4MQE#KRC{(U`9m(6t> z4$yn-F(7XX3rU*8>6??CMsXR=2sEgyY>4?}VrMk697#^=3ayw=TuH>j6rX=`g}AeJD2 zz~ym#U^v|p34yCvNa4<@OLG`)SGL;KVQ@9i`aY835YvTN)$TVLi)c5O`}vza5OmBWYM-tgPSP$M)|DFI1o3 zj^ohw3ke~p*`Iy$tUvb>EN|0Y-OAx2&T9pn33ERLJz)U+NiYj)Ht@7QJ$spz@&XT> zooY_sIU=Gah|#(!#3tE3rTn7T)xk(=k2VyuOKvd>^<%Hj6>)xxzN$h+C!ZqjP6e)C zUQw~(Ti^woneSvOPrRunLN|>AGU}_wk43vNT@RPb*ITcqzP%kxK3L;&qr1DC+jYmn zerb?YyUgO&ty?b*@6$qNBnF?Y-tN~fvYkwZE?~?NQ0ololLP(tVQf zGljRjaS{pGLPJ8%AT$ylG}Ho?HwjFxFfta$8#m;l$qAo%zrjeAgCmDxdTN8TOfIpE zmkfpl*z(?P-DzlhyAn8&q0(@L_fn>&@88%xW##4Voib(R(7lqJBd)6vkuHv~Gu6J% z*ID$I!%9$i|LSfQI>6R#9OVXJ$cPuLU_&sXly*E)($=JAOig?HoBURz1aCc*xfvPL zGgX)F-L(k6*Oy8`IXQBH^mt+>USIZbXZihiA~2;8V~#@av&89{nK+P@V5B%t;?|rTEZ%?O8xUYD zUyP7nyy)}&yTUmJ)klv4y}Xd@VksFJUgk#>;OiL8%*}a>TS#K!k$e6;@4*lQBV%G@F1EUwnm0Lz7J_J%S>T4sVkO@l7#EL>OiO8b zJ^jy;V7oqFb$wpvy@E#A_N^;EB=?z*77OYggJ-I6T&7xBSm5?JavpXXb0=I;)6$wx z8}17ZA=q~QdzwG03_Ogn9f{GDjj<1XBUMbU*%rr9WY));9d#A%U`pmGBV*JR99TC0 zU%$4Fo|Tnh=2K24KdVB9g+x`!Fx2aF*o3u^!h+0@e=W=rXC)MP!*>SpV#-onTS~|$ zc+C5iQ-<+=hIdvF;TL?Q<6<1OmJu`YzZYH1IofJE>4mXMKQt zB<4|{v!jv*(PRXVKb9sW;h2`e5qoX=lZk4KrYhq$D1@0B~p1e11 z+|$(j(`yqDAfHHM_A*cJGZhqd&w4Tmk#ShfXb?QuU0t>Qs_%KscmDkOwY{nEcOO6E zan-E7YG}U9$T(wr&CA!v=Q1yE_nIAiBPj{V_=B$+1$wt_YxBrW^Z+*?n^d^;zj11~~5HB!mAAkR;oPt`+ zaZO}#^s~rcMdqQ~g z=BWgHr7N89NTRDxhD!~>CbulmX;PTh4;2Y>wy6+Opg#wSNUc;fKe%_=p_Nu6td&DSO-#Kg#G=B0~a zadNaT=zefhK|vwn%^UMQ9W?v~DHed6or6oqu|Ikh6okjFS%^S6;7gM3VFLA?<;btR zOUw|in(`5HDypELKUH%dl{Ag)!-_1%*LVlZ;?bnp7M{tG!ggfqI?=N0u^&#N$R>iA zN(-mEMA+pNlaLB|V<>%T*U$Et941qu?;Fb3P|&GM`?8rsoR+DMj|gt}JY;vR;qs;= zbs?xGaZk#;8~nLBxNz5*^rnMW^jKou{o_MBjU|uwFJ?PLrPnaMUTblT9_pKiv{2uC zuru?WqBHyYAm*mwEJyA?i^I~@RDZN{;B9u##(OVjA(eP{m9<7YGuE>6CSj& z=(zk_^cwPRHLNktU83?vu#60U5%z3DyS64S&jdLK`!QSJS!P)#?)swqfN46OP`Xrk zyqy^L%Ju*FZnkX27{$Z3(C~u31>~W-fPYR8gKbQ8TkS=+^`zH9812pT+L|k(9{1y~ ziR-WwOsy1iVe5-#Wg1X0@?E0mybyVjvL2RF9A*87);;S)$XZMcdX3#Gly-RaT%%`v z;1VG^uZZKbd35f)(h1cKz%!{S)5`NR&xt$~6dsh*=XauE9k22T3EfHzYdc zX6N=bYiBaFS0vvQQ;9{sqq(%ykbTLR_>Q)LVmxOfEVm-ZWv%Na6Aq5_pAAYker<*X z61TR-wzox>9AfX~2o3t{+$~1pVq<+FBWJjN9oKvyA1e8exqAe1^ifwG0>4J~T=83` zeH5DFmRh>;=}#bY)=p2nwCmZ!gm~qKdWv3bn}OA8ePnkD zjY(Nvn~B6H($Lc$;Vi4=ehoLdFK#|#y4!{SXDDq#c;T7Qr=|)EKOdYlZHqJ7=Ek>* zkut*uoQBh$?(Vg2MJ_(78b&7Rkz|en`3nfSSftO092fF|_*rP)Mg5$v$+;-j;7(~N zZz?f=|92jqoM{>6><_E*Cs&UAeQ<#w%aHmTmF9NL*eVuZ#5k`&c?< zuD`Olg`T`?b(R%9&vPFdk3M-9PjVs5vWV0%)%)Tc+YDbEtFDHP(@Axs>}3X);f$-7 z-TGR^!d(NZ;uhiFD{dqOTWFgFv4>_eAH8r(`#CEICwj_bL#hcA_iY_30to96Y`BSINZcty{79kMN-D8oVOMO z43sFlNq=}_zIGdi*mCc_p3%n857I9*MYp4@9jWAu4j)l>bduBYc)57NN_|hCR9_if zhlzCP=YJ)Z&}}SkKCEI6NL0LI(T1l?H@BqaAG(~CI#_V(?zr;g9AXm8N~7TKMm!+3 z<@HzSQY^?+4{lF(Q5G*N?IT>NEO{Ost~h9$-we>@NVgc=NCz7yUXbvVQqSX@=9*)I2pKQ8SS4rv=RbfPm43)g_ zE43AM-=Za4F%d745EpLGz>mzazea(TaQ4hz&idCu5khZ4dxTTx#J#&njE$_~u>%&P zQ*o9+y_Qf%$LHa3kCzy6&3*K3>cVA%H(F%s2sMy(4*~kTCt{rM%h)xh;wIo|uj{j5~jKLuN)Ek%!D z$G!z0D*=C5!lujEz88jHD@454^JILaJgugR{K|lt>U}-$M~k($l%F4q)a8EG?AF>P zK%{?8jT3ID&*r@*>N4Cj|4xsA$ZaYVnX0&3`7kw1?EkuqDHt;PzC$mJqazx z;ZBWePh`Ah)Yc%TIp$_{eZ?Pc-+(LtExV?m0`>n@~T;BVg9FI$uragIbbP7+88f9!BRl`YB-9Q`H$NH0` zVEOs34z;Y?)3$_Sb=zBZx-i^-$|KEu9gFF)5(r_$9cfr4AI=c^`(a7!$_q5m>aG6% zeOY9;xbbwm`9>}ElDjK)&BhnpVD+KObi9nM!gl(z4iP#xdpB&Jo)>QupXc#v6Q$pXx^ z^=p%H#n}&O0&!so+WUWpV@NK~4UCtC5?&x$(=cjc8nIq#W)qBcx@6PUudOe0zj*t| zsj+{RDw+9cVV0)hhdvBzSjzQA0&1+-7Jd_rlU+U>M3d8pg<& zL<%mxaXf9ASf6&tja`J57aF$>E0N;`X9xL1>nko-1rJ9i%?B|{q{f<4)U1EZUM4d7 zEFwGEb!g?(I@0+?t#R*Un5?0bhE7Gsm{!|>AJ!1hCo6UwX4yAqSj_WoxUY4CtAX&n zL&x=Xo0_V(zyFK=@u7FoxFp|;vC*b?bGN3=`WNb%uCG@JF%C}OFpu_``cc&=BI9p! zChdaYL{*A02Y0BfH&oIS-R~7Y8bgd7?;#7|HF<)?Ta6?`xr%b zZN2ldGdS-F+N=bnk!AM2x-;7L>tY0%cHNC!b~`&~?#%Goh6Uz{IlB4Ts9Q7&&5;S) z%=iYW9tYR8O3i$s-a|2XCb_Vf+Xz-}xEJqgG(WLk7>(xk6w7rQ8?5oG}o9*UUmhxiZ zy69&6jr&g;rKCq*Hk}R->tcH`xm*dQ8~A>chq1uKd?BFrXt(w=;YrCVjV0x_u-j3e zN=3Ee#n)%CorS9jC)e!880RmC0pJd3H61+MH}~C7N`R-_ZOk#Vh{;qK}Nzu`Y{ydfOrqgn|mR9a_O89-i#d%Wa^A!}WU~^$TC5&D7 z^B7>iR{zAszRxCiraVnJJEQn|^TFjuh<3?SLRL05wBE|qD#b$zJ&s@8H1BVp6;IK; z^kQ|L@nXELz<#@ao>a0rf;?cA>^-Hp_0mk|vlez~D*ee|+50Wpw&reTUR|ysD!uHNhWpUTSvFGQA@MK2!a|B=OKXs@BA1WUb{%v0| zv%x#r#h^WC>i2wA(ACWX{f4!4pC|IU?I<;woE`QHp#vK22D+i|^Ncd^^ZGIw71!a3;h>F2Bm1k9gvJNyZv z8${8aLcOtWjpZia2{_{Q%@?dy%Eu^Pqu#-K^*49L?(_3A9Ui&SwbJWsHk1V^Y1uuD z_4=W7xRETslQ=%scPn2dxWG44e^e9j`jt0*3-$FX?5F1Pd0Nje4RFOgc;rTYiO#2c zgBqzZZzLchV`03aYLApSXe9Pt!K)6tn4U507|aH>WRKjKp~L->&%=Ca#YMY`P*qyp zVzJ8jgFIC{wiS3@WV#!!?rBwXP*Za2Wgc_X1-Y@GlGC_B9ym{H(>EcL@avmgo$n_{ zlkdT1=k5JW>$w})`45y=3>FCqW$xdz$7hGzo`1JmueeGpeaD4RA-~T6JRJcC!*P0* zRvwXRU!Bb(BBvCc?AHM?l3f@C2s&`0i@Y9s(;c@RMvj$UO8d4(S4eW+xfO}gUQul7 zkn)J{w-&YuTi|e7k-_AGLBm`nGg%`x_vZJ024RBrEFc4fd`>a9k2iIkb{>;FdIz7p=zUetz+3cvQSoekaSPl9Hr$2e>+el? zcmH+4va?zBg)W=GaPnX&#<54TvH`{?9fe&8`2W;O^(qd6;?rZEr=K4@f{N}^6uZ~MhYuN; znZK=-K+(`4y*dj_@P72tB~3uhKjr{*wrDTBx6enT!t_sGtY$jTItbxtDF9A$4{R=;q2-bJ;p|HWhX5_ zjsoA8j)1aWBpE(*_(=#)d z#4kvptnXlj%{#e`@#_QzdKnNu_6`;c2qjJd)V{{S0noRZbzPIkSnu-8Gyqyq#778I z_sZnrcLaxpHL0nH`Jw<|o%$gVtt2!wA-%m10`2CsqCWrZ);pGgRRJOABCT zW;Q-I=MRL%Td0o&9TqO4#X3OAfR12p&UMN;I}7>|(}#fpOy*awd%!X+>fu%aS^FPJ)Y9<%_i zG>)B~I0aW%0cUCetmKEB9BIIf^*r{U0N^3;Vw5Trr{w7m-i&P^H_3>_wn(7%>Voj) z259Yv+}v+~5)?lF%@0Gy06f;|UbDp8p83LOU5svAE1OVo$C@$^wjC1{^?gR z)go~BN9fZ_c#3~-Ya<2t5QZroP#hEnba4?vscs5AXAB@4&mzF5hR})qnsx?Ewczs( zm1XrZe{?6RJvcgYvly$T-5v9+YZd3?<4d?N&U^y2J5lCbkz3|>Z`G^>YJ zz%ohDdfbbW+rSG3^n%p6bs!&ufT#+znQ8(CR*~3h{~Z;48?|Ld5s}~1C2&hTrrpdy)dJae;cq7RWIfz7z_ce5F24uQFB48D_AQo8 zL+0VbSa~I-ivWQ?Ztx>~3{-(sR!<=4Af#(cN=h0lY^J{X(|cUIucmfeSsB3g03M^J zfZ<^sxVTAY3K<~3{2A|61}*`Z$0#PI>jp}LbgQx2^xRw}pg@B#uyD@7H_l0%UMFYQ zybL3=(3dmG<+&dW6qB8>n0|BB<*_O_E^&HPg3{m7EXwEMS}<_@&wnwKbLpX@Sw7}* z0cGw1mn_}WuG{PoQC9e^Hi&tn2S`z+RB_tM%E~`ubrNuwDA$Q1fr-40g}M$7e8^js zRPjF})dG61%X~}2l|7Tr`flskfHOSVcm$9idN}y`8vy>m_ND+MZlXU~JNxb1x66Wp zgKLMt+k>6}S?te~24DeJAaXQHXuHsR9pnqtD>9{8dAd$X2njuX_^Vf#N=bsGAYdQ{ zKWixH2&@Bw+`D%Va7HZ1FR>ATRHbj<#x5=?;W6vI_Td5R8EkCB!9qirYHlEVBU4ku zQQm9kmpL${QgH9JrzfuMNkTmlj==Z9S^(S49o;r9)e>E+$08pQ1A~9vs;Js{TBRuL zv=VZb-i-t_7A*6|CMHH*>9-NDfPiyHGhCiuK!>x(Z%wU9F|AF3yoKOEdE;pag9?Jm zEMRF)fNAk$IPEMw3a;CR8DFYWzi))xhz0~6q!0uE^*4mkil8DDV43wKm6Qm?Pxg!u zl;Sltz3z3u^#dOM%-OT=-o8chw9EO0M#21ftj8s*d+1T^&GIi_m`oIKWvO#r$j%}G zoe3BQdH}E?|JKz~Ee?nj>;2kR7>LK=vOq(;qS4oeh9dVvM;c-Kb3nK@f4KKKK492doIlFKKW~1Ow+0(1Z}`a{{?g zB3P@H*97z~mDt%S1+nM9Zla^O2fmuRJxe|3`BsHOBWjCp2S73)BPaJ}dJcV91qD20 z9BlSkS2{Mp!%x@CJ;215@5-^#d^4Qw)6j2(OLBvpqdi%K0?hE)SgjZ1kXCmm7?ZX3ABIGDWS&>!|Gb27*sAP}QOj9-0!wUd?G z-h@B0z_tQfke~2pe8zI{Ulz0#QjjR_iISkYy0jlZevAOJ4ro=aqdd>OiL>#u4Z5&0 zqv7COiYpPl`xfcZ(Gkk~p25MH1Z6-y@WJZ<%rk&g0+NwiAay`h0v01xWFQs0b^reP zI8NPP8TYJ#xc&oBI!HV)v9K_XMvD`~-GxsM2I|rQ`v$Lkc5-r3^}z$lydsZ|6Yjav2etbsO#1QmfO6Oz`=&ea3?b`N z%|@^~5U;B#&S(T~E;MX__{Qz9@W6hdw{?EYjdt;s+v2M{Bhif(CY1a~wG>rV!Vt5l zKEK$Of)K29(hp}^+5Sp-dw!PTm-5{d(LfD77i=K@P?#RPCIUo0jNRk;?)Me*nQufR z2{E|=*$<)O!-H!?3A4#WK^PMVw`TzU<~JJWZc-5<2H*h_M4UK*c_EY*Z7p%LIl+&- z0TbB9#8KEOaY}=Wi;Jp83ZJx*$Hc`kg02E7w3(8PiHR9cG)b{>Lg`3<=L~HAXFo4T zlZqUSI~?uVfarzv0*lg`%wO-y^9O(^gjN$OHZXO_H_v4k-vV~;${hq48tM}uK$vda zxBwC^$7LoT1AV@((?W3)7eTM=2doWRhULSy_>lYHAbJ*K_;jH{3db(|^XHkxXw45OAC#0LnT*{xB7J+g7$wRuA)P&-fQ*=uypU^?7WjQo%~wrW#SRYk?2UG~YmuBAoQ z=}Bu&_umAz#{F&wweCuVsFfk!;BYA^@0!{$`SbO{4hud2D4=cy(ivo>YLT;ghmB*~ zLXqQt6(`*L?x6Pg(z<=sb+O+QZA^9LU!m~s;^64Y4P{=5O zkfY=^4!BwZ4=9U?)r*pp(&dJL?E=I^J*4;o6ayexM0rJIyzIB7^wX!ASy3#E>9QyU zb(u$N!9vjI|9$zu{3y`7`c!i*^XElHGTFQ-BZ&b4tDR}s;P20)JH zw)W+$^JL)FHK;ojW66MzW@yFKR8=Jb{Di?r^2w7cn3$OV6kW!;0fB*mD25inAg!vW z7Y2$G`G3yQG5|rP9PsgIs0Ps$gP0g?X=&-dOGr(~KR|1Pfjk>3z5>q0URc-rP)648 z1*3UXR`w_RSo{3^jEu$147*rXU7`N7u_RLRk!uBYCqbYxp8Qp64ry+~-ujfynENhY zg$gKG{B1S<=05(dVENlf?sfB7;qU^Skt;s&AEyj6#nQ zrf_^fSR^0cL@slCc?$%z2rWqgCHe^7`7d85ugTaZWZpHW#7XVF^4&cCWt5_LV2coe z!8FNcDM5G==NI^v7rNFF=wTPQmVZLQ6X(prVYUI@n7@u$yd;gxEy~fX9K>Wo-%KF6 zuo7#2>HC$`-kS&clYmD>?z@PCA^=yJc8n+MqQ%Qd>GlEMt$q=`%IWCWP3js&9@)Gb z&*3Y%r%DMvC+ck6k?8EGyf+ofF}e_d3pJIs4>Gp7+Gor zSJQ=0sJ{b5)yAHJ)P>An%K!*?S#@=Cs6?=k3g>mkOr^wEC&z~(No##M8c<$^fG8my z#C9mJ4^=$h`hjn)p_pIvU+kjRGjNFc6Sk ze;8?A`0v>L|IiDBWoYMgF-|Qvpa3~iA84n+eOgpaLTMlpuq^n%fHc7tFMlHLgLSk$ zkcR9#gt&yZ`+Vy==1X}o_9HR&`S;(mH6M(tB0bM;qPlYk5YRw?fpLzQxEZoV?D`_4 zA4JzQVlOTdi|<{w7p4UPHK-}6hO<6>>W*CEfAm-Qw7)q5i-iC*A+-`t*z}%>V+L&5 zA20~u!|3}ANY}9a!L98 zr6r-X{!FhEhx)@&?)~%QBu_S;(mczs_;1w-)q=kjc!`vb&R?#alw^8R!shiORe~O* z*Rpr+;DVHkFx9+NOXjxC5P3FVY3w&H48|y?YRo}Y%6RYIOHe!*tPEF@NcFBRt%gG~ zQqAWrBU2{W<9oNZ;Xi8@aMu*Qrh~9TV?qnm7#iXw?|$J6zC|N58U}}gF+z}DPh5g! zYPMD#lBl=>V(H!OO*LK|Zr>aE!%WdrS2_q?UKfN*Zo^Sw9B!@$IobLcbbH5PU9?V- zv$2QYD%lcvmmH1f^c+ovcV-)7=bEc65BCnvJ#b2*Z;RVCKsI?_3-^=~q8?zY%Lzp^ zbv!6c^(yONnz%kY5nP$!<5bS#$*)zM)fK43Z;NxX`QvUd?E%sHP408>5Hz5y$Q`|1 zM2!>F%QeDu|K(T7_byv8YMNI?c9!@h$Dw*Z4!2W zT>a)FGAuxU=~9ql;|$$r?LK`Y72?T=#YLu&^@sG`8H<4!4?B9tG9%304Jgtn@CXW% z5{3qil{G@kqRj4p$myAxR=1^ais8_wCtTnpFfb zGef`nsZ+3=rRiE+pMld$wIiJiVkH^iC)gbxD*MuU@NmGqXlH8mFU5>jFPZrv-vcL2 zI(PpJ3=2s7DMcLFLFB~tgkSV~jrqHb8XT_UMM}Mt!M1cpA>QJjzseA1PojDgQe!0u zh&gZUBJ>}iN_RGta70N2^|1<&!5IEpJZpU{ON)&6s%R7~t=IZp?pNI}R!6@n7%~J@ zk_DpPsHi9e;nnK;J_C?@MTX~S&F{{C-8_xxLyL2IF8~R*sI_6RGf9B}E2*4E4mPP2 zW|HIfXc5Q6Ws3un6lh2QuY@WuQTPv13e>YhsrGzj>bt}A(#*n52(U)G{}4K&%r{$@ z^C0o;HB%ZRbc9ge?zzNy24~Qz93_ypOZ+-o`{vA5=rZ=I6%zAvTGp!`v4Oz9U4i0- z{IiI@&NtZDlpEvEE;-q|3h5aMy&rl205>5uKjlekT$~^D{A7O_$8NFX(KkO{E@^gX zKH3dS|8#_dyalePs!^7*yF06T*EzDa`pZB5(rVXd5_A}7mf%8UkQ655)GlGlPCQzs znsc?=flNrR9Nq=e{)VZqLjY03hYZc>V>iMp0}EqvSzK2=gK!&)YR*fuu?wn(`X4r5 z`s?0|UH>L}I&`XvFThHH|4?#1jc7|G3IsRJS

    wudzj9#P>o)c6^0>So(=Om-lo3n4_n&rP8Gh8(DlmIJO;$tER{E#A$ zOaJj3`DLsGYQElAyeFdYv`aPjf$E=x+`0XYJq73;yh~*BKGAzr1-;hFGaOr4&*NZi zKe<2=*X$i%+ov}n^Nxhu zDc)2dJcI(KSzBA>^%*Y2Z~XljOO5Z$pWKmVE-Dt^^R2#FDbZD>zK@@iN1QdmagoMX z#Un6qe6}$1qkR8~pqgeF+@70jW_w{lt@Wj|+{n*gJXcrBw7;%3bPlLK+&rYYbnM2f z&-H2Ab@(``cZB0&=`UM_ywc8=QfJ>~gVkv{b8{N4-@7Brm446L^^Ga$Y5j6esgs|< zJU=h~v3_(vdi=IbS*0sxNiSDMjIeulA@(d|*~(fjkH_dXC^ol8LWC%6cBF)0ZH)s% z7`^OjAM(wIfi^PP^%C;7Z=uk%Gn~Bnp+edf7u!sH6b-L*rHT|af!B*-IyK9SzBpZG@l_9e z7u6!d*XwJNlk}oZWF>dT=-`hCfk4cGsG^vz%)-|DdgQ5p(~2eJ*$v!#emi{hg=tQZ zKmQnil9&D{C#3&X{CT-6kxpiRa5~k69A@f$GC!&2&P=`cq`je3^K>MN220A7E+9_rp=3TyocEEPjBuR;|?W>@%fo^v+C6 z`~71CE)M37T%6Ufn6i^cH;Y)W$)&8^a=xd}%)tu}bS8%_R0=tx)#gI?B82KV{Ok*07)R%v152N0o6E`H*jv znZ>VTrdlyIKVTLM52|prg&&a5JfO=Km6;ji*+;q55cFp0zd6!f$=k=*ZePwar<`_VW(r1fDmW#tBrV(=a zrqzC8XUR?R+S)^&=c=OJIE_<{xME59+OzXiS@X|mua?!nW`{K~SsCZ?wYbtRB{ut^ zq+9hl8WE%YkoHRow6QW9kMojVOvRVE>@qFJIT_&R;%?b}W^C~EY*ux%r(Buu?_Ve@ zuMsxE8l#Wp3BSmuO8%n{u$oVgQ|~5mooFE?GqYrJa!l2D>z%; zT*f;zM)nzZ)|9fd#NK4Y)`3<-I5wgyC*m`-c+R>H3CNvcx>A()<9(7NeYKZzwep;$ zum7314IGA)5}%@5*}CoU^3&2Mg)c9dY*I^%JIv{(c8^LkH}BG4itcd$ZJM%PTK8a- zNz)T|V_VVh-&u9_cBrYpP2}@-4_*#SdSy;W|BYebpq^9aYU1vbc%7V=TFWLS^~oi? zO0{>{*l}F{R-xqfeeL~ujn&a8Lg0p{Z;;sU#kLm)Uaqs6^V+VEM^UFVyQCBG*>e*1 zc@{$!;Ik-jb8lh#N^obLunVqMUdO`7%=3F` z9PxM!-KkSuWus5xzGQ3uW9am=;gF^@o7&>o%xxK&9V4W~yEcbmbJHz{qrh*cr{M0` z;SN7JQ&GsWxJTDdyqrGGd^4-CO`z@l9OO^-2-=CtSm-e&Ro_yTf3mbJ4wCi%!@X7e zIrR4FL9%BJ?mmCNXm4E`NKG&Dj*5q@cIxshnV5H}b=+piZgsg#I;ZeG)fhwRU6(by&^&jbHIrE(HRV04~6Xww|yWctOMg8&}@+Wra_`QnvsPiKi zoB8F(Oy+v_ma|=i+9cjetxPepGoO`;3aFvO=@J0pC^TL`Xwa|jDNyiXhk!IJm`K{C zw8=UNG+h%euWJv#_DdUUaC9vG&}U)h+KIWddAWR1ocJuhAgK*8( zC{Ke_bw`}lamHeL@nS%2w#&Qd{5!&sf6V@PgM|P_zSgUzG6Asay5_H0{pq2~cm|~5 z<3`zN=FxH@FNCx5gieRF47}#fI1FyxzF!H*+Qdn!^};ZjC@9vXeJLX-pWFE^ylL>} z0sW#MZv1SXqA5;%fXeyajakm$QIXLDK=9`_?^Z zsziA@$K|Py4$}ZIE{)2jW{MpSE`qd{MoileB)lY)Z!OmbA)RNLdrtE}8joG0Okiwu zuw~jgNzkd0TRy{XSeX*^+4WvytL(a!>ZzYIo(HvkJ)^vXM8(-QOCS6gyrNq zp7deU|L$T3CG#hPM<%WdMh_Ne$!C7Yy4YU5($|m~!S0p!s00UCQO!~_G9U}F z5ukg;Py)z~UL%jVpMxJ0wBCL}RP?k@yL|^iJEBpb+g#lK&QIKW)7^q+T7 z&zXs)ZLMm$XVtd7vE8g;G;Fy%PI)nw!@kc%C)FWcX&N!Hx!i0|tLlEKf7Oa|Y0F>r z!}0Okoty#+|5~=D%3!j9!!b7httX{LycUaAGcA(ras@AAyai*_iQh;ST}aIPaS9e6 z3h)K{DBU51$s&n4#``Zm-54?{3^k%Iuy$7|F9X>p5dz%-k++eOIC?p}3v)#gY z1t3e7Xe=!|XD|zr)Br0ALpwwsT+Vc|OGlicwnIhs7TcWS_WQd#Y8J?ir*2m@w6jX8 zex$8>@C@_u+cyvXRCUm1!rg`Z=>fTbeMg!N<9^7%t& z#_NMUw6v6O(VTu9@+axmQmUA^vHG~r5*Qy+GjdbCv2WH?9j;7K2-O9>Rs!i#e-b%z zEpP7!$iVR0A}enc+>ss#UA6rC9;YJxWKzakn0H}4UaM3$TCcQOC*?j9Iixu~$IhpS zl752NQ0v?THl3CDOGO!7I$D5eGX*4vUgJj1^F-k_-6b zWDyw=Ls_5ub1cx7Mf_ev%#Q3CqM8jr7NBcpKfH zG?!9|e}0Hmiw{{{a4mni-}_&qQ{(jFe-fWV`zx3ERR-H%J4Z)yki8Ry6}B`d?-ECbNZks=wINx zi2(xx!-xEQ{-fNU`SG~{Ytc|>TyiR_VN`zqJ}1M@;mQ?V0gzojm`6Jg`Y{!il)k;h zg8q7FM+QJP;_u7DrXLZEh74yQUirhuAP54=ixB7qx&BYL9*S|B;oUW6`A<;y#)1H^ z>x;^4;D0A$HS}&heahl-vTp({z73t7#I(ZplCrWmusugxPw%sv34G?{oVry$AeBKo zwSeZaRMyVvdknl7%8dfiJKLEWUSBU6p2UUd0Ng?>x`vFpu8Kj^p$qfKy?3&gW@($g zeKXjYZnac#TK-i!Snb3G0|`W$KS(xMvgQ{Zq=F6*dF`t)_vsOVUfhXBpfs+4C<)q0 zrl+R^U@MC%jy&Lx17T?OVf1A|Su@=l-3Ss1*zf@DZh@BXT5H8BbQ1-1p9iUP_w@8Q zS1RY|c%FQ}Se|*@NTDA9iewPdz(xY(YMDwluCHGJTQUjMI|Z=JFRQfI!_VC>lo=5T)5wRmLVhOf%$t|zVKa&UjLBaa z7#vvixwwrHk$?BvdvRnX`+^>Ay=did@4tFx@!t($|9B%H_vFcY?e6EW&j;Fs4W=o& z-K&Of9iiOOrE8kgXNGR<0%{COev1UT4goec{5-8vDIoW~4$PMV?Y+Idd0_LD8j$j! zLN8P?v*GHL;kv!>(Czei7nLdSc%HbS`+;x@X`mz3ceZ}hLF&#nhP6ua+JcEe2fM%w zYtuZmz@weImWnyB-APZTvNY^Ju~K*Hfl4KqOt~HvpMh3-=Raxws@mF%J2y-IGbRNJ zDLdDy@)1XB%g@{D4W|ta4K@c`rsbK`q@*t^Dg>b&5~PXbAc~N_bLUQU&jYFCh>zk( zO5O#9kHlWC3qBg<^%**rAAyYkA>la$c38jyZGyjc_w|1y(x_Ae4B1Pr`viI#(M~8t z5-2(lMf|L^GbvA9g{>eW5)%WV#~W=fM>mCtjER}vv4?vlC|Im?+vf$vIet_-|O_`MM_WUzpnRl;c|2<(NnAC#SmupWB%u(3^vJn5qR^a$t!v99F zYIpx_jLw=8fFR>K;ZiK=+jK zGi>lfwTL3Ags53Vxq@Fry_4V~7@IR+Q zWtdk}-OhuG7Fk&3pYE7uCllY{?fxeIK$^sjUAYPuQhhu+wvce7Dfc6DdGntiA zfjg9i*5V@1QxAVu3lP_Jq=?}od$1Qx7l`@>Qc3PzKxYMps}y#B0Ue#@r)Q)rDk&kL zk(_{_a2xH9@Hj91q?%cjBlP_iP>i+?~8BvP> diff --git a/doc/source/_static/rplot-seaborn-example3b.png b/doc/source/_static/rplot-seaborn-example3b.png deleted file mode 100644 index 4bfbac574ef29bb9f0dc3cc6df9714f673d49498..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 31101 zcmb@u1z41Czb-m7(#_CaQUfU6jUu4}(%miL&>*ESv?3uW-7O&m>V2n2$!rmFZD0zs)n{=&ut|I?=d zy9@q6_mEe6iVc4FV_Urf|Gwj#MhDd&^Lp{r3hGJUIhmItorf8 z!2kAgQN`ZgUjAZ)K?|~lB^vt>o_=|{*W2i|mG}co-r&JcKewq;eW-{F?afhvj1v82 z7sE~PXOA_DqxI1{TPls?Y)xpnwH|BVmP4@UgBSO(h7X@LdUCOAynjU{WV7_cv7$Sc zPJ$5|b+_#{@alXxsAprm;NWucmR{!OAWqxRFq-XQ^2g$09Ptc-w(A{N_emW+rA$L@ zva8=BZ5@^7UHk25giA@9(jAAxGK6Z0FsnYI(q~nu-@kuHE(G~KV`Go6c3gu(yOtpk z|I3|v_|F6tgJTgPA=04hEmK3O9(UT^hE;;*vz6!+=>YKve42=4Hj#m^U-$c1;yz|& zp-@C)JWS^4I6GLq+^>J|iJk6SsP#ay`N{V8hMS8mZ5^HP;^MoKm)kXkWo6yf)&p8P zIvAO2Rc_1j>KWqXP2M}5Pc)@4Vprqez6~$<5J>Ia)Up@!-JnPHfe$#7?WVjZkB zCFSKddZq#Q58;wd3(Y>;?ICF34O3-?z0zRAOls#DMzDFhjoiD*8-VS(pY5Bge~_jF z_7dao`s0&5B)B6Ki$z9;L2}q~sXfHb&JF_EUTC2y(Je!T;L(Ybk#cB!xw*bXfw(pw zU^biZ*E!E(%4V#T6c=|k`y4_b)jTW`5)xxmVRuNgFY1DB19=SVN+&I;sHkQt%@ro9 zXnU96wOz4EpDm%48rBo&mXJf14$rl;wL`$MP(xtuD~eXu*2_QplQ_A!Hac7pxjCL< zgg4iVw{)&8#}ws_n|H}5DbXO_yK}Frs|6)b=R6XGx$oYES@z(~1)R<|Tx^yclwK#5e9l(<6=~j3E7XaI?XqdOL%X$H|@1XzgY0Ws%SZ(hR9Y-tHhF`Gi}=Hr)RiB zE#s}l{xndr=<_Na-Z`*9BMfKosLpCE>d+-#QES|=DQ=pjpy_bhmUlUaJP~h>S z=G{9qhzWcdL-KgyiF=S#R|J07OAOroHJMwgVBZR}^^vTz{VqZj$mPk7g0nNPax5*? zY29n@yjh=-v0v{@>8Z--z!ht3ha(pEdU@;TP%EaR&kxruZ?4YgeTS~eW7;1N0y{>! zUo$ZAB_H}Y=?l988Pad%1|ag#`}`zs1_r()N&t*V-o?e`=cR88@;#gs{3ZzCb}LoA z^j)rjhctcFh!8Zap5rghsR!hl5AggpiwY{8zW+UB7z!IGDR^%mXJ5rMJR(f-kP@j zJ(vpr^;VB$k{n#jp3xko^y5cJiC#sl6}Hsjpb+%F0~tg%krf)#qv#b*B(F-X`V5jT z=3!IvMz3J}4rHe31;4P^Bbs#WJR(T{(<@s6Lh07*lS@fPXw> zI{y2w69ZqCWfa7!C}-5mUV1;gP)z(zsna<$NQ$5}&Th-;g5rxvkj<3Y?gish38K@8 zxS)TW;s%*b-Z$!j8nHa~C@YQ`awA@&LcuLQ%7!RoGbtu<2LBJgL_*zuvbPxLbNCwt zqLCpUF6Qyu3>*mjXjG*G`p`-bnPiPRQD?X-nqTHUd?!z)1ZiK5lZn0DYh$1hc7j;q z&DJ?Te2RQuZb%qL<;$0-r6w)nf@9+TXE45|Z9z!;C;RbA5Y_%)XTgpx_sW0%`PUJP zs*goQT>l(H@|TW7%Cr;Tmdfu={hK&o;rL^0CCj5ZDwe~UQpl~7KL1GyQcLal2Y2zZ zJL~!uKh)j{lT2zfej?U9I(^iF{JVHd@`5THvMNddsRa2R>99mf#9fhuy`}m!;@98P z(po>*c#4+=EP#(C6nc;4#kffv$OkP>IIw8%>P zzQhY`V?btt-PTJNWJdni{yi{&2&JmqIXIwN;yrrwXk_d7pBH|Rt>mgD5#fZx9H)31 z=3PpSn}qd|{*W!FMibHkqIb$Ydy<8q+W@EU)fgaIDsz%O(0zfl2=y^#Gw^jFl|Ufr zLzvoRS)30dWmY{6NJneKJ|*!vvleG~+OuS?@1^_x{kt!3ONT(uGeK$>C#Nr=gwn4r zE__uJ!$9y_ZQLr~+P8IZNV{k@)hMVPp8fZMx^S3)3mE*pfFEy5!YAUCA)BtW`;Li| zlHfLDa>56}`X7GAa2fmE832Z|H!e{0{|H)gO`{Ye}ut>1=+RTzn0iN_*CMf@9!T7!cr zLszBE)xe95g38&QfA2*meC5v7D#NYpqeu9lU@-C9xzA(TD!D28V1*HUPNDmy_G)ao@F>Gjst^`%dRchQ8~)ZaRFH^b#bcF0m`b!VeDQX* zBOE0qW<*}#rrNntZvr5LR^-49*Pq9UBka0>SEGL|)qM4501i$oJNBUi#BU?-U1h+f z_p9tzhORB-GUxr8iQnnuFTy&Z)RIhRsLX#bXA5?_|58@%GmIfQ&%#NvlMt8>zy zu=Myn_}Z7p9tBynp8ZaDt#n^iKJ%Au2JV7%tN?j!D0MT_Stv6+6KmMLf0snfoUEad z-yUTfROu>eYIg#UOH4af50+lET^d(6%BvcwRU_Cdu!B?|qv;_65ZFvD?>-3){Cd8; z)Da5b{k&G`fBq+BR*5v5Et@r`RUmlt0`f9ZJ|sF`6r11bW<4h^yA^z3(4~GARKj5z z1q#~Q#0X^;BR;+AMg`S43?-`a9>}+*$#%B=dpfGB%^(J1LMlNwRJh0HjB_ydkiQ$W zG;W|f$Z2e(2G(y=)sy^Q!%_Ys`4uuWD%K`NIio8GZlJ>FpVvW z{70@k0=W)s$i>md*`EP^_wh#}a@u5|*u*p95c%x*zdLquEis0UT;Qkc#Ls*6Sa6+>=&F~<-a!X=;)u=ts3eeugj>y}{ixX*t z6Na%uAk#zbzi?GKrG9mWB=Nm3b!wN;RHkHQQ+#NpB*)0AkW9s@6jBNcs~8d`&`M*V z{J|=4e6Ogeun-%fWn@HqECnLN7sh_UG=CCQIbz!37h88nX^6B9OybZuw%k^nSag;c z)7T#f`pZ#e7seKn^|X5!MXo8%E=|$|0oOx;3`-uWPn?20J|?f4xP0&$JIrcd601C# zBfFP>MJp9I^6Nx`?yNt4CR!AhHaS7!kBK6g`>O7~48_LMeP6@bm}q@udJ!+%8AJ8=kBB=L%FxO_-NUT{9!%XGf=p;-ds4Y zdy1=PN>b_T>j#YTj}6z(E$v|?MVMWLg}D2aI=!^~2|N5Spf)Uo)@ zm0Q63nRi80M#T8iHPam?ZRJpruzSZ>cl8R8w2` z%Mge#Q&rYVBu*q#9f}S!v_Z{(UK#bPdVtUIxVGD(;|TlLH^Pm?NRdSlXl7XPL^1y(q#_vz*%c7-W0P)m&@Q73kF+THdwtdeEI z2M(y1Xw4`^lkUnb!J`=72;S#QU^hm#J{`-+lCWHv5VpxQUYP9h^+4sb)3~^~aOAV` zv>&%0P)>2=8|0grZt6m4glzD38@KqbwryoX8L`hm_w%~BS+emC^oWpJkqej9(q?Yc*;(N>nLhAtbXsoa*H>A)F0%AtCR) zQ5OuDHko<-Y?bq@FeKQdLdDx#So-?t352X&)v;y%ZVPv>V9yfn18nqPsyLDmH{P)& z@*XYiosN!baoW2c9XofFq~hd;6?zY0R2b^isnO9Y#X|VT_$E==3zj1r@b&c2x8FZ* zJr3v^*ZmZl7@JDjdVD@`zV*&TfBR%U2vI&Kt&693TBXgSmB7eldubrb7z6XM+ z%LXAsD7en~D3|>_W=(aji?PV$4p0Y>fK!B>sUb_0ZD&e9Q}{3{CH9(0`P;4pZ3a__ zkELQ)nOOwoEne#_;c$QmT%DFNVH2g5Nd8`QHdw)$k5*cT!NV6OF&OzQNRqI{Bv@{^%W1-^DTQAx=v<77h>UIX zz^xreZ8Lzo=UK_AOAxJ&*UxTB(=S*I{=op+p|5DW5x>jL?f$cmc+O`nqbzpoF~ks5 zQjlZ%8Bk|JeclU1MAG|m#?G_n*2c4TO)ja_4!p>c<&0E)tgM>o-Yk!AkXNutO!ljA zt*+K$+%H1t0k}c_{ogIfG&v+;RzC(o!@5tPD%oEIO+De<3mnRx@BT2v@P3vQnY>h_?ZNtb9gKQ2tn#vki zuyNx&_Y#L0aS~ButBLylhzSMzRpO@bCc6TTH8X@f&KYK`KXl0&GB7KBlwOo`F$(oY zPOQte613q%P~<4eNl(I9>2NEWR^|eqPWnuesF&rVahZ8V@(jSBc-nEblX-s)35nt% zgCJ<0X}?da7K}5Ah=>r(vt1j*SUV`6%Ak|=Fz00K>y+>_W* zbIr!g$zx>G>4)HZPxBF6*d`p0-Yi^SYqr__2e#3Db6D`%LOsj;_&mn`>ugv)CXaRq zXC1vc6;CyFW#YVA48Yc?UD~d_K|@Fl5F3!hJ3&iN5BZDbug9KVq_H3D+rJy|$7Ynr74H*3y8Z|XlyhQPMsHUdKBB8D z;>0<6ndW3JT&K}U^h2F6|3{=v5NQR*{y}j8m447TF_NQ=k0~tSX;F8<3C& z-;0qV+raJbqkbD~EjN#hjp$XOy%ogS()fZ*1%ghYNd zp7Gi@8uWaTiH3vq16JM_A`nh6BZ#Nn`^kwA(eaVgOc1*=<@gOe3q{HpvZyP$Y43~1 ze1+mACy7Ns%g~gF`?^BN_M6$CpZ$du6}`hjHy*Gax8H+QcvNHJETTmy>SRiifBINp z=^GC96>N^;%DQB5|IKS2?PM0c650xl8u0doow)5h9i6r4bII;drv|tppEBi-1oJ54 z>pbgrK`1&DonARqRhd2ScWc-VFV14@&y$zK zi4!NP)y+2L&z?5FBhUQm^e3E%%sMP>uOnQs*35Ol;jlVeRu@VZ9wKmzTCtPqMN3IA zV)XM?T&|2ZDdU&i2*XPg4%v!T^O2IZRJn?8c1OALkMwGd)4YJQgDRGNvmqCw@=>~Q z$FkcJsC~aYd`AM1$Vyh9DC=^M2<+JZjUpVu7yQE9CYJO;Ii7r~; z3*IZuv=O!EBuX^UXOJgA zsQsHsN(7PK_g=O2@ zd^gEDV(vX&3ZA%?!4Wih`qZ(Ct~Pl&z0U6 z9kkmCbVAqG^vE!Q%TMw#d~5O-y*;LyXt*vcI?XBXAXM8uw|l(`iB^jP$3>AO-Y7sb zNgUT-7~20awCHnKsy0LBZ&K#)91OV(6k@nZ#$Bh@J=)eYOFO>ke4{hX$0_0%TwI$b z7TIIXRPunq!ZVNA$Mru@$5ChX4=^IjNb8a~+w`XSI@W>x3u|-*641ZQl6LMY zOp0nxVrR6=Ernd<#~a*Ld2?_Z0F|0xFcVSPBp?|4Tv<7@(;EMYi5t&XF2lAy1${jQ z!o?}?DwNl~$U3Ld{H3W4HTZRxhry$RUu#xh$}*k}Gw?N$RS1KE?C{~{bruB+ZzKyE zpI$FR8eM`Rdhw>O%(rO(6`bVMVaa_hq>O)SA3T&DtLT(WG45m|B{=A7mF5YneNtZ9 zTfrvzh{THpFSxzv7De_jrHNywFACLE_GP`;N1rQZNonGkR-!cj&AysZ?(7bl?fug! zU_wPt3RaR0T{?zD7Uwn(-e8+8?_{YhZtQv@vSIhml|T>3svpa0=3m`La@krETGCOG zPwHW0rMFraAu0sRuUrl9uFh@;5nC9$^2j3g{TCHea?{;`HNrMMg)HIJa;fGIzX%@` z79VC8e~T;@XZSYho$WxRxQtBDh*@$>ZlVk>=nK9naW=5QNWSjtNZYB zz%Y29U}Hy9sukcuWq+7KQ8GF*LgZz1al?L}?|soPho^gJZ}DnQebP54OR$4HMn~(u zhc0HgTTU~1p{c5&HCJrth7%LOOTV}{`E!3t^2DeqW3Ve zepJkFAv8XhvHo1FDQ%Ztx`GExaq_f4Ko`7S%)cwZb!7J!DgGCd*PIznhR zY>r1Lf>M+bQ{FSKj+C~y!^uhJYWsUl!Q?a;f*_%D03JQV!LfDFY4RsZN9*s)(tid& zWOH{du^u7G!)4(OS!3SVjT|r%eQLsR4s+lZ83?l{#j0&mj($9US?6$KjvRZX`M>yMBE_*IUsHnC(R`}KYKJ&1-Tx-`$$w(@+ZHk{ZsTTPO zvx0bT7){9&vOnq_ChyXUJx31@53hX2v$;zuAT7ihoR{L3pDe?vl>Zx^*>w_ux6>^I`nsQ<~1_`$=6Oap=oftSv69>Y?|fzts%*BT}P zKzA>{AXw$^zhYPh=NHuQ$k4YrQ1PM01D zTF|~1Rv-H*qnvCWB(9BQDV(+4Udl8djXgX&Ta7!|t{GlwyV+_3TiBS43yZ--$<7T% zgFUySXqA`^u8_VVke$xA62~!U4GzmG*m5s4D%+)Zd`QwX`3hOH^L@%;98PN=&iIE5 zK&D zwF`X~k22Rrm+q|W7x~iiy~rx97Wmy?P6|)KLv?wtivYV-_B(^h(hN~VK97$oI3)ga z%_f^&=`LFQ#MmT4L0@-d=}rg3n~yNedX872*2f8_CUD;om-{RJEeyTG_mCfGR@XXHYmYR!2?FSHsS?B(ociZ9;c}__ z$I|!iVUCGon;-{^U=$a)AIh*az~A;|@%H8gj#PgomGs}M}U6GTW9FeRgqJ*AJBely5{87Q!%i$_X_yc9+aTN5>@e6*@oTiBwbh%)t;9 zt=LWkYmEs;$J5h`tqSCT`fSVe7DtX~e^iCRe6m;v3Ee)7M}ibPTSEI7tj~Bc)!VFy zK=%oKTXf&r)8wZpWai{JwNJI=*BkYG{m8ZePrN8p1aI{!7Ae-M>E-ZByBZ}xdgcuX z^qvrT3E=#~k&Gk`t%2qDg=lwGo7X}OjOk*zD!a_@cN8|c$&~c$M9Q!-jmw|bsSX3w zQ@U-YcE;Wc`SX#)#2&9)jfPWahC$bOgkd8kwXWR`QK|lN8V4;1m;GPQYFTrs97*=m8>0p*`yjwsvKglHc+Fv z*`&JzLpCxHgxET!Rc?$6)_#TbE`#;LW4mmzz9~fo)RpbKOyD9hA5gZv!5fMgbu-87 zgLdhVVlA%_e;(Pwz;m~$Yy9x`mBR^Z0M}5o?sW*Re%SJZJ%IMMNo+%HpY?4jwLHc0KKs3qNt+ed(si9i7AwizX!Lh2HQG33&cUc zxT%lTQ&?g>`1%>`S#T|i2N53QZcFGrPF+HNjn?vzERqS4d=jrMz~cK7_aB8aY5K`l4J zGP;*5xLZvnhYWb$lU|b)>cM+k?&m3zZ}4hVtQ|lyUsZWW&_NkllEsO`N1ssQ@_C6c zYm_7V=uM+^N#g>g&KRDSrf;amkcpK#?^hqtS5vnzSy@wOt%2KfXwNQ{BhifNT*5E> z)hnOv&lIAHjm3+>@lANe4B)^6M*H2IR}LcwDKJT_9Gq>afGBQ+@ZM-0)ywaxn4D3l za!Xl`^lIHWk2RVw-jg-`)MxhMV0;daWye{&%HrE3rQC0yi3#8Mw^@-gg(hGA*w;Ng zvVLbAu`^r03aUy7z!Ai~?&d#Ar|_c1s1yd(&hqoirekGCu?NI#7$E$p#60fQGX?gf zO!iwbDy6RFE-4$;B7+Q((zz}tPAJ||(zTAL(Qap|_gzs!y7<>fK#=RXY_CV5=wjHT z4~0(<2!S&+X8G)}5CXsYJyKR*54mjKvVpk%1Y-3wG#z_-`vAXeg2*3t3^bakFnm&> zVyrZl(X>14clx7VZcq|-7>w z%g=Uc!eLsBz)OO}X|^naZVom>)e>7$a?#k%JxZaSsO`Rhrv2FL{uT=hZ&r8gEcu$$ zOh5y*3iq7AX*0E%2tc$A)_n%CIX1TpHAANFpDmxDFIT3)+Ra0XR2h@uqTBp+v2L@YfwfvvoKt5WVL+|?v`Cld>K2vqG)Y7DCLFlMWMtW z>PeB8co1Q&a@Sx02aQm+g3|2XK8(f_`o%p(${AhyhXnNA1~4{M6!_ZwB?xu9w+(rn2&?Z8ALRaXV<>A%S=SJ|)^u_j97!;Pv=8@&tHHeQP{>TjY(OIX5yEM~fe2rSu! z(-f3QX%DmPJ4_yYxcS(iL#Np@u`MA5kV&GYN^0|a*oDs6W&p}Xm&XBX<7e!CpU7cs z>9nv-(m*tb%EVQzX;7T@f55(gr`arv?O$_jLOg>A=FEkb{|NYKd#v-S7`*kOrxp9v zb5Ii_>OMs$bce;ibJ0zQSebH$#Zej0*DF8`PMufsK-2WBG7C?h1lJOeNe#qN**Auo zQmNKc%@oDQ2XflFFMi_pi>^6HUE2z8wtH;e9*f_%UPyJ0<%x1lJ^y-0ORI0wC-8V! z&EFW?Mj8Dj$(5W}fpwmkq0n2&v!&3PW*;F>aXOoC?_V!>MOp%)A2N|UE;km1ryO5f zG#D)q$o!mbTBgZV(1R-Es~KMz>V6L%8+QR`(R+Te?m$h8_4E5sp(~DjGWk8%qZm0c z3ebu_L!ct;n^EZ1n)j7-Dt#73lyrlCcQ-<^QFWgMJb{=plz@Q;tazzLV+o9yVjn7t zCnQ;S>gBDu18YSHAb3Aneq=$Tc(S%<4Px=TxX0>BI)I=t=Z(Wh>(K^Fq3D|pMPzax;&mxC*mKP?W@ER zg%hhgvD#-Z68pYfC5hkHl`&*3Z3_?e{x0O_@B7@mZ65%tFprGhFYLH#qK@do1wgC) zG8pjV!Ztw7Bf@B5zUOwEqGx) zqy-M+!*OO=V2{~a^q-;tvH^KWX1oyn_|E2Y|ylC}BGJmrJ~#RK3awaQVQNYvIa2GphZ;0g(sO%*qXMD)#G&-YB6(njU&g#lyL zT$oX077@>y)6dP!`GAYEFOj01hvI(B6qwxudfS#;rVw#wYPfhvXesB}iq?H)g8r90 zLfhg`@q3`Gsmn>OB}91r0uLA6AQ8WT($pkPy8+ZV*4%5uVD$ zZefnB2E4vjDU^Qd`Nx$Aovxww}s- zV#{rTPBxSSJ#q*=?{jJDeFdxx#*aNadXDF{qW8ZJ`>fO@!No@`?1e)$WHpjM?jm+b z!Tiq5Y(DVAad@r`)>9pwb|eKZ_I?i?Z@jAj%^`JZdUE7NuyiS(BVbBY9!rfC43EH zi}>+&-ui z*YBztKuQ2zIq0f4$n1NybpxP=G#v@?mgsqgcS04_B3O&Yxf2H&xp1sf<#Z>!*q1L+ zK`)m>F^K;FR3oJnQkn4Uk`$WXRGyM#`7T*z0(YPSeNh{vf0*_qi;HOh>C~Gva%~CF zVPG_q8`>?gy+27f{un-8>8q#8A>l<=+Bk7^9@zMFav}CO56P1PXj_9()x&5nMRPnA z&8CC%kh_4H;B+jNB-ftDrs40ZPEU|o8tpW%9si+-&mx)3NKlY6bemmOj@YYX>a+*F z@N(LD-V3KA9uJKDL3jVhxrZRAe&ghHr;rZil|LUuN@DTJuiAA;m^>0(OB}m?>aVX0 zD^0&6U3C#p<~Uuh^yCRCV6%tbcbtMiOahK|pfU3U&6Wr`mxUvZL~>vaJlvFZToB4b z%Gv7ep#vfO8LvX>koZzeG5y#Fq~0?Kr~oH!cc^pUw|i!X1=fILirEqj3hxF73nIS< zB{{$AW;L*S((Zm##2uTK&qjc1+-xnxqtMaGIFo1CrDwwZ9LzOba^Ws;y_HfPAyNIF z)I)l!lw+vjn{jiA<{2h8?kog1-V1eCoyo6P!!>*O zj;|XqMw|Tob40M~IiPpcqn$TgTEgIsC*R?$Uqo>1hFJdO#H+(pU!gFmi8yO zSqhI9Bxo;0l_-aPh+7G%OF0#)7Q*iv=x>*iIVZSmeL{Km?t|mzy$!~BA?#JA`9PWF zZ6!v@7EX36sZ<-?K4G}Lb`EpwDkHY>C7aHqueNtNF(?y7b(dYd1nUDeyrN*^rM;zG z3IcVUSReu%y2zXaOgBD#!^xFk-{Cp`uAt!Y{YMRLfSh&kywWnzdA>o zpcH9vSYZSs=J5?wpC0J8F3X^Dz|X3y_jtv><$VXOMp%Zk6hcJ+suFeCA~1wi8PB=e zRD;jSThW_<^Wy1bFwD|Jz$nzDz1i_We1?6n90y}pY4rQb+dw5^J{l2h$Auq&XrPiu z(M#=BJ2}lEf`Lua1MNx}cL52X=kB-qa&%MCkthar+OX&dn-{*nN;vV9(FmI+X%ErZ zGZG0@SKMhMx797Jd=j$1xZca}hx|T4cmL@r75o1As+BbwtC(+qZiE^@`Zjf0b-d{P zeR44Y2-E{5cssou!5wX>>%|}=De_pyq+(Uzxt*?+@xxMWO8b~ie)gUPmbHU8P0j4$ zaxb>m!jpQB2j;>ne9;`FS^zRoX zmbf4qkq=g#49p|VTy^X3Gg7BbK#Lrj_@(|#$-L6wsL}v0gJJGbk$+HfE&)h_nO$RB zMT%g3uo~^iL_p!uG3!-Qs4;V>X&X6dAs2mSLP$@`N3-)~dyf)L)hWWv)$5l6~;qM#u~zWJj1chYyCtMu$@01qfrw2d7h#nW6SsBzhG0&V~pH|hu7eF z)C}JFAeei~qeeD_4ht7p$DmJ5?MeAh@3oWId`yNx#g0}jcA~LHV3-rvrRdawmA;Un*6@YFEv&AG%r$pBrXj%HVNtn{IIPvtUFs)KH z0x35>T)31B@|^*0Bx4k_u*=*lEP6NuNu3pJQh4Rvdi;27 z`QZNjM*#~Q1>--~c5)vA0wfRr@dt{jhF{5s5J;TN z4IC^52m%xmsoOqD+T^97fh^U@c{J$1+M{m#Wb!q zSruU28~Gn}_iqv)dD@@}xEwIOdC6VgfZ7JQ7TUVH5#hL$7UTJk{-1X=?bOYh!JrT@ z^X#y@$6oSR1;WXMvRzic&LER4ObqVB%=4|}KO;}=LqLpU|An`}A@djUS>a}$O zeSv`82qPr;U-a@(q_Bkg%>lQ_+km&*>HLc-?;QgqO&T!9`?rzW3*^Dg0@V|evg}gPQ_Fd_YF%vkqDR{pxnZ@t9oHz3z3@)3Fdl|zqH`Dx3j58IA{if4QB?>~oX}0bkNf8M}I}Q#G zl4sqgy0yRD)v#SX0D%CL@A{S@LHcj$k`wq6B6OxdUISbT(cLfe^Yj1F$rzAe4V&8i z*gEdCsbTvMcTjy-8Uc(vM#}HR?CTq@GeC2Nd2LQ;0OjZ>_K&u&?f)%Bk{tcdhcE zLI{{DCC!n_k7yiS=2(+=^NsTKwE#tRbt z6)qx@Wfzi;cEFPVP^+LCq{sprt6(kulGF(`iV19>4uxR3ku~4rPRQ{Zle&k>!Wv7~ zGxC#s8TZu^oIp}@`jvq)M(Puq4C7qWjS><~aIA2J!t%oohQ#)Rdn;q&cl+L7*r6+5 zWWbZ*QHX|@%n+?5X4Vkof3exteb(!EXpZNiNwl`vy!k46bzNF7E$iiA5q}X&sMM+5sDf)N^M`QjrdCuGseMHzoSC+JE z*x<(W+{0tpyUBg8<+RgDcom4oUx_0@!2hHhG4FVZ0kqx9;1Xl1rn!=rsLX(>WOo$7 z$(9+9zVF+TA`>L-y*(2GM$5D>=G?KaZq^UQfpPP_!`0EzfdkzOQs3LO+lULk#RLdM z8kuMrR|0f?%&x_}`0ByPnmx)bpG+m1x$91x$SFCTQ(aB6*LEvi-g069_#%}F7IXh_ z0XTkp28^Mm%)9i0=$19Uq4o4Bdd?{Jy8(>Iu7py)1U@?&*1zPr%=)Gd4&ASZZ$^Ej za3VtDU$Ix9YwNZj8dV74qA}e;PFdXRP)L>6UfiIC9eDsnFb9&9iJ%g~oHp{nM@lt; z@=%%X-6x1QSb)WRqgMk^v1$Y5ZrdVZ)BM8nuxQ3q*by)Bj$KuZhnLB=j5ZdS0+Rd) z$|e9O+qY*8C|@t8PNNe(BoT|{*Gw}X`u|+(qK(B9HI!7b6)a`+%OmTDv{(E?mGMWxS8)A<;!g8skhV2kQ7nLf=Nydtv7tfLo`I@_h_`9 ztZq!&4KTdX@-3#v(wtg8OU3Mau=SCAk4jaXfn-`)ub{qm-{5gk#tG_AP)xcl2Anc+ zbK{belS9QlId#j8Uf&^Q|0K@$Z}CiSdgqpf0p{zg_Xv}=eoHCWS;DAt|6os%U*7>^ z@%VK-l|pR)NY2va+mAYc)o{=)27NMefQP2`=&8otgMYMWNgSFz_#$(`wT{y(Hq6sIi)#2)7Y&0FW=% zl75Q9`VRZc<)!VrSMr2GJ?pyt6MN>VuG%z4s2q@ui2d;j43tIYi9TU7RrNnO z+Ia;lnT)Dm@Wqw6-q1vWdhp2jw9SpoL=N=!8Fn&u z;RRTRv7b~g*1sMJZ_tMe03c-lOumoCELsfh4`Amnfr&yV(6ufd`yPPjIG6*G`d_IU z*HEf+4>>7jZJPY{Hk#raE4V+&6_dc@oXV1PHB5ODWK-YXFHS478vB(#_v7TPmRQjl zHWr_$Ec)Tn!vfh~j$}ZENYZOg2!O?Zz5E`>*EwVU9U^`MIlxjnLlu0Pjh?yI zWK;lyCLa7+_Lw<#W{cJFr2W)G_QSmPh_73E^ul4;Cul5bk?wjy2gJm4MfkPxB8%&{ z_hJno#q39$UZDc@KayIgKqwUkY8b#-nkO&=4B<1t<^awN^Z8?ilscbsVQ*AZ*E0wX zzc^!BI*#5y>ju1eJ&kw(ll0kp`R>~(1vJSy8dafL;?kwz8koP4dq;cZr{{CcoaWCA zeOA9K)&_j_S*yPoQb{886V> z`!*hrzP0@Trc*)I#enKd7l&ja24Djo^rjKZl^m76E(^T;Fp7#vyJr+fgy4N)*XX#* ztLIzkeYC+ZLv+>cGU+EAMsRZQ8h@&GFZyKWCYt7r76oC*dU{`pf0bT`On z6aXPVEbwx6(c-Vz(2F~`Wxo@520R+%&!36&B_MFL01=n0{qZwfy_ZG4c{2-AHsZj?Yb&QC>h=%c9i8oi9AU@M10-2)Qw%h3J&EQIsxKv-Ga%j8nd zr6wN|QWC<~lD&rIW!ppaKHR#c?Y18RB5X5#FhN#G*yxhEJs$=FhJUGae9S}Z!4n{Q zdG_QUDQ!v%pNGZ9Ita8spvW!@Wvs=B7%&oA%g3PEN=t% zUW@m#gp@=&a?gS)D_z7j;(O`IA<)M5(6}}edl}WE&VU5uHf{W3*;eL~3SF)p+_mP< zU%!^;Uo2ukjKGsHS~l-7nSXNH2ky9rb)d z002rB$T=~k1fYs;dohrQqafr8RjNGBc^F3ZP9`fCGL|WAKuhZRwqSFslX(~qZ zr8XYr-=af4r)tOX8`kk`RsqerqSRCwgguIR+gk$p7!1hB?d?v6pQqtu#-7X?Z3}OI z^;J~NDyz3@9}TBfXNjc215W(BPvgmxu$^onbm)UcXY6ndag?!Mhe$41{J`KZVSS+t zjegW0sn7Y+ z;OwE|z-^oE81o_D7Ncd%oN${VyV>~M@`WT+sU)irVg2YWk8^Qur;zvPm=_(M7LVZ6 z#P(Mgok{~`Z>&rQn?IE6o;Kohr>|4+^H24Zvp1=hsY89PCSPbiKl5a`-H$2h_2b(3W2zV*N6Y$EVVdV1_WJGH77qk2z`$GowfB_u3tajV^Q`D@DQ zz4;|oL;jFfbg6)Y=4>ww9jb@vS3HizgDK=Hq2*gZ!#$ZOi1t-bxCEh;dSk&x^z(7) zcQeT>l2}m0IoJd^@^oSlt;bZY*%4aD9_;+QyFLzN!4kmn`_u31C*}?!)L~Lo0xwUX zrwzGM=;qsX$ErMPK~LF=>=kzB$Gk$joSD2CHf{D2APFgH1q1|qqV;KDJDw)TX?z;| z{x)Qi*|Mfj)Cos7TCmsy8+|;r3g?|UR5APRwLAI!w+s~{*KfO5fMVz;IE`OurXA*H4N z>-^q3#vS+G_w)UrV*m&C-fOS*to6+2nRDWv3cPH#-1w>Y{X!6RhbVgcbk;1+`S2I? z6u?ulN*y5Y9RMmWu-#qbl{UW&D6djsWu-%M8e2MAk^Me+J5Q?_@tMp7rV_YJsTtm~A$43xdTVK?Tm+vF%86=lgU*Q$TWXM5jhaSU7e8Uj_OYJ5#y?KeMM^wRy{o) zdW{W)9-i4o{|Rd9HO((8v(+b{Oec=4sW}N%*si^)rO)+(A>77YR6lAvP3UTWm`0V= zFhz4MqLascr&+>*5v;a!f1n(psbb`-DySjKps=l3T~V6Y^iE6W6Jl#Hgr#Vbhb#)b zKE~OPT*}QB^PiB>E!lcKY%?*YIMEAFxcz==TXyBP5#GjLR$o~MU@KplY?|X`@w;uyFAbXk9}_8IT_`8l=_{b9_kOA$&phxYuYRq9F#*K^%S+fy7=nSyj>r6h5oU3b<5 zDQ3zB2P(o)aXnA5+c@OeM{(?uCX_2ZokZ)htJ+UT|6aBzdvUBB9XU9(U0ddYlLY_| z_@~pjH!=odVwx%o_b28cDqo7jetnGQ~)&oC?W)uXzLRr9RGBHnTk;SHDCP;+7gKQz3f&VQ+5YaJR(JuJ78ZBM6> z16GD^+%uc|uMAbNl>)i!g;le`H)1=ynyW6_6F-4=iAB=P7XT=9?vO!K;*!j{2A^WT ziJ!B7O_&QKtgD_mHq}YW%Rv$1#L%AE>^J(nedB#&x~;gc%5U!_V`hQ%Tp^2DXGVQ+ zQE0r{Qgn$U9-~?Z|3H%{tvixc;g6!+C=z!9Y&MJ@cZnmsPXiY%#(F*aRv4FYYW52Y zUhLWO3FR@_bC9$O&2&02&SaH$2HHof5m&#aMKma>E2~(|535ZzSq6Me2{g_j%W1A=c?dkxge3! zjG2?zPC?&nk1up*in7sO7t2rVOdGA6Vrc`H%2ulf1wN+Q?D#tW_G40}&*@|gTqgWn zW!7^mJmvat92$3ev+RMurt?9sI-)9mCu{!?&xiKR0lq?*&^>30wggS7t(D7qj9KwF z7;^vot$@>;fxwN9}0M_Qmtm&3^0lnJK(IzmE#>if8QBo$Quy|MMX}s zzkL2>j0{3R(`wXO474o6l>lx>E;vX{6K`dpC`4!B52S^fZ*1(TyhmPY66(dY)MM%Q1~yRJ%wOEO4$1tSk65@GnfH+%M+gys9CAiaBl;5S5id0Zfm@%7iQ z6rjCx1~Ay?RedIMWkLzoTxVC>2M1=bHH#sYgA(Gp9 z?jQ(-nwOuW&+_f8*x4MsPG_N?V#G_m6-}Sz0iIM2kN)xI}xJV@7vZ(46Qi zz8iu1&M=6mduR(2ix&{G$yfo(R+{kU>}W>A>FL~vwhDO`hWcp8lI8azUx#ng)8v&c z)qy)kA|YiY1Bp^Rm6;}~cNXf{Ymu2%(FS2n+|Nfr$gaDoEpD-(Pb6M@Kmyy)oNcdt z{Ti~VbIKA*^CusFVIOSI!maPA(g`~A`h(I@wmLKV(vGxakKP{R0cI_RD9bJk9_2wx zWvbghW34IjBCWV^Gg+;UEHc+}WjQC^&19}nl*mb&hHVp2mR27B9Y4J@*+FXx;#^c_ z!8hbywm5RmWZiI{LoGoLa41Q+c4Z0k(=_vI>t3qLAJf_L%!eVLguMFzdC#ks?q(nh z;do748{J!pbELA)O5zv(Qyf|VBNdre*hdlhNsj3x@Qa^?Kt8C^-7u9$qqe9tQP6$4 z;G_KeFz0nRG!k}y05r`4ZC1Ys5DN+4(zitK*^Y;#CF#jf)fLdih(0XXNJFVoRne{( zbi52PBP(}Wmwhi#kqU!f={*gUv+Vi4Zy7%ZUu_xm5*${ak6WPP`!$rZ^WG?QLCJIx z*O*(Zj+N#oi|C|$bvn{roOr7(G88^oC~t$#vRF!;besn$_l0cr8SqrNNYzdh(Z!!= z@@Qdw@{kmXJ8+2<*V9UGQ^hOkmdG$$|3IuX-KxcLZ1o$B^XkC^xez{Z9r{{_`zou| zEET%?!n=YaWtUFV&n{YO#`T{Ldn&SlfonY84`q^(*O2AMcs0hhvlP6oJ2&O^865en z9gjz)S_lVkIA;EcHJLayHRnm%Q(@I#xSXpSEmvR~R|@)DzsHQ3cG%8Si)CT4G~pOe zO|AKABNX%N`nDUc2oXW7tzbRF3)TBnF9mDQ+FE=i%t;cN}^5I+Sy9nx)>>E>Q32F zP=~lxM2@(#>^SvCdVhRn(U$=&X0{Jy4J^wdQN5*>&-Qqp%e5apUy;*R|3$z#x%sex zRCC6#Ip9mo^%X(PmJa?YQ8x!z>=8wwQzUc~1d@=^UG^r2^yRv2v|hKlT5k~5pPDPe z5~9T}8palLrSEjxCM2jmsj%1NhICz|jgZAZGv#Uo4=GQJ```0| zTnnyK2ILVEL4_`0n5~)Bb@3+-N9TZfRGf6enYSS3&Rp{6V{qQ!ddzlF*C=co>V~}) zL&iDaw%^N_3t9?mvJu1`I2al4D;v;vGvw7WaKa*oc1XoHnQZsh>Wru=SupW?8ta!T z77ME}4Z$Y8dGeaqbi2^IshDBpRx3he=f=Ffjj630I3a0$;bSz1-4^NT%&jB+Oqp?XC zaFCDVe3VV-vH1k1@jKHpF%PY6tYj5@EKP*7GSL}^vsg0J_SZUUD35bcryyB94r;1+ z5vj=}kL!E3*)NSBaH4b0Wu`-HW%Nm-}gd6u8B9h){K}6RD z`F*2{I)<5hEj%+#vTn(z4Ihc(y7GTFD!vlCTf>G=P9Wy<=A`E((C^Cz!e@zbfZP9+ z+6DCqOMq#kf|aF{Bbxn}ggKyAEs~1K$hjqnk$;7eQAv*bj5gQU7VthsRDdi_<8VtD;sG%UC9*Y10D@(%cM_6Osn~{vI{QP;5fdyxQ+KtE{rrK$p2y=!k{uSSs2vyKwAi3dH zFv$q7V}aJMZ%s{^JGnksIEZ{U=HsjI4X!+uAUD*qXwGOJ<66@W1x;gP(bd-txiH)M zx$~+ys!oDDt$U(w*;Dfjv{p72d~)H0^vG%Ix{AT^u0Hgpp!eX!LrIg*h?~O$(+WPA?@QCa-|$H zd;OWgvEJjWZ69TtCqjvmN#{oj0`{`e9*P4j=_e8M=nNiNh@v|PB!V5&1i9VdUsB_8 z0?vEpVs;zO;dG+gb<6XY<=_;8=Ad&6rfPsVqCDOuY3(!3bT zp0d`0^OCg1+n5zh@~Q?$tC9E@+5Qgrk*rxSIO41I@G|#W!SHf&*sqbN+3Rz@lunME z5>KS1m6m{vc2uodBhuj$_3Bv4@<*e2jvr&lfRmW8Epyz0+##Lb0b9D!;4}eWyj-TT zOsKig>!S8*3&GZU+tu*4S!MjWo+M&2k^5|{be$NLXVggEW<5#X%ZeB_AU0M`NlX4B z$?I3nS&?>?ybicM1}78@Dr4}7o)mk;P~t`Ffwn5M)_xVSdmcCx)(BXPUO=!G-jbeo zJz@m}IXeS3;1N@z-B&nx@`*EadDb&iBg$^hpB1&^+wgvvJLcHEu#aR(8SJCiXNt&= zeUy5Q-%|}xngyNS2yC)>*Fz+}EX0;|$EHF;3ehU{1jD$(M7XEM_-}H`M3jR{=?)uk zwA_Q%1Pqp5RMbXK+sgDi%|?nz`grZg&p*8fr%YCbPx{x;9gOZ(bFC2m%im^l$e~Xh zkB*H$2V=KG18K8#+FY%u2-qstR7#ab6m~T54xfvZ?t!)Ab}nUe3@c&CUX}8p0BDN% z$y?VNMqKw4i$5AosIp1mBieX^NenElD6TeIpA?~Ke7`Qv`)2(F-s(QRB1LoRY@Alt z5>R7fDsI?E&hWEqa_FO-cIQ@1p_PBO1s!fX1`ve*i2iTFLy_g*_9X4MOUM6!s_+{zz7EDnyM#(xlLxCKYOn)xgCx=bW~qB6Dw@t zMqrWe-3>}9a$uURUX;a8eUw-PA6hBc%K z;i&BOBnkQ5qtVhrSXB6+a`jhEO&AEQkQW5czQo@A2Z>756ob-) zzXp={e-lBrnIyim%YvM%8UI?b?sC_xq`|5Fpw{cCs5n897$n%mB-WYN(Nj;cn0Pmo zRy;Kz*Q?1H1QP`Vk~3>=o*_+dyza`%%0L&&8Yl~I16AL}&$ownn+#$^ZrM21)QP_b zZ(Cy(WlT9pj;(3hpOPM)7Z?}qa-1q(VZOR{m#nnM9Sqsl>>MCkM_N` z32HHld`VEHb&1yW&QZnM9CDtefss2*N!~g%!tD_;5nGj#KXUaVkYS^DWKYXz3vGpP zkzs1Whp!M@PsFia)l#d$T(M5mS$d>WMCn&@8M--k_f-J&_Hs97a*3pnCHaOZ8gu_i zddxJn+M(+Qy*2aH73m)<(gxq2*F$uFww2fjH5JpaSn5lc_wroL|Ja6JTUBO`r|e+s zm$&_E$>hYQx6tWg(;L|NSqAfFwvURXRv;v7Or7U+o|Go20i8{t>NM!+f2$WuA*M`; zL-^llB-jPr4ulaS_ZYDJ?A&b<2gur^_Xeu)%JI7l%$SyaU61k+Sy7%WfM0#ee#Hu3 zbH$W4->X)&^}_2{_N5Zm7U#A?3T?M<(48rnkR=+%$WMDuxG`$^pxcj@XSZY()|HzV zt%p79%vjA|o{#+~uucEXo!zf5n)a7DFDa?S8!dDLsoVLD@^gThSvLhrC;YqS_L19! znMs-`plK+5;XbST-6efSD%QEWYx->4@c8;{l=m!^MuadF;IW^}ShBV8iK1B5j1I}2 zk1ZV9v%Ys0Q4c2zj);ej+wUay{%VXu9xu=V zDzPJQ4ea#8Js^Iw21v3uUpHtALvWpypvG+glZX>r;9C9F?4awto!z*v>ov3+M>m>t zk@I5N)_&*5J((Bu=bfSOnWYyTa%yR0PRY^EAF&~-hH)9l{4SZ1V;eGaHp#6F^e8bS zg2D40D~hGSbtDb4*c#+mG6f=ldE`4~yVU!sM?Ex|Oi-weHpJhfdDEbQ69l-t-@QC_ z4Q*?Jd1u{soI+*1oCMK3yT&J`&-k{8R?IAFA zymQ6OQ%78nax>@tJqn!O!$#bgpNPiAuvZ#}(`+5-YiaNmcxoRhmQR?~z3^#7N(x-r zA3)XmFb#{Ni|A1Ycbf41nwxa=W+ktP{es&F^`}I&t~v7?k2`Xo;=6ZMIq#~T=JktG zyvT z_3)5V<=gva2TA`?N8I7}tZUgYa{BJW&vp?Ye^zR7=~WNjTP&PWO}jor9%DKp#U`zW(ci+a_g-|r39x-8WGdL8^*^k9Co{m9kd z^Kq3AiinBvRz_mX#jQrif>d)jU}uIX`t&IO?Y8b%s67J>W?Nm}uxk7BCnQf{A}n)r zron4Sc2x%PMB)z{+|4L0kNqMg>Do^2L?*Jr=%r4Qv(7lD%&g2CM5@Th4Ya)LaI0PzEN$;DUUk-S7XDG{ z7p2+4Mv}`#Vwe_l%0;rwrAMDVC6y_0k*qzK7$TkEG@lYJTb$i=95m?+y3-)j)4i1j z&8m1`wuaL-u5MAt7?3{Vnaj=$$j(DJ{PexCzdSiV@4fCfIwr?^JzoG{khRs5{ofQU z*F1Ec1uNXH`o5nwGHSU?tO-OE^-V(jXcLLI9JmP`KTOLiM&g3b60nMhzzVGAxx+K##k#T5KqSn>kOVg zKmBo;Mipmr$@n`5V-u9jEJ0&Pw*SNR!D3@lQyxSWfdmu~CO!xGu*vq_fN=*sKJXcIz^!lZQvjTRW! zWpa0&A73WdC1iLo+^>K!Q8n1gt^HavsT(3#4Q887$M8nN&(cwvar9U<(VZh1g0M6U*>o%3L_i9ZWu_- zJ<~w${mz%`U3MfPoiqK9Xd)d^x-KTFR9dcLjjUe0H5kr__BrQEB}lrkr^B{mTz_yG zmU6YQ1>$Aa${lk1$~Z^|%>b7W^m(YgfA*2FPKR^ECKN{elT~rx>$jA7}`k+eV z2FLzT+L~_q*P?3)ZK}~rVZBmU+ngdv{_!fK+dlsZv>xPz3aNjva8iqJPN`Z8r!FAo zUh49D=rMhqFSg}O*0bB(F3(@?#bCl_L_d_pFY@hP3GHG@PhtkAc0ai!$28V3-yS`6 zg4GIvh>e)9e%In6YYB1PGon9FpcVIY#FI}f{o#}gTs3NYSfcir`G*)q@MVXb$mN#O z20zy4bbgrr-e)}tAU)dE1-ux+18y@CF1Yz zQnjqB@&2QcDy+S{X|;1}0BR#3^Uzwh-xx;`Ive{*i&haCGJa@0y-qCZCGEQ1>h@Hf zgT#m@x2TT6(wjy8Z~J4pdjOPrlmGQ5_u;pF0`z{hlg*C}IjL0q9C4`;f93EDW7YcQ z0yT}U&(J`9<2^6U%V7teP96Bym!w!l5-NfH_Wc{xI1%N$O^u%Ai}oc|Vx6(Ahb=2J zQRDU`6NXV1#1yZ)bHt!e1)HmsX<>HgR}7PGIbaLyyUd^fa>_uiFGG$tXnd#e+qW-3 zFr}TsZ7~@weRvCzk`(J~y6Q9UG24C(#~(%Nd!M8I>$V996On1ec~ekd>7I)LVUHZ#j*(A>XgOBQ=~rI1Ta?z?_7t_F6mV9&@>dDK@{zWe9eq*1NKR5 zto)jm%ZdTEoj{8gHi}m@Zp7{1ldvwgZy-4~(O1K%^+`~fjK%_0dAx$Pl(!ljL8Q!> z!;~*D|0%Y8ClIaAF*b_!qc}$Z=xr#iynsdyP07s&Wo@FAzhbYYMtBjD0>o`ujJ#lI zr^l8^i27ctK5K1k1#A+1ZGq$iRmb*ue?796+Pf!dLrp-9$m(Mr6Fd73=r%z%2Apz9 za{h11q$SEZ>p%&A`?=9SL_jy~4(}&8wJ04trNpLdUs)7NJHFT(;=<^T<|d3P&pyIR z;-IC?!p|yzcvnZ&U;cu)e7B27p-QK+aFLwQW_1BvfNAQDtF-(l02Q~Y1nen??a@k( zjov3P4q=$nn^)3f`bJb&G|kI4XYxYI^LLFYlA#|5w-xx=%-riip6zqzbmd+_&G$#} zY`RhV1#74o*AOE;>{d)y$mhmJ^$ zW-Z4Rl! z{JQsLztX9D zzufzJiE7!Lx3Wq{%Ii8~;4Uu0LNI+QV}~NFrWtc^6=sY5<#*OcnKR!o;r7HE;f0X& z`v<9Cg$H&n!auBaR%{AkD}^StH%p(=Yqi6Hj@>#?t}_OW?-@|I!^VNfZG0Zl=;+8p z7e7WYfcX#vmX=gLOrN9)$I~&Hm8dk+TuGmXeaq0>F_Ct~@05!U?)Q0*7ZEjHKA3(I zs!KBPDqH`h>Ac7Na!u}t)!A!|xW47Qi155qa+cpVR9bRTlLNB2R#e8pM2@UI35wx1 zJo40dk?U7N|CaVC{`R#&0tCa)Qs=bmEz#=i!Yb8{mIr+~1ilGRzf`#{196}x3Zq!O)5*?9>`TS}sKTPNnw}~}BOmkgF)e5?C^;8*( z%K7U{51Wx!Tnd6@lQ@;{y7W|t(N!00*!^3){PV~H>R(r1Y!EA-1>{-%&xxHg==xQ% zsc88RWN)*(JOW)19-1{|Qny|#m{R#|r7V^F>mGaA(Ia>o{=gwJOyua#cpWp+WP+pe zy(h`4Yaa9~RI|qdWVSUn8dG$Fga%j1rGjRpGT0z{cf>I$L9Mu@e7tIBUAoRm`1P#4 zm4ikTcPx{4^Y|dkaW1QE7{0@9ki-Cf#iW!k)Gfz0FL*e0Y<;{mBr1y7uVdn{3TIdx z)a7EA>@(VpRu;xUbkmvDe*gCu+L+-xnGUxl%3O7MdUtgZI`vGVETu%`J`x{$tp$X$ z0Xl`fPutiiUXN0a;%gq~e`E$8Z3JU1$@5B zJ0vi<&eiDPIkdDyD5TDu09g4zY$j>YD{X&Df(-6vz-*_QYX0R+ugpW@z`7$b#sLGY8WoLM=G^{$^yakX%7pTTiv3)Wa`_5 z{E^Bcj|0iZh0y7Ozn4Ba!3Y1Yg?8kear?V5(?+%`vr=84N(v}$dxLFQ;x>tnbtiIJ zi|$JumI>d>Sqr=NJ1mJBR9qjy)Ju(dNQb#`re10ty1KwlZ`5z_AWU5a0|l?2qZgPX zDM}D?+J`t00-*LKM!z80F2e)?e=i%0)NfFqc3sf%IYeZB6RY@9fcv4ihi-UOM9*UE z&2?SW+otYfCO21S2Pup|W9kWavMlN1;x3R4^_UXr1qA-wq8xQPyki%u4bT(&9k7RP zMX&nnttu*=nFML<6pS=(6`4go&TE0!?COP ziYRd*#UN;BX?I2A#V=ckF&3T*1TQ{p$-b(l{q>oH{5@``$Fnc-j&;sQH=wPCu{LC7 zhm>bwh^~35k#t)te3}~D*1s^Kuo0YO01zjwm;2UoUDs<6U7In{_^Oj_S!6baKdNTM z%xW?lUGsI0w1uWVpCtbXQR?te=5VHfi%@hK?EJj%$!=Ume*V;)HiV2L?F6~;wI6`=3=M=@kfcmgH7|V|gg(eX=)u(lEcbN1c zDmR+@Z7i1X`Wi>=_9K`?`D-~Vm=A@DI+HXZifXuh18jFCG}UWzguEqyLHe`>$S zA?qLIJYuKRdoIM^XRX{%+$`ix0pz@sr`KB`i0@_fZrk){Q2!xebr=i9v?xPftK~z{ zkqZH8F`^`t$#wp3>RhDva>kn>c6ZBDFwm%k9of3rEMq^+z0@rFFw&zc_h&264UY$+ zCL7gnv(>dyEllswkhq^gSlQmcKkE6=^G7;p%DA|aJvBPu{=iNP_b?9ox-FlCF;Yr8 zVU!wCywq&BDT8E8V^K$okJ~F*Of5ZPywDtJ#6c~58|~hb5E&QOus)`i%F+U{b53ra zx2!ztE&O_VgJfE%8OQ@2-tc+prMhmxi}ae1-eS7RoGJ(uVuQXjJ+^gGIE@ z6!|f-`M$W+yDA>FBX%m%wa#e3+!co^6e!c*y?fU&g+;t&;V~)rEQr+2prmC))Wl>0 zDF`Mrh?=gE@Ym&gWU$%-5FUeyfz`8OmJO~8yZLi>(B{kVd;9)~@U5(Uyd@$T0+ zQ;&kv4Cgm)4g=_QbNj6@7O6nMwCz?W3Pn*(jWaX5JS8KVTesrv#+h<#SpHiMvWzLB zSs)@S#g(xMcg9+|nW+ch^O?F~>|0f?f5SQIxB3z-NLIXR%*LS4e>0M$62Ta>{{H;= zQ*mnx6^Sl**pO=&ir5*#`3>#b);R-DuWa}Q{H7u}K|%&x zUt9qt;8e^1LJP=7h5+lTtdF3Lu8a*BqJSIjOGcYbYdfa{yc~qGGSm8nlD!ZAJz9`r zWWasD{~Zdvtb*Kf320GuZmrJWHT)MR?2zm?4ym@*x3e;2U3QUm()MubyZ=#K#pEs^Yh& z1{`b3Tf>D=H6w&cI781#pN0KyMDsFc&D}u>nQFD*$wTw>Vg+j{}A_ z!Js`B!D;w){a-KVmyTFAKz?12{(` zdQJNP6NZOsXSvoQ>LfzNx9i{Hj;Ix3s0lD%8-r6Z!N{7t0WOAppc7sDoD1%ZLiBjb z{>XK7W(kbE#tE@S&f`r8LtkJZ$ACKQ6x{J8Qa8=lLQ%@V(Dy{z3;-;l0BX&0Y_nN( z?kAJ3Qg9#wu)$Cg`tqIA<1xmpa5L;gHll@+^rQNQT&9GDe${98!h#irpVx;Odm61fyB!%Y^ef}4&N9ZGKy*0$!tVQoqj1(Lt~gSBe&6E$=3>$R@t!tG zJ~Jyzr`6kiau|ea8u0RaOlwrt)RJmM{_A`VYHJgPM-)YYO87(srU=#6?t@x=fRF}f z41;L~@S8fo8=k2&h$<^v^jz@+yhr?t#!Z&Owzjt2aXJF|#J;6IFwrsw!MjXRI{+is zjk?2u@#5V(cnJO0|C65z9Ue2|v8jyz_ZTIGn*LuT#QncNkiy~T=XZAqdf}`4;l5kh zLhiP+W^fCv5z8*$YysyWAT+b-cYL|vSW;LR3?vC%!7v~?jq*Y#W@h-hrk^AAf&r_G zvuRJz;o!+vweG=TcLY7Q;x(sznXx9ok#FOs<*h2Kl)u0&4tF)KC1BRNxw$#PqcDgZ z)s~~gK+dKcG=Ii=^qLkl4Q}waUt@wb zDof#)yx`cLlbK2U`{P^?bbo8*@qWeYbdbRd*f{;mYQH7{nQv*(u&tw`6(F&_dAvL0 z;oPul7a8&av%6tC(~gmy9c2}C%CZ7yx4r-~3!QC1M&6p% zGTKUzC10Df%A+HWd8I!CTqJ=z6%hh_M?lFjqD;5_Z(I=ho=&*?1q^}$s=5Z5lOlLR zF!Oor?d{<$SHSwpUEQn50>soS@geV1K z*?ypD#spq21rQ@3O~J`~@URJYWd*?q+IGo{0M}oD+e83IGGGz04wj%5@L^=tZ*Y?G zT<1xo<#b$r^Pd?jc&7k%1++%l?bdVuKx5_?_iG1|MQZ<@2jriDkbnwMv#|Q Kma3F64*EZo3V^8q diff --git a/doc/source/_static/rplot-seaborn-example4.png b/doc/source/_static/rplot-seaborn-example4.png deleted file mode 100644 index 8e08c7e86178aaf41db1917153f929faf14fcfa9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 55627 zcmcG0bx@Yk+a@3(NQX*ycL>rU-Q5Dx-Hn9O-5}E4-Q6YK-QC>+d;ER7v-AD6v$M0# zFy#BZ&pGFg>$>jiF<4$!3<&`T0RjR7NkUv$5ds3T;@`(xSn!hpStc6r3EDwOLisKD z_x-J55cnS6R$RjY0^%L|zYoaC6|cV#5JV6X!h*`KDaR`=j>4G+{33J(846}JCm|UmBEo`@g-}pGiSUIep@oRZGIQ=$DPpPjo(vc| z-)bum5m%(ws)r7zwCwS6SMTu}rF$C2kV(A#N%U3-pS1i08d?w)dLtbbMj91bupm|# z5(E6;kW8TieDdRxz+?iy&Xh&}KmEESAq?kJNSo&qi{U_2k$16bX;c0Yx7}us72{g9 z$(TpN5TWPu&D?N0f4k@`3@QP^DC-!H(n)PNF2`jj9O1M&a#y&pcSXVAONBu{vit3E zuxx&@J3R+SBE#1vR9svwQqE;+BP+^trer=(u9L;u;>w#r!zvuDxs#&sl+iTaq05~i zx7!613wpQ>(JcOHOWEhAJCmo|lkYb(YJP!%Z_WoP)0ynI#ecJzDH^yf2M4zK7#SP0 zc>Q&C#y4=W7@}xQ?@AY(2e>`9LCU#I*UHu~lg*4HpPw|xo z{hNDxV&A{>jQswT?fGB4`DcFfYAILZJ?cV$d{y$w--2%_rU(>d?4OhKiJqbLo73>`x9`>^%2L=WN3Y9N! zmu#kLE#`{mjjNOFw)!^q_WJKHwm4nSiFCc6KZ$0e5fT#eJzpzc;=?EHzJ(TN>2{I_5_&6*&v)UCZewUaa3n?n@}xio17 z1vGYc_V3pdVr(|6f9n?a1c@HIyJraVq?5Wb1bsj9xH4g5V;dS8p6?{;#v~^X#ZfE$ zyGw?|tJCJQ$wDQ{2m;5#IrlWo zIwtP@42O}J7ancmAhAc@rU`xdoTimMSVq!>O;NYYx zD=T9bEE68D4@gH+x$DhwjfZ~`Cyf}JUGGmQ3JU%Jm#Fc2c0Y($(<#(kyV&Zt!G0Zc zzuvdgvU0Lo*cnQc0{64Au`xL%$4{RmO>xi@j4>HS`~|!&hhy!#l4hM{68JhUC56u4 z-(RA!v2k)v*PDJMnL`c%1trmVG=*9shHPhNN5U5lB_SjtE+50Ca@gI)G@HwwRU?Lz<(ZqE6+EqI;Te8soPvaikm-i_5{RGioSI*Nvt_VwcN z5NCprpzMVR+`iG!d$arVjY$w!IYWtz|897zFFf}4c!3mL!pO)-qSfOrK~hR8S0P(e zVsA7x7F;fYQI|nPSa^A&>1_IoH}q_ygUPDv77~MYgTUSC3O1)bnOeCHIXSt%nyzQ9 zd1F--3j~+*v3MXdJ_sPjd5{4}7tSY(y+2{z6}7%cMeX>Y``xa)ii6w8{(4^_D?1yT z%aIDgy_En&=wgkTZ*ehgXJ;owLv!;~rGXekOH0dVw@WgXq_Bspy=s>;oid%)KZCK9 z3RIZ&Sg+}IGoMzPoZfUSE#ab|plsW7rlqA}Vqrnrg4oHEOGox8FJ}b7p2F+#-Un=^ zlB#N78m|Y$5OrHv7@}x64vf#BVb|bbq4`&O&6)<{dQh@hjQ+%#moF?XUV@WcsVI@A z*jg}2-?>}!HhH+*nO|CZBd?<)D=CQpHVc-BjSW*wOsq`1Q3!0MO>y`k7ufE$mva!W z)gWA&oKGb6^oStDBJeg2XNrfDIm*pbsi~>2j~8OW)=W0L*tp*uih#YgDK0&Y2eqlz zdgZs(QeCC@t5==P+Gw6kvg=vv-TBQ8$R;mHy1*+=PEMa4wtldh%DTE;aM6aQCe_fn zd^ti8yAE1M3D)+1dMw#l-LFm4JWe?wLPA31Y2q<4F*6hki|C7$RbQ{-6BAi2=b6%6 zH{loSZRBii&yn&Gt+8I6mrF_$7``IN)7SHRODtPzkxO(LvZ(Z0E!I${dp%O?dfs4x z0yA5xRc$&!(s7weq1t4sT({kv zoBh*CCZYdlP$1ZCR);|@{bDmi|G=P4#KaT@il}TV7fc68fsg!NJP@Gqbd8Qiz`T3U z1e%XIsq0Hw^4E%!`AX7c4x7A|7QF0A3kL<8#@rcl^L@ z`+i3DGO~E+?6g1rBi;K2+b1p#Q#PGX23$CbEO>ogPsja$9336~Uq${GQJf9|Jw3t@ z?-{fQKrOTJ_*y?SHWmqzARZKIf(mU?aAc+CksukPl9Q#n1CY)ho6gj-_<35MHzGlX zs5{jpL`8LgiaCDZ`1fh;HLs*3Y$96>oZEhT?Z{mhNH%uRwpKmwzJlcU(`GO^oGvVw zF>iZ3qy?cJp;O+!x|*g@Z5$mF)0HI>3M#;x)6-KwfBy>*uPe3%UX@& z>~@>NpyUsOO^zBgluqZvS5uoSx>KRS0v(Z@hbILDDziZ!VqZ9JAjqSssjD*QdH%Ys zTBYE*;WXZ4YNb3Tn^hh*9HgtgvGx<*OSm8VC@8w#_YT(H!#3j?g5ohfd+FX0rKR*X ztvByM^`yc?C@C#{0NHcqPT(CV-LzoWd_H~zT9(J%Y3T_Y^hO)#HqQr{{QgwS%g?Tn zVMNpKCkRjFHsAgij_E=BMfCQkJY_Hx1%Nu!5>CdhDzdT?Gq0(sR#a3}?UYAol@=6b zK~|YFi?3U2&j#rb7ucwk7L3kTH^E$P(0R(Mt_B%Fo;Lb(y(vfb$K1qxZ%CZ2ddo`r z6bxXF!BMLy{p+19;AlYo=nZ9T6W7tvk;x?nZ^uJqp5sHWQRN3Pg+9|qh+oVGu(1>} z`n@4o*C$JfYLRWn^|%T-J;LD1xwDYfH6hdkFA<_hEH* z>s^@Wl&8mqHmbSrK1Nq}nQFj<&1y@M)6s0DelG$ZKEBP7%gbkBrQ+K5Z*MPFNc88V zwb7p3OS})qx)XX$1bWI!=WAyu&1*n)WEnPQWi=k|7#a!%{R-db_GIa*57%bekmf%~ zQq2HnxW7N}lf^7VLVh&Dzs02%#B#8_17H25();%}sbmQM{YrS}`1} z;$Ja0pQeI{iHUJNEX@B1dKHLUk(iv{l3xJmbrqNvht#SvgvX$i?a^a`U~<@%UUl2U z_j){_B$Ln|OXK|%9ROBw4pA5lwR?6p_V?kcIHKlX=WBL8f!Lod#RX-(FqrKhVulkB z$E*M_I7^ za`>uQ%FD@lw_707Q(toN~Aw0dc1gu@kz78kOdn8WR>)9HhAAo z0f0CGHZC9huNxK!$6^9KceX_1jllEcjYg%uh)k*GzkPuS#!Z6xzhVLZZye&ky*dZS zljMpJ0587BeNuD`jOd}A?~e!h(6F$xs$BoV317)VOb-nWZQqegwOnTeAdH90Aw~;Y ztbc?Bxu_P(*vLpkLIOU9OhHlcr(|wXsq%j)O~I@zaUE!`fBr~5-kr&P`$oL>{Yu6v zm*iia5Uh2Ez6HrN(d1;|e7ZciJCY2Kf|6fVRaG~M_Kz6-(&UDkI&hp{T!dO*U-#+M z>#$^Bvv)Hw!Hts?nHke8edZndpGT4U;PAhT$~8PdeTw1bSX2xQyS^xV85tQ6pcU;; zWMhA}U8heR8>B9LY*)3=J=trP_}@pF?tq#OiPl`FKc|{rP)o`L2M70_PF-HB_`|=O z`D{m8{oktG|64}Yvd71PynQYyDTxCK5|y4Fs9BmRlkDv3>N@MJ*+m%iub7_0fgS=- zK=jSgoE{raQR`frC*`WU#b3`0HYmY=e9?~vfCG(cK5~*x2+dMl&qtmR*khPA1uU*xpk0LFP=?hAK^foA7pJfeAi^Y=_Vv3^~>=*=F2r>(^^ z4?x1CpFa6x(W*{67eW1Jk;)5oR;+-v)vtL)P9=r}Nl}-$GxF62b*R3nsH-zRTWziV zm1`URpVHz(CP7BlvvH%#f{EXy-JesIKlhb?5(^{9jS(^2SDl1mS2FpePjCQx|?L;x*1Gq&WbOHN}$h@ zgeWIxgN{+I$MjLYP`QPbBra$lu|sh@;k5xedqO;tfY+Q;3)1}J?J=NZ!XhG-#v?eY zCF+8pH2f#>AOr%DQvm<~xCa*^UQsY|me0{SsltPGFS7_(^ z^KUpD*25O%t)gaT)BqLg{MR)));qr}G&x~NCD4J+6rftV7!`#&*5p(i6cW<2u#kk! z%GW#zzpng|fCj4a4R5!Ps^m zb}*H%>QQdBgxvD-_<4DS*PN1b1C|94j79Ye0Do-l>^2_u(&6;Z*E{L7>JUGCNK9t1 zVf!slgY_vXi)eXHIK9DTb93__e5aI4>zSI0U2XN4%9Fu0WU;%te9oCgn16YG`UeB0 z5*f&8X+r_)sC3vH)$$Usw=W|sv#7nD@oGyJK5qY@7DIOZh(JSg4e*bprRAySF}H%6 z{@P%iu=i+Y9q76(yGHtn3^e|27jt?60`39EReEMpBDRfW{MzEn7yoFYSEio>K zbhSnvT4gHv+h$J_-AL6?v16BQc(OpKDG=%@h(bW#d=9> zSLr~6eKR^b3h*5qM7*SbY%B&~uu#H~#i3wrEH0O(mvnrF-nH#_QV9;;0lPH1Z>FR#ZfF+F{}N^wH<_lLWd+eNKU zb)G+M!xt=fw+FX?7?f6Rk_}Oy%OCaQ4ctVolv+wEef@b(Uf%hH7>0V?vDV>Sx!luj zsyVCGhWEV|j3wyScQh!O?~j)oQ{Aq1i4ti)@;-_LAmi&e$qebSp`oSzNNvTUf0Lr{ zeCL&2@XqAd&55PtAIbB(+k9Ph@49pr{V2<*VOYaHX*`umVPOJdQp(uZu=F3{wMefo zcdU4yD-zedp5oC+MMEZng~BDIq;}3XzJRw2zrJRJ00h0i;v1>yLe&jYcDm>H?@3%v zu;uM9Ec`F`yeErJCADaipi2P#q|#)JfrVwVqN<95sRFV)kY6TSG~9wb|7p+)tK#*k z{dH@}rac!hK8_a3*ukQR0l%g#@J2wi+nZnFX4)^bYk2x3Kv9oL?K{!4Tx(LC=080@ z7wjVHXU|6*K&|Wj6A{}W&CCBsB+s81`aR|tUujqte<;iwv-t)jNWQ>%w|y6{<(i+& za_s;E=nq6uYE-OCA>+*tjc4jJaBiG3CyxNyqa+_C=IqRVaA1~`r?Yc?K*hpJAA32PKXlV5Jv*G`G7OqEHRFv_jHW{weh{4$I;NW0U zhQzJ)ZAk??_&^lG?fr?jI<6a70Z1MVPZMCxW>a~P2^M;U-_HeSj$+6zHUbG(lZX90 zD0q3pIUS6%CQMD|cVKR{*}8jOWqXX(M%teH;U0Q(F{}2bu~c9F!iY27j7ZcVn7Iwz z+YO2gyYG;X>Vz4a^f091Un17A*Eycz_5Fm=MaOc|#AMpAVNR0H$XmdR5-w~rV=HQfjaxqm;f?qT&yQnWiFWn9+ z(Pl#C3SK&eDYo_5zNU}oDw}NI`)d=J&4t%4&DmW@;#vK!?d#1Y02MhVCB=2DPR^A{ zSydGPL>c92>e2Mb4N$#+;$@>gy0{1n_-sqdcUwotuHIgl(pe-I7Zeb>Kf=S`**}yL zHrZ9D)`C9mUUQqNQT-kQfP|H2q`&V#G(W&cmC?3Z&Sr9+TdE(BEIYK@$?< z#(41fi!|g{d%i7f>0GN6WKK8aWVw*JSjDbN6X1lTck)8Z(PQ^~CUOx|b>u4m$l2gP zgIhEvfi;jl@i@^1g?j{n%pI;%4hJMurFJi(A3p@|pXU1e`#%8evAnztWS};q7Vgy5 zfxbRi02D7T?O*M}yo(wd-nQz<`NAS86j`^-O%8Fq^+`!N{w;gbX0HV^yt+!@;o;Ht zu$%0W8flBt+mo`@PuSi)IG8l2gBw_y`i$g%xcJG(A|JgRX>-1c65hwbj+yPc-0M%4 ziI(oD_Go#?)~;drpyjTlBngCO;?W)Q;mu?6>R&pZ6f%j;;iRd*w@c-Cyf{BP^&mRN z#?tGomZqAVxPP--sounG^Rpz$#KkA7j35G7Hg3vBwO!BOc`@QV+oh?^U!0>@SiCt$ z1_vhw1Zz%jzJR&4iXFdlrCK}{E-CNpJCuTp4))ylj3I{4-Gd?LvuscBidb)emkfqXQ&6?dxvp=3TA=^b)mmg><-K3G%fDbpD zFT_+Sj;3+S{+)r2nT#16kfkgqN{^{Rw`Tv)2nwp~ZG>w>b^=oJOVfN01 z#dCtLYtivrkB>LnKCX{6#g|;=4EcdUhVgIXd-A_(@na?AniIY#P}6e=rE@yqfWgI7 z-pJWVAHp54L=3>|AJ;tVgQll3LDODozT3R$4;;^tm@3*hJkU76--l2dO6#@iHLtD#4m655ds1G)zwPJdBzj25yI?NwI?W$SH1Rc-=#p#Jo(IcK zN;9>US~rk@l>$bLg!8u=e&}pER_-PF?_Zue%LQTA`z5M)nlRBJw4(TUtgSwoj{g37 zRr_Y$AgEg(NjW)#v9vWn1>T8>_%}3}yWOpvoA(}_nd47}aBjSQTfKwEXCp}`zO zDrjm-8g9dnrg9GfF}%Tk2VDnb+t)-RKnJX_OALQ_}9y?{0T-DJ4LDp#c zx4Vhfft|~uZ^JaSHpeMT6IpgrO#$wZK>ySMa;n)%Q)vU_6*1_FLzL;#B>eo5@98x! z7cFcy&j@`~E#y>H(L5rL4Q3K%40jEo##wU6Te1 zgSp$fH!@}+{K)Ux0vt4IWl98fd}R%dK`>bn7Z2K+%ymUa-;|J*EukSwFsZH0*QzJ* z^XIjnycbmA`Lc#Ylm_tF9IypP<@ z%U?%C8eT!mOQ2Tb6DXg%>U#J357QLMuJ!6R_DnZci;GJOQrTE0rN~6jP_dyf(Q@>?hucnn$Mk4C~*`FIOgphaQV`C)LwEjZ}Y75SPT}ozD z9jzvUDZ2=|$4211ZM;{#ouIxhI`3ZD?41v)rL%bc&1XYtyQryCG<$GwVaHKycnyM5 zh5HyT`tM8u^=h!)z`MA(P}%E(1o|mR@Ek$k^`h2VZhFo7-;#%8AS?L@R%djMCKl~S zfY4KEHsubqdnj?~lzWNV(5o(AMnIkA_>zJ;_13#l@+WhpfOK1TvNT#>oS?^qWWtwK~j7pXZg34wD$a-OSguOYu;O?M*`|g4D$=~EM}*e zd5pgX)s)&gR%Ti?PeB9EQZ5pgY|@+>p;*q-WYlwWb3?@EiX-&8|KZ~k=pnvOC=t^G z#!L&f7H}XH^J;1^E3`4*A|hr1BE@EoDeVgz4#f5D?(TzxyB6^nVQWDK>ZHQ$-91H3 z>)@T^eIarAb0qvbRV9g)qM&CDRaFYWoz8*!AC;CC;Ok4eu@P%JbGVyt?gs}3otxET zc}kj^f11w%)UX&OV(!hZXjH9q!ef0&ZQ7Y4oHI{8gWzr<4;`ds=;&Qd%6I3L2+XtM zbSDol@}^#<8kaz}Bd5g_x1F$J%mERkB*0mP&bNd%0=WhvJ<& zo?%>rXg2sOx}3%P0qWnj>=AkeJfhrbQ^!jRebI7!4nF>0UI27#w-{PA29;zl$p-#u zb4b>Yh|+01jk9CSYjQx6g+njwTrqoj z-a?Gg#GQoj}FR^@B7ymJB7`x|Ed>Een1@n-bwt`3XKEOf8<^7IjD( z>;dv9DW1vYP*uUxUq}?U>CGJ%w9Fs37vfCqIaN`^56wAtTiC-(#aycF=eQ^sQu}Bl zdB+S=A1_SrUV6R;8>sbI?=tEREHSpISZ%I$`UawbO#=fDpyZ~!aux^5UzykXcmi_( z&&{+F5OEsVry6Wh_`M_i-?eZ4i60u+v2SuZ>NJQ%%99XR8WJxmzQ9|%CQahh*#5Ai zq^KHFmWB6K1csveAdh*+o^$ndwFcSDKRut(y3BfBeZiiXIX!EWM{#o0c72;TL^3pq z{dBASIKyN##dxWXURX3uAb)D*ye&4C6`$J~O!A!l{GH01)}&nV&~*6ViOmuW+_B5N2I2nEx$OCEyMK8<-fVL@%5x;0Q}*s4@&so zdzF8gFr^vWce*VwEl%Xdz9a7 z2VwUW6q2l@0L;BbGdoYg_C@5RDE8~wlV-5+55FV$!U+4!$bl{Ka z@_$Fz)QB}THT8?n6G5#^yKinTvVIjuuCXyjLu${KdXNRPEiLxl3OC=+V%5;vRC@DT zoJOG^l7(HPe*$t4-*}L`lh+HBHg1%~TFyZG(Z;cw@~}7MdtpZI`0hZ+P?}8P!{|6y z&J2>0!~-{=5;p%xA`pFayxb6hEeFjbtBbz3uMc9pNM)&LuFAfe(`y${D-U~Vlm-h2 zh{!qMYlVV~`+{e^MqG;)AgV&I)Lpy?ld2H)4K>)DPkN{O*EWY%hSVL`DPil6yN44g zE+!_;EO~0MaG1I?a4!+Xq{2RBtG6e~Mk=IU@OHrPFI+s((-Kt0A=BEC? zIk+hA9i1+nmT*sD_JAP^SM7;UXnn$cm@smVsL-Kqf1-hJYb{3f;0JN)!ocKMS}$uI zHk7x0qkqzJjwf-w`iG`iSmg5=OB2>?Rv|0*Rva=mg)l2%?@){CEVCGo9+_yEKd-g9|XWy@>RdXq-aZG3S~|YtUm2vZ1~K5-qqpFdKSCT z!s*6S{r)-duD@kuTntXd>h*Gm|DP!dSerORBk%_O;4waep#|?kba80<%HlpoEg!;# zGZv;cp`UGjwe&0FCoGTG7vA9{1GlalAnydG7GsP>5PnH23tJE9{OsCjzxhXFI0+pg zHwNNW;P?a|Naj>=2qkgmmhtTKG;9Mh@`?(>jYGkhM+nq~1BPo0*XBnF#NDbVp#jKL zMss&~xKCVY`+1N8EVM;d!0SS%&J+l;X0rY)JB;8N;o+g40MVZE#IAlOe%@vB;B}s| zthedI(qFkvV?*^aC2CA|m3EI!X-tStK^ciZ@#89eSy9Dqg|B*Tt8F1EF;Hm-pZ&sO zNOtia;3>m-Z=~F#_U&PO#n>R|Q*h6NOr(C|Kg-j^#o^u*Y)i(mV8i#Cr3?q}s5>KG zWwR@+b^6vXMQ+aBzrhm^-p*bLeS7vqa^>~k1@Nxi54-9KHZ1e94q;41;@{V7H1`Pg*) z*lBCXlgnd|r$Pr9BPc9UPZ#KVuMBtj>wdGc>h6&sbYNgk_!Rm+Q`KVsa)2~NG9>TP zp2qr=V1C+xsFo~LSP2<(cbP3MUhzwy&n^$I9#iUHt;{G|DjHgn_L``aEbTJmyo!T? zZ<4gJ)Rb@^ZZN0}*&W}`UUnG}SEH}8T+@UKJMkjLq8_zp5B=M6uIpB?ZL2CGyj)T`3r@;#0PS6 zC?!FzQF{{|8$Re-$=->Sca~E8!K04Pum-i-!R@8oC(A+~3Fc$NgW={Ay?!T6(7KVk zGNn0d&i0(@&0ZKupT2qzFSZ5Io491xx_s*k#c=Z;R8Fweijd6I*XwWl&TNL9KKv9% z-&`LP@HC=FO5H<#O#HZO+UD+-v#Z*KU%_tZ&>2Ea|*{o0J zA6(ZbIwJczWlYy<=u>g)I(uiM-@xbpYJK%c)4Jk28hrs&`8RADq$7fLVEz}6@IhBWpIte<9f`n(c(x;zshFRWg&x_0R zdIxZvN*QYqOp&Ud3%*u1%yU&9pS!aRUH-c55>7eB(`eZ70Q3k&jzex&HswqG?bqN| z%%LNt)ou0Ziz2Iw%S-*)lE$!)7i>U+1l^BfV#36{E*eIm!#o7!FPN+4O11aSs0Y1$>|B8+PmQDPobX|oryL2hS7OKo_eiPwcrh?Br4@mizwnU|0ch5+UZHm zv#+9x$Zm3-PK#){nRh_6nuUYEKe&Fcv%=*6YuL}6bDruqq&?!U^m0lmyv0T^0Q>Vv zTc77S$3JH%%v6QG_mlyHw13uWxi9g?}7W8H~wsH35|9&MS}1Em6n1|;#Av~Rx&{(aBzK7V|CHF?fAkg4)P zJk4S}l9=yX4q>s_rL|lL@?3wByi@=~y1$*)d<7-EhO`n{Y}msdhdp&4fm7iJGpxeD zdv~cVb9XA+EHDmUugGE^VtKr;Xvplh!%H>4o=r$9>Z*31)z-FdO!^fw%hzMw(l56H zc2w$1fNchNy^L19y-D0#(*|Sti5O1x>1%Ggr7$aSh$PT~JdY2igPq;ou}Q3Z6^>W- z5$U{wwp$e2#g2wpJL?PM>4*l9&I}O+t$2gXiR&fyok}}_^}ZhnA~cs%QZH=n$p@Hy zbofpV)@S45sw{U?7X-3M(i^|>OzLSWqk<`& ziRU3&TVsnaOrm%9I1`c;2Csc}OqkMWO=4|7oy4{|V-}lMX44{S^>UAq6Wi?qeKhtA z*`I9jQ6Cd-q+EHm;d(eVjP--DnhzBE*_FnlWA8o_;08-m7zjj)J=kAr^Q`b_JRNcI zJ?&lE}NB;(-5Hbv_XzA&r2bJ4(6H0fSlbwJTO~5?$NzG zaFZCwPSr@TP7Xg{s>H3&c0>+`d{6OQrmkiFR$fUuut125izJE5iqlFuYhTx|&#D1)GP&{b3$pKE2REsZTag+OdOX|t ztiFNK!k5*$R}P3r{~H)OR$E4$76>3MCfl^p8WW(tL|f>d&qeg0E78lL-d+jOvC1Ey!=}keYzPj+!vlB2{*G!-Vg!)C_{VO&3ZucgqeGfSm}2qwI#GY#-7wD zbAu5fP7{nR(aUGMB(=AfMPvPAo<5*V&mJ;1q@=wS@Uel=m3{e{7yjFRf15bZi#dC- zwK$bK@B$cmV-pjH?$$zsvCTv7otpybdXKccUfk&U_}YebD|~v75>egwLh^`5#F=;H z^BDU-4FslyK&5ZACDId;w(CZnw$JL-H&^AtsDM0#txQ3)E3 zJ@1D%s$QdLVto7{hTPwGy1MO)=3JQ(zyB9b;pD>pY*HC-OG?H1WB^~3>vq)KIdt-)e*I_e~o-x|v4PU0Ge+S2YucY4=aJ0I{f z5%Kb#5Y$&dWcjQp6|0r6@1@VJpPUR`W2dv4EPR*k6`m}SaF`5xh*OTsM7ks{&qAft zbqN{a!r_|h_rccXmg6~Ip=MTF&#LQAITbfcAzd)BqL!V_s9?#yokT0J6pm`;=x5Qt zsB*~mt)t$1S8iKIML^rSV^%65GAJx;V#~*g?4y?+0Aaa!^A1Ap8XHl-+4(&#dr9xY zf`5LxSKWI}X;)OB_9!Zn0F#0#8xEK<7+zDCl$I2peNBHhLD|wjlNJ(W5seLg|3ii| zF;?K+ohaS_S7_?w?n9}{>ynIv!(sX4yc%t?9mZ6bcL>Yw7d%n);Y9|Vs z2cCFXQir*;9n&xzTqh}e1P6_38-tm;!5zsa7Y}4y3vN|QO$ob$n<;-V*9P0&XUSge zaW{%%c@daMrFSa0pl1YT~)$Q5%(OG*%BVyLfo921@Tv?!iG4bgXplQL?E;?^NrN_u_cYmPkbRXP&fn@xqdzvw@kIJ)^K zN9g}F+8~sYmCdQB@S%{Q6Cpj7#KT^|u0gTtf7$PjQ;9=+zZh(_ce`=8K77#Sipj!B z5f0V5AV6z{QI=pdIp6sKuJm_J$akCtH|F7L)vg>1~Efbrn{3qQn?2WTAbO68quYU zS&iBJCn>nqTrY^u^iFJN#v-v1I9y0|6a~2IH2{Kd4y#7UX1 z!yDm35fCi;J#ejlohOOYc&|pEuB7QxE9tB4{7kDrp_HZYm^GCc^YxdK6%ISC$56Hk zvWP09GI-Dfc)(&3oN=di-?|*PHAFuM0-Ei0PcRhoM zl@xbPR4^yOXlrvz_J^|QoBSg&9diqqJQ5_GvYYYwfq@otl8qcl-yJp{d(N7}{Js(0 z&KXuoM>k2!8queumei(MbSnId1zXlEU=gLw+WdQbSAw^AeI5|9$amFaW6fy)!goDL8 zTC`w=dskNn8d_u*F%EPmjfSOZ(lz;j(myyKa6eok<@#~W_7DDv_Vlk*pY!dqWYq{3 zm8N)WnMuN=Ec0kYYDevl*Oc;BRbI|O&ao2<;X2oGG5|FRU+lXb%cYQs7iU@o!#V0e zrmlbq1WByEIQ>9|fq+z#zq`YszO_oGeZ_>lf+2N~H<_BI@ha8a%I7CP;Cu%1_g!5k zi~!z`6%_#9uvzo6rs7$xm!a(?ukOEX*=v72+}J-G%?%^_iwhgAnHCEF@4C!DSohZidErQ`urR zV5Fo``5#-X-WjCj=Nc>%ILgrv+Jc$N@;PO~xys*^hbj~_vsE=SV`wjG9P$U$W#8pP z{4p{{|5GjhF<$tkY-`&Fi>x0ag}9qA~S8rf=<_n8s}wJvo1eXfFX9GFPAvf zp#G;u012lLdOd60;5$>}wUht`uN*8xmc2Xj0V9^QQ>F%Uk|PCAT;`TQLar|)&a-u{ z4X(;75m0*Un!A`cR}FSAvoSG>z$Ip$)kx?L+L_2uf~S=JrV4?FCDV^^To#uDHDJer zuD2GHz7&DNfsQE|>XXolgz`t$^^gk_pO+Y=`h>9Uj+Dl5Bd;|S#BEKq^D|doqqtmd zez9W-QcjK*0uc*cbyPIW9eX|z5;fHl`x*0iy+4ySzd~V=MCFiK^1*cbrjrZ}%K<{o zM2+BN7NvS&crrKPTRUURx%a1e%`RWj7&7n}1<w{=XDSrbDgt(a*`+f$aBUtudnrQYZ@UOkEfHco@OyH$9=9A5^bGaNTt#v1B)qh`w;%(wJ=K>f5&erV8S)s z-WU5(2xZY2f*kQc2a+fsK2f!!i1{^f(|7;ENFE7-k%PvGD~?N zm@25BD9QA6|BI#`Sv&<*JVF#T*`F&c%%zvBPWIV3LnCBNQ=}Pb zqvX*-=8^E?PS8A#psZ?-1G893O#eam$o*K7KiJQ&+=8wBLxV40bATqQRW9z9+`W@krg0hnUQXEDtjD?S0DN2t zNf~265bf+7%WhXkQlexHofVXoEQzj*h}a(yy+qJt65{90bcU{owm|e?T$a!GN|H9o&a;H=_2XP>pC?ot{D-b{*IL&qOoFQKE&Kh(*3?jAG{wVOxP5BoNLvn3K~+=MdXOgOEQ4b*t4s&WN3R0udi zj%fEV<`TTyWJc|hPnihGRy$*rPzFx6Ej;OtTEyJ+x%Z|{pYd~Z;tVkkFnW-mA#D!_ zoDDM$wLew~eO9**o1F;PXt(06t5QtEzI+?n{#D4`8TN|@Ql_A`Cpdfcs%_TcUA@yb z55Iim^afcvty@lr*ZY26%hg8zoY+CflkxcUJIf9*m-5SEgZk+^;G!Oc7}@m$H+59t z7c*w6;1`I_`-h_nV>7#c?DJ1Fc#B#Cr}S=d3oCwi*Q9logo6eo7(u+@T8$_lTi7DPP(*jVC{2`Ci7%9sAsT-jGqEo!x||TZg8az$ zca_%Nvfg?nZ0cbBw3%>QIHGPcv#Rx$QUiZ(cb7ut4=@Y!>fRE8s)JB#x2hU;XI&`Umc7fVt={dT|X_&%_0a5?U3t+T!HMLtiU9J)t{RTYH{_=FDU!Mr%{m8Eq zz4txgdjO|lxe(kOajg4lY(cy&Mhhml<3_SEAKqvYmWG54!{H^j=C1tlNg%sZk7<4D zwwr%IKq4(*(Qd~DmA_n^@>AyJ&(wTaislvTtiydOdb=$Q6+~SVHxUI9U`M!Og0AHzooFZ#;5sK&k1a$s_98rI-Pi^J$@5g<3~4^D@{pY zHacFQV1Qji$FKyL2SmfuX>^J>UuRqQlj&*!XEL&NzaCSRp+rAF6>4aD*RkQ$;rwF^`sa`2*e5*98}we3*&HwA17jV zsXEoNDQjz@4j=W9jq$SWgsl`K$fI<%#Ut;N&6G|kbRN%Wb9VBw=MD~5c2!{=TTf zDEgeZ4Q6|N#?{Q$Sj+YzKH{gb)Zy7K%|h35);2dlLIW_6uUm`@zk_CTxVOFJoW zP{DFrhviZ%PNN9IL&s(r9R0>~CLGAMN@T)87rw02C)r{Xo}N_vbwVQJ{jpT^i;e_? zsu;Yeh*S(O#%qio9r>WCHU(6PC^~(h*?Sj^RvahezLrEFS4b0Z2cJjTf)d*DbZQCa zu919?If$C?6UkXwePxo_<(r+q{^qb5T$bB~9Iq0w;CpmK3&&aQ>FgrQ5+bSKM4)z# zNB@B7i-84at^cNIL!NjTHAF6hz=h#>{;M0R?GfyTIwl->Q7meBG?$Dgl|vKp_)o!! zX!&0u?V|$|e}zd9jgj*SDcLh{-~;9K=;?P)XP+gw?0#bNxBmc^O`GPlzBaeHP|mNkd6nlsS`?l=$NPj6U$`Re6W*4m+)`j<;{N;J?Z#u z%BHC-^2LuCn%mmZ$o(f$C)E$sjE@8S>V2ng8b5w#n>PC>P&p%^Y7B4n^t5hYv>m^j z&Dy}!K-^s8BTyRs?ABjs5VEO$hz5lS39acVQHz)DcG9=JT0qOX+ZrZ1!Q?jF6Vgkw z=5c0aZ(oJYyve6;VAF!3qpcD8!TaaZiShY*W`i~ZyXBIAZAQ{}j|Z~=T*5@aLf4OU@CF&px#dzk23SPDjV^^=qE*WwYntv!TwuZj=|fBX%BfGgI7$ zdPtYZeGHROs5{$hSB0)@IR7?*GZzLs0!&O$-O6BCZz8E>gz`DBH^TTo?)CdzZxk$f zF&}6kY)sd1*SQ}dbxpb|Fz2i?KeyD7P*kM2JxectIZ(CzFywQp{}B3s|5Lttl8VQ3 zXR3fC5jy3RSL>87RR0*XF1?nzo|dcJe&{yt{KLAFnzh06S73hxj%9t@jR|E81j(3z_z7(>l zKV#raNzF|q3Y2#_@|LgEVuneR+Z9E^?;HTY6cG`T>a`pi?!(|UGE{@y94+2T%PSlO zfw}w)ok%>!u6r+!x2NNzcIty(+B|8A4ardLC|kkCB@&5PZ&Y6i)LZax%4;$}6|qs{ z`PTh%A!n{ymtjNwos!Dy>!Lz6L7#PQjHj&a@`|5gP~?tqm-y`N;{9X#;D5YuXH|By zmscc+;>+g%qtATEPKObjJk=qwPKkqcdLDWg{lupPp7DIgcZtSW)37f)B3eKqv=J{` zFK{x|TBtSNdZeJLdVP@QRy}Qfeb#?5U|v}?{r$18acI)F`GZQ%!m)t5(L3gWW54G+ zT0@ETr^zcic$yIM5Q0S%)BPI_kmb{aS4hzA4-V98^W2rDuMZAe5ThOV`o+BSMXsr2 zWmdWvMZp@WDR;iK}dc0*?iO*dooxbC*X(!Q@>pQOP!9}d3y zMFH84GU&!0prYn_T_hDgZLmm@@M#+y8Y1K{Mr#Sg%(a=V_c z<{SsJ)hxVEk4zFQQW?fa`HfCHPp+?2_y|zlnQkK&Llg0o8 z&38)nk8k*Kc#epP8;*uudWbI;^Tp^g4xUwp#|>RMhxC9cu3rn9Dd$7#5AybvE?uR;YYh z?>G~;oX@_@bVu*!br4h1Yt}41yGPJJaZ@$p9+JZEv#^|GV?3O9QF>ZoWFz1rs7NdD zxY6F;{&cM2x^1yD%51j&YSvpw_@-YEXf}v3zsmHa@FS0d@d^=E|9(Sd94$g@#Cko0 zH=V8iI+^GpNn2~+@bC5Vw}(SY#v^muqM~|njo3{yk-FAPCIC4JCfK`Pdv=TI40A?%QsE^(@|Gb}xzjztMK4xah2( z6C^9Y5R_Li8=06u8!$WqXhN$4;g85?=QF}ma>I{~QjeEZ&WAQy^S_t;7D`L7R#HN` zm0r=ZVYON>psaO0)`(D|`9|{DtU;EqB6T;rchrwhrnh%VlFoS=io6^%U zQPcILCCcH2ZPQUc?UFP_Cx5MKUm;Cj9{Z1Y?uDIyn{}I#nn-#vR@|i6M=sRN3@S!i z2nvKXX*tYr^^)YYpJNjd^fRj;hVl> z-kt=Gd%l=LrwA~wuYkAa2U82!q2i)2q*=bEqLd91`fT^aA%UlVb0FF5Cu2pgq$7-p-<;E5^P@<@N%2)2v+Nk6){ zP|9Urw#(xrH^S?F$QDYqu{T*V`^Vx)KydYqaqR`)4<+J$!BaH3b#9a{iCy87DI6nX zV?b+Np%JkCjS7_0xg9-noBOcxZQK%uh(C37m?K=4%}F3w*Zt(bxyY~8_WJ7M*6p8G zFYx8=nrA0NZ*I;Eeoao$6D}mHYIKiO;bmLqfVf9*b}6_kzW9|Z$r|q--L1=@$m9up zrcNmKgVGX%c%1sDsbWW8H>Jjkg0(b_T@x42?#0fXv%B^RD-^Z6`<#~9&&AP*`pT|v zUKE~PtZe4eY+gCp@FAomimM9TFfdoL`&{eP-+13uR8*8rI(~+TnA(8T5CQS7Try7t zkUQKNE@aenESGLo3g*KEoJ5gB%BG`b^zF|ceYzk1v|IIaTB$HE3-R+noH`{Tgz@5E z^vr4xN^&T+Ueogi>Sj^L7+`RH<>Jj7crD`^jnbBFz|2V5uNOe}0p|y&pJV-Ix1jWQ z`eaZ}VXa$;_cFFA8P3ht?Y!G66^@Ee?D-rIfF1LSK+;N%Jb(=_1 zK421Al4fW<(k@t)(Dw2hB;ojA(9b2Vyoa`JidF?!$M#Oa~m5X)RgwMvn$Mmi(3*tI4QF%Ku zj4J=+NFCdn&hCEd7Yj0a<}@Qn&60ffXi-faDWvBdCk+k*=A5#wMgrVbEcz#f)z!E2 z@%Hnr>_8i0BR~rX2zXs#%xpeZBJ=vSTjR*a_n1_-@h#`kQvL1>F;po9g-mENK+HaA zTDd}|!Bh|?(AgjfG!1c8RqJ2*4?5SDDn#0cgWl}S;k2TRQg-Xf>ZK!Jrf2;P9aZ@_ zlw@6 zX=v%ui!-vVG^o_9wd6M3d-L&BQrtF{vG8%auSC0Y%OLoGId>))C|sTOVY|6pX<(>4 zNz)>_QsLbA|qWQU+4=TFOty+WwG2t#Tn4_W51Wgrcx2PRc=T(m^oxt_Ot7? zq9VeSwftj~uVSLHT*P9ko&Ei7>g=ywpCw@_W1UG6(iJrS!MHkGt-u^ubFAHKmNi!` zm9VzPfbh9FT)3p!x+=|uyrsGMQ)sc6tLq(_s0TIE@>&y87-Uo7)7BEXLje9sney1V zQ7qP4Rkc{g!Iww+D<&<+jHy*=-WCz1FJwwhS9aOeJJx>3nT}bpi9Go^vzG9^Q6}_d z4g8O*p2t?lx$Z@u8g0>ok)9V4Cr!&I%=l@qw0>RfpzBNjtLRtyyYYJJ?JEL*da3Sq zHMOokr;{4z3+kuD$ZmOKUb1cu4}&vyoBx^&Y;+^fb;ZnXoOY1>A2%*X2j>o9}Ip_BKe}=Y=%t}LP(y8VdYCq`j93x zETi1e-tw$kl{>f~(y+HNZy+iqF0|pdq@otaOB7*gj)(M&6&yuFPlT=vDYS}X^TQQ# z9cJr^|qeo#CZ`+S6j)iG* zXtpM2WgEPaKLrI94ZdZ)PP|hV5*-zV99`NHm8d>*XT^lcM*2^zK5IrSFJjq52z#EE zB_bgmZRY)=?N(EkEDh#XbHv{K%pK%l?Itxfkzi&_@=&s*y7=h9sMbVEDFxZ|9>w2( z#052f7~f@h=&p{h=V?PhPndtWH4*sxh&Up}ih_|b3PO2uA0MAoFK558&{CYSRA>SO zJ8gJV3SHBrx*2=6@v54$zUyl~>1wB&R;?#d&{I6~T`2pC)r>5XXB}pw-i2uFM9{wc zUGUP@kxG!*(OB->2AA%!$ng5Vg^b~78T!|*TC)#}GhT)_cg~YQsYtvktY`|&@926l z+Zfw)w)e1;>sY7>OtLfS{=$Wc%HLsLw z8&o1j@qZr_WF9YTqOi$Wg8nT0j~K(*+uh8xG@rKCG!!d094Wn*Wa;%yJR>^asS%@3 zP5$-VZ_KY$UW|J+(McIxt|HC+&)#?bk8iga$YG6S9^TFV@Xy&nK=@O}GsbU;Y>&dp zM3JXPs<~&2bU$@l(Y|&;l_q>jZS|zFzvoKyT2rweb^tHS=fs)fg0hm5tGp+3huH(^ zwYjRfIpxX0JlcACEVsJB>+5TfOrR2=eGEPky4gkrwLHO~g{|05{rup5YD6x}x^>7- zz_jM?U&u*>lk3*Oq|epJwG`sYa_*en`6Bf&;h2+&S`E#VjX}$nzmvI>s**0WH6!Yj z0Yy55$!Ns{I~|>pl{bnliqJ*=L2L%)_9o|-o%)N9&m;&J#Dygm&biqgmP6n|Mpsu) z^rT!MdYz+He+}Uk78W)%G*>FsLpFT7@YGdzVG(2V*`1{B6{T+Gj%>Y9J`#CB>B42f z4(4&c=aWHl7pSi8I2(AMYd2QJpQ>=<30S8|&E`9db>o>DdC zGcqI{G~~6{W(ZtELXPDJ%`H?6di{(&CgTOA_qhUX)o+AzU-OL&_0KVo&+>eA!}R7U zd%slUvnVv1E-+&J)WOB8FSupnK39U7VkXPochC2`n4W46Dk`#@dHvycGEM%Xo*dd1 z6FB}ayzmuu1zTwK7t?3*Pn8WB=So=o&%Bo&7PYCRo;T2%PJTy+i4xFF5$ z%|&FcP1G|&*!)Pl${Ij&qkwKB4r#Mn!u(uqEQwGfnY&C1`(BRO&@yMywC^nD&cb^i zffr@^1Rjs7Uh|g|OO+r;n=5?nem?MBUYEo@Mn@g{?vMSY0rcRoCnK-&ivuz;M2av_ z+vr*o3g0pf*)Db{4(dLLM-w+kscJVa?myW$ex2Mv7FQAdgW3I=_pm6Qi#$N|gy`v| z4tIou+>eM{H&25$+S5hvm1z|p!f<4U{~vm7XDhM7f%K!|z?QWF9!ybWQBwltG~M>& z{l4;d0-Od<*eOQbuE=rXPr7q0dB25-61MLR5SkKB#+MOMXmo9AkWl~CeYdDB-Py(b z(uvQO7q#psM;W4A#`@b{7K5?Wlz=;T98B5~nRNEqp-@9c#+z6_eZC7r?G-?6hW&f9{Yj~cE25v-NmOXp%xHNdw1h%lI-24NYCDTEhc1SYd#7iaVzWpH1!v@% z#Cf=Sw(8=US&o-~0>IlNJk^wo9iQEGG6cI8NqLqu?lQ&9t_@kdr=GeG zJ;JmZKEB~eX7NPl&ZLtnI=6Z-iMg{lK;bf2yDcZX$^Q{&yt8QH2DK&uN3P0lNRF+_6FS}=1ndi|$u7sdi6^8EFZp&Gb))6U?y1TO-HsqPK&X_xCF&8!t+exP4D95ypscl51)bdF2WLmzM zk^~bMziiSn%q%KGv-lP>RjXr(f!Y~;Vp)WTEjsj9JbT+Pf;yruCP7w!t0z)vND-|;>eo~8k!qEGn zIis<3+5LlkTb^fSg$Y>*=!Wahs^995mRx+gx!%?HL-KRIo?9`UD7z(MvV+n$M?91O z*pN#HB|1zc!jBqRA~WdFh#c?j)E=Oq;(ND3ZYS43c5-lS!m6xPjgu=F;`=AbY_jK- z)|@v+HVHKa6=BtSNpx7JL6*FR9GtqlFT=S?zqDq(iru*^pXYE-NzFk|RGKe%8Nq=P zI$D*{{NUtuE*I`CP^)_hT7%tXsD8LM?4!e8JUeC^bvMKfpt)4ylF=?Z<;79ha zGnzXamFagbFq-GOzN)FPoVC$!$_RAqJh;cg8fhxifjnMjZA=7o+)0_2c9BuqFQ8<%mS9FAcbY<#%Pfn( zN-mj1Q*5SDgBZFo{mOshhc$N!vT>mtsUFXhg? zL_(*rn}Lnk)W2KVwfkL;-e*fD=8%;v_lz+dMdEU!JTQoZ5l?pOsi&!)_DrR9&vBA^ ze^<2Ftc=lodzvO7V&C=nm|U=N9bGt?8{Odr7NpKO&azhPBLy-o6XlY zZEiVZW(kKc6qfHzoO<{AoOz9j?f2Nd$Lk(1n+=Uq=?1}?ZGe^$uE^8kK+B@L2 zxE_tHL4Z4$Hy~HL+k6=8hSnA;rl_EwbWY@aKHXMXKGoRde~}>ICh%AW8|YX|sUFu- zEgwT)4 z9S8?S-$^BuK)+jzrPh1h8?3e)l3_S$t5yp;I!ZiVV#>8QCWJ~)sb)pun0}fVTpS<} zEo-)Lgwc|kY||d_c+Rs~pQQCJ3Zan0*gQrca&EVqqT9!K(V&nLG0!^J2`(Fnt#}?> zu_i-$!@YS0&H76j5RG14U_w=Fz~tzQK$~{o`wfaMVr$-*jhCL6aGrJAd|cpw{^XBbnfUu zX=Gw**N?hM-c;A*DL)+m#z7tUkWY{%^MOA6(HHg%rI8iK5htgD_e$C9Z^z4Z=kzWX zi4l#!(^^9Vq3He=*ZEdEU?g@Xaut3XmktUFS{#r_%Tg`;k;?OQo`@20^O=~q8sOi? z{sIBaVzkJ3;Wv7b&e@i}H4X`hNXX;jPaz?I%8FfGO&rcsM6TdHO)hMGI7c6xaXS`B z=4yZof*o2~+N5?e50r{=oi&F6#er86x|fZo9HmpF823JXdT?8XhbnKm#hAQ&sw^S^ zlRwY3>}31gyo|~D{)n69fq}Yj2HkhlfncCb?42%PL`FV>`ap{zee;%c`WP{A^Rx8o zG(L-0F!|BeN&~|Z_Wq`x1m*GacQ_Zvp+^52`Bx`h5TH?v9~0ZXM9I$|0#y@0t3!s^`ote6F|ZPu+zTtg)9gxsPL9nS6gvTi@c#Bd8>x*9)B6 z6i7|o3wYE&>y_pL9b~xJ!+C7JaQp{z>j;39TR#()?)7qaYkW7QxIF|y9y4djr?`K& zo{o)=PXqw(+w;t`erwh?LwZVD+Q6P3N2qp7%E?hMZ#~6m|7p_BmgJwW-$3#0TV+s4 zp=g~udY*cz5MPRpNr=P2--KI6APL4j7;H2K25Cjgk0{nDXfmbFH)Q~=4 z3P=z-XfG|bldFGw)4~TV~Tm*MO9RsD5(k+tE`z&{Dt4V!6pd( zU08c>-hJU4!ViZPc|MnIv=_j=|665~mK#hNP<7NIB%l(#n( z<8V6zuI0rG1PY4uSFcnuLCk#TUSRUkXLT*=)}>tygbJ9f!Bcsk_Thp z;euhnO>LSBpYE#BfEWdXWC|8HU-(@%Rn>%N?;ZwX#t9>&!VlZo*{xh%R`TGqhkp6N z9Xp;hSU+kqU!AxnJu#**!968Q0!!6tT}9y?YXDYrcc&;VMF5tb6jD5v$oO8vk<Q z!V;PX24i0v8(%!Pp3>MmfqH01U!S;Qbi>s4&P>BpzXuR$9sv}3bImY_@x^*ec1T#* z^0#)fw}V+y$%$OM75zdjM$bSSV2c^Iy}u@)>uaGZE{>uge8u@Z2|ZN21-STTqeX8`P0~A-W%eK%dO@AL1>(8knrQ5i@W*_WdF%(ye)JQ@i(D z({0vy(h5SLL_k_2mf0^!!%}Pmcr^l#{o?&wFeC5;Yik9z8*Z*EN>-uh*w4RH>p!&i zl#eelPS5>8#rS2_#g;7(fv~rynBK{h111b<)54%_%h*^Ppc)LmX~4GxfRVrQCtVZ_ z0Q|vk0HbQM_cI~K-|}~;u&4|g5dis$;<6SVb_!1G(1aE|%jt=Mp&&&p& z&Mzzwa$A$!y?gg5E$zW(MgL%>g*pNpH!f3l64{5K!2tIDaDi&!pat+W7CR!SiZs6Y zd0(BYPTcA%fW93@!t0koce$Z|V+gox9}i1wBPH_MDj_s-p=4MOdw{h3Q(>Y2Y}m&6 zwh+6YIuNEX-m(>;4F|yBpHM64yU^8-Qn8bVcr@%BUS(cEK>;Lss$e`JS_viOt=Wc# z#MIOh_;QJUgI*o{ii)c02yh(=VQ*0qK3l|bg55tZc_#>)Y8a?fg+Sa_PLa=rU*LoF z;{^X)hi~2f5`1`tz&B7+RaK3OiW-3({r1@)Nc?f$$G?*YrU&bd?~zA{>- z4m0I_T^3W-cIzOAW~}?+w4rtcFA^S1_2A%O1eSF^JVL+VNiv^t4*`7j2ow%V@cN6F zmzN9F3RPi%KHL18#xSTVUy*Vce_nU4QBCSqQ{09?q<~8N^J_2?DZn5X(y1J`ghyI0 zd($$Ed=KOHbxDMisd|8`MZ{RXCu>{zyYAuJr-MaPGXd5ET(qRyMM!R_0GwRJSAZKH|_2+q#4>HAqit~Oq;RaNDrrIq-d*0Y38|M{=ORwkJ9Y%nl;-NzINx&B;ge5^nSqtp0UTC_IskvR1GuvPyeK>zvskOr5LU8E zp;~BsJb^5Ya@z&`^DPjbVM|RD4a9(*+*dg11vI^7+}+*VRP(x*etj*J^92BW6aeXw zF$G@0C36MlGUH(VQmi%tP-HXo>fE8GkAQ`X`wC(mxcu*c`brnzTG=lU$T*6KMi+y!*O_I->(urz_?-)Fv_An5TNc5pnO zLLbf;NT}msAoQ8r{-X35$gN_aJ&b?_TSIZxOc#(zO#ggsx=Wf#95H-vrx+6fZaE_$ z-s8PP`U_ZAV09Wo@`-mw3dR>jJMOJP*!Ye`0xyyNx_!4CkAx*1b_+j;o#K|BpXzn2 zX=rG8dcIMLXj?N4%|+xT;lu|I9>mMiz-7OWidq;wV^uZ%8~$@{Is==FB;ZTn5E4qI z33)Npo&0?p{^oho8t}itXXFi#CSqn>e(Kyl5yWtq2sP7sdoBAe0O?^=D>8uX2v!a& z8ks;mybKJ4lvwy5?z{#P z5FFJn%T7;ElVj@D3g3bMgij4-uKFrK_t^mDUw?Hx*}#ve5X!GBVAz3L?932?3 zj4>Cg!Vj_f7c-4$09YPnI9G6#DVCjGXob;mVhqbuD8GiuJ1eCUBFb1fGCBmQ@0AeeE zrD@!rF#{G|Th8j9Iq-%yXX=t}T@sf2Qo7;3b${trs#_CllIr&Hc9p+rzT@fT1;Jj! zS1Y(D`hhPW3UA%G;+9em#uVWN1t5k>flYqkhn4mW1m8$VNDw|R!8j|T0jz5EB9qaz zcInbF4nVgL`~i3x#>kT+>2?2;(Lm*4j#zKl90hMjq1PM@ zE46N&5rBNlyHPXK;B$k=+#d7P#bD+3@Je-Gr@vh*kVGiw$bxKg7jF35&u7z_9}yAp zHYx*yh#8L^LD(2%D5_2~lSfZ#mDQr!&f@dQ|FLsd5Vo}QkoBZc^|;g&`>WYC## zY10Ka!|o+1DQN%^01pWU3I@q7VH(bRi?$UY6KldP+zwD(*R9I2cmcO(z#2t_LOS3B zE0o;h)_be~E!d_Nb$_)P(71ovIhp{_y--%uQMIk@BZ?(X_l{KC?wk8~c%Vl=x+7|rIZ zm@L1&47syMb^eP99gTD7tu7%t>I-~Z{u z0C9ucekbki)l^iCfqW{T;lDqpM2*Pw{CTVkU?}J1Hz00+(-*~8+xq7R*lbkA*x|80 zF4_Ll_(}i1l3&@5M>mzF4==qiSQfmqq|D9D+h%jQ3D3{Z4Zmsp=OeNcwdc{T{bqz2 zsQeJ&uQOW+Z6KFu>5zN1QCvG!3J+(Za0N3opxD^{;IzS_A#N`97-ICpZ-gq6_jJ;B z(<5C%RXD42PETFoJQ9cSS+l`g5JG^{Nh?2puYnZjO7J(c9qFPqQHVAXC459lIyXmA4|-Js!vS;wSQ+MPPFm-VpKHur!cw1b z*ufI{f6jX9us*0e@aW=kJk(*;_5Xl{FOC$yt2MT9X$|)mm0VD>g>LCV%>xRS&;M&d z>}rALV&KVdojn(S0uK0px3X6vK?P6y8w1ApxEhii=dtF0Ll)-BRVOb~6+4j1@)5d@ zSq~K9#r$8VWn>$zbIvshY;TZ29R_lzTTdPk8Eoax-@g85WHrbaU?2walUwI92{N|; z*!zZ*#jqi>`v8iX&*D^%L++}oe$!wA*`@-X82lFgz6Q>1>_E@RnCY@J1L^&(WzGNi zo)ScGf)RVQZU<%KmI8aNIACjL3*h~II3U5`rSHrwcj$w>u_jiA-|z#w%!9wAP~&Oo z@5l6o1PXF^>#B>|wElk9{8JxSSA_BL*g_>1sT%XanC~~T8%im9wDQ^1E52Df+&Dx_eHgJ9C4OkS4SvX@IMg!*b7_%5gJt zpN_Bn*q+XB3~MHw-G>w`pU{jRt1kE4r8^<*kGV_t0V_`Se`UNX2^{!eEp=#B9)HCZ z(RrqcDf^$VkPO!eeYi{QQxStDlALshO)*z&c0+K^T{TrRFcu6F2A7vd_9(vB3Q>N00%E9B4;M*+I0Vy?WdpI=XObhPc^D)w@ZTgY^c3%mEF(F4Qz#2;_l zo%P(!9>HszsXC!5u$E}1%v>Ahd?=%cn8bb41vf!&f$HpSiHOW&lMre?^J|77sQ;h@ zWVawr#A(_Hc55?jOH8iLogB9&rdzV{=cizL$n9*&3@Vi%P>M*h+16RR`tyhF{D`=9 ztLnt0EqDno1QJ;5iGVWV?4+Un>rkh1tVKe5eF`L1P$qM`xjF%y-y2A}i?k}cl-qu$ zLx%Thyh1-6y0hye7Jtfz5QsRfW#S&QtAsy`O0hy@z$9@NpF!!A03R4?`1rY`R8@`p zz3+x(2eH3RZV-@Wg|m zSo+XfP)JA%I2ysrQNUaIwE_oB4Mz+dFpR%2YIIm0Gr5L?g`q)3jP1=8rczaR=4J=l zLPeVV?(ay!Xx{5hV^4QhmK6Fs{=8uj_kBrYolUAfjuy^24IM3>9L!a?L7kuhaR|iT zERggqpUj@@w&Gw$!3P^7!@v@g0V(RI&tt{f?I11}7J?l883)I0I0H-$sKPjS× zN3jgXW%bM%?)&v_@~5J{Sl~IQU5=lh9}E2g1lr|Z7tY{B5Da>LaFbj? z*F!iv^I;K$6YcdtkyEyAxLN6Rf);iy!ZDfr!pyzpD9?EWxkS!|5BNEAP$NHbUFWS5 zjj%2=^i2#_WlgH z@oi1MxjBPz$u7OZ4vo+rb8e2ybs<#q_N0C3|Iz;DDigwSJvZ4BVSS9q=D4aGqu22F!B>8l zZIRx@Z%|rJC~eT|nyIS+Cz%HUf!uY!_@A(Jh}|uNII-Mv{262th_{X@n1Y9YsPC~U za`8cX>tKg|bLPimi$T`Bl4tI-GvVyv0(t{ASPcD^bM`gk-gHTK^>>gN)DM-B15Y+P zQNDiv{(ap=I_N-BTH2`PqtkGqdN}CqRNQxzj2D7Ji`yn``WLQFR$yG%FXJ2&^ zI2%3?;5+V=ZrX@eMwBibu;r{Qn?G{sJ@hUJ`8Kpx{fnQn^{zcg>u#<|rmVT4&bzBG z6kAOop7{xS28r~_##p#U#t@nd9R9h2F31B+Cn_wRMN$pW@Dm8SbKchFK!gPjKv;x? zon0}Eh^s@n{i^g?)jXiO^1QjK0_Zg+bX<^@7IZ(No!x^*4kg>}V5)YiR3<&9FH77- za29RrlGbZL6K|9on?|@J2@46GA zMmcV-pBWdF*E3>w*#cW$+S;Wf8W(s#RG* zP%Mr`@V2{dIr$?2nx4_{aoLXPQkhO^q=u`;ggi8WBK!e`~6 zA0MjO#yJYJ3a?&0^}4|LDiJ;tr$V!L!b0Z@(LXw!56bc7LBD!!Nddz`dw2sMHI5G> zsNL}O2qXXyR7uR4e)8|$nzw_1HQwkB6Zw!6y?ULA+Vc6p%kS1$VQPi>(WxzpD7KoU zZY(Dcb=CgXIqMc5Rl)IfdVPTh>+0ddhx-rE&=%`2$g*u_5%C@8)LyH+nD2Z#STRIj zu4VgECu{ldxI=STA6A$=x#Pe)Cmvm+asM^G-8b7cHqaOI?1hqdqXaqgKcAb6FVLD9 zsBzJUWk`+hss88g-Lib;T>IUBFTr#OkeV;rHdx|9^5`@X$8exa_U`gD7aCSs;{JWn zuV$LnlPdnE8A^NzCJTtOAMsem?2i~02}`CDtf{NLs2woU2`+&L}cGYBPgAYQ>$LW?o=`IU9J3cd3sKF2ax;P5gU~a!TX{yV5mwok4Ps@u< z{N@~2OXPh6rh!RnAi80X0Pnftc7qSD#n^QRC^q+D0OA&+4LQ!saI!k6O<+bujIgW| z1Xbw1Mo5AK2YhX1lb(k{UZW~-0d61A3DX@x3a$WmST^e^WnA1v2bq3;UUQp{{M^?P z@*C$E>fR&cLqioBT@o`F7Z+F>4N$GewRs9 zK2YCbqsowp+g|bfyCm9cIHRUr;8G-xy1 zlq}zMhVx5y%QjleAtEw!UiN&*kgs1RJD?8cQ!>YQtH`S9c>oY{W^b6MHSqu){nYrbwV4;Wp8{&1M9x#-5<=7 zRLFZDfFR}jLd1Q?ie|`U`=nj+iJvYxX0HwND5b+rvIFEDef1t`x}#Rw?O{_Xn-g(u z-|dtW1Q*O}1$?~j2S2$FIsk}OdW5g-XknwcMUXEpZqm@2a1RWy>p1kHqY?7?>E0vW z-Bn>{o5<~t+(lhwbar_B&B+1xZd-9Je(`P>?NzwkKBq2KaLQ+qf;Xu%Rk{ySIY z-+n){+QcnCa5!}wH}R~p66CjcvD+w0O5iaRDc!kBbjnl8hejh2l>RcStAp?u>)%`) z%R=2)sus!ci-hdU5_H=nii@5(;cmii@f!>>L3kqbfV6(@9D1s z#lUIQ!;9ouddP*3ZeZJF_gmXQ7#_2a@1FFV(I zex6X^M&evpH?$|@<>Um%U0=~qOPu&WY2z@aocJ@R<6%8dZjB*KJlc^K+fx;OBeb<; z1y%22o>&D8XkDqT%ymFI!*T3>CAMK~Cys`|}U9o$bG2c)s zyO8tw@Ip@vo<2Hbo#BY=&iX8cYHadSkRNrOFzT_ao8|*_^dqR_UAz3;yqVxW6h6Q3 zN_K;k8wfjSnwtE(djk&rf|$@r=}^8icHMGMht8l6+PpV0XYU7loxk{Xz>{AInO+-|rM^hmA+z_`zzXDijmQ(@%2xN)f zG}f6nvvbHgZabkpsdilWsSb!r`G+pWEoEi*jr;1;FdMW)P45&-rplz+xuZ|YHcw7JI1na!f(V_j3tW0c$ZEbjYQtw z@)2S^qU6&@OlqjvBkwkOzw}O6IV;`s{hwL@VzwVUO_p@6jJ8A%RXbRT+Sh8YNMKTn z=VYpWlhNza`IX{gan-e)+9(DJn&iq}`$}r$dQndL$CHNZFJ6I|2y9GNbodJOL5Np_ zc{VmikghwIc4dFB&lo#7Or3f9f%aeSL7U9NC#`&OtUDy~_fVoDa{3;35f^swmB$rP zdAPYuBb`f7JM0A=5X627d$qc?O}2{P!Equa_B*xMvwD$ANaK>7*u{GJxbXVrO~CLh z7p67GBHKeJitpstC^AEDMwz`s73o>8vjUf9Q-okMIMp}R#uZa5(dmLaJn+eV9iJ;^ z9gx@E_j}>xfE+T$@}@gg7LsqcGRfTAIwe0Bl{QDy!dTay5uL4A9OUQsAw7MXtXgw3wU90r& zR@pu;ihxU``MKA4VXI32?ZGno>FKK5Ggtq$ zN64Gwzik7YY^64sl%1Aw1r)%{>891QW220N-NkeZGNZ$<5hVG%zmduN)2n~r`d&v! z$&&G;Htm^_SQKZJeGAAIOUmyy%jg|`uPhK@szrJKG_UUd0S0B0aPWHysVtPXg_X5H zt-)}Ip>wsEU9{j49IVles0xn+&@9x;&LWuwb`z@Cd&y;>B{XEHknPo z+T{P9mi5?x~bYAVPCf&L7 z+&O0%xh>7)_Uh_CC~{<#?P5K6a23z7qIu}9|6^ymsU}3|YxiHVFFsgfebtF@*#?bG zX-hBJ+RbIm*3EOuGRQ)D!+eqS`w^~fE^Wn+i9g4w-K%EgARo(OnqG3`^bBdRAxP`- zxR5yc7RIkt`-3M9nK!|a>;mH@R{X(Y1(cHZ6;i?*L??Yr&^IAB+y%CX-PI8{hsEfK zU>bjlyx!@*cjd=BH0*rueWtGo_!Ja`Cd=b(PKqR689DB=m5_KmKpAL0&*bkY>1kOP zP#kSnUy#dFK#>0Q=%-59&{C_xR?hfeaR(ReezzG0QnF*y9yx4D-9)nXoQWTNLgR;c z7t;F?Qdv6B?D^C`*AR@H$dd~vN3~FYVA5_#7Cf`wk&01j>o~-gC9=anjvV|V?R_S+ zxA#L#(WuEU9FQ=KaH(z~!x@=xy$?Lotfzc%eYDYwDl224H&_mZ0_ZySAKLa2aB->j zBpeVN9XY|F905vYg2L%8)SZUbJ}*r=yoV)quFti0aP1i|cYCkf^G%D+Uot+sbLZ72 zrz6+y@jB9s?!|+Ji}2<32XV)I%GsgfO9`(WHn5$yW_zpC-+DV*B2jiNqw$4|WhhxB z94vBVvuy69a2NJ9xiQA%pE`@NdR`3bl*8i&&i$%A=AiKNVrQ5}{Ay51vSfCb#`3j+PiV7XqPhnNm{1QUZ>idO0nmKlwFlj`$&-N$?oM(I8PHyV&F{NVG`! zAE`c~pF|!+G!M zJBHnH`H-6^oV9*Umho9RCh1gfN*g(o8`jRdod+Mh?l3ZXyt(Lqgvze<>!w=-vU(CRhYuD>R9 zo%+L@(WK(R-hc%86X6>KM)i_c`Hmi>&P2bek2)v6ja$_owQk%wySKk;(r@>DnSkS? z!b0Mw8*qKtiO~1K%^yNmXnFng5p!g`y;jV~lWL!?XnFK_E`vCKx+fi^rzj#R`GnXf zy!$M|k-GkFp1<$R=sADM94U?;Zi^RuXgAlW@CaGXO!C#c{zNP8AA3tb-We_KG*Snj zy00fZvn9c~_nCV5q+Nd<4snR~#uF<-Q+{g37aKqI3z$RxXt%c*A}J@iwnVr8_CQy| zH3+J4k$rs_ISLIyK9>t6`YDnZmKszI%L!kEz|}|LilXjX8cKO#-AnU>GOX{~g7IYI z*$~Oe)5ry9jmnQM^$S&x&o9Ck8%B=zkUS4o8s1B1FX!hj19wg%*7F4Ypurszc0N(= zo2bk!JkNxmxQdSHktY({cmd-zB!#~oMg4gd^$qW*%Ev%y{{h1`mbK<|)9i7l&sOhv zS5?SlWy#t)vM7wmr`xwPJqY7sTTc-?H<|KibIZ_bvLAn7h{mWOBFCg2XxbL60QuK9 zqs}pyYVn0O@pZK9{f2-oHzy;KysfjE%I?9TcKtb_Yiz44HE*7e!W)S-JB3QsTl_-}+A zBgqHn<0^9eQc6b;ALo&stICmtE_q-%GwwSHwywS>(N+{_0~iL+Zk z6$k!WK4+=^hXjQHs3~9cN+mUPGptO%yIUv3KO;fj5MFVVa5^8t3T0_-*VXjE+af)J zsuS1ctnL^#AyJ>Yjb~5Oe&*#usKh6XhbB#;H_K<}&@QFKph56ct)5EXA#Wx%YF>UAZkM;O`Buxt_JCck-15#+R9v z{Mt01(K^6JsfdWqOaIP@KAdDez`z+4dq0_v__;2sq`c3@Q#JI&98(qrLJ{XlaSSfp zC?eecilObdUfYjnv`+|cH?fcO>Iav8#YQF#pSq|b=0r<2*>|j_e|-SSXV`d!iNlC0 z_w};HHV4JEr*;ES(F5p$DM5*e!^Jh?x!0yNKi_O>{d{-5y)^0@ZB)CBM?l0HRrN(( zd!Bj8(nwNjM)TiBA7`(ZN-vlka;JJzh-Y)(Z!=JwsQsu)2Fc9tn*G4ltLM8`NqdkfibT;?+mH#I8)HwsqwdzvV0#3*9^H_py6 zEYm*h_E}R+wq28r$+l}ITa)dYY}>Xm*|u#>w*6l7KF8kQ_x{+8qx;ey=egEzvGAIM zVHV2tZwdR1W;klf>?h(IT-Ji`C+*K!J3bO8N`&5pn+^`cMo1DY?hUvy7pfL|cHT%o z5R5xrx@CG0t7o{7YfMC|y8&fMAhZM-5d6RZ1_2$n16ew#tH4{fjiW?M86k@GhMPq`^XcrJ9J4cWggF<4AX zzCJhdy)l@&)XMlFC46{zOVG}WnngTw>IgKJ_GeQ9NJo^Li_Tj8=RJ@kub`|v0Ti_r zsN}r--wGwjQGzI;xA$_51S%NnDF*KE?EY*H0h=G2VxS%aRFeD@0g|!tC~JcTxpAUu zd!sEMa?j4X^FXEL<7V1iHaFb$gbNdcH-3b9uAJRreys*ZpJSVxhG`XGuczSQ!F!J}}@0UM==YV1|Bk zHEkLDtb``IV~T^2h51R}u(R-dtJQ|CVu;~GBlDVDGTs{lE2zi!e@K(lw16=LV%A+| zasq&KqZMmo=dC4V1!hU3n_dk9Y` zZ?A6tQGFJPFaDbjpR$qk zfR*B#2uoSneuTrzAFaNxSK3I|7TpR*C>$BaqJ z;!H5D2_lK8)M2%PuQG;cQ^9*5(ajt=O0f#6X_y~Kn4&|cNynU291VcPo%<`6ibkc5 zjhNH-=Uc->CfQp|7$=|~ON2iZ<>bY9>)VMy1=U)wxfh^dCd;IPHp^xJK4}{;aQ_xS ztmysd30&Dce)0nJ%Ay4+rA^z6$u5^&sn=#0Q1iXKE8cJ4H_)4`A=x_|%hKdkP zhOU2zx2(^)+`yE#0)wU!*qy529D4U26_Sxln`$ISGmh>QXmK^dxsT8$49P=^#f zEG0e_VApguB}$Z#jN)nZgm13IrDG=)**?nE0urpb4l^YAe`=oncU3v}?DTvv0THGl zyVa_Ql$6ikNiW3sh5r0%K0zlhw!_GDF##i(%PLJ@xxJBL{ru%0X@NvEu^v!{_vS*j z!`OMaGX8&%6<2>Bq1ZM8xp`{2OJ#N=R#<42%bqw?4y^hN5h9h&(?~zW+E}5jTzp#F z>!{Aw#S606zFaG)BKM&{=PH&nIcD4PRF#(Y8q~&QWY`xKl}C3G;p3khnOFmIsS?Vh z(vEt61Rjo^>PN$~7$)Emlyfls!V;7nJYueYTT4sQ?w^Xs9|v8(B_^_;H9s~pI!{Zb zEuX&}(z=%X(2is)DW^Fqf&>xtaxYA$dLuU@g^_5plRN*-9lxt^OO!Q5bHs|Tw@p0B z!Su$s1LC(9jj2?xSrJ)8QAJ~#S0tn*^*BWvP4-!tCkRQqFcvCrh+?e&N`^RRGmM`${(pX5NP6VfTt zY2D?Ww_|rmj(sBB0FJLGcw}P2WGpFCCd>1J(C79;{1{d56SA((xz1=r9q8v>J#}T7 zjCI0wempp7te&JW9Sf;=bMsC8@XJnSC^a{?q@5rUI1H97%+;I;>)jQO2`0Rc<8EL^ z*U40D_#>s@(H8U4X3GUpA&Ya?2nzFYKD=uYTdcAhi|N0Sk6td%dlG~NkqlOz7uAP; zLA2kh=3<#7LvvbM2%(^(gDBwWc;@3%_wodU1cNMJDR21%rkCz`0z`$lRrkMBLui0Z-=HZDI zoFCsbPMs_;GqFK~`&u9$qbdnYR>gA z2(8}Ye03hubqD>nP(q(J_n?OI~f10H`1TmuozFliC7 zIV~Up#$Ym^o$rzjJnR&4_F?{4Xny|t4TwVeJK^#BQ62O4gQV#S4i-O5rrLO9(gt=B z8Yw%j|4*1!ua|HPr2A~8+0YP1SXVS9eI%@CGDB#X_pq=fL{aLD=#Lqw)ob`h z{eHQ969znS666BL2?*G8F_&1W;<1I(^QzWW?Idy% zlAA7meXlq2(h@3Dpg{tdq!_JMAwgLGu-6)$+rQm=XYgC&oZ55D&9C%63scM(9FvtB z@WXy676jhymB%4a71~#YW(W0H@1}=Yl@!2PNAM6f3XO32_i~ylEKfm=6uY+J4@gBi z730N|4mH_R`DynoURZr^wI_EDzD>B5qxf++b+->Yp6QOMt8;Jj!L>t?=VZrebdKfd zVg9%hqP;k}FPO_~7!QLoDT&+1(g{p5Hvig>+`$zV{I=H7UIP`bC`afpJAGg218 zwvTr9K4T}LszX+QSXfpTS64Tijd!aX$_M&Dc{|BtGqbQT5XrGA^)o$Vy&Zj-&Z(r~ z+EH?Wc1lAy#0E(pmg%f?l9HP!12Iyfqa~0`76pfw$cfknzK=mbI6Mz>h9VG$HxXMW zp<+h4b|wA6kjBUYGdI6AXAIS}oDzirm3!SL)!7PKDS1uoGEO5#qJY=Bi=rNOLb9q8 zSi6A)1dG49Py%BT8Dg+MAa+78o?Ain{1!$+NdJtLzA@>PJU+NvQ)jkj&nm^19G%)j z$u$Pj?0qDore>YLVX_@p0(+Fmt)_0O@2t_n=!xb{Q$6=!?!x&@U!F1k&ejnSBlh8K%ckA7-m*5h+|6SM%eww`uBi<N_l6i)R0<}~W**SHqz!B%pk~)y@F9c6 zp0(8J6-{3si(9NU@bN;5XF+9R=IR61{`SNDNcm0*A2TS@Sn>kYnHv$MV3viR=)B=Q zv&wtpdODK6ZEZs5+v~N=HFG#(^=z)=;6crwQy!EP#|%{FC)3vz<>ldN?+;*E@!_^2 zQTN5Lgk7aB*90)MRT5HXB3J!Zi;{R;kJDluJV)BRQ8a{A7sa{*G=_S=)5Q zwAJl;rw!YB0EBBqX!2a_PO zYk$A!8}&D7^KYelz=aLTy&X!mun~0X9W$Y^m$}_R@Y60nfo3;e$g( zQG7Lfa*eGalMMsUa7u$-6+VAjFvWU+vg7r@$3k%?X{ksr`W`(fQNG@g`P6atGNJEo z80sFUj`u! z^$zU|B(V4_VSci?i9iM@Qy=$-W`dZ)!})wmQH3Z0S8!dsc66Wpc{q|2nllfxTuGwM zHf`*Pg!Zj)OM=VlUhND6R11uvIFUXb8zfQ#%r9iVO03LUG+~%2b>qv1(F{?2W9#ST zl@ai0hYEtRr$>f_l>F~KExP=N&cA8P8zFNtoXa6gv{G$ud3JccDQm!@{-%13pm1V+ zWUZWEq?N-m>Z#d8*6+%{CroeFSxwF_cYEOgMrGZHJMLaHeP9~af$gKG$2yjA4h*0 z5pwO)Utr_ne%9~3mQPc5e?W|mO8xj_2@CLmwY=^dX?0!^$W;qvvN;nY@@Q$DQTN0m z48YF#3z9;=%Q(0N@nc@xI-xwR1Y2a-FQN)2IJ##?4-Af zpahM;h9xO!8bF6ic*O_yWr_1)nO5ue&$19u&<>Yx zEu0yM1f6H}d@EgkhIENCl30;Hk9X7a;Qi)1*E8+8F3+}nS#wa;i=o6@G=0%j8?y7v zd2fHx%bx#L9*ozVk05J+3_dan(d{CW^4%qliKUQp9`ERu@CGZJe=}k`0kD14nXl z!UrA2=S`T-UiqoxTI%-ktR50BoddLigLU091{W!+Qti+~-{V~0Zr!lxC@$KsC>E2> zNvbbk`g-SJ;9z(5@>GHOF-xK=i##z^Md|lgPOmc~j!4s$`9&RxM1L0TUAK84rhuI+ zG;Y}gDQZE)gZ#VbQf+-s*4u^uvJ;)e1c8L*1@C^|2_pMt<;l#liM;G~nWPLH7LQflpRQb@sA5BW0^JsTiZwUQ7)!I8qfzVH5KmAsc*%~d zwbsj5_}bc;89pC@UEIR&D>B(o574(W;+IM~I)i|c^EjK`w=KJauy}67O}V(<_LUjz z9gMo3gG7SN%+JfLp#ilpVj#C6r^a>X@-S3{0%a#6N7`1eE)^#6wZ_2#a|6HlFCRRj z8W@RMe291r56Y%dEh!Z~Cr*2(DDx}C55lfMpyYd<{n;K>TpZOXqY#{ zHAq8|0H!(f|7Zcku|t(L1VKESAb!;{+^bg2MM+ zYn_&YCP8)gP;J$c(|Nr^7*}KA(Dm*!e=JN-Pybo7P1>MA$6})8K6>57>;-oAm5kI7 z6%Z_wF*Ap0+I0ATdS25vMmEaunBr{xwFaV|^5JJ^=2rw4|85*hhSs(X& zyVbw877X-<6OWFtZld;Y&)yDemuc5|UlA?gV%@@Zh3S=T-=FKB+P8UDTY0dP88dLq zpLly#=i5+3Gld6V@xl@r?W2is++&aB^(yk^6q`_eV$~=!x{^(C*O~YICDdUUS z#X^EaN(GJFcm+NEAl>Lr>HgsNz}Gx|xK*5Ka$Ie_t~pgiD<*Etgkb?1a3(j?Fx))u zLss~hCggOg-`gLIM_0?PmtL-_M_iB&U3LR)Wi%L90wecTF9wg6!$a^RSrhMuLy2JU z%I$_s2ZP;X5^vGn#BZ>Z*b~eRmSMPIpJ4=e$aArc<`V7JQ-nC|>0zW&%G`_2ii;OJluWpR0vogTK3yy1d| z+OuQsun~QIPg`y|K|MoldhV7WX_6s*RBcbiCP=R|NRb%a#thU%i84O0!g(wDZzpB? zqUYA0FD~}7Ml@hxhtZJ-ZTN-FBABOI;@3&;NT=8@`Ad2=sZZNSQhydN#ixqqL!p8&9KCe6KDgACQTO37`WR57<9}*m`Vk8SVDZ`{1MR-$P}hOr?7e zQBYnKr+@eXlFx-nwchVwc9I!Xcp6}y8w83sa{Ti%IZ0G0`{@xfB&4+3!g*V;(WPgV z6KMEnLNQWiw$9r^WK!qie_D}BMg3;`!pwpc8nZU&!Ig!BYN<`3{iH`OGEBJ7CuDFK z5ev0?HvLz1`0L2Am0+il$L1?TJ5XOP zO&GHF2^i1+Y02Ln%-Dq#2Pom}UBL;Y=)hl#V`pN5rONayo`5DN8*mMdUtEQF!B!TP zLX;hI6Na(MqD@Qq2pFDADY`|pv8V?n5S`ReGINMlNs+!r`Z+)aDq5EeIWCXtZHbWa z@3t#0ZS*R?^5<>N3$@c{1Vd~2SNFCh*9YdVZI&B6kDOKV4Y&OaH)H;TaU1Z(DQ`Kn(qn(j@ZmcK{W}sMFHGslcfA^XRitsJ zY;M%31o)40|1^+nY>^%p&xczSUh^Y&ur*tu{~2m>Dk~HJje%Ds5*HpH*&!h^smzfF zT&FN_@I~vZzKg=r!J|QP7qEVKrP9B0w*0cZGL35x?IZZd_O4G^v(y2h#LW~N8mYib%1fi9kE5LXHLFLz{m&< zXv(_ncr{(6IV_=8Om4|I&+x_dqF%u))VDZtmD%D_3{QkX`f z*|Xbi9LJSUDmCvgqb^8_Z|#Pyh5VwR3y@W%dQE;JAe|TQ#lG$Pdu}f-QJWyXSk=`<%x2&d zO0Er0ontUoy%1x?p;OEBouki#nbvPipZRvr35f|x<5OTC({Q}eTfQOc9h8Z@2UdcP zKm|z+3#ThXcb`CDt~ZROt0mV&8q6o+w=OKAuT;Ne4-d+y(PkG$pVuIWXGUYBG>qhY zxfQ-0M|_FceKaaW|Ae}aKl#w{3j~YB)p4qkm7N_73CZ>aN5t#Sju|67(CzjwU}$xH zyKcEVO_l+6qEnMT1R}0ClRmW<3KT1%uM**MVt1;snhq%eF+Vg^Oim*1BVZ&8XXZuY z3;ln0!lsBo7j8rz8R0;^^phlxl+-!eHncn^e0UjY1bGZ?7u%+9`K$7|jx{DYb8Mr@ zb2`jBwsqki*p+>8H&>@~c=QTc(cjseA4|pKK1*To2-xN-D|? zW>Z3q&@Kn5!=A(-l?~0v7%4I+G@&J5IN=dlyyF!1Eo}d|A`_GkTH3~Ttqhig3I+_B z9a}pP{gI-wZDiKB(XMI)Oa@#hD^7Z=5Mgv7zkC(Vh_yT133CpgNvn zM{PJH*KOT&y9-@CZFTXt2lw*OR+y;`&OstnLWCg4(C1wLMVw5L+Nw{Fg^9+MXklZ* ztdIukJ-h(^`^pICKU@7brUII29VIxZg`LaQgR{KE7SUt8-8dvt$;Irj!? zVB~8(KmA#2j$VmhDN(5OPeN01oE)Eo=Og8`cXOk(>Gf!ZNH8v5xp*~9D$l6jTfl$N znI)US1PbKP04l|y((>{?lX|k5kw zVZmhd0h_w?0&H8r?aOg|N9~$c0tOCDX!7m>LvC&F@1w1#BfqyY8WEe=2ltZXMn-a1 zs`9T!c`;NirZ=nv&qj!wIJ$o~T(MiAA7_nOZLjySd5!5nPEI$9l5~~Q+M~NToSYf~ z)NYEeuK*x=q5$dCxMee5ILP2}rr`taY=0OC2B5x3=!8QqqHB_{vN=ktK{M7Smcv9e zt+=qaFx|?f|E}zOf#A2f-(BHyZK*JgSrAr8I`(w*f+9f>g+%AbU0qjuORfW6#<7#FX^HLED8 zr-$S;=}f{ad{mQqmH3=@t)zFFw0(IHsY0*EU9?-e0tCsi8E~{+9V;m+og*?Xx{rq7 zSBb{J;w>-w5jh}pi2UxOm|ZpQW| z8Cv`Y%n9~fV;ofwdaKLS9aW_OXjBl1$_hCo!v4e3JdE$7)AQq*k~34zy7QS`g@)~3 z_MUdF8O*1Ha7mjv{gvMvC6q}2Zvb|C+v6{xcCe!dG$t%r8F<`5CSkgrRgmv0+!4-S za)qeq`H@q#a!O|LlixbbMHtbjtBI#(LA`NxhyJ7Yt+$j zQayCX|2BYIqvnPzkG)P)w?4V_sHjD%EH^})LJ0rTQCGk`h<(R zr{=!Y&q2|-JDMx4mWz&lX?>oz4In;zLMgg7l9mNN9Z&$OLsslTsxDzh)94EeC8I_Z z{{?T1V+Y%~S3ho=yjt8*FnA{9XoqM#L#C7tI#fD*{75#R?hF4E5m}WPpGIv|zOiTo z7H39C3~DX4m?4LF6oF{jwc+(5&5cx>tKOn73+I^ETQ$Iv9FM}+42_Hg_!!q9|E?+E zi9lefCf(bXOXJo|;kp4^#9IpziD4unj=aBLi9jHb2+hR+2K!AKLhiGTfbgA=EPC7C z8CsqleLncK3&@ZZS$dgFL~b4{BL8U#go8=BuxKYnVu}pLK!P6C!P-fR*@hin58WY1 zX`MD6D~&jHGUwoU{AVX4ES8Zbv>ko8f)@eCFVq)_e~vj9@b4?A*GH2K+iE#0T&Y;K zc%D!k|3KK+SILr)&Gr(D02E?76ot)(~N^66`bRQI)>((vLq$fZK(r&HAI1 zvNjOSzanLAFXkbh|1oIt?dToWQegA<<*!8rfVF`BfeZdPo&p3b7AG zo;K{BA@_Um0dab@mbHw@BVpV8ws1lLD7}iSb3?t-gE)pU(XJ~<{JJL**ggrbx^R-s z_NvE7o(o!C>ge%Y-#fX{v-vlf{;`zR}$9lw|w(( z==wW56~n880Z+(A?B_$8JCf-mVK46^5pzE` z;B~jZ-y?Pc0U=$&(5!piFFf7LK4q^Jog`_o5dt{ejovBw)4s&xxl}Zcu*W1!Wi90y zmaK|isf@Y}K*p2%c=fk-?DF)WKyglCZ0^%7YIRU!m&Cwrd+;>`j<-<($k#^5_Phc8 z`SZf^?)n;bJk1FKFo;~BYVr4AJMETKS04g6p52r3YLl0{dQup9G*RH(hycEJ?>CVw zpHFfSl}_Xop6J|ZF$47mVr7VJmGtYn;O*I;p)rBV=4OI>v(wYSo=-9-ulx7=>tX%c zQ4UdY=C&;xHz6b)0tStA8#Ti*Nkw*opfH+m$aqeHRE2;@L1|Bb@>*cNuxNQ5S`YEi zR8_&ahF}eApqetWTrDd9gg}HCLu(qxwxeU-J%NmBeZ;b@%NBqksEA1P^`$V|#!g-0 znS?pRtLWaQ1G_e55;w3UV;#3l1Oq0|UDsc0Mtf+N}b85zy`NG8%m zpzbfafM23XD?tXq z+u^og1tKmOn0ZgAjC@*m$S^3c_5Hw>=?Ic2a(TI52YQ>WJ~3|*|7V69^yVsR6bW?r z*K43y#FQ2>p5e{}Ocl5-6)ssq;RC(Bf!TFH`ZC}^8X2Kp2Uu18gGFz2d#&Ddz;oEx z&|nLQ{+L}Zw8SE?uImq9pUh%13jouj$A`J$Yw4fzvNAIZ3!~8lYFBsaVcv%5f@Tp5 zgbUh>yx=b)NFCbSaXI%fq}*D~BIqdN{nERP&n+GAY2_Zp!J|{=l>9^5aJ_TtL}WUn zSK(+9#$YD?;Xy_8M&26|F?vm$H5Qt7SZJ)F=tQ7y9Z}4=As@Nd1s49#Wd>h&yl2L< zIMX>WJpBe-(OQ_1zP?j^|E|ir^bE}Evx`ddKf0JW7cLNv=YQzZXQf=#-(Gth|9Wk@ z?K1`YXt%?+QAZO67=?fY0wOe25d^Q?*`rwwhd1qvxoJ9$)0NDDy+b^~24>-b0)!R= z7EZwYQ7DG{xI$shCj>2Au^@MTBtUXAB?6JQF+td=`IalB01dGpXC5o7tu!p>_%@Iqf4F@E>J=9v3*jbd+iVmsb)^M;p zG$WfkNg&gv+MD-#SM}dBi~ghG0s2L7rJ{nV?NnUrBfEK!{$7E;HmS3 z#l3!$XhHpJzv~yj5|2bQN{@cz$-SYV`-XQ@QE!|hLQtDIf@O80c={I{fk7msU2a%K z(rakSayDiTzll^pnborHBxuH@sIqc@Ln5c&zu){32q)z9n~bXW$zlvn=uWWR^}VlR zLQCAmgIy^Vb5js=hM_n4_Y5B^n-hTcDE=210hB&<+#itb!#C*r3}^k0&lodjT0VFh z-aQCj;6S%ssE3ybY`PS(^Y%?50a6Ar)BuZ;zG+mYJk%79Zp>Ps^L z@{$;u4F*f*P7cZP?lCV)3Gg-;{s`$TYcuD2%xh!G$4UC*P{tT5(h`w72MTN^)|Js#=?m z;J!|^osF&F!9H@!$a#<-gQf@B1xheQWU>`w?lJk-15d)vv;|= z;r|GeDmB{4pghh_@*X*~KT;q7z`)%m^#Ib4Zrx?e5~z{0v0-F$xm2sIe;Wx!{HFH> zgH{hs(quU2g{L87D2=$02eqtd}mKi%Mh@>M>fdor7B7&@g=I5&u@4krOsIX zrK*zr^C%f3c*UHA4}Dje;T?vo39M}-a>Bf?j2&3yYQb6jvizaiI4o7OAlz}oiW3OM zAJ6ZUWnchCBFRE_)iZf70u@+qwn*r9IrsI+?3SBEs5h0_8?op2j?sOT&_najKX1nD z)^8CdC1zRP?yw(Xq#VV%tE5s$^vyL|UV7V2Q({)mxN&6qtXQ=i9?2{%jLAhhm3_x{ z`qMy?iRyAsTOPCRX$?J|1-;t!3<1&x_t znh`ww-$KOt4HhYkTp9U)m>VXz-S1tLb1jNY1@L%iZm1emV$kD*=FYbV-q@(fLLuM1 zMXgHGG7taQHa`$CGuIA6c=K3Nlqzn}yJsW(5sOOB*tearyP3kUE$U5wL_yIeagRYt z6YQd=hQGv|o4poJGUi)uPV(`5l7GANrhPk)FQCxorqdMcAMIP(WfpxeFau~LzbwoniyxyWRJk2Ir(OCWLyIG4_Wrk3lO zX@g0-%ABf*LhtuSi57U)(5mdH#S?`&F@oS(Ca{6ClA(oQKOyDHQh_FMxGrMsK*+^=l5gO768Zi?Ob85|b48)Ix!DO%Zte!Hb90c|4Ey;nHo*jrlGh1%n zvtP>eVD~t#!y5HUwa%`$dWw70|M*8v-Ox(CU$faNPjz4^IRV`eZ-#ZAD;{Ol&zuo8QEGDj=K-8wk_hWqV2XDmN~V*-L4L?kJ2I>31tNI zyLNAztpw-8R>)2)CsJtDx%m910XIFF(oeRMutpzs=S}*{Oxr!(>=;G(EvKwAsZ)Ed z*>G4T%gYJYSk?GW_PFoujDP3SwEx)g=Hu1o_F?UpZA%7VISD;GmS>M5pg6pi1U2Nzu8g}|jWS0`OHD()b z6lw`OQA_6jJU;m#QjlWqLG|kvrG(*rY*y3l&rP+??`?#UN)r`#)H0VWSz_s_pPuj7 zTtg4G75gG%op(6A@2w9c4%To&CUaXX|vU*T|ZPt(4NI_48{0 zr0z@LG|A$MF8MMbEfTrp;R@ZZ3H_XjaT8UIF z95TD<&*Npyt|fpv&(Q-cW=0 zyY+iKoUSz(LObPQ-*<(%?P@;A;&+)xTrg^@Y&l{lYL~$)?mB<^w@eQF_BWQGXTzz4 zOVsE+aL#3Oy^2ma=2ifeP{H6SxvL(`wM8nsE*@19P3=f>?SeV6x<6w-nB!s$tL=Gr zbDR5|UbZABCy5mg-VoOI!k6CiNnE#i*oiepixa61K!c*VFO*+mZ55%&dHY>;Dmx+T z>2-yb-t`i8-m+%ikTPS@u+DOBT@>AQpErqzIb8QoEFofNhpb$@Ihw+>SG_3iD2xnw z2B)1uS<~JQY>{C==Hnh2+0)LDpfYqtrYb4~e=+4nl=U5`Db!Iy7{Zj?6QFTp(ra;& z;mUb|Q+24@c}w9Vuu~#racdD^4x?nU3NFi-;GA@Hv?R#+pqZc}{-WL{W~NFCrkpUE z@VLC1SV4t=hPXbe{>uz>7XZ{P(Cszo3yO|4P6d`sx2L-NzB5!=PO$27ab&Zz?(y;U z@pW|${r&yS1($jsAB%8*E;c8#gQF?A3s8B!IM+WdMc3~<$%Rb>&!EUUm|?3L69>2` z0%!+%4)OcL=0f-{#Wh}rL;{!MBJE+;?+lJ?E?zEE%^3>Xn-Q|?`7oAZapb)VA9!V4 zzonhy_X+ztM@d<^E8*czLnkjx2GnixRm)oV4Jj1U^mLxDF6q=YLo>Nq{%pYn6(Leg zU#v_We{JnO)Mg+7?D3iwpa(jQoM&fkqtM5x1_q-HBR1qASH7{Z* zd?Fi)xB~g#Fo8G2c`8GT7=vFm9%N}`sh3r9^jxY(3l?w_#+>VjXcAbw-kg=rmXQ>r z2V#yibR#{m%cm1)nLarc31A^#PlWxlGmCXRF@7dt^Vw3>Uyf6Bd;6h?f`DKZN{R@o z98D$B=>l*k+!UGn{vpb%wJkm|4UJ3b&3GcM(1-|Ppj~D3`arktw8R?#8T{`GJ`ijU z#iEL*rm_WgRq2krWFXwZ;tJ?Y=%^q#qj^Ko5(hdu+z(cmZ-+ay7>N|lF0V)6Sq+3H z=Gn7JB}WPHi8Vux8N%5(h6A9w4hIZ)XTAmPk-YTi@vT6(=xFKeuDW z<&H=#I1+HrQ;;JZ#%tqpx_8jwrt;Kn2rFO0BmOKh@t{zw3E@IID$o6UwcMO2`7<{}H_Z!rn zOW=)sONg6Am5k$bqyj>_Qv!904^P3{hH;oftb#j*tRjoKav&N@ObSbFt@a!1jOupOd?a{ zio0!{4-e}e_P%nuvVP_n6fT&Bz(waN{~+5T-U$c{K*qom6OrIl+{uJNgcpk|2giEYP|RU@LtsM+Y6tsc4I%Ni_OxeIs~1>aiWDPS%+bs@l&Qb*4E$Zvn5H$p^S*MQx7vOd$_>$R30)v{Gx5ehA1CPfkW2)=TPvBD^tK~GB z*7ery<9eVgmD3jf9$E2>epFaB-$1WP2d(AT2-hKRai3$4$aA0}Gf(ZVP@L9}^mD8R ziwG(FIwe5atz=PEAo9S68$J`5+u%mUtttW;gO(&vL6cDZcwpedii}AxP=U;%$V0By z)t2UmDh_glwi!AjEXM38fMKVYtXtU9XcQe@l(tv#vG%E?sVxS;-sJgOQW6rRhx}o! zC`Y#Nr|l44FH*vz2ud&PTZSh;&=V8U+aFeG5HsnwFKw#HsB>|7Umx|oF2vw9!2Rb~ zI2RQpA{X}0W3G8~gNTGwK~<8~71R$qX2ZTl6Eo3*Iz)yFA0DQF!@$Dl`e^VoyMN8| z(cqQ~j@NGMG+O&Jv42PitXu%MWGd61oLDr$l6C#7%Ux~freTvYMbu@}&S94E&UEhg zkGtY6WPs=f+~#6tW?1);_isCaZV9T1Ec(l?MN#R!agU!;=Cat=$hAgy*!@wv?<$(7 z3KDR$trUAI(iy+12F@C^O)j^Tq~N&Cg{Pa*R8*1->phw-kv9{Geop-+g*JC)N413C zeaKY~g@yV1+3{#Inc!M|@lyO?I9ft|Cx7lT0=Q)j!5bUd0DgX2+F7&Krl%^6YRQ5H zu;bOJM66T`*QlhR9nUT+gQ`&b!q(W>{=Pym8YVwuNh9LSMl6YfN+I(%xvM?7yE`0n z*7E$?`s>n~3|^kE*OYuDz(%IWpmX=T*zma84fi={Y}(Yjt)e3vq-1tnw%r{28NIQ? z8~aNBC}=pHJ4;i$mTq3RR$5k7&^qa%saU=kx7movT^#rB=3&n0fw>nR6+%(}iNri- zL&8+`Eilj&h(IBFeY_;+|J0L?D=d201t2ISbXEdw4AKsCc&Ddae;6g7eqP%qzZtBHrG z1_NZAZM$^6qH;ky*Y0kp{;=e%n}Ei)Cv3qWx1k{bHIVW;Bip&n zMv8FID9S{k8Wit)O*DWq1R#K&NvZrmvkg$B2vqBM9j!Hel1qoZ_J&d$=<=!k(*6Po z-!6KV{i!w3cG(%2olOW(<~#t<>W;R%*_L{Rs|PuS1NevchYTLyZ$9&m=a+p@`>6BW zO@me{wI2d7aYErjrM)?-ySCgVHmv%;I-ed4gj{LC<6n@?VaU6PqN~4N3<8^)rDN=5 zwj^`IqG^_vZ=#}!K<_s;nqNm;GSubiU+Y#l$cWi&S`UcYSu10T?X>rP*Qyl-eMMd} z00qDF%lLxHHyKMBe7xTB#o_QH>nQ+Y(dIqCs%4G3lG9~ni5d;AmE4gu|4~592)=m4 z0Du=o0x2J*#+?I`nQg$tK;aK<8kaEt4&Al#{C^Mq8|0ZE0MWl`>gtH-+adb)3!r{} zMa95~>KK_6PhYCH9&^W`+vF_WaG&i`_h}_}DrzW7>n&5Fkx^BdSDiI$+P38eY9dE$ z+(Hq8B2dAogc#B_#7MydlOrsYT&}d$2V6fqRq(o+-AG7T#I{YZ@*5gbB60cu0@)FL z)=jxerC8jK5I{;pvCCzrRp-LRx5K3=bWOAxLc2-~Mi8L0IGR{}eO!DBIy{Oqxj)hr z11xY}g51MK!%8bEg=`dd?5IQGE=87_t5{P`{smV8)cC(giZ)-VeLnvM2#q~mcM%ci z-l|%;h!3LIc112|ENOs(0~P2UfM~FPGy?`utmlfidJ?GTt{%VUl(wV-d%~5ZHh;f( zaSaNHMT>@h@qroYcGMnFL99C{6)AMNE$|2C{EfZdO0up@B>fG9Z>BLM;FJ z=h_Cco#>g3VdG-sk8Y6f1YdPMYkva>4<9p2OMoiN;(P`{A=?_b{Ns{&8{seTt%*Iy z5m-btyu5}RE$QNdF-2-s5mAvDI^x)@P~M}ZdIrTEMG!P}bRbzr3}{*}-&mHLPO*AF zkNs3HG(>%xe7<)Wz<@jxe-Z-j$XnmHH-EtX90=?SW|x_%sb<+)Q+ zg<-#Z>5^&4p2U=sL4Lid)i~9l)58Q14(ygs0sqpUEq-2@CLK%L9HhSlI#-M?XjTJ@-b8wdu4L4>iO76CL`!ofGgV?01&!7HbWH}V9) ziWG~&8v}Og0AVwH6pf z7;)ZbES${Hq%sgOiv~5|cGImbZT=0J#MK z?!dpfz5UthNDa97-No6&kpi-R&zO<1I8XuC1e6gA4ez^a!00GY#Qs&gQo+i`Mm)w0 z7|k-}#1#}6b943ojm)LW^Qvw|BUr8T9?+@Cfa4002#L=YJRX-vazbixTGq`axRv-pO)BC~oJO6SiG3T=H z2~T)`W8vdsG;*y^Rx^q7nkPS{Kn6I8^U%OWNXAhOSW;HjB#{4*0)!B_xw!$sUZ2|! z_xD?fdUpfXMV&Znm}_U!wQF&ATN9yQYWul}3|vIcaI0|NQG* zh5-J~CWA`>4q`iA1QZqc+cS~&ZSJjFh>gx^s!ezG$VbPcdaF&q>3Ac*= z{5?FpP40RN1T35JKqMHzQwI(sRrl5HVa4_YMz_=4QQVH0@a38-OVq0Zx&H>iYk5cBes29RVD`;{nWw$YlYwHGl*Ku|DKnYj!%JS?d%Mf-vitub0Q>7Wq(yy;BUr6^%5ceYkHydhje`|IEX3TV z5@!OFCV`jmQ6*Y={bB7VYJ<6b;1V;-%gg=nc_rP3 zrk<6Lyi*-x_#ROsMpT`4AxBepzvy{Uh)!Vt_F^vU!%)aL7UozHgL0HSg_b;>g?v-Z$PQa!Eik|odZ zaXl?C==Hu-t6%a{t=kk$t6!Nssh0{TJ2=fKczN>UC?op-Q^hE68%p2o^1d^{0p&U-n%!n!U z$rtw=Po4~1Wr9e>#CY(XzWVxlJ$?N=m*T0e0!5^#gP=n_DTu=kuYw%XP`hHf`sK;9 zJeAs)fuH4_D={!I_#)FUEG$gRjAmQnTYwS+A+%0PvZ7L}j&U&2xeZp>Y+gD8*CIFiJ*KkOGvG^YJ`?Y)4&8aAA!|k)T|1V5SFlR)AnZ;j^gr#O?I&$i9Ivv84UE_W>l#b%{_xU&n1sQ;55Y7$}ZVaBT zn}BYNqZDYMBteDBPR__ZkR|2t%~$$u{YPAsl$112tA|t3zA&tE$;sU!iVA{Oe=qD+ zw2M)rUcIPtp4OW{SR}5lO~~R|!Fi!Ne~NSXudl75YlNeLYkY+0eGK(2FZI>I!Fd7l zl29%&DwVcMpL&!6bh>45w$KcVT|meV%lxC}`>Bn5#)yY@*ywZZZG92LND|c3)%}4m z4?w+v0c~g|GlA^}K)AD7sM~_|Bn0b;4ky8z0_>0jlrpw1eI3yK(A6gD0-Mc-+E*SY zizd)6+1p<3=u!2=9#!HRfM>>9ia&>#nlJaBbgeB!O1O2Gn@mo-c4+$8&ELd>JgCP&71Nh0*aGY-rN5)88c*M zWh(gNNT*w3*&fixv7a*DqJ_+{ic*VLwqA+nggh&j4W(&Uq_-sPxsH!ZC@V7q563{` z35(Sqw~I))!Oceoi{3pDZ0ed$ULk1q5GhvpzrlFpMMvR#Ao>mC@FnoXs@aJGMJNBiM#9*jdG diff --git a/doc/source/_static/rplot-seaborn-example6.png b/doc/source/_static/rplot-seaborn-example6.png deleted file mode 100644 index 0fa56f4a018e75fd586f2a7f8fb96a54be7f20e1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 26604 zcmce;1yGgk_xHOIL6jDx8$kgnrMmqRR@Vjsu=Lg6T>(VzQ1lOtLcD1VBSQ2q4bPPw0@B=Fq8YB<;J3btuy-Fv5hBfpt(Awj!SpkQKhZD#JMg$ zL;K~r)-W2kD&cM7KVn^1(8$qL@$ih6b_V)JC`qZSP!{&v3-|c3lEl0k8Ram)#sL_k~@Zp12i}U70*c(jZQcWyX zQ9S~+k)8OVG8<|-dU_>2JvK^~d-od84<{o+LqmCjULo?vP@d|tXyi-D%E~76XDWXk z$<6-~MgQgdcP`7(QtZ0Z@lE59S1TgP)r-AMQCfC(+)mL;p|kVz$k^C7D5&T(TwH`62W#SXcAPv`BmVXEVxIf` z8b^zwmqEe7H&j(s4d#ExlH50zO`F$t$cOdV-fF;|o%-R|WyTd47S`mkSrf3k*g4y; zq2DxAXr%P)S>VGEq8|nvVmbXY8F6|)zddhAFJdLaT3_3`nk-ucY}t5)9}M*jT% z$En4|fI)p9NPz zT*l8_&EjuvyC$I|oL4R`E=s&{hKGlrRXQ>ShlEV8tOUcyPbYb7yzDtHB(7$pSd9-1 z32A^;<9z<}DhU=k{65<+@Ok+BIql5Mi~$Epih9oWP8&PDkPwAT1Xa^WiN$B-uT#s* zLA;Zm1eAi#mOaO*=os09aK}vpdD?IAByKY>1RYNL=&P!$2iwgyU-$b#W?>bx>-l;b2dx; zX4sO*q*$}Dxj7hiM`vzfA-LA-G&nRA>v`d`#w?|@#!9Ev^vv3$D~+RQIQA^ z1fc}Rc8lnR5`5|1Bd{|Nc!hI7qb}Z<}iY3v_Y5=@W7%$yGw=Y^z~rG1-Sm0)x}%!V{mGnjIdh zCqtf&nOQF8{rh0j2PT2kK4(f8w2h@!qm9!I0eRU8{sx$pD3Sp-ODg58mm z%3av6U@*0r!WJr2#8|lU>0tii_>ej_l01;u&KndmPlPj<%)|2z`Bh$JGIB^Aa)<5s|xWZ0HEEndfkl z5WXAZRmj8r^5si}Z-mGR#)}s(0wW@>J?%%c;vnHLYQ{qN4qLvBi;Ghotbc{N+W7wc z`?uxgyf556tZEWBm%DvA8>>V8NZ{0#eVBp}q z&CI;xeLPQm6%CDhnPM1Kk09JdA(WgK2NshPHu)`V;T6P90)oiI#4v1f9$BY_g$26% z_irM-LuzX3)|4Nv;ZoOU1p$pd4#p*PEHW+(ea~$)94ete_PVp1;4*%^M#Jo?@o@1x z7&b*@SH|DCaRZ_1^y7u8X+u*J8l1_cu?mO09qBN#H%e*J++VGgp13468UFb5@xgYV z$`cd>oY&9&=63rlYD)7MIV!Cx*9B`%pwPDoB`_-90&2*Ul8o`Mi^mTI;ZX zak_~rmN#U5ekFIa_y(;3PeO+M>EUJs(u3I4>=WH1!bHa=^KVb!Ma??-nX8D7m|9!Y z$vm5yGFV>hC!xKn%6v;bPwUNBl}y8xzORVv4h1het$XhoV=dTS75FB&QB==qr&m|C zS(2UKa|FEtaQlG+gw;*53Z7 zwoT2OSQd>La4WVhE-etrA`=oqU~^`BpNKvj%-03K^NQPIpl?~asQFhUz1_*)OQG#X z0*-`s5)n^6Zh1Pq&K5&dzS=+jCR_$0c8d1Rm}R?iYU>g`I^Z_bj=nw~#;HQJen7Yr&0jRQB2< zx;=McE;4K#9i4BU$My*=(`KrvwYBuA=u*N)_(p57CP#L*P-r9wioT_H90vfa&mIUq3rsgmsx8uLkGg)YBqxXs{x zwoWH6sp8mlB_2LR1A}?@=~HM|*VEg#Z_@|~ot3`GQe`G2{O-jvIy#!TUo@fqbx_^a z>YJWN&S3Yi_!OB?l2_nRjdtc*zdgGLk=n1Ifc^bF%`0i$fsl)Yl0J~`z=QP9nD&$_ z=fpW+xYnx*D-sVMpXplM$!$71Wg12^NEOixUo-yy|4q+t%KG$2(NhZx=!m6>8o`A> zA5m-ca3O=6sPUj@W5Z#Nm0P$AcR%tz_xcH(1BCDTXqkKqM5vIVp=bX7{xtB9wzsz- zi;H5Bg6GiuHPYk-=Ky)uAiMj(@4%?6q7s>u6#niV9-@A0x*_tzhr6bMkR=zrD?wYI znR%^QZ1Vhf4AZaH)*oxb#TLSwkOD+2KdoJD#Aqe=WD6-G4&x&>*G+~S)PHRK;Zenh5C8S-&Y)z#lZoKv~ zzgkrul%+=bAYHbV6wyKSLO$;$ojHkim(<+1$BYl|Khe92xQe}oo~KUlASK1iDNdl2 zb7PnEb0|KcN(Z?8 zD89ujdvAB{Arn8UFKrpg$2;t)mo)mMX>^rAP~2!GDyB7X)Y(=v(3orR$%Y0}D0kIc zZ!K>y2_GzP1f{$hHx3QV;weoD`bGP)gbaE4YeA8Qa5Iq&J?7lii;sgd0@*g&;@D?j zU853ncIE*G7_i)%NiQHk*7ly&4^LuaD-ewF)7teBGcIIeZEI)u3hxRedlHtG_ng;; zP-w5t%+Ai^L>dm|_)h|lYU3;ZTBc@@@IS4YRi#U-VOV;mJAch{J3 z0FFD-nL>h(4(TM9grsDlqTqnV9E@=y-UX7u$Wpuv#(MT5IBr39&S$BC~WsV2E$9} zDV?!Y;z7_bh~CZ{bF?AzXq=QpVmiT>PHkV+R*XIKKK0q(c2Z1SyKM$s2D^H*>hWLuxa zYIWCCU+E$cwk1^?A4NYDBdj=)E#RHgm2CFewS6Lst2x6!m-<#SHU)f0SB5;XPPy&p z4u{8GR!=g!pX+U{Y56sB5xrVvVp3)~QJ)vR#NrSqz-J0qyY9X{(==4{0uw55rA7W& zxi?2gt`@F1p@CvYl}<=qXpp%g$sODhA0+B6kRd;c+icDjm z&ahJA8h++vQ91idmeCaANbw^~fjf7uk@Hx-qhi;qCYfyxD}LAwmX6cP$Z7`l>)Fw4 zM9=I4(~kPi&Q79-1o-&;w{x_*`t#{YpCR2OJxGJUoMF1k$TT!piI} zv}22C2;-jgeN`dhF!1H>hbYERk#o~grLvHiT`#m#VRH zkKFGu%$=+4N|(i6f6;MUA^h>cRcr*THr&gk*CFME`zk91zrB=u$KS{PzI3J#V;ZT4 zOQJvcID{=DJ#~a4<&araDG7_+^YetBFQgeQ(Nm{AZI64>G4?lJSj&E`$rp!?c5fPf$=2QrLQ56sVV_p#r&i^ClkE#F2g%8k*JQ?}G#9xQvOD znD?q&Wb%(5J*;W@u5gBB@1-Y`JNSO2t%7{Y8Q0y7p?{>!(|1(3ow)_yM}s0yPsT{? z^+dVdtX1-4t(PE(FXW!ye?D+R5*Qd1^s3ePXX?X{u42mKpRVn8hSO;{BG%sn($`E z!xR-|X3Fc7#7yUZvzh7yIP144u@k%^+9(8qebbJtUyDuRj zF*QB?>K2Ro6gbw>huy}gjM1c`iRHe@iaNfn#_t6wiX{8TD+P-W3|4Uk3z@7eLZTd< zqC{g{zK=5=gY#ONcXB9B(bGf0*d?VilIqhNP7fI!9wvSG@F6IgVp>{ckb&CH|GuS7 z$cpJfHud~|iRA>BlEYA{_MjdSOJNy`&28$0?3t|v{q)}yHaebO!=1iikbV~e#=EFwEsMGPRZc;ql9gH9nouhRofN}7JXFU1I7`Q*)n>c^�`NsgPOiE2O0_E$ zYY(>9prV(L_pZICj}}t00v`{>I}H=l_2c7Xu%!ykEcFn4dg*863HM@l7?d13iZ~Kh z@AYaz9T5HH@!x5KLCAsv0k@1(nYH|4>ubTqU982gL!cOZrEC!!iIPY6f^Dd+*k?426A|C?oI=TRQVbq>7?LHd^4pal_3e0%R%#7h5g^@xVVTbm=52JOKeD%WttW?JvKIyBe9Wi6y#}5eImS1_~r4(*S~(r zV$go`x%7srw}t4TV9ZIWzp*`|xv+gWFRgi~g8EHC@V`j$6r`Po&Q9C}9;@vB`SqpO z=+1smT%Ii3?duq)wT7bJ*|D9Hq6%evfDBBvd-to;e6DRSO>(>CKfuQyriLhj6kk99 zeV3dZ0kYQe5y*|xv$IA{wg;)T6~C_3SL(j}^@sHe;>V+h$+uJl6Via`g6TsC~(!w z-Q>#@Pzk_>SYbhs0_NW8AZ}q{A;$IV;t)3>#RZv+!SsWIvi_Asht9`mwoKSW#l9M! zslPKTgt$b{)bJJYDrnwKKiYb(YW(6+1WQp-e04OrlE)p3*HobALp_C!fCJoRULyPC ziHxi)2EqVzKR)|8*(%wteTFV>CUT``&bNdW1M2g>-xOXGl> z`EG4BSJhS5jOT?n8FSI^yRsps%4o+w_j6BhJ6qJWRqOx!u_lG6e|m|RD}2WLsq_r1 zv(4pE-cm45hVRqqMV3X|>^&KwzSUPIO>V^Thwj=%O>QPNR&9D6x`THZxlZC8OxXtc z7Zx_g7#HjHI*NN%ow%5lYR=-Y)N`M`0{=-%GiB1|(4G45S%p1As5Jc*R8%+;!O}8z z+EwQ>d86Vdj)#_eB6cdJUb$5LR-BlGCS;MPD%SOI{j(LLCwE4j_BHBCy=SWOPQyqT@>J8%Bh|T=7mr@OebHN zfxIH%$J4$Azeb8}mp-ceumfMr*zfpx*&Dbn?I2nFS2$;}Mgc!fP;4}TjiKQ5(c{3L!G%7L|d^Xu0{K8kbIbLe=&f%23*VsC+ zJEwVbxcD5dm(Mn$A#3}|I6dngnnH8w=nBn*EWK1(lR(13S^D{HrI){Z68 zDy-jistR@Ris1El5F&CeFnjhKHCQJn>ieJ*>Iv4$m=^V$to9fUlEVE&qb_yzyp@vu zIF=9RIv--vElQ)>nXwho!&_o0xZgQQYmt?G^${aiFv%T_efwvO4k2bQEn=97{@{^H z)ts$gV|=mWRNjh(*X3bxLy0=OJxdLN9$Kx_K`G_fydK|pR-!Gckh^S~(**wxihqR( z{asOf@#7#2hxX<~`g!db|6l60BXJDi^1IaIGX}VxdX#*Nx|U{_(VUmGkdN+slrApU zfq3^(`*sP$b@BiUaSb0*uFSSI=c4=zESjwDL54(TsqI#e&)Xeue&=DN_F>jxhsS6Q znI{ssSD)Rewj?B0@-S3?`EF(Snnp+n%GLl5{*>=^vMjl`ti$;m11K!K?ng8XTf|3+ z%lBC4HCLQTA5IXH4UGCRA-d(TEfMDwAHO-LjorGa( zN_pcn;rZ|Ln~Nt@0UF}{7+b#HiFgBU5{cr5pT(2v>n^Y-f36~YFA~{lxR$&%MXo7n z+uIF1KX7=CjdL?+n3WZgu1>ZsJDHEMb7+1LBa^7^H}v%f#cgTlSJx{^K?cp$(%adzSQ4bJDM0zqOyj6JjUpkX}U!ij= z=Ud$xYP?Wz&qH)|PZqcKQsoR*yRdBdqF5h_W%fUxqY-6EWl% zIYZqx{246BOSXZ~ZcDI8foGwq5C6azs}fOOSS9)VRUx+OU=5cCr2y zLt=KFoL#cPp78SE?5lZd8crGgW#yOkg->Pz2wpB(W@#DMcC}FYwTKK8QTxeuldZn5 z9j{(6dP@jZ)jN|XSi2UU30ia+Mz5{w*SEgq{L2F5X+KSB{navFv)MS@TO#v8&!y@H zq9^5Byh8@ks2S$#SASgV-qaJt(%UorbhaXisGll)lImA%G3tFcRb5r-dE;j<&84Ya z$)!L9-(Y#C&Nw!9O2-ja3iG4ELDqJ+_PJjlQpn%A?HJ}Aw}XUs^-rXm9OIjWWE~1a`4fPx_iODw;W5=KD}2UC1#ZVDWt@Y-E7lP!j%h1IitTe_I{2tw zczxsY?mCl*`yeDf_nvmm8bp2EVXQIq+S*q==f{VZYX+M#FYlJCGE>%la8ZDO0*WE5 zOpI1HsEj|~X}Hj6?ps*Zv~Wc#@0}>bX$@XS5RM<`?D^Ql31&!*f3h~~uwayFzB-PG zZVmNHPRicRP*G8-%=6gaZ`NgfX8TGOhqWeC(BK=pcPn#!2Bumu;h&@zcUR(BVz>8f z47av@a8VFR6nMkA@2w?Ke+8~l5;zLOdBIr|*0p%|Ow!!ht^@c=}bSRQa9DRq8A8ndw%hC~A{Xi-NFxDrxaN-sO|;TmURJt63lHJi&|A#aN#iKSs_w44o19XYy*fx+TjGLQVkC9SZ+tA-4 zjTPAwUi}DxoHtIatUi0 zCX)AS(ek+Myx&vF=wh1X7hp^6R775MXG!~OKQH+{w6V9gw#<4@PRBinJ4}dV$H$Te zv7a_5TXf^o$@IfLZK|->d*;z|QK`vK#uqsGSJV@46}anLgf-?@e78nH#EDEc9A?CQ zOmAho%O-vZDf@r69LV*pevOWff8i7oX1{VN93Q5C==~=Att3bJVBL~zM7P=Xi7fs4hygW1u_s(`Ti`|pH~l!BFNt2MD{=iws_CEZka z8O1ds`UY0_pA{Kj|2xa%1kF==Qv0caF7HuZl{%E#W8(!ZXCA^qGYuD8rHDZ+svf%A zRC*mG0(5xe9!@{|Q&*jSuioE$=$KWQ&_6#zj<9}q%ZS^?#wACTeO1l5rnFBhd0MYi z9N{~|atpz{HR>CVu~S9xu|mx!7v1Ubrwyt==?VWmrmt16H6J}f`)+kq?9e!=d=svp4&2@Vk>8g1+NxE8yVX}#QOW(%ci{4l zti)P)J}&AMhma$5POg@?5tq?l{PVnD&{*S6obyJ`eMD1^%=`5hOz-bx%nMa?qbW6H zZ$2oSCob1oyb3v#G*sZoI=$#Y(UW53)Rj4`F`bU+z?et>$ho9~@vR@%f+ZQvAdX48 z$spAt_k&Fxi$P{wc2iP%kTkPrY<0)>*Ifi%c6yKIGbIpWQ#O?blbM<3XudQ$3V8A6R_{W5o{w?KwCQ9VkJ4YA z@(?fuHaOXV*jsW9x=2@jk*|bFI@=K_>E_kgJizD1g)Yw{EsIGLI&%?t8TylW&^@}% z86WoPwQM?`CFtY639EgZaQ;DE`o?6%f|SRehxY0^{+= z`)HRZgLNtDW!|UnSV$&uhL-dUbnE#%4(<`%d*+Mm zhMPF_zvTp_yg@_X>kT{)+*X;UCp>dxx_jaYgewSDhgYs#dH4Q3I)cw>g%Ctl1j1q< z_XmB1(3Oy;D+t%QNEtk8>WJ!Up*?8Tw9lyMV2&6T8URZ0>-X>1ikS_+V_Bv*H`!gD zNl6)-etD7{uA%3K_Q2$iZ$tzxIu1o3piKtQSzH<{p#DCRO}IQik48*Pj723>t}v^j z^ZEk1paxuIg$|2~=@}W%_gDH|7ZY9Pa6$qD*FuYnAAo$*mp+)S%KTNZI`3|N$Z2B` z0SS*K34#V#olM2VD1PV}*&Fv}K9T8667H^WSmepmuwz~7=}|7PsCX2?O@@7sNIoGf z9%;YJX%Qk(8G20&W`KLd#Kt~~NQnAWQ!}nDcyFN1eUw2)&?R@U7An_Zi}yHWT(V@3 z;sDb~Nul`&@1`^NV5aqdCm0y-kN2$9lJx$A2Pg<+i%v{TjD8dTzKA_oz6&}M>GdAS zKjfn0;sUm3The?kFM0~MwrqtC#~gZQLH|EH?(})9XqxcQ8}<>dE2XDLrRRP80*oQ2 zeMNN)DW=g`sFAfF4Z3_BI(Q*qk2R>35uToY8)-$*ygfSG?L@K!zkkam#C9U>v53W_ z7>#A($xokRfJ`Toe>J2|2Bc;d#Hq~#P(%d#@s3Iq7!IVI0~ik$!gV#TESLFG9Zrpu zu||;(>w8F!=k!@yeo*h^Dt6Vl@Bf4_p3wrXK2M-6P4YYI;mh7H zjDs12tC62RjivG>&@(Z8?H;mv2?{R~#IXe^ZuuE3C9sgA_t1+d^#}%9+u9_dSD=_I z!b(XXVQ{jy3^{^KLZet&%hmC4S6A05I59o5`rgNA@GC5w!gAx6gm#3KY7{VN017B> zaQu>XRrvlXIhh)%1|bE!e#uCYvn@$Lzi0#mv=){yrX(2JowsKmA_>a7?Cb$rdC2|{ zJ_SW65|adr&dSOXYYC?;gbu2!o7?lVqiv)H0^NE}i-CJm4<9~<`%-p;o{er|-5da1 zK~Ad!{a-!$vXqWMf#U@63Q-@Y?;Qk+_AH2l1+dz#9fZ|3<7}umO?{(X%a708F+;OXEYP^$ywBaoybi}p z!iD#`B*B6F1Ri61DJ^2a{fV0!KM^$B;L)HnDs;Zr3%ZzL+G2C}cDz~Lpp>H{7uaI} zuE-GrS9lNa;^MH*2)rX){aLXIb~v>tPVsQ)qiDwIubD~aTc!$mQcsI5$I6v}&{4yn zl~PuYa=ko1eE8x;1_`hACk_%UpdG>6HL5%jW&pBrsXNVZZ>f964_5@QQ)u^b$ZA+n z5I(G^;oPstpT_sE)A~cBqzK@x6j1Z=B`}bvuif5~5@3~!oyo+96CMS45`m!k2ZoXJ z21Bnua2un{^*D}7+R4*@>QGkWys(af-E@-1LF0kZ=d6qwBFla(C9 ziE1~vGoXd(-(r!{vFX>|larTkN)h*iAP{r}@LyxK+pd3;A^!aQyz&@ahK`X@+HRsK@c>>s5arVZ+!nn)iEzs*@kc$RR2oF5A`BsUb%LbuzT6gvd zlu2JcO9srmA=VRfcNYMId9}1;W^8Io5BQwJVn?i=1~eKG7?_w1U^W5!tAj?+sB`*# z3KZ)$d<_Z^;o=e3^M!===x$oTtuY)AF0RhX^ucLy-8nTfMK;t*tX^F2JJ3IDp zFvK&I`%ecep@D1^A*`NBZVwu5$@^)7%hhIb4g&<^lcZ22QMwOI>U0?I1Onl>HiQo* zPZJd#jZa9}m+ml^1>R`V;tB$Km7Gp1s!WUs|n?v(r{=QbwFn(|lmdNB`f1%ligVJT}74jm-nst{KrwaL*tZMOLLkY)})*qPd z>MjkAJdU|9zKjAm8SulP{!X&z>*oz`KLdq&1pEO6rqHrUZ*o{Lq;CccI0zWJG{85Z zI&CKlb!EZ1Y4PR%%%y(a8{GSED7A2p*i#-14)8)O2qg`0XgEBNwzP=d3=Qe@T&K|h zZj_@}WF-pOiwCL}piw%YB@kO3f|~~GBP9yir@$;9^(J5VLle>fRCrq_r#C+g%z8W; ze*WA7PxQmEFr~Vh0`~mjvu7m!c;&14RUF_N{NKJcYfjLLwt8FQ2npd;x6>OR|8x53DA+{NG6hea?6fjMo}gBoiVXxCgWER{`aX z#8V;l`@myG)*Mb50*K=bghFI1snljN99c=jKt)3glS&t)fbJ~{T-UU_+6gHs!ys^y zLQH)HQMu?v+tl2gf7ygPCd`O5f};jPJG?(f1Lm#8)AJr#(p)DOss=(j8ebf?n7|a5{{SKOlgrgm^*2%zOh} zde15`B=9oSUW8yhE&3Nb#EsBQh3OF_y{ivZgw}jXU*0AqxD1#$2m z`iuLaYf}1f567K&{yzh?Yxco@QLI29{*94mjeW*rx&rpmWXnoj;CU_(N)+vAsa(hT zLBv)fr0onme4)i4o8?3`sX!tQR9whR0vRxot|~u2A2^Mz{r$hD4YIQCBAp2MUDkY+ z0U}3b<(n6f7a2m1fCRWpt;d6HYLC(XtJ#=wO4H|jB!jPH1s(yHI)aU)--54}KLHWl z~AJH#N+y!adzupodJ8vM9|O#flP27zd7X_d@ZAR}zeM;3j6^ z{oOSvh$j%HxM3TwFhH~ii5&S>XI|H8VA_sf0)%JYGTQR61V2b?X+y)95dfKmr@UZI`i1 zE^t~)1vgpvWqC!Hpa~l`Vn7t9faLXUlOeJ%>*?iHXxw@eAZh3V8v-8#Z^=;p44`S% z52&6r*;lP%-L2jXqOX>exl@L%?WSD9NK-$5sn@A9DIF3U#mT=7lTPVkMPaO#-RL_Y(j*?~Jw~x2~CK^JP49+denn7a`q7OR`DDSrMn>x91 zyA)SCE=PS&=m#bd8IBOh?CRUgUPfTyP~b#0fwgWA8AUvY5|uSO8Y*olj(%E7H~;Rx zE2n^pAY`PFn0!<4GtU&NHp^%G*6P%rTPPFWr&h>desOkOqNn1(KLl((auUZEN>GRm z$l7AeZ85PuI~sY5YjQ80!=X_(CGCxJ1BVgdxYf~ibBLQuJsBZ;WH1SIUI*?Hd@LFQ zh7{mTlDFqa|6N0UXJdQ1Y+g{4XJ2H^y7A}FW8|rY935c`N2j5;7dhD!qh9~`$&)(G zWNaRd=a|S33=Y}U-69nvd&_<({^7ChCWNjCh40ya-Z~fen3p`O_2l>T^z16J&{*E* z8P3$+I8U+q(XHP59ElFI{=*R4IK-MP);q3~33>P8np3(IgP&8rTIO3dkMo1M`Wh^E zx)`z?l)BM)^?!czR|(e_D0+F^iBt+;E(zw}P((#Vkx9n%^z^ebYu%i_R^&VT0+Ce& z;+ttF6*86M;UR)5vS&j*@6}&R;PP1d0ghMPn{#~=nn~OCFE2C-HW~cI6curj+!jcM z4u*`7k}HC%yrCab1mxrmB=IGW_P?PaEuzT~WRcU>BZeG2#MW#O=7*D0JjdPr?{2!W zv9U<)0H#kk!G!-L@D>d0a>t$BgAqZffm8ZwBfrhd}aMVG_U2AI=ffx z7v}YqF-gP8S{a{~%d^(QDsr)V63=xVa+JU3iHu-I~n%~ACI@qUVhVS$^LgKoC+3G$oAF0zZ@^E1yxp)U=J#SLP*uYtD_`56zxk**Hfw{WXR!FffbpsQ)5o`nV%=TN&rG4?g}R%<=9fUs9Gm7X*uugd%YYnH?QGGmXtpYN@Nl7 zdVm&?iKylf|j`i{$M7to=)BS_hQ%JK|as^ z`lVhVli6Le+~Xq`;TQOeY;6XV9c3DT+GV>(%&9~pdjKVlG%V>C@}^@jB<96uQrB*u zy`xO`@9i+sHJRtQok__xg8G&I%!)fASvCkJg#Uf}xBnJ-{^tNOU+^f(zi{h6boBy2 zw^^KzyPP|W?;9cEnK>dI&8WCSzH4iiP@&==l|T>+ z-m_|>)B2B1msmibO%8?!uqv9AE#hUF?lsHl2!Es2z+DSydd38Db1;p^YaX&NoI00yy4)r z>+S&EeGu^ysII9&LHutGBm1|58)(2Y0595O^fxrB&>R?OZjnQo1MJzrx?ID7P0_dW zgnPgpG~ZSTX0-UrwxRO$54>dcX7|tP&shIE3#j;6j0rFZ@VkN2xckjp0q!gt7scRJ zK3Avl#kdek7b|Vp{tBlQc`0UM9l6K%pbP^bFc?!<7Wlg9J&yNc(YqzDe0Eya62j1-Cj z^Luyk`_r>ts>93zK+gX)xzf9IdgI1OE%_L7nn~;H_c`gB7V`n4C&^`B_eYdH_*|5q z$OU94#T@yUY4=6<#JKB!;1XLI2q#(cn@L@5&rZwOFGUAfxEnf1P|ej$T&mpq`?NRA zZD2WV;Z+S0@c!ML&S2a42Xu8%?Hho>gY-$1zb6IW5@PLQmbvK)^!N7g?7==Hi$^OZ<;VhQt@ESfm4d|xw(QXi#eU0z~$$nlPR|Fn&iSNF0 zDHD!VUJ$&vQp7BN2?Ao7*F}ES223p1U>${*Rl>J{XnOs3C*p$G#!jL|U zWWZFBriu_gG*wuZuNPOChkutw?F_`gd4IdgMC~=s_3N+XG9{!kGH`2Gc3Y=(Y1bXw3fy!LD!T@kHg+ zaaGQ}f9>T%;nWjI#gNE8vUdWp(kV&ht19$^;daesU|1*yG;R|6kVv zFu9>!YKc<+!%(Wv{cn^0s>l-0vJzQS+$zCp-C!9097%PJQc^kmJYdf4>bN52Jv@x^H%yWt9IvvY!U@bm z4&$=+wb$bUh+KCDVeOgn>&gNNx0fJGwH*2;?p_T;mM}!zLG&;>G`0qqVJU%_0g-$>WRWiQ=I_My!cjM!K8Rj-kBS_;AFW%1;T)PrByM2=1@6%b)z##%C2#4DxS%oc{?u3o*$wPRfr+gy2V zzC2lWUE}X4FPFDeI&@K3`^o^YINQ(VS^WzF4r5oD68~?9E}qg`4+@*LpEqjPaq*iE zaQ-ShH`*8b7P46v&@=m}XM136WdZrr|23D9JFy6?J5o1>l>nE9%%x$5;J?!(;BiyVa1WY!Cv9GB@&~VD;=R)f3SAvR zdPg(cyhC1R+=MJnA|r;KflIVonwobJQ)7!XXL{}#!*!Bnr@D;xTriJ{0)xn)=Nln^ z6a=rePF@%oQLAWf0f3^vwjqa5~RT7?b$FH2E=Ql;5Gfc1%8z-T!mi z%&KUhUve^yjs+N&$k;bI0kt+tn%L)T1pH*0+FkwX1$tpWe@Y;`PO-7P1Tz0dh(97a zqV8Cf9i-_kW5&Lg>v1yGuY7y3Hfbqxl2`!cF)45}&0(^lAZD{Py_vF2Xkp&xdbWizj}h z+xxFMkF=ZnRtsZNP=}OJKBt6HUC?P7CMWAA7Fn2?QwCRIwgn@))gPp7B4qy+GEQhp z;**dB!b}90KyXz`idEGQ)@He9J<3m&mAe-Xe*sIIaPDSP)3VlcC#yqvL3>b~Cr7R1 zi|PLE#pU|A+q0O#;iK(MOUt6KG|EqJ2a1KtKgA-OAtF#PHy<$1<~e7u`eSVvuV>78 zq^DpQzt`@;5LX*l_;Ru=W4SZEZtdmG1gW%CrAD+g%Ntp~Y4?6&1m17kJvI8tkz0iI zptJt`IE3`WqEG|3>Bi!x#ZRZN)UO4JX*m*Cb%)h1gtdc3%y-9%uDg_bt%iAC-;WTSl$4kFR8OOy zfBjP*EuY|GO*jR?%rB3ca|8a5`ip6p#e=bB>N15pUv4ij9I}y5KRwIEDsc{o_g1j0 zVnO5t-LXfn`YA?8j`vfuZDP5jw!W&(!%FSm(e1eu%zU(Zfu2CMAeYq&%+Jzn71t7@ zqZ?psUQ$JcfF}Vvp&b~Y7=doRT(8QO^sOV5Cx>_6GU~db38+}D&Q57LPT#8Mi%5*d zGoOmGQb&^fm_+yL%}a^^$cHHzWb07J&a8i1l@2YW>35;ut+_S+pe94gtplwdVYUR@ zYi*2UKXx6c;wUCkJ8v>ZFxo#}6|h;X9(1>ADfrxxv>H5Y_~P`3eeIA+@G!QQB!-J! zE5=A?X9RT065vO*z_7pDaz@-Upq*2&dRT)9Pl{!8%MSgjs(g6ged-lHS;DH%K(Ts) zr8hobv+=|ZM}w7vKq5?sC}nkJ1rP%UUf#IAKGlw7(am7VoTz~*$wpSp?nApmb(>=6 zN8?(YB(SBGRWlSC*^Td=8kaCS5N}>M8B=l|xWD>{XFeB&IM2~P4+g0^3ffb)gSsx; zpts()n~~C>aPQh4cxWPlmEIBTu$Or=0nhK1{FObJr_e07rPa!Nk1Kli4s;+nbPnRE z$`Zr7D~}jTzqdvjFKU#2oKZ}m7+x(+H>rG@=8a|s3TC^BQGd1?sCC2Jo=>0R)g8Cn z9IfOOoOr$VQ_wfRCpcM$iM2z|%tKhCH#lFjHZd_v^i!L3Phj9)qQn3`{VQ?r$6=Hc ztEZj!m5I&!3lQf6+BStYlOjOHYZe+lTovDUhY9V`NeA+n6Ro_=`3v1Ps^0vK7pTe~ z<*;rD>k654t$v83SYINqp~ODSG^xZx{i=`R4owyW+|)<-IW=7jcha3%(^TZ&QI9b7 zo)RAEL~^B9-B8UhZy6V1O~cz57@sN3|7G?JixKRs1}p?Q2XU}59@1n;%gr5&Ttr`A zga(Bk_a~V=HQ!5rN;dMu?3Y@V=cW0x$FXP0G6jc2#~7PavimPrKCBcM2xtopB-Vz$ zJuPmR4}68(4CGWk%qt>)^aSCUp9@2$pkw{JIY7wc_y zEBB(#$gzC$sA{S360_Nq>vhnDTiq1i=qPnkP#~4G8gCMzR>89F_%VE3QBlq-hTCUW zUiQGqyz&J@bzIxc*fEDrVPwpGs5De+MGl#febZ;PCbvE^!npt|!~4ggswiICvi%|{ zWp}AF?ud5OO)idVBz}3pCA%WkT{85J1V?q=*|mdmgnf*Q#8XFrY-9b;GkudD|D{m* z;1`yYwTWQ#943EA;0HSme|s*Oz_j@z{e>f0tpSR@=ZDS?A8OOZx`^Fl4;hm80y(1+ zWUOfH;gq>cMN{M#xz;Tsiyl>%|d&8gMJYKke5!6aU$%+Jxsl_j)*g z&g1)0^PGpGyX&H|qW#&tf*-UM0d1g-x zh_^0Oh33Wol1ObZ25&q;zw7J2w|k!RO7#O-B+c*=8KHmXbxqAYdW5a>Ay0!BMMwVL zWn;Ed+LMfecK0lvPDmF5u?j^C)O$`Nd3zpz*oqG+N&l9f;~^78cDU7xL-TjezS1!6 zl`8qCwenX?`_xXSLTzRm3qMEXs<>;h@R`wCnNIu|m6Cb{tgN&;D?Xd)4^f{TxMJFb zfOGL4EMwHGE&Hy`2?W&DiqQRDDU4NOwzHf1jm%MDkH=UQmNYF{4F4wT?2J#i8~8w= zu3s7SK;|`?)0v^v@e!F%x7VpGJ|EhKKr=^Bf%_~`_sH#SYg8f=a6wSsqKav9CMFGe zd2G74fuz#;;vY*ZJWt$a1pKf?jDp_uu-K&YTT%MQiJnhq-7C6|sni>9F~@2+zSStc zRav_>{1A8o)y&}0`{BbktM%xSc;^Q&44;T78a>pD-_kR0H+~bG+*665OjjZg3st~4 z0W*_{hg(VEU(~u^P%CHyJ@z`p(vu{&^ zPA~u5b)C?k1qY7rCQ#pEQ)fX9gls5f7!eUsrD;b7=&H#xM`pIq{{G!d6jx^dEd28-_!m?w(8q4Ip2ercw>%myaDjP*gEK4YuPvWo~KUdo*{WkyW zMob^jo+e0~(f}cd4Y1xja+wQ3q2vq<0sWTYQ8+CpSr4tbqs~&%PWN?q?098h4+m<# zub;OexVTfD5_Ucr1!F(!i#uMx)U=6F7-nfY<+xbbFlnSWr@@|6z>t*m>*);@lQ4B& zaX$aHCvvub%AK=uCt+j#wys|MuzfAqM!+cDc3eA>imu0&L($G1jh<$6aqH;&HeB1U zJ)`!!VV9(!kl0{uf6~s{#sDpw^Z8SfNUtcm1Z>3auKhvhZ4hufDyper0S9X|2&@4K zZ8s1Rq>CO8P3lw|x6Dg!%;x`^d}u00+hMAZ>`W=NeY}8x_x)o zEK&DMLXCQmOjxnwT69DT3$1HO*U`DOsU*APO>J+-I;>h}S|bJ_4P;iaByn3AT}sP$ox49&TpU!O^BU|bs@_#I93JD( z?!+|S9a}_v$v>xMBn<4O_j&xxd7(b~^3o6LwDc7uU(kF2$H^B65Ck$uKq~c%?Zo}2 z9BmugPW*%{D+@QuUgtC!%h4d-JM+{g7iwwGRx9@we<}P3J|G=jOPa9rWDITCb`!)i zyXo>-RJ_S})n!}PBq;g#qt1Y~4BDu}yA+eH6%2lfC+Qz7r!lcF$nMZ{U3}@!gGyL% zv9PQDJ_(%_l^Q!aeAO8Aeg{z}nTjzH`xlWd&TKqTh;|9IFRS2%I1I41F2Nby9;(M? zD#*Qb%q+rf8)LMsV`>3CbmE_J-#0N^7A$O?{&c+aHM_wddjH`8Vq2MY_s%J2HV0|m8g z%wQyFyzl(Nr=Ks6!n63*c1Oaz{q~aC?JQXqTC+#As~Hp4FW=ZILn18=3AOV-X5_G| z)U4JW?00m(guG-b3K+U+S1pQra$#4bb<@9Eg?VM4o|Jw{Hm$K-Th_N!fBc;67y}6p z8Z5C4{&{lTF`G8I9ok%5zF_rR3b!E@o=$<9w9mEmJ-B$SnBGZ#ySUIB%g6*W#Fbrqi%(3<;=kN z^`^X)c0;Apolg@h!+k) zq*~q4m*nu{>_9sPk<@Q&6@jlR>$sL+9UnmtljffnZHKImD6xBNtZ3e8UC{A7C5dGs zgEt|YPBi-{|4YcV3c^isFYqX`i*C2aXfhOEKu6?^Af%a0VCnkUPLLT!{lV)V9|=o<|s9P4(*-||7O%cz|C7P}2G#Kl(?^Qr1* zXO@Y%`fxottuU-_w%mRl)oM%omdpr}HiNPTf{T~!3onf(enb&LHB9SQ?G z)5xG$8=1}%E~D}oE+sC)ma~IGY=UBb9`z? zRy^462}B?IQtUq+Q)PqxuA$6*DyJ%i|AvS~AgA_E%Mor&s=9oIf!)`kbvnau(-HxT z%M6a^Zz3p zJJe`hxT>lhmduhjRc94UnSWm2G+6QtM9w@}9(Y7?5jBLje3L*dCqjtL;HB_`|4od5adJ2 z^V-I1tuR+FnG`-FPp{M0zh?4d%~9v;V8gYtjCx5Herwi4RksO-1J;#zgF2~0#7l}@ zHe;?KswUr|B4O7AoRTM8vGpUoGdjkx&LlA|xNE$lfdzqf)2DLW@(MWai?y5*Fs6Ws zfW%B8Wa_?inhRSiOW6Va{DYb)Ga zx;fvE8q5T0I(Fhbc2mXGkC=7)(f?cN?TzgU`%oRM#HqikC|<1}6f_v28s+xsB5IVM z(`xumLLj(agTJZg!$yY*-8gp}rCE7xV}r0Gcq2<;m{KdWmCsoVcN#Rs=<3GUSc2&O zubin3mV(0CxhZlBw*2aEWhy3<_Cav-a3B(M77h4JevpzQWR-v6SKq6+gAQ;JrBAcwSz=ao;{+aPJJ zuk^{gLJq=@Ka$xpc1IQ!YjytiAIRYd zA;3D1OJH{xWstuXX?1jJ?Ne}e+In)TlV1#e$4{U3zvK}OS9z_h$mP`Yd?)M`0o9L! zVvVVTg>LVywt2Gox4NaSv2OHU2lrb}%xJ=X5!Bz`i)o?UCZVVa_yv)Hbmpdvehf{3 z_;(TWV-jvIlJ#(dT9DRfF~BOG%ta#(u7!hcdz7MQ_V!FW)cJ1Mv~_WPNjShViG4_) z3kV!HZQSAhh#Z#Q2jfzx%@{kYaL(X}Z=YPmdnzG}u+b2;FJ9-!!;nM?*cOF3?&^)w zHJ)0;>jq80f42wARTEf6}BHGi{RN6D&(t~sqE^IpR| z%JNb}(fqvqf;=mNE3L}fD<$}M*^ljqdvA~6s}A}GdJ13}0BI-{0@L5|BBqguM=#XC zlK3lbD}{B;^ih8jRFu3oVG;U?7t`yjA)sm^d@5&Or*~^9bMMRXQ|2ZyOhrOY;acw0 zt}Pbau+m>OiFS_p5B6X4oUv*t#(?!|Y_f-sI_`uwdETO(cpT3BMM~uhF|07tsDhs& zYpYjTc5vch_V|sg2QSy7AH=g_@`HIsecIkc@z{rbZRwPSSwp8>j?LBzmbuo>*rA07 z5b}ZW577CbZZTJ7O16vX_*ZOX6H+>h=E&|pQ0f-xw>e2*#MGbkIsQ;c3l&GdJsTA( z`E$i%LksnPOcj#?Rx0J3h6#E`P(Zs(d3nJq!Z2+pm@1nd1sQ0?*`c z_e=Ye2t?1zaH}T?D?)>0K7aifo37LN`}n$|Zd2VedYm;Nj(lzrq!R3)-9Co-5f?Mu z|6)!Fi6ZBwEi@b6`OSDCplidLpGc5qe>Ff>1XowKk`+ciLkd=LgmZi9|8Z4tqGgW? z4AXeU11TYp_hMvJ>q2nBUlaLZP$U)31-EDP{F!N>8U3$PEo>^*9&ClVn z&vWtkE(%|nOfY)I!V~YLn^nBom^7d}`2^LTS5>^t zAD#DB3Xejs>v97R?hUSKkH7Xrq!^s#wv&dv=R)@Y+*%X}cyM@l6Y!ymN=pl;?V}SC zK+FasPF^r?UtB{#BPOjUmOg{#g=@5{q`sid5Dk3F*oGz6pm<|Ft&6riT%l-4!g|d! z>ot#fObq_ocy!~K5LXp-30sMFa@sH7U5efT!I#4glqvI*PGV-8_{@Hz8)%mr!Iu;$ zXeLl!Vwi&%7J#osf@Kw$n4=iJK(yK&(y$Wr?qXw|kC79H-m_SB`L2Sy66cNU1|9&S z2=w3*{z1)|Z7F$*v4k8yX|b(Hw7~I@ZJBzM(mdY_tX;}>ar+-2cODNrvreblFI>fe zIb%K7JNF%4LFXB(>WYRPg7=|i>X)?MSQvR1cJ=cBPo217llS`x$b!rwj{VMF*KAR8C`IzQ2 z?ZuYWxbgF;u(RL5OK)07R3{goUQJJUKZf!MU~ytXV=?pE2Z#Qc$tzoNGGBIs`q<8N zE@^LHIx^5QoV3Bl#MCr<`gz)}5{k6o&DLcuZ^o-qV}|{DG@(mXNyiw)LM?JCncHT| zYKxjhG@3*#KUXWQY0Us-S6c{HH{yL*1?6tR(bzb)bir;mDA)Aabm+W{AeG4mh4nXd z+=HzT49J7nEl#}zmcNVFXyS4Y4dpDefRa*5+-Eci7UuRa_pZC9o*5EYZ2CSCqy@q{ z)F?hEfE`S5;z=8$KtSnlbTES>%aa zx3MZ~5T7c$|4czap$_neAYd`20G26>3s3~%!<&oW9Aba{PfEELm)v>d)h&Kr3qRMh zxaF6hqH+sTc(c)aY?y2cgG#gZ2M))5HR!>H$^r=_u$l1#S$~NxBSzq)z?bF^}cVezE>Z39mohKV~;~J4XL< z^Eo}U{JiG2sJx6cB=Q=$1UuJ$JNgyZ5O<<#>g&%~5=Wkpi5FsqVW#l*P1|iYg^i8v zD&1?Vp&>U+8(Whr&hS!@MNsP5oM0HMaY)h^fB6S-_tx}nx8nB+{54wgyllkZR;Oco z@mop$uL)&ynu9_QzrE|0hd^-bQF{bLhIePhLs@y^z_l3)ZEx}LYR|RHLNO6u)yev8 zTW2{7m3-kYJfJ$uN^`taU#p}Wv&w=O6~6oRz5wurNJp?f1D+C6E><){ zT;_01X1sCWPYK6Y3IS+vW*tTJdH^fWmqvY38u!iz2%x7+m%Ycd{XafhUTqKwQUyRD z3oH>Y`DuK>!kkvztFlx>1)SN1dpa(!^J1GGBz8e+ehpxccxqsmpGq~9P)Eh5u@FG~ z?bb^q4B@vLq6CQW-8DhLL}y;o@I-6P&wm5xv~bI8^{5f+<5gPVz9Bs*ZS@}q=(fjD zsMy^_2Wb7mK@Ce!Zy5GveUXXXEH_cZZFaqQ^CF|YjxdTqXALyN5C+MBG$0-$$xQ_1 zS_`5Bu~m)Sa6vgOpmSu5h=_mymq!lp@{!~MsvGW_=_i75RJ~}fQ8t11%>Pu{r~j<9 z8lrJ%W*l%tBwBs@h_Vg=!^ZwcuMuki7ds{dL82W1#(xRW@J@D3a%yTsfZ+@j@>7nE zFu48)0R&qCsAiZxJSk``Cn%porqG%2W6)Yv zyBFqud2$}0-K|Hn=Rkhx*NSD2^6aCx1CHbp@Xi7-X*mTA1O)*?911M?D84>wKL?B-P9QBlGBWbeu>Ngps~E7m zKF-@;4mchGUM}&w;Rs#zHR)pz! zr*PscFkZj`On(&2Ahet6FTk!1wLpps95YH%QfNSxrlYS<;?vd}MZ(U*L)2la0T{EO zd|6L}gSr+2f!qZU7OjB>0_7?L6Q0SXr6;TVjV?D2{|RCKYHaKTZXxi!fdeKwtz>{t zjVbqakHj^4r1i?-4C?VfpZRmK*?M{kp_G`YZ@Bjl-%_Im%yW?YQn$8#03gOdeD){m z>D8OhpFg8Hdjotq$_X6;)HM@RQ=1=AKl{K26)0lb@kfW20z$SL@WBfV+4I1gRT4JHntM z+ME9O2u$SD23j}pJxAnTZL*^T%n%@w4+H9&b~rpSXiZY4+}X#!^HA@@t~RQVV6Mqt zNA;fAg8$FXbV3SzTS4>Z01{`EGcQMyky%ITGQvd|Z@d5_X;3yONU~5q&H|uJ|ND@V z&V4Z-M1B;oSy~({wQO!}0doQrsCWfv2*I0)@$r8(4Ip)R68K5FIqrmD&FN7n6(hht ze+2si90BWBqc5zs_5;cqc(v~wywNag>TaZk&OG((-eV_|a>mWZu0+PkRjs;Y= zqAsf;IHT&gWAB5Qm(V&=X4pbi1GejdD9NeIR!f_Q{s&*}lw|+_ diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 6334167b2c746..8f89cd6789f4f 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -943,7 +943,7 @@ Panel4D (Experimental) .. warning:: - In 0.19.0 ``Panel4D` is deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. + In 0.19.0 ``Panel4D`` is deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. ``Panel4D`` is a 4-Dimensional named container very much like a ``Panel``, but having 4 named dimensions. It is intended as a test bed for more N-Dimensional named @@ -1032,7 +1032,7 @@ PanelND (Experimental) .. warning:: - In 0.19.0 ``PanelND` is deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. + In 0.19.0 ``PanelND`` is deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. PanelND is a module with a set of factory functions to enable a user to construct N-dimensional named containers like Panel4D, with a custom set of axis labels. Thus a domain-specific container can easily be diff --git a/doc/source/io.rst b/doc/source/io.rst index 7917e6b4cdfce..35d6639d21269 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -487,13 +487,13 @@ worth trying. you can end up with column(s) with mixed dtypes. For example, .. ipython:: python - :okwarning: + :okwarning: - df = pd.DataFrame({'col_1':range(500000) + ['a', 'b'] + range(500000)}) - df.to_csv('foo') - mixed_df = pd.read_csv('foo') - mixed_df['col_1'].apply(type).value_counts() - mixed_df['col_1'].dtype + df = pd.DataFrame({'col_1':range(500000) + ['a', 'b'] + range(500000)}) + df.to_csv('foo') + mixed_df = pd.read_csv('foo') + mixed_df['col_1'].apply(type).value_counts() + mixed_df['col_1'].dtype will result with `mixed_df` containing an ``int`` dtype for certain chunks of the column, and ``str`` for others due to the mixed dtypes from the diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0edf52c7301ee..1d8d586f4f595 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -201,9 +201,6 @@ default of the index) in a DataFrame. from pandas.compat import StringIO -.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: - - :ref:`Duplicate column names ` are now supported in :func:`read_csv` whether they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 921f60b23d187..271fee6341324 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -1010,14 +1010,14 @@ def timedelta_range(start=None, end=None, periods=None, freq='D', Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None) - Notes - ----- - 2 of start, end, or periods must be specified - Returns ------- rng : TimedeltaIndex + Notes + ----- + 2 of start, end, or periods must be specified. + To learn more about the frequency strings, please see `this link `__. """ From 0b2f1f4d7c80ced692c5007c04451b1c2cffcbdc Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 10 Aug 2016 18:12:11 -0400 Subject: [PATCH 245/359] CLN: Period cleanup related to array like meth removed unused / duplicated internal method returning array-likes. Also added some tests to guarantee existing API before fixing its dtype (see #13941). Author: sinhrks Closes #13955 from sinhrks/object_array_cln and squashes the following commits: a75a718 [sinhrks] CLN: Period cleanup related to array like meth --- pandas/tseries/period.py | 10 +--- pandas/tseries/tests/test_period.py | 78 ++++++++++++++++++++++------- 2 files changed, 61 insertions(+), 27 deletions(-) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index da8868bb2bd84..d92ebe18b5697 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -402,9 +402,6 @@ def asof_locs(self, where, mask): return result - def _array_values(self): - return self.asobject - @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): dtype = np.dtype(dtype) @@ -541,14 +538,9 @@ def start_time(self): def end_time(self): return self.to_timestamp(how='end') - def _get_object_array(self): - freq = self.freq - return np.array([Period._from_ordinal(ordinal=x, freq=freq) - for x in self.values], copy=False) - def _mpl_repr(self): # how to represent ourselves to matplotlib - return self._get_object_array() + return self.asobject.values def equals(self, other): """ diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 17e6e36d52acd..d7f1a52612819 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1783,24 +1783,6 @@ def test_constructor_datetime64arr(self): self.assertRaises(ValueError, PeriodIndex, vals, freq='D') - def test_view(self): - idx = pd.PeriodIndex([], freq='M') - - exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx.view('i8'), exp) - tm.assert_numpy_array_equal(idx.asi8, exp) - - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') - - exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx.view('i8'), exp) - tm.assert_numpy_array_equal(idx.asi8, exp) - - exp = np.array([14975, -9223372036854775808], dtype=np.int64) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') - tm.assert_numpy_array_equal(idx.view('i8'), exp) - tm.assert_numpy_array_equal(idx.asi8, exp) - def test_constructor_empty(self): idx = pd.PeriodIndex([], freq='M') tm.assertIsInstance(idx, PeriodIndex) @@ -1988,6 +1970,66 @@ def test_constructor_freq_combined(self): freq='25H') tm.assert_index_equal(pidx, expected) + def test_view_asi8(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + def test_values(self): + # ToDo: .values and .get_values() should return Period as object + # dtype array. ._values shouldn't be changed + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.get_values(), exp) + tm.assert_numpy_array_equal(idx._values, exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.get_values(), exp) + tm.assert_numpy_array_equal(idx._values, exp) + + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.get_values(), exp) + tm.assert_numpy_array_equal(idx._values, exp) + + def test_asobject_like(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=object) + tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], + dtype=object) + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + def test_is_(self): create_index = lambda: PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') From 257ac884d61a74990d1cb4d72c48b1c9003298d5 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 10 Aug 2016 18:36:49 -0400 Subject: [PATCH 246/359] ENH: Improve error message for repeated Stata categories closes #13923 Author: Kevin Sheppard Closes #13949 from bashtage/categorical-error-message and squashes the following commits: 0880d02 [Kevin Sheppard] ENH: Improve error message for repeated Stata categories --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/io/stata.py | 10 +++++++++- pandas/io/tests/data/stata15.dta | Bin 0 -> 3183 bytes pandas/io/tests/test_stata.py | 7 +++++++ 4 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 pandas/io/tests/data/stata15.dta diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 1d8d586f4f595..13dacf28b5988 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -394,6 +394,7 @@ Other enhancements - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) - ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`) +- ``read_stata()`` and ``StataReader`` raise with a more explicit error message when reading Stata files with repeated value labels when ``convert_categoricals=True`` (:issue:`13923`) - ``DataFrame.style`` will now render sparsified MultiIndexes (:issue:`11655`) - ``DataFrame.style`` will now show column level names (e.g. ``DataFrame.columns.names``) (:issue:`13775`) - ``DataFrame`` has gained support to re-order the columns based on the values diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 59bc24acac6f8..a67fb6651edd0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1649,7 +1649,15 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist, categories.append(value_label_dict[label][category]) else: categories.append(category) # Partially labeled - cat_data.categories = categories + try: + cat_data.categories = categories + except ValueError: + vc = Series(categories).value_counts() + repeats = list(vc.index[vc > 1]) + repeats = '\n' + '-' * 80 + '\n'.join(repeats) + msg = 'Value labels for column {0} are not unique. The ' \ + 'repeated labels are:\n{1}'.format(col, repeats) + raise ValueError(msg) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) diff --git a/pandas/io/tests/data/stata15.dta b/pandas/io/tests/data/stata15.dta new file mode 100644 index 0000000000000000000000000000000000000000..d13e2fa337db39f73c2fc2a252126a1a73396180 GIT binary patch literal 3183 zcmc(iO^g&p6vy8JA;cJh9~VWS6GS1-FzgzUk4~#>R0w1f116e1tkpd=GsVxfv2M#2b3h;8*+tF2X?%qM{<|`mgTl*{^ujNq+shzFxh0 z?^RWgA15qfL)C-@euJ|rH-T@s!7bi8?~%0b^{we$>-9dcp|97kOt$)!!ju+k zLqYyK?&KXk`HS!5c_+Vj^1(TI`$Z@F=jG*^lW#luo|DnddAoMfuh?1rVP*siB_B#U zZ5{{LC;ZBcD%(XdEGghgLy0I>xuXB!nc0@x;>X$~Qp8E-IKR>@`ZaA@EQx~^>sGI+ z6Rqg&%jNn^Ott#bLwV^^{|0YIqub(F3KdwoO!-h2*P(pgM|W%<*wVj!pxgVuez887 zY1CvCGgFI5iKJ8VVy<|dNoVGduPn0p<2|ePP8=Y*_v!Bz+VbtiDprpldlU7iPEX)R zbfo=!*5BnE(b&1;wtdm@V^7%e{ZHlNd)&CO6M6l^u6^LO3z`0Rm)@G`DaO-`{^Z7e zboHC8|N6onCyV(Ddy07q`)?}xJ3ar_znK5ynfAMJgzxp=W0VXwhIEphC~G!l5o<$c;O* z^kfzwa`ko}>~kl~0=(Y_7K7#BKF|j?f=yr>coGbP5ZE1O&yob}1p0v!60B@9J~OIg45tDa1Go53+BLHungP-N?;?{ z0zyy+2J8l7;3e=fI14U=&2zB^upb-+C%{|aL+~Y-55Ft~OThiW2lkX61RSJR!uA@# zL6!MKK<)%hVE1ebSRZN72D^X(F|dA1!7vyByFmwxf<3_cbRXCcuvcjejDv&V5O@)s z1D}9T!Drxe@CEo5TmTorC2$2?1wVnG!7tz%u)hBd{0^>zKfn#}C-@8e4JH7=-#uUs zm<#3sn@IMSyAv!1cY!5fIk+3F0A;WmRKQwr7+isGLnf3?n~+t3ZK;|Zuac?6%2tRf z$T-I3Mj@|?s7aAlg4I(&k>Of$!nkH58;Cclt%j=5l2I*%9zi2o>KaL=Lr#*1I;9qq z!ggyh6|uU?UaF{Jwq_@*Ysl1y1f!XvHqGPPGdkA zR?W;wbQfzGj0%ntw=y(rbd(JXDfud@vk=YsR8#as92%J{^e|b+ zNrxYJ9_IL9BQdP8;z(n?SVJp>adiY3&r~SR%2+FFnody+1=1s(UaDAiUarnY zdrCk_2`o5&LUSy*G0c-(BM%ad*iSX9 Date: Thu, 11 Aug 2016 06:10:56 -0400 Subject: [PATCH 247/359] BLD: add tempita files to depends This change makes it so that if a `.pxi.in` file is modified, the appropriate module will be rebuilt. Author: Chris Closes #13960 from chris-b1/pxi-dep and squashes the following commits: d3ccf13 [Chris] BLD: add tempita files to depends --- setup.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index 1c12ff4aca372..d1b97e1e3d82c 100755 --- a/setup.py +++ b/setup.py @@ -106,19 +106,28 @@ def is_platform_mac(): _pxipath = pjoin('pandas', 'src') -_pxifiles = ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', - 'join_helper.pxi.in', 'algos_take_helper.pxi.in', - 'hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in', - 'sparse_op_helper.pxi.in', 'joins_func_helper.pxi.in'] +_pxi_dep_template = { + 'algos': ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', + 'algos_take_helper.pxi.in'], + '_join': ['join_helper.pxi.in', 'joins_func_helper.pxi.in'], + 'hashtable': ['hashtable_class_helper.pxi.in', + 'hashtable_func_helper.pxi.in'], + '_sparse': ['sparse_op_helper.pxi.in'] +} +_pxifiles = [] +_pxi_dep = {} +for module, files in _pxi_dep_template.items(): + pxi_files = [pjoin(_pxipath, x) for x in files] + _pxifiles.extend(pxi_files) + _pxi_dep[module] = pxi_files class build_ext(_build_ext): def build_extensions(self): - for _pxifile in _pxifiles: + for pxifile in _pxifiles: # build pxifiles first, template extention must be .pxi.in - assert _pxifile.endswith('.pxi.in') - pxifile = pjoin(_pxipath, _pxifile) + assert pxifile.endswith('.pxi.in') outfile = pxifile[:-3] if (os.path.exists(outfile) and @@ -450,7 +459,8 @@ def pxd(name): 'depends': lib_depends}, hashtable={'pyxfile': 'hashtable', 'pxdfiles': ['hashtable'], - 'depends': ['pandas/src/klib/khash_python.h']}, + 'depends': (['pandas/src/klib/khash_python.h'] + + _pxi_dep['hashtable'])}, tslib={'pyxfile': 'tslib', 'depends': tseries_depends, 'sources': ['pandas/src/datetime/np_datetime.c', @@ -465,9 +475,11 @@ def pxd(name): 'sources': ['pandas/src/datetime/np_datetime.c', 'pandas/src/datetime/np_datetime_strings.c']}, algos={'pyxfile': 'algos', - 'pxdfiles': ['src/util']}, + 'pxdfiles': ['src/util'], + 'depends': _pxi_dep['algos']}, _join={'pyxfile': 'src/join', - 'pxdfiles': ['src/util']}, + 'pxdfiles': ['src/util'], + 'depends': _pxi_dep['_join']}, _window={'pyxfile': 'window', 'pxdfiles': ['src/skiplist', 'src/util'], 'depends': ['pandas/src/skiplist.pyx', @@ -479,7 +491,8 @@ def pxd(name): 'sources': ['pandas/src/parser/tokenizer.c', 'pandas/src/parser/io.c']}, _sparse={'pyxfile': 'src/sparse', - 'depends': [srcpath('sparse', suffix='.pyx')]}, + 'depends': ([srcpath('sparse', suffix='.pyx')] + + _pxi_dep['_sparse'])}, _testing={'pyxfile': 'src/testing', 'depends': [srcpath('testing', suffix='.pyx')]}, ) From 4a805216d99b37955c97625d304980eff10cab56 Mon Sep 17 00:00:00 2001 From: agraboso Date: Thu, 11 Aug 2016 06:32:43 -0400 Subject: [PATCH 248/359] BUG: properly close files opened by parsers closes #13932 Author: agraboso Closes #13940 from agraboso/fix-13932 and squashes the following commits: 3fa7d25 [agraboso] Close open files in TextFileReader upon StopIteration 6592c73 [agraboso] Do not acquire list as file handler to close 7aa5184 [agraboso] Properly close opened files in XportReader and SAS7BDATReader 240383c [agraboso] Properly close opened files in two tests 52d1073 [agraboso] Fix linting error 39dcd99 [agraboso] Fix rebase 75fc34d [agraboso] Make try/except blocks in StataReader.read as small as possible 812e6ec [agraboso] Fix long line c7e9c9c [agraboso] On close, CParserWrapper must call self._reader.close() 99e16dd [agraboso] Fix whatsnew entries 30b61e6 [agraboso] Properly close opened files in StataWriter 3b0f25f [agraboso] Properly close opened files in StataReader 1e39a5e [agraboso] Properly close opened files in three tests d759156 [agraboso] BUG: properly close files opened by parsers --- doc/source/whatsnew/v0.19.0.txt | 3 + pandas/io/common.py | 4 +- pandas/io/parsers.py | 38 +++++++++--- pandas/io/sas/sas7bdat.py | 17 ++++++ pandas/io/sas/sas_xport.py | 9 +++ pandas/io/sas/sasreader.py | 4 +- pandas/io/stata.py | 62 +++++++++++--------- pandas/io/tests/parser/common.py | 1 + pandas/io/tests/parser/python_parser_only.py | 3 +- pandas/io/tests/parser/test_textreader.py | 3 +- pandas/io/tests/sas/test_sas7bdat.py | 7 ++- pandas/io/tests/sas/test_xport.py | 4 ++ pandas/io/tests/test_common.py | 11 ++-- pandas/tests/series/test_io.py | 3 +- 14 files changed, 121 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 13dacf28b5988..5cbdbe6168bba 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -905,6 +905,9 @@ Bug Fixes - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) +- Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) +- Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`) + - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) diff --git a/pandas/io/common.py b/pandas/io/common.py index 6f9bddd0fdf9b..b7ac183b7ab41 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -327,7 +327,9 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): if memory_map and hasattr(f, 'fileno'): try: - f = MMapWrapper(f) + g = MMapWrapper(f) + f.close() + f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7846ccd1a6660..5372203318d69 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -393,11 +393,15 @@ def _read(filepath_or_buffer, kwds): raise NotImplementedError("'nrows' and 'chunksize' cannot be used" " together yet.") elif nrows is not None: - return parser.read(nrows) + data = parser.read(nrows) + parser.close() + return data elif chunksize or iterator: return parser - return parser.read() + data = parser.read() + parser.close() + return data _parser_defaults = { 'delimiter': None, @@ -727,10 +731,7 @@ def __init__(self, f, engine=None, **kwds): self._make_engine(self.engine) def close(self): - try: - self._engine._reader.close() - except: - pass + self._engine.close() def _get_options_with_defaults(self, engine): kwds = self.orig_options @@ -898,7 +899,11 @@ def _clean_options(self, options, engine): return result, engine def __next__(self): - return self.get_chunk() + try: + return self.get_chunk() + except StopIteration: + self.close() + raise def _make_engine(self, engine='c'): if engine == 'c': @@ -1057,8 +1062,13 @@ def __init__(self, kwds): self._first_chunk = True + # GH 13932 + # keep references to file handles opened by the parser itself + self.handles = [] + def close(self): - self._reader.close() + for f in self.handles: + f.close() @property def _has_complex_date_col(self): @@ -1356,6 +1366,7 @@ def __init__(self, src, **kwds): if 'utf-16' in (kwds.get('encoding') or ''): if isinstance(src, compat.string_types): src = open(src, 'rb') + self.handles.append(src) src = UTF8Recoder(src, kwds['encoding']) kwds['encoding'] = 'utf-8' @@ -1429,6 +1440,14 @@ def __init__(self, src, **kwds): self._implicit_index = self._reader.leading_cols > 0 + def close(self): + for f in self.handles: + f.close() + try: + self._reader.close() + except: + pass + def _set_noconvert_columns(self): names = self.orig_names usecols = self.usecols @@ -1751,13 +1770,16 @@ def __init__(self, f, **kwds): f = _get_handle(f, 'r', encoding=self.encoding, compression=self.compression, memory_map=self.memory_map) + self.handles.append(f) elif self.compression: f = _wrap_compressed(f, self.compression, self.encoding) + self.handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding elif compat.PY3 and isinstance(f, compat.BytesIO): from io import TextIOWrapper f = TextIOWrapper(f, encoding=self.encoding) + self.handles.append(f) # Set self.data to something that can read lines. if hasattr(f, 'readline'): diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index b75f05cf9ed7e..2a82fd7a53222 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -92,16 +92,24 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, compat.string_types): self._path_or_buf = open(self._path_or_buf, 'rb') + self.handle = self._path_or_buf self._get_properties() self._parse_metadata() + def close(self): + try: + self.handle.close() + except AttributeError: + pass + def _get_properties(self): # Check magic number self._path_or_buf.seek(0) self._cached_page = self._path_or_buf.read(288) if self._cached_page[0:len(const.magic)] != const.magic: + self.close() raise ValueError("magic number mismatch (not a SAS file?)") # Get alignment information @@ -175,6 +183,7 @@ def _get_properties(self): buf = self._path_or_buf.read(self.header_length - 288) self._cached_page += buf if len(self._cached_page) != self.header_length: + self.close() raise ValueError("The SAS7BDAT file appears to be truncated.") self._page_length = self._read_int(const.page_size_offset + align1, @@ -219,6 +228,7 @@ def _get_properties(self): # Read a single float of the given width (4 or 8). def _read_float(self, offset, width): if width not in (4, 8): + self.close() raise ValueError("invalid float width") buf = self._read_bytes(offset, width) fd = "f" if width == 4 else "d" @@ -227,6 +237,7 @@ def _read_float(self, offset, width): # Read a single signed integer of the given width (1, 2, 4 or 8). def _read_int(self, offset, width): if width not in (1, 2, 4, 8): + self.close() raise ValueError("invalid int width") buf = self._read_bytes(offset, width) it = {1: "b", 2: "h", 4: "l", 8: "q"}[width] @@ -238,11 +249,13 @@ def _read_bytes(self, offset, length): self._path_or_buf.seek(offset) buf = self._path_or_buf.read(length) if len(buf) < length: + self.close() msg = "Unable to read {:d} bytes from file position {:d}." raise ValueError(msg.format(length, offset)) return buf else: if offset + length > len(self._cached_page): + self.close() raise ValueError("The cached page is too small.") return self._cached_page[offset:offset + length] @@ -253,6 +266,7 @@ def _parse_metadata(self): if len(self._cached_page) <= 0: break if len(self._cached_page) != self._page_length: + self.close() raise ValueError( "Failed to read a meta data page from the SAS file.") done = self._process_page_meta() @@ -302,6 +316,7 @@ def _get_subheader_index(self, signature, compression, ptype): if (self.compression != "") and f1 and f2: index = const.index.dataSubheaderIndex else: + self.close() raise ValueError("Unknown subheader signature") return index @@ -598,6 +613,7 @@ def _read_next_page(self): if len(self._cached_page) <= 0: return True elif len(self._cached_page) != self._page_length: + self.close() msg = ("failed to read complete page from file " "(read {:d} of {:d} bytes)") raise ValueError(msg.format(len(self._cached_page), @@ -643,6 +659,7 @@ def _chunk_to_dataframe(self): rslt.loc[ii, name] = np.nan js += 1 else: + self.close() raise ValueError("unknown column type %s" % self.column_types[j]) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e4ca99fdcb109..76fc55154bc49 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -253,6 +253,9 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._read_header() + def close(self): + self.filepath_or_buffer.close() + def _get_row(self): return self.filepath_or_buffer.read(80).decode() @@ -262,6 +265,7 @@ def _read_header(self): # read file header line1 = self._get_row() if line1 != _correct_line1: + self.close() raise ValueError("Header record is not an XPORT file.") line2 = self._get_row() @@ -269,6 +273,7 @@ def _read_header(self): ['_', 24], ['created', 16]] file_info = _split_line(line2, fif) if file_info['prefix'] != "SAS SAS SASLIB": + self.close() raise ValueError("Header record has invalid prefix.") file_info['created'] = _parse_date(file_info['created']) self.file_info = file_info @@ -282,6 +287,7 @@ def _read_header(self): headflag1 = header1.startswith(_correct_header1) headflag2 = (header2 == _correct_header2) if not (headflag1 and headflag2): + self.close() raise ValueError("Member header not found") # usually 140, could be 135 fieldnamelength = int(header1[-5:-2]) @@ -321,6 +327,7 @@ def _read_header(self): field['ntype'] = types[field['ntype']] fl = field['field_length'] if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)): + self.close() msg = "Floating field width {0} is not between 2 and 8." raise TypeError(msg.format(fl)) @@ -335,6 +342,7 @@ def _read_header(self): header = self._get_row() if not header == _correct_obs_header: + self.close() raise ValueError("Observation header not found.") self.fields = fields @@ -425,6 +433,7 @@ def read(self, nrows=None): read_lines = min(nrows, self.nobs - self._lines_read) read_len = read_lines * self.record_length if read_len <= 0: + self.close() raise StopIteration raw = self.filepath_or_buffer.read(read_len) data = np.frombuffer(raw, dtype=self._dtype, count=read_lines) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 9a60200c78893..081d780f71cb3 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -58,4 +58,6 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, if iterator or chunksize: return reader - return reader.read() + data = reader.read() + reader.close() + return data diff --git a/pandas/io/stata.py b/pandas/io/stata.py index a67fb6651edd0..25f13048a73fd 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -167,15 +167,11 @@ def read_stata(filepath_or_buffer, convert_dates=True, chunksize=chunksize, encoding=encoding) if iterator or chunksize: - try: - return reader - except StopIteration: - reader.close() - - try: - return reader.read() - finally: + data = reader + else: + data = reader.read() reader.close() + return data _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] @@ -1411,13 +1407,13 @@ def read(self, nrows=None, convert_dates=None, convert_categoricals=None, index=None, convert_missing=None, preserve_dtypes=None, columns=None, order_categoricals=None): - # Handle empty file or chunk. If reading incrementally raise # StopIteration. If reading the whole thing return an empty # data frame. if (self.nobs == 0) and (nrows is None): self._can_read_value_labels = True self._data_read = True + self.close() return DataFrame(columns=self.varlist) # Handle options @@ -1463,6 +1459,7 @@ def read(self, nrows=None, convert_dates=None, # we are reading the file incrementally if convert_categoricals: self._read_value_labels() + self.close() raise StopIteration offset = self._lines_read * dtype.itemsize self.path_or_buf.seek(self.data_location + offset) @@ -1494,7 +1491,11 @@ def read(self, nrows=None, convert_dates=None, data = data.set_index(ix) if columns is not None: - data = self._do_select_columns(data, columns) + try: + data = self._do_select_columns(data, columns) + except ValueError: + self.close() + raise # Decode strings for col, typ in zip(data, self.typlist): @@ -1514,7 +1515,7 @@ def read(self, nrows=None, convert_dates=None, if self.dtyplist[i] is not None: col = data.columns[i] dtype = data[col].dtype - if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]): + if dtype != np.dtype(object) and dtype != self.dtyplist[i]: requires_type_conversion = True data_formatted.append( (col, Series(data[col], index, self.dtyplist[i]))) @@ -1531,9 +1532,13 @@ def read(self, nrows=None, convert_dates=None, self.fmtlist))[0] for i in cols: col = data.columns[i] - data[col] = _stata_elapsed_date_to_datetime_vec( - data[col], - self.fmtlist[i]) + try: + data[col] = _stata_elapsed_date_to_datetime_vec( + data[col], + self.fmtlist[i]) + except ValueError: + self.close() + raise if convert_categoricals and self.format_version > 108: data = self._do_convert_categoricals(data, @@ -1889,9 +1894,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - self._file = _open_file_binary_write( - fname, self._encoding or self._default_encoding - ) + self._fname = fname self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): @@ -2086,16 +2089,21 @@ def _prepare_pandas(self, data): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._write_header(time_stamp=self._time_stamp, - data_label=self._data_label) - self._write_descriptors() - self._write_variable_labels() - # write 5 zeros for expansion fields - self._write(_pad_bytes("", 5)) - self._prepare_data() - self._write_data() - self._write_value_labels() - self._file.close() + self._file = _open_file_binary_write( + self._fname, self._encoding or self._default_encoding + ) + try: + self._write_header(time_stamp=self._time_stamp, + data_label=self._data_label) + self._write_descriptors() + self._write_variable_labels() + # write 5 zeros for expansion fields + self._write(_pad_bytes("", 5)) + self._prepare_data() + self._write_data() + self._write_value_labels() + finally: + self._file.close() def _write_value_labels(self): for vl in self._value_labels: diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 619ac7b4c77ef..96eb0ec6fd7a2 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1580,5 +1580,6 @@ def test_temporary_file(self): new_file.seek(0) result = self.read_csv(new_file, sep='\s+', header=None) + new_file.close() expected = DataFrame([[0, 0]]) tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index a7389fd174e1d..3214aa39358e8 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -130,7 +130,8 @@ def test_decompression_regex_sep(self): except ImportError: raise nose.SkipTest('need gzip and bz2 to run') - data = open(self.csv1, 'rb').read() + with open(self.csv1, 'rb') as f: + data = f.read() data = data.replace(b',', b'::') expected = self.read_csv(self.csv1) diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/io/tests/parser/test_textreader.py index fd2f49cef656a..7dda9eb9d0af4 100644 --- a/pandas/io/tests/parser/test_textreader.py +++ b/pandas/io/tests/parser/test_textreader.py @@ -54,7 +54,8 @@ def test_file_handle_mmap(self): f.close() def test_StringIO(self): - text = open(self.csv1, 'rb').read() + with open(self.csv1, 'rb') as f: + text = f.read() src = BytesIO(text) reader = TextReader(src, header=None) reader.read() diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 6661d9fee5df0..06eb9774679b1 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -44,7 +44,8 @@ def test_from_buffer(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - byts = open(fname, 'rb').read() + with open(fname, 'rb') as f: + byts = f.read() buf = io.BytesIO(byts) df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8') tm.assert_frame_equal(df, df0, check_exact=False) @@ -54,7 +55,8 @@ def test_from_iterator(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - byts = open(fname, 'rb').read() + with open(fname, 'rb') as f: + byts = f.read() buf = io.BytesIO(byts) rdr = pd.read_sas(buf, format="sas7bdat", iterator=True, encoding='utf-8') @@ -79,6 +81,7 @@ def test_encoding_options(): from pandas.io.sas.sas7bdat import SAS7BDATReader rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() + rdr.close() for x, y in zip(df1.columns, df3.columns): assert(x == y.decode()) diff --git a/pandas/io/tests/sas/test_xport.py b/pandas/io/tests/sas/test_xport.py index ae378c41cd24b..d0627a80f9604 100644 --- a/pandas/io/tests/sas/test_xport.py +++ b/pandas/io/tests/sas/test_xport.py @@ -39,11 +39,13 @@ def test1_basic(self): # Test incremental read with `read` method. reader = read_sas(self.file01, format="xport", iterator=True) data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test incremental read with `get_chunk` method. reader = read_sas(self.file01, format="xport", chunksize=10) data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Read full file with `read_sas` method @@ -66,6 +68,7 @@ def test1_index(self): reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True) data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) @@ -73,6 +76,7 @@ def test1_index(self): reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10) data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index a443df5dac586..c08d235b07c9e 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -116,8 +116,8 @@ def test_constructor_bad_file(self): tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target) def test_get_attr(self): - target = open(self.mmap_file, 'r') - wrapper = common.MMapWrapper(target) + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) attrs = dir(wrapper.mmap) attrs = [attr for attr in attrs @@ -130,10 +130,9 @@ def test_get_attr(self): self.assertFalse(hasattr(wrapper, 'foo')) def test_next(self): - target = open(self.mmap_file, 'r') - wrapper = common.MMapWrapper(target) - - lines = target.readlines() + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) + lines = target.readlines() for line in lines: next_line = next(wrapper) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f89501d39f014..48528dc54adbd 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -64,7 +64,8 @@ def test_to_csv(self): with ensure_clean() as path: self.ts.to_csv(path) - lines = io.open(path, newline=None).readlines() + with io.open(path, newline=None) as f: + lines = f.readlines() assert (lines[1] != '\n') self.ts.to_csv(path, index=False) From 70f361ba78fd9385da469e5a13cca6e9685c8b1a Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 12 Aug 2016 08:59:25 -0400 Subject: [PATCH 249/359] CLN: PeriodIndex to use nan related cache Author: sinhrks Closes #13975 from sinhrks/period_mask and squashes the following commits: 60b3351 [sinhrks] CLN: PeriodIndex to use nan related cache --- pandas/tseries/period.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index d92ebe18b5697..831d73207bbdf 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -16,8 +16,6 @@ _ensure_object) from pandas.types.generic import ABCSeries -from pandas.types.missing import isnull - import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc @@ -40,7 +38,6 @@ from pandas.util.decorators import Appender, cache_readonly, Substitution from pandas.lib import Timedelta import pandas.tslib as tslib -import pandas.core.missing as missing from pandas.compat import zip, u @@ -87,8 +84,7 @@ def wrapper(self, other): result = getattr(self.values, opname)(other.values) - mask = (missing.mask_missing(self.values, tslib.iNaT) | - missing.mask_missing(other.values, tslib.iNaT)) + mask = self._isnan | other._isnan if mask.any(): result[mask] = nat_result @@ -101,9 +97,8 @@ def wrapper(self, other): func = getattr(self.values, opname) result = func(other.ordinal) - mask = self.values == tslib.iNaT - if mask.any(): - result[mask] = nat_result + if self.hasnans: + result[self._isnan] = nat_result return result return wrapper @@ -498,8 +493,7 @@ def asfreq(self, freq=None, how='E'): new_data = period.period_asfreq_arr(ordinal, base1, base2, end) if self.hasnans: - mask = asi8 == tslib.iNaT - new_data[mask] = tslib.iNaT + new_data[self._isnan] = tslib.iNaT return self._simple_new(new_data, self.name, freq=freq) @@ -637,9 +631,8 @@ def _sub_period(self, other): new_data = asi8 - other.ordinal if self.hasnans: - mask = asi8 == tslib.iNaT new_data = new_data.astype(np.float64) - new_data[mask] = np.nan + new_data[self._isnan] = np.nan # result must be Int64Index or Float64Index return Index(new_data, name=self.name) @@ -892,16 +885,21 @@ def __getitem__(self, key): def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): - values = np.array(list(self), dtype=object) - mask = isnull(self.values) - values[mask] = na_rep - imask = ~mask + values = self.asobject.values if date_format: formatter = lambda dt: dt.strftime(date_format) else: formatter = lambda dt: u('%s') % dt - values[imask] = np.array([formatter(dt) for dt in values[imask]]) + + if self.hasnans: + mask = self._isnan + values[mask] = na_rep + imask = ~mask + values[imask] = np.array([formatter(dt) for dt + in values[imask]]) + else: + values = np.array([formatter(dt) for dt in values]) return values def append(self, other): From 29d9e24f4c778b0c9ebe9288bfc217808d2c6edb Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 12 Aug 2016 11:43:05 -0400 Subject: [PATCH 250/359] BUG/DEPR: combine dtype fixes closes #7630 closes #10567 closes #13947 xref #7509 Author: sinhrks Closes #13970 from sinhrks/combine_bug and squashes the following commits: 2046cb5 [sinhrks] BUG/DEPR: combine dtype fixes --- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/core/frame.py | 26 +- pandas/tests/frame/test_combine_concat.py | 461 ++++++++++++++-------- pandas/tests/frame/test_operators.py | 58 +-- pandas/tests/types/test_cast.py | 40 +- pandas/types/cast.py | 21 +- pandas/types/common.py | 29 +- 7 files changed, 410 insertions(+), 227 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 5cbdbe6168bba..411b2b0abaf5a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -788,6 +788,7 @@ Deprecations - ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead. (:issue:`13874`) - ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq``. (:issue:`13874`) + .. _whatsnew_0190.prior_deprecations: Removal of prior version deprecations/changes @@ -939,6 +940,7 @@ Bug Fixes - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) +- Bug in ``.combine_first`` may return incorrect ``dtype`` (:issue:`7630`, :issue:`10567`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) - Bug in ``.to_html``, ``.to_latex`` and ``.to_string`` silently ignore custom datetime formatter passed through the ``formatters`` key word (:issue:`10690`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4416213817ab4..ea83200465582 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -31,20 +31,22 @@ _possibly_downcast_to_dtype, _invalidate_string_dtypes, _coerce_to_dtypes, - _maybe_upcast_putmask) + _maybe_upcast_putmask, + _find_common_type) from pandas.types.common import (is_categorical_dtype, is_object_dtype, is_extension_type, is_datetimetz, is_datetime64_dtype, + is_datetime64tz_dtype, is_bool_dtype, is_integer_dtype, is_float_dtype, is_integer, is_scalar, + is_dtype_equal, needs_i8_conversion, _get_dtype_from_object, - _lcd_dtypes, _ensure_float, _ensure_float64, _ensure_int64, @@ -3700,17 +3702,20 @@ def combine(self, other, func, fill_value=None, overwrite=True): otherSeries[other_mask] = fill_value # if we have different dtypes, possibily promote - new_dtype = this_dtype - if this_dtype != other_dtype: - new_dtype = _lcd_dtypes(this_dtype, other_dtype) - series = series.astype(new_dtype) + if notnull(series).all(): + new_dtype = this_dtype otherSeries = otherSeries.astype(new_dtype) + else: + new_dtype = _find_common_type([this_dtype, other_dtype]) + if not is_dtype_equal(this_dtype, new_dtype): + series = series.astype(new_dtype) + if not is_dtype_equal(other_dtype, new_dtype): + otherSeries = otherSeries.astype(new_dtype) # see if we need to be represented as i8 (datetimelike) # try to keep us at this dtype needs_i8_conversion_i = needs_i8_conversion(new_dtype) if needs_i8_conversion_i: - this_dtype = new_dtype arr = func(series, otherSeries, True) else: arr = func(series, otherSeries) @@ -3721,7 +3726,12 @@ def combine(self, other, func, fill_value=None, overwrite=True): # try to downcast back to the original dtype if needs_i8_conversion_i: - arr = _possibly_cast_to_datetime(arr, this_dtype) + # ToDo: This conversion should be handled in + # _possibly_cast_to_datetime but the change affects lot... + if is_datetime64tz_dtype(new_dtype): + arr = DatetimeIndex._simple_new(arr, tz=new_dtype.tz) + else: + arr = _possibly_cast_to_datetime(arr, new_dtype) else: arr = _possibly_downcast_to_dtype(arr, this_dtype) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 7202915f13258..e5aaba26135e7 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -20,23 +20,11 @@ from pandas.tests.frame.common import TestData -class TestDataFrameCombineConcat(tm.TestCase, TestData): +class TestDataFrameConcatCommon(tm.TestCase, TestData): _multiprocess_can_split_ = True - def test_combine_first_mixed(self): - a = Series(['a', 'b'], index=lrange(2)) - b = Series(lrange(2), index=lrange(2)) - f = DataFrame({'A': a, 'B': b}) - - a = Series(['a', 'b'], index=lrange(5, 7)) - b = Series(lrange(2), index=lrange(5, 7)) - g = DataFrame({'A': a, 'B': b}) - - # TODO(wesm): no verification? - combined = f.combine_first(g) # noqa - - def test_combine_multiple_frames_dtypes(self): + def test_concat_multiple_frames_dtypes(self): # GH 2759 A = DataFrame(data=np.ones((10, 2)), columns=[ @@ -46,7 +34,7 @@ def test_combine_multiple_frames_dtypes(self): expected = Series(dict(float64=2, float32=2)) assert_series_equal(results, expected) - def test_combine_multiple_tzs(self): + def test_concat_multiple_tzs(self): # GH 12467 # combining datetime tz-aware and naive DataFrames ts1 = Timestamp('2015-01-01', tz=None) @@ -194,147 +182,6 @@ def test_append_dtypes(self): expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])}) assert_frame_equal(result, expected) - def test_combine_first(self): - # disjoint - head, tail = self.frame[:5], self.frame[5:] - - combined = head.combine_first(tail) - reordered_frame = self.frame.reindex(combined.index) - assert_frame_equal(combined, reordered_frame) - self.assertTrue(tm.equalContents(combined.columns, self.frame.columns)) - assert_series_equal(combined['A'], reordered_frame['A']) - - # same index - fcopy = self.frame.copy() - fcopy['A'] = 1 - del fcopy['C'] - - fcopy2 = self.frame.copy() - fcopy2['B'] = 0 - del fcopy2['D'] - - combined = fcopy.combine_first(fcopy2) - - self.assertTrue((combined['A'] == 1).all()) - assert_series_equal(combined['B'], fcopy['B']) - assert_series_equal(combined['C'], fcopy2['C']) - assert_series_equal(combined['D'], fcopy['D']) - - # overlap - head, tail = reordered_frame[:10].copy(), reordered_frame - head['A'] = 1 - - combined = head.combine_first(tail) - self.assertTrue((combined['A'][:10] == 1).all()) - - # reverse overlap - tail['A'][:10] = 0 - combined = tail.combine_first(head) - self.assertTrue((combined['A'][:10] == 0).all()) - - # no overlap - f = self.frame[:10] - g = self.frame[10:] - combined = f.combine_first(g) - assert_series_equal(combined['A'].reindex(f.index), f['A']) - assert_series_equal(combined['A'].reindex(g.index), g['A']) - - # corner cases - comb = self.frame.combine_first(self.empty) - assert_frame_equal(comb, self.frame) - - comb = self.empty.combine_first(self.frame) - assert_frame_equal(comb, self.frame) - - comb = self.frame.combine_first(DataFrame(index=["faz", "boo"])) - self.assertTrue("faz" in comb.index) - - # #2525 - df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) - df2 = DataFrame({}, columns=['b']) - result = df.combine_first(df2) - self.assertTrue('b' in result) - - def test_combine_first_mixed_bug(self): - idx = Index(['a', 'b', 'c', 'e']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'e'], index=idx) - ser3 = Series([12, 4, 5, 97], index=idx) - - frame1 = DataFrame({"col0": ser1, - "col2": ser2, - "col3": ser3}) - - idx = Index(['a', 'b', 'c', 'f']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'f'], index=idx) - ser3 = Series([12, 4, 5, 97], index=idx) - - frame2 = DataFrame({"col1": ser1, - "col2": ser2, - "col5": ser3}) - - combined = frame1.combine_first(frame2) - self.assertEqual(len(combined.columns), 5) - - # gh 3016 (same as in update) - df = DataFrame([[1., 2., False, True], [4., 5., True, False]], - columns=['A', 'B', 'bool1', 'bool2']) - - other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) - result = df.combine_first(other) - assert_frame_equal(result, df) - - df.ix[0, 'A'] = np.nan - result = df.combine_first(other) - df.ix[0, 'A'] = 45 - assert_frame_equal(result, df) - - # doc example - df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan], - 'B': [np.nan, 2., 3., np.nan, 6.]}) - - df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], - 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) - - result = df1.combine_first(df2) - expected = DataFrame( - {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]}) - assert_frame_equal(result, expected) - - # GH3552, return object dtype with bools - df1 = DataFrame( - [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) - df2 = DataFrame( - [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) - - result = df1.combine_first(df2)[2] - expected = Series([True, True, False], name=2) - assert_series_equal(result, expected) - - # GH 3593, converting datetime64[ns] incorrecly - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) - df1 = DataFrame({"a": [None, None, None]}) - df2 = df1.combine_first(df0) - assert_frame_equal(df2, df0) - - df2 = df0.combine_first(df1) - assert_frame_equal(df2, df0) - - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) - df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) - df2 = df1.combine_first(df0) - result = df0.copy() - result.iloc[0, :] = df1.iloc[0, :] - assert_frame_equal(df2, result) - - df2 = df0.combine_first(df1) - assert_frame_equal(df2, df0) - def test_update(self): df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], @@ -476,3 +323,305 @@ def test_join_multiindex_leftright(self): assert_frame_equal(df1.join(df2, how='right'), exp) assert_frame_equal(df2.join(df1, how='left'), exp[['value2', 'value1']]) + + +class TestDataFrameCombineFirst(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def test_combine_first_mixed(self): + a = Series(['a', 'b'], index=lrange(2)) + b = Series(lrange(2), index=lrange(2)) + f = DataFrame({'A': a, 'B': b}) + + a = Series(['a', 'b'], index=lrange(5, 7)) + b = Series(lrange(2), index=lrange(5, 7)) + g = DataFrame({'A': a, 'B': b}) + + exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]}, + index=[0, 1, 5, 6]) + combined = f.combine_first(g) + tm.assert_frame_equal(combined, exp) + + def test_combine_first(self): + # disjoint + head, tail = self.frame[:5], self.frame[5:] + + combined = head.combine_first(tail) + reordered_frame = self.frame.reindex(combined.index) + assert_frame_equal(combined, reordered_frame) + self.assertTrue(tm.equalContents(combined.columns, self.frame.columns)) + assert_series_equal(combined['A'], reordered_frame['A']) + + # same index + fcopy = self.frame.copy() + fcopy['A'] = 1 + del fcopy['C'] + + fcopy2 = self.frame.copy() + fcopy2['B'] = 0 + del fcopy2['D'] + + combined = fcopy.combine_first(fcopy2) + + self.assertTrue((combined['A'] == 1).all()) + assert_series_equal(combined['B'], fcopy['B']) + assert_series_equal(combined['C'], fcopy2['C']) + assert_series_equal(combined['D'], fcopy['D']) + + # overlap + head, tail = reordered_frame[:10].copy(), reordered_frame + head['A'] = 1 + + combined = head.combine_first(tail) + self.assertTrue((combined['A'][:10] == 1).all()) + + # reverse overlap + tail['A'][:10] = 0 + combined = tail.combine_first(head) + self.assertTrue((combined['A'][:10] == 0).all()) + + # no overlap + f = self.frame[:10] + g = self.frame[10:] + combined = f.combine_first(g) + assert_series_equal(combined['A'].reindex(f.index), f['A']) + assert_series_equal(combined['A'].reindex(g.index), g['A']) + + # corner cases + comb = self.frame.combine_first(self.empty) + assert_frame_equal(comb, self.frame) + + comb = self.empty.combine_first(self.frame) + assert_frame_equal(comb, self.frame) + + comb = self.frame.combine_first(DataFrame(index=["faz", "boo"])) + self.assertTrue("faz" in comb.index) + + # #2525 + df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) + df2 = DataFrame({}, columns=['b']) + result = df.combine_first(df2) + self.assertTrue('b' in result) + + def test_combine_first_mixed_bug(self): + idx = Index(['a', 'b', 'c', 'e']) + ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) + ser2 = Series(['a', 'b', 'c', 'e'], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame1 = DataFrame({"col0": ser1, + "col2": ser2, + "col3": ser3}) + + idx = Index(['a', 'b', 'c', 'f']) + ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) + ser2 = Series(['a', 'b', 'c', 'f'], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame2 = DataFrame({"col1": ser1, + "col2": ser2, + "col5": ser3}) + + combined = frame1.combine_first(frame2) + self.assertEqual(len(combined.columns), 5) + + # gh 3016 (same as in update) + df = DataFrame([[1., 2., False, True], [4., 5., True, False]], + columns=['A', 'B', 'bool1', 'bool2']) + + other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) + result = df.combine_first(other) + assert_frame_equal(result, df) + + df.ix[0, 'A'] = np.nan + result = df.combine_first(other) + df.ix[0, 'A'] = 45 + assert_frame_equal(result, df) + + # doc example + df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan], + 'B': [np.nan, 2., 3., np.nan, 6.]}) + + df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], + 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) + + result = df1.combine_first(df2) + expected = DataFrame( + {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]}) + assert_frame_equal(result, expected) + + # GH3552, return object dtype with bools + df1 = DataFrame( + [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) + df2 = DataFrame( + [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) + + result = df1.combine_first(df2)[2] + expected = Series([True, True, False], name=2) + assert_series_equal(result, expected) + + # GH 3593, converting datetime64[ns] incorrecly + df0 = DataFrame({"a": [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3)]}) + df1 = DataFrame({"a": [None, None, None]}) + df2 = df1.combine_first(df0) + assert_frame_equal(df2, df0) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2, df0) + + df0 = DataFrame({"a": [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3)]}) + df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) + df2 = df1.combine_first(df0) + result = df0.copy() + result.iloc[0, :] = df1.iloc[0, :] + assert_frame_equal(df2, result) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2, df0) + + def test_combine_first_align_nan(self): + # GH 7509 (not fixed) + dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]], + columns=['a', 'b']) + dfb = pd.DataFrame([[4], [5]], columns=['b']) + self.assertEqual(dfa['a'].dtype, 'datetime64[ns]') + self.assertEqual(dfa['b'].dtype, 'int64') + + res = dfa.combine_first(dfb) + exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT], + 'b': [2., 5.]}, columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['a'].dtype, 'datetime64[ns]') + # ToDo: this must be int64 + self.assertEqual(res['b'].dtype, 'float64') + + res = dfa.iloc[:0].combine_first(dfb) + exp = pd.DataFrame({'a': [np.nan, np.nan], + 'b': [4, 5]}, columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + # ToDo: this must be datetime64 + self.assertEqual(res['a'].dtype, 'float64') + # ToDo: this must be int64 + self.assertEqual(res['b'].dtype, 'int64') + + def test_combine_first_timezone(self): + # GH 7630 + data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC') + df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'], + data=data1, + index=pd.date_range('20140627', periods=1)) + data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC') + df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'], + data=data2, + index=pd.date_range('20140628', periods=1)) + res = df2[['UTCdatetime']].combine_first(df1) + exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01', + tz='UTC'), + pd.Timestamp('2012-12-12 12:12', + tz='UTC')], + 'abc': [pd.Timestamp('2010-01-01 01:01:00', + tz='UTC'), pd.NaT]}, + columns=['UTCdatetime', 'abc'], + index=pd.date_range('20140627', periods=2, + freq='D')) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['UTCdatetime'].dtype, 'datetime64[ns, UTC]') + self.assertEqual(res['abc'].dtype, 'datetime64[ns, UTC]') + + # GH 10567 + dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC') + df1 = pd.DataFrame({'DATE': dts1}) + dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC') + df2 = pd.DataFrame({'DATE': dts2}) + + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + self.assertEqual(res['DATE'].dtype, 'datetime64[ns, UTC]') + + dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', + '2011-01-04'], tz='US/Eastern') + df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7]) + dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02', + '2012-01-03'], tz='US/Eastern') + df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT', + '2012-01-02', '2011-01-03', '2011-01-04'], + tz='US/Eastern') + exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + + # different tz + dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern') + df1 = pd.DataFrame({'DATE': dts1}) + dts2 = pd.date_range('2015-01-03', '2015-01-05') + df2 = pd.DataFrame({'DATE': dts2}) + + # if df1 doesn't have NaN, keep its dtype + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + self.assertEqual(res['DATE'].dtype, 'datetime64[ns, US/Eastern]') + + dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern') + df1 = pd.DataFrame({'DATE': dts1}) + dts2 = pd.date_range('2015-01-01', '2015-01-03') + df2 = pd.DataFrame({'DATE': dts2}) + + res = df1.combine_first(df2) + exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'), + pd.Timestamp('2015-01-02', tz='US/Eastern'), + pd.Timestamp('2015-01-03')] + exp = pd.DataFrame({'DATE': exp_dts}) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['DATE'].dtype, 'object') + + def test_combine_first_timedelta(self): + data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day']) + df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7]) + data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day']) + df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT', + '11 day', '3 day', '4 day']) + exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['TD'].dtype, 'timedelta64[ns]') + + def test_combine_first_period(self): + data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03', + '2011-04'], freq='M') + df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7]) + data2 = pd.PeriodIndex(['2012-01-01', '2012-02', + '2012-03'], freq='M') + df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT', + '2012-02', '2011-03', '2011-04'], + freq='M') + exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['P'].dtype, 'object') + + # different freq + dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02', + '2012-01-03'], freq='D') + df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = [pd.Period('2011-01', freq='M'), + pd.Period('2012-01-01', freq='D'), + pd.NaT, + pd.Period('2012-01-02', freq='D'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')] + exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['P'].dtype, 'object') diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index c91585a28d867..ce7af25eb0460 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -1013,44 +1013,52 @@ def test_combineAdd(self): with tm.assert_produces_warning(FutureWarning): # trivial comb = self.frame.combineAdd(self.frame) - assert_frame_equal(comb, self.frame * 2) + assert_frame_equal(comb, self.frame * 2) - # more rigorous - a = DataFrame([[1., nan, nan, 2., nan]], - columns=np.arange(5)) - b = DataFrame([[2., 3., nan, 2., 6., nan]], - columns=np.arange(6)) - expected = DataFrame([[3., 3., nan, 4., 6., nan]], - columns=np.arange(6)) + # more rigorous + a = DataFrame([[1., nan, nan, 2., nan]], + columns=np.arange(5)) + b = DataFrame([[2., 3., nan, 2., 6., nan]], + columns=np.arange(6)) + expected = DataFrame([[3., 3., nan, 4., 6., nan]], + columns=np.arange(6)) + with tm.assert_produces_warning(FutureWarning): result = a.combineAdd(b) - assert_frame_equal(result, expected) + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): result2 = a.T.combineAdd(b.T) - assert_frame_equal(result2, expected.T) + assert_frame_equal(result2, expected.T) - expected2 = a.combine(b, operator.add, fill_value=0.) - assert_frame_equal(expected, expected2) + expected2 = a.combine(b, operator.add, fill_value=0.) + assert_frame_equal(expected, expected2) - # corner cases + # corner cases + with tm.assert_produces_warning(FutureWarning): comb = self.frame.combineAdd(self.empty) - assert_frame_equal(comb, self.frame) + assert_frame_equal(comb, self.frame) + with tm.assert_produces_warning(FutureWarning): comb = self.empty.combineAdd(self.frame) - assert_frame_equal(comb, self.frame) + assert_frame_equal(comb, self.frame) - # integer corner case - df1 = DataFrame({'x': [5]}) - df2 = DataFrame({'x': [1]}) - df3 = DataFrame({'x': [6]}) + # integer corner case + df1 = DataFrame({'x': [5]}) + df2 = DataFrame({'x': [1]}) + df3 = DataFrame({'x': [6]}) + + with tm.assert_produces_warning(FutureWarning): comb = df1.combineAdd(df2) - assert_frame_equal(comb, df3) + assert_frame_equal(comb, df3) - # mixed type GH2191 - df1 = DataFrame({'A': [1, 2], 'B': [3, 4]}) - df2 = DataFrame({'A': [1, 2], 'C': [5, 6]}) + # mixed type GH2191 + df1 = DataFrame({'A': [1, 2], 'B': [3, 4]}) + df2 = DataFrame({'A': [1, 2], 'C': [5, 6]}) + with tm.assert_produces_warning(FutureWarning): rs = df1.combineAdd(df2) - xp = DataFrame({'A': [2, 4], 'B': [3, 4.], 'C': [5, 6.]}) - assert_frame_equal(xp, rs) + xp = DataFrame({'A': [2, 4], 'B': [3, 4.], 'C': [5, 6.]}) + assert_frame_equal(xp, rs) # TODO: test integer fill corner? diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index 3394974d833fb..46f37bf0ef8c2 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -192,6 +192,7 @@ def test_possibly_convert_objects_copy(self): class TestCommonTypes(tm.TestCase): + def test_numpy_dtypes(self): # (source_types, destination_type) testcases = ( @@ -218,18 +219,43 @@ def test_numpy_dtypes(self): ((np.complex128, np.int32), np.complex128), ((np.object, np.float32), np.object), ((np.object, np.int16), np.object), + + ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ns]')), + np.dtype('datetime64[ns]')), + ((np.dtype('timedelta64[ns]'), np.dtype('timedelta64[ns]')), + np.dtype('timedelta64[ns]')), + + ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ms]')), + np.dtype('datetime64[ns]')), + ((np.dtype('timedelta64[ms]'), np.dtype('timedelta64[ns]')), + np.dtype('timedelta64[ns]')), + + ((np.dtype('datetime64[ns]'), np.dtype('timedelta64[ns]')), + np.object), + ((np.dtype('datetime64[ns]'), np.int64), np.object) ) for src, common in testcases: self.assertEqual(_find_common_type(src), common) + with tm.assertRaises(ValueError): + # empty + _find_common_type([]) + def test_pandas_dtypes(self): - # TODO: not implemented yet - with self.assertRaises(TypeError): - self.assertEqual(_find_common_type([CategoricalDtype()]), - CategoricalDtype) - with self.assertRaises(TypeError): - self.assertEqual(_find_common_type([DatetimeTZDtype()]), - DatetimeTZDtype) + dtype = CategoricalDtype() + self.assertEqual(_find_common_type([dtype]), 'category') + self.assertEqual(_find_common_type([dtype, dtype]), 'category') + self.assertEqual(_find_common_type([np.object, dtype]), np.object) + + dtype = DatetimeTZDtype(unit='ns', tz='US/Eastern') + self.assertEqual(_find_common_type([dtype, dtype]), + 'datetime64[ns, US/Eastern]') + + for dtype2 in [DatetimeTZDtype(unit='ns', tz='Asia/Tokyo'), + np.dtype('datetime64[ns]'), np.object, np.int64]: + self.assertEqual(_find_common_type([dtype, dtype2]), np.object) + self.assertEqual(_find_common_type([dtype2, dtype]), np.object) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 93be926fe1eeb..59c939126d2a4 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -866,8 +866,23 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): def _find_common_type(types): """Find a common data type among the given dtypes.""" - # TODO: enable using pandas-specific types + + if len(types) == 0: + raise ValueError('no types given') + + first = types[0] + # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) + # => object + if all(is_dtype_equal(first, t) for t in types[1:]): + return first + if any(isinstance(t, ExtensionDtype) for t in types): - raise TypeError("Common type discovery is currently only " - "supported for pure numpy dtypes.") + return np.object + + # take lowest unit + if all(is_datetime64_dtype(t) for t in types): + return np.dtype('datetime64[ns]') + if all(is_timedelta64_dtype(t) for t in types): + return np.dtype('timedelta64[ns]') + return np.find_common_type(types, []) diff --git a/pandas/types/common.py b/pandas/types/common.py index bffff0357f329..39db0be3e416e 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -9,7 +9,7 @@ from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries) -from .inference import is_integer, is_string_like +from .inference import is_string_like from .inference import * # noqa @@ -386,33 +386,6 @@ def _validate_date_like_dtype(dtype): (dtype.name, dtype.type.__name__)) -def _lcd_dtypes(a_dtype, b_dtype): - """ return the lcd dtype to hold these types """ - - if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype): - return _NS_DTYPE - elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype): - return _TD_DTYPE - elif is_complex_dtype(a_dtype): - if is_complex_dtype(b_dtype): - return a_dtype - return np.float64 - elif is_integer_dtype(a_dtype): - if is_integer_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - return np.int64 - return np.float64 - elif is_float_dtype(a_dtype): - if is_float_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - else: - return np.float64 - elif is_integer(b_dtype): - return np.float64 - return np.object - _string_dtypes = frozenset(map(_get_dtype_from_object, (binary_type, text_type))) From 6f8841648d068cc04c7bfe0625f131b7c7d6469e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 13 Aug 2016 22:14:08 +0200 Subject: [PATCH 251/359] DOC: suppress warnings for Panel4D deprecation (#13989) --- doc/source/dsintro.rst | 1 + doc/source/io.rst | 3 +++ doc/source/whatsnew/v0.10.0.txt | 1 + 3 files changed, 5 insertions(+) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 8f89cd6789f4f..b5ad681426b15 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -1042,6 +1042,7 @@ The following creates a Panel5D. A new panel type object must be sliceable into Here we slice to a Panel4D. .. ipython:: python + :okwarning: from pandas.core import panelnd Panel5D = panelnd.create_nd_panel_factory( diff --git a/doc/source/io.rst b/doc/source/io.rst index 35d6639d21269..2e62a6cf8d855 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4012,6 +4012,7 @@ number of options, please see the docstring. legacy_file_path = os.path.abspath('source/_static/legacy_0.10.h5') .. ipython:: python + :okwarning: # a legacy store legacy_store = HDFStore(legacy_file_path,'r') @@ -4059,6 +4060,7 @@ Experimental HDFStore supports ``Panel4D`` storage. .. ipython:: python + :okwarning: p4d = Panel4D({ 'l1' : wp }) p4d @@ -4073,6 +4075,7 @@ store your data. Pass the ``axes`` keyword with a list of dimensions object). This cannot be changed after table creation. .. ipython:: python + :okwarning: store.append('p4d2', p4d, axes=['labels', 'major_axis', 'minor_axis']) store diff --git a/doc/source/whatsnew/v0.10.0.txt b/doc/source/whatsnew/v0.10.0.txt index ce20de654ffd8..fed3ba3ce3a84 100644 --- a/doc/source/whatsnew/v0.10.0.txt +++ b/doc/source/whatsnew/v0.10.0.txt @@ -383,6 +383,7 @@ Adding experimental support for Panel4D and factory functions to create n-dimens :ref:`Docs ` for NDim. Here is a taste of what to expect. .. ipython:: python + :okwarning: p4d = Panel4D(randn(2, 2, 5, 4), labels=['Label1','Label2'], From e0c329199e6f64d2461acd2b89dc398dc4df6057 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 13 Aug 2016 18:06:53 -0400 Subject: [PATCH 252/359] TST: Cleanup test_indexing Author: sinhrks Closes #13982 from sinhrks/test_indexing and squashes the following commits: 3947468 [sinhrks] TST: Cleanup test_indexing --- pandas/tests/indexing/test_indexing.py | 1606 +++++++++---------- pandas/tests/indexing/test_indexing_slow.py | 97 ++ 2 files changed, 829 insertions(+), 874 deletions(-) create mode 100644 pandas/tests/indexing/test_indexing_slow.py diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a96e4acfad89b..b051b92e15540 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -21,9 +21,6 @@ from pandas.core.indexing import _non_reducing_slice, _maybe_numeric_slice from pandas.core.api import (DataFrame, Index, Series, Panel, isnull, MultiIndex, Timestamp, Timedelta) -from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assert_panel_equal, - assert_attr_equal, slow) from pandas.formats.printing import pprint_thing from pandas import concat from pandas.core.common import PerformanceWarning @@ -111,32 +108,34 @@ def setUp(self): warnings.filterwarnings(action='ignore', category=FutureWarning) self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) - self.frame_ints = DataFrame( - np.random.randn( - 4, 4), index=lrange(0, 8, 2), columns=lrange(0, 12, 3)) - self.panel_ints = Panel( - np.random.rand(4, 4, 4), items=lrange(0, 8, 2), - major_axis=lrange(0, 12, 3), minor_axis=lrange(0, 16, 4)) + self.frame_ints = DataFrame(np.random.randn(4, 4), + index=lrange(0, 8, 2), + columns=lrange(0, 12, 3)) + self.panel_ints = Panel(np.random.rand(4, 4, 4), + items=lrange(0, 8, 2), + major_axis=lrange(0, 12, 3), + minor_axis=lrange(0, 16, 4)) self.series_labels = Series(np.random.randn(4), index=list('abcd')) - self.frame_labels = DataFrame( - np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) - self.panel_labels = Panel( - np.random.randn(4, 4, 4), items=list('abcd'), - major_axis=list('ABCD'), minor_axis=list('ZYXW')) + self.frame_labels = DataFrame(np.random.randn(4, 4), + index=list('abcd'), columns=list('ABCD')) + self.panel_labels = Panel(np.random.randn(4, 4, 4), + items=list('abcd'), + major_axis=list('ABCD'), + minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) - self.frame_mixed = DataFrame( - np.random.randn(4, 4), index=[2, 4, 'null', 8]) - self.panel_mixed = Panel( - np.random.randn(4, 4, 4), items=[2, 4, 'null', 8]) - - self.series_ts = Series( - np.random.randn(4), index=date_range('20130101', periods=4)) - self.frame_ts = DataFrame( - np.random.randn(4, 4), index=date_range('20130101', periods=4)) - self.panel_ts = Panel( - np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) + self.frame_mixed = DataFrame(np.random.randn(4, 4), + index=[2, 4, 'null', 8]) + self.panel_mixed = Panel(np.random.randn(4, 4, 4), + items=[2, 4, 'null', 8]) + + self.series_ts = Series(np.random.randn(4), + index=date_range('20130101', periods=4)) + self.frame_ts = DataFrame(np.random.randn(4, 4), + index=date_range('20130101', periods=4)) + self.panel_ts = Panel(np.random.randn(4, 4, 4), + items=date_range('20130101', periods=4)) self.frame_empty = DataFrame({}) self.series_empty = Series({}) @@ -169,7 +168,7 @@ def check_values(self, f, func, values=False): for a in reversed(i): expected = expected.__getitem__(a) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) def check_result(self, name, method1, key1, method2, key2, typs=None, objs=None, axes=None, fails=None): @@ -206,11 +205,11 @@ def _print(result, error=None): if is_scalar(rs) and is_scalar(xp): self.assertEqual(rs, xp) elif xp.ndim == 1: - assert_series_equal(rs, xp) + tm.assert_series_equal(rs, xp) elif xp.ndim == 2: - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) elif xp.ndim == 3: - assert_panel_equal(rs, xp) + tm.assert_panel_equal(rs, xp) result = 'ok' except (AssertionError): result = 'fail' @@ -284,7 +283,7 @@ def test_indexer_caching(self): expected = Series(np.ones(n), index=index) s = Series(np.zeros(n), index=index) s[s == 0] = 1 - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) def test_at_and_iat_get(self): def _check(f, func, values=False): @@ -294,7 +293,7 @@ def _check(f, func, values=False): for i in indicies: result = getattr(f, func)[i] expected = _get_value(f, i, values) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) for o in self._objs: @@ -320,7 +319,7 @@ def _check(f, func, values=False): for i in indicies: getattr(f, func)[i] = 1 expected = _get_value(f, i, values) - assert_almost_equal(expected, 1) + tm.assert_almost_equal(expected, 1) for t in self._objs: @@ -383,12 +382,12 @@ def test_imethods_with_dups(self): result = s.iloc[[2, 3]] expected = Series([2, 3], [2, 2], dtype='int64') - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = s.to_frame() result = df.iloc[2] expected = Series(2, index=[0], name=2) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = df.iat[2, 0] expected = 2 @@ -402,7 +401,7 @@ def test_repeated_getitem_dups(self): index=['ABCDE' [x % 5] for x in range(20)]) expected = df.loc['A', 0] result = df.loc[:, 0].loc['A'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_exceeds_bounds(self): @@ -424,70 +423,69 @@ def test_iloc_exceeds_bounds(self): self.assertRaises(IndexError, lambda: s.iloc[[-100]]) # still raise on a single indexer - with tm.assertRaisesRegexp( - IndexError, 'single positional indexer is out-of-bounds'): + msg = 'single positional indexer is out-of-bounds' + with tm.assertRaisesRegexp(IndexError, msg): df.iloc[30] self.assertRaises(IndexError, lambda: df.iloc[-30]) # GH10779 # single positive/negative indexer exceeding Series bounds should raise # an IndexError - with tm.assertRaisesRegexp( - IndexError, 'single positional indexer is out-of-bounds'): + with tm.assertRaisesRegexp(IndexError, msg): s.iloc[30] self.assertRaises(IndexError, lambda: s.iloc[-30]) # slices are ok result = df.iloc[:, 4:10] # 0 < start < len < stop expected = df.iloc[:, 4:] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, -4:-10] # stop < 0 < start < len expected = df.iloc[:, :0] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 10:4:-1] # 0 < stop < len < start (down) expected = df.iloc[:, :4:-1] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 4:-10:-1] # stop < 0 < start < len (down) expected = df.iloc[:, 4::-1] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, -10:4] # start < 0 < stop < len expected = df.iloc[:, :4] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 10:4] # 0 < stop < len < start expected = df.iloc[:, :0] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, -10:-11:-1] # stop < start < 0 < len (down) expected = df.iloc[:, :0] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 10:11] # 0 < len < start < stop expected = df.iloc[:, :0] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # slice bounds exceeding is ok result = s.iloc[18:30] expected = s.iloc[18:] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.iloc[30:] expected = s.iloc[:0] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.iloc[30::-1] expected = s.iloc[::-1] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # doc example def check(result, expected): str(result) result.dtypes - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) dfl = DataFrame(np.random.randn(5, 2), columns=list('AB')) check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index)) @@ -500,9 +498,8 @@ def check(result, expected): def test_iloc_getitem_int(self): # integer - self.check_result('integer', 'iloc', 2, 'ix', {0: 4, - 1: 6, - 2: 8}, typs=['ints']) + self.check_result('integer', 'iloc', 2, 'ix', + {0: 4, 1: 6, 2: 8}, typs=['ints']) self.check_result('integer', 'iloc', 2, 'indexer', 2, typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -510,9 +507,8 @@ def test_iloc_getitem_int(self): def test_iloc_getitem_neg_int(self): # neg integer - self.check_result('neg int', 'iloc', -1, 'ix', {0: 6, - 1: 9, - 2: 12}, typs=['ints']) + self.check_result('neg int', 'iloc', -1, 'ix', + {0: 6, 1: 9, 2: 12}, typs=['ints']) self.check_result('neg int', 'iloc', -1, 'indexer', -1, typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -520,14 +516,11 @@ def test_iloc_getitem_neg_int(self): def test_iloc_getitem_list_int(self): # list of ints - self.check_result('list int', 'iloc', [0, 1, 2], 'ix', {0: [0, 2, 4], - 1: [0, 3, 6], - 2: [0, 4, 8]}, - typs=['ints']) - self.check_result('list int', 'iloc', [2], 'ix', {0: [4], - 1: [6], - 2: [8]}, + self.check_result('list int', 'iloc', [0, 1, 2], 'ix', + {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, typs=['ints']) + self.check_result('list int', 'iloc', [2], 'ix', + {0: [4], 1: [6], 2: [8]}, typs=['ints']) self.check_result('list int', 'iloc', [0, 1, 2], 'indexer', [0, 1, 2], typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -538,10 +531,8 @@ def test_iloc_getitem_list_int(self): {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, typs=['ints']) - self.check_result('array int', 'iloc', np.array([2]), 'ix', {0: [4], - 1: [6], - 2: [8]}, - typs=['ints']) + self.check_result('array int', 'iloc', np.array([2]), 'ix', + {0: [4], 1: [6], 2: [8]}, typs=['ints']) self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'indexer', [0, 1, 2], typs=['labels', 'mixed', 'ts', 'floats', 'empty'], @@ -555,11 +546,11 @@ def test_iloc_getitem_neg_int_can_reach_first_index(self): expected = df.iloc[0] result = df.iloc[-3] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) expected = df.iloc[[0]] result = df.iloc[[-3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = s.iloc[0] result = s.iloc[-3] @@ -567,20 +558,19 @@ def test_iloc_getitem_neg_int_can_reach_first_index(self): expected = s.iloc[[0]] result = s.iloc[[-3]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # check the length 1 Series case highlighted in GH10547 expected = pd.Series(['a'], index=['A']) result = expected.iloc[[-1]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_getitem_dups(self): # no dups in panel (bug?) self.check_result('list int (dups)', 'iloc', [0, 1, 1, 3], 'ix', - {0: [0, 2, 2, 6], - 1: [0, 3, 3, 9 - ]}, objs=['series', 'frame'], typs=['ints']) + {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, + objs=['series', 'frame'], typs=['ints']) # GH 6766 df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) @@ -594,15 +584,14 @@ def test_iloc_getitem_dups(self): result = df.iloc[0, :] expected = Series([np.nan, 1, 3, 3], index=['A', 'B', 'A', 'B'], name=0) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_getitem_array(self): # array like s = Series(index=lrange(1, 4)) - self.check_result('array like', 'iloc', s.index, 'ix', {0: [2, 4, 6], - 1: [3, 6, 9], - 2: [4, 8, 12]}, + self.check_result('array like', 'iloc', s.index, 'ix', + {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, typs=['ints']) def test_iloc_getitem_bool(self): @@ -617,39 +606,38 @@ def test_iloc_getitem_bool(self): def test_iloc_getitem_slice(self): # slices - self.check_result('slice', 'iloc', slice(1, 3), 'ix', {0: [2, 4], - 1: [3, 6], - 2: [4, 8]}, + self.check_result('slice', 'iloc', slice(1, 3), 'ix', + {0: [2, 4], 1: [3, 6], 2: [4, 8]}, typs=['ints']) - self.check_result('slice', 'iloc', slice(1, 3), 'indexer', slice( - 1, 3), typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result('slice', 'iloc', slice(1, 3), 'indexer', + slice(1, 3), + typs=['labels', 'mixed', 'ts', 'floats', 'empty'], + fails=IndexError) def test_iloc_getitem_slice_dups(self): df1 = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) - df2 = DataFrame( - np.random.randint(0, 10, size=20).reshape(10, - 2), columns=['A', 'C']) + df2 = DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), + columns=['A', 'C']) # axis=1 df = concat([df1, df2], axis=1) - assert_frame_equal(df.iloc[:, :4], df1) - assert_frame_equal(df.iloc[:, 4:], df2) + tm.assert_frame_equal(df.iloc[:, :4], df1) + tm.assert_frame_equal(df.iloc[:, 4:], df2) df = concat([df2, df1], axis=1) - assert_frame_equal(df.iloc[:, :2], df2) - assert_frame_equal(df.iloc[:, 2:], df1) + tm.assert_frame_equal(df.iloc[:, :2], df2) + tm.assert_frame_equal(df.iloc[:, 2:], df1) - assert_frame_equal(df.iloc[:, 0:3], concat( - [df2, df1.iloc[:, [0]]], axis=1)) + exp = concat([df2, df1.iloc[:, [0]]], axis=1) + tm.assert_frame_equal(df.iloc[:, 0:3], exp) # axis=0 df = concat([df, df], axis=0) - assert_frame_equal(df.iloc[0:10, :2], df2) - assert_frame_equal(df.iloc[0:10, 2:], df1) - assert_frame_equal(df.iloc[10:, :2], df2) - assert_frame_equal(df.iloc[10:, 2:], df1) + tm.assert_frame_equal(df.iloc[0:10, :2], df2) + tm.assert_frame_equal(df.iloc[0:10, 2:], df1) + tm.assert_frame_equal(df.iloc[10:, :2], df2) + tm.assert_frame_equal(df.iloc[10:, 2:], df1) def test_iloc_getitem_multiindex2(self): # TODO(wesm): fix this @@ -662,11 +650,11 @@ def test_iloc_getitem_multiindex2(self): rs = df.iloc[2] xp = Series(arr[2], index=df.columns) - assert_series_equal(rs, xp) + tm.assert_series_equal(rs, xp) rs = df.iloc[:, 2] xp = Series(arr[:, 2], index=df.index) - assert_series_equal(rs, xp) + tm.assert_series_equal(rs, xp) rs = df.iloc[2, 2] xp = df.values[2, 2] @@ -676,14 +664,14 @@ def test_iloc_getitem_multiindex2(self): # GH 5528 rs = df.iloc[[0, 1]] xp = df.xs(4, drop_level=False) - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) index = MultiIndex.from_tuples(tup) df = DataFrame(np.random.randn(4, 4), index=index) rs = df.iloc[[2, 3]] xp = df.xs('b', drop_level=False) - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) def test_iloc_setitem(self): df = self.frame_ints @@ -695,13 +683,13 @@ def test_iloc_setitem(self): df.iloc[:, 2:3] = 0 expected = df.iloc[:, 2:3] result = df.iloc[:, 2:3] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH5771 s = Series(0, index=[4, 5, 6]) s.iloc[1:2] += 1 expected = Series([0, 1, 0], index=[4, 5, 6]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) def test_loc_setitem_slice(self): # GH10503 @@ -714,7 +702,7 @@ def test_loc_setitem_slice(self): df1.loc[ix, 'b'] = newb1 expected = DataFrame({'a': [0, 1, 1], 'b': Series([100, 201, 301], dtype='uint32')}) - assert_frame_equal(df1, expected) + tm.assert_frame_equal(df1, expected) # assigning a new type should get the inferred type df2 = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, @@ -724,7 +712,7 @@ def test_loc_setitem_slice(self): df1.loc[ix, 'b'] = newb2 expected = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, dtype='uint64') - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) def test_ix_loc_setitem_consistency(self): @@ -733,7 +721,7 @@ def test_ix_loc_setitem_consistency(self): s = Series(0, index=[4, 5, 6]) s.loc[4:5] += 1 expected = Series([1, 1, 0], index=[4, 5, 6]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) # GH 5928 # chained indexing assignment @@ -742,12 +730,12 @@ def test_ix_loc_setitem_consistency(self): expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a'] df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]}) df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype('float64') + 0.5 expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 8607 # ix setitem consistency @@ -761,15 +749,15 @@ def test_ix_loc_setitem_consistency(self): df2 = df.copy() df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) df2 = df.copy() df2.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s') - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) df2 = df.copy() df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s') - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) def test_ix_loc_consistency(self): @@ -796,9 +784,8 @@ def compare(result, expected): self.assertRaises(TypeError, lambda: df.loc[key]) - df = pd.DataFrame( - np.random.randn(5, 4), columns=list('ABCD'), - index=pd.date_range('2012-01-01', periods=5)) + df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), + index=pd.date_range('2012-01-01', periods=5)) for key in ['2012-01-03', '2012-01-31', @@ -834,19 +821,19 @@ def compare(result, expected): result1 = s['a':'c'] result2 = s.ix['a':'c'] result3 = s.loc['a':'c'] - assert_series_equal(result1, result2) - assert_series_equal(result1, result3) + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) # now work rather than raising KeyError s = Series(range(5), [-2, -1, 1, 2, 3]) result1 = s.ix[-10:3] result2 = s.loc[-10:3] - assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result2) result1 = s.ix[0:3] result2 = s.loc[0:3] - assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result2) def test_setitem_multiindex(self): for index_fn in ('ix', 'loc'): @@ -859,31 +846,32 @@ def check(target, indexers, value, compare_fn, expected=None): expected = value compare_fn(result, expected) # GH7190 - index = pd.MultiIndex.from_product( - [np.arange(0, 100), np.arange(0, 80)], names=['time', 'firm']) + index = pd.MultiIndex.from_product([np.arange(0, 100), + np.arange(0, 80)], + names=['time', 'firm']) t, n = 0, 2 - df = DataFrame( - np.nan, columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], - index=index) + df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) check(target=df, indexers=((t, n), 'X'), value=0, compare_fn=self.assertEqual) - df = DataFrame( - -999, columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], - index=index) + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) check(target=df, indexers=((t, n), 'X'), value=1, compare_fn=self.assertEqual) - df = DataFrame( - columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], - index=index) + df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) check(target=df, indexers=((t, n), 'X'), value=2, compare_fn=self.assertEqual) # GH 7218, assinging with 0-dim arrays - df = DataFrame( - -999, columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], - index=index) + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) check(target=df, indexers=((t, n), 'X'), value=np.array(3), @@ -891,54 +879,55 @@ def check(target, indexers, value, compare_fn, expected=None): expected=3, ) # GH5206 - df = pd.DataFrame( - np.arange(25).reshape(5, 5), columns='A,B,C,D,E'.split(','), - dtype=float - ) + df = pd.DataFrame(np.arange(25).reshape(5, 5), + columns='A,B,C,D,E'.split(','), dtype=float) df['F'] = 99 row_selection = df['A'] % 2 == 0 col_selection = ['B', 'C'] df.ix[row_selection, col_selection] = df['F'] output = pd.DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) - assert_frame_equal(df.ix[row_selection, col_selection], output) + tm.assert_frame_equal(df.ix[row_selection, col_selection], output) check(target=df, indexers=(row_selection, col_selection), value=df['F'], - compare_fn=assert_frame_equal, + compare_fn=tm.assert_frame_equal, expected=output, ) # GH11372 idx = pd.MultiIndex.from_product([ - ['A', 'B', 'C'], pd.date_range( - '2015-01-01', '2015-04-01', freq='MS') - ]) + ['A', 'B', 'C'], + pd.date_range('2015-01-01', '2015-04-01', freq='MS')]) cols = pd.MultiIndex.from_product([ - ['foo', 'bar'], pd.date_range( - '2016-01-01', '2016-02-01', freq='MS') - ]) - df = pd.DataFrame( - np.random.random((12, 4)), index=idx, columns=cols) - subidx = pd.MultiIndex.from_tuples([('A', pd.Timestamp( - '2015-01-01')), ('A', pd.Timestamp('2015-02-01'))]) - subcols = pd.MultiIndex.from_tuples([('foo', pd.Timestamp( - '2016-01-01')), ('foo', pd.Timestamp('2016-02-01'))]) - vals = pd.DataFrame( - np.random.random((2, 2)), index=subidx, columns=subcols) + ['foo', 'bar'], + pd.date_range('2016-01-01', '2016-02-01', freq='MS')]) + + df = pd.DataFrame(np.random.random((12, 4)), + index=idx, columns=cols) + + subidx = pd.MultiIndex.from_tuples( + [('A', pd.Timestamp('2015-01-01')), + ('A', pd.Timestamp('2015-02-01'))]) + subcols = pd.MultiIndex.from_tuples( + [('foo', pd.Timestamp('2016-01-01')), + ('foo', pd.Timestamp('2016-02-01'))]) + + vals = pd.DataFrame(np.random.random((2, 2)), + index=subidx, columns=subcols) check(target=df, indexers=(subidx, subcols), value=vals, - compare_fn=assert_frame_equal, ) + compare_fn=tm.assert_frame_equal, ) # set all columns vals = pd.DataFrame( np.random.random((2, 4)), index=subidx, columns=cols) check(target=df, indexers=(subidx, slice(None, None, None)), value=vals, - compare_fn=assert_frame_equal, ) + compare_fn=tm.assert_frame_equal, ) # identity copy = df.copy() check(target=df, indexers=(df.index, df.columns), value=df, - compare_fn=assert_frame_equal, expected=copy) + compare_fn=tm.assert_frame_equal, expected=copy) def test_indexing_with_datetime_tz(self): @@ -958,12 +947,12 @@ def test_indexing_with_datetime_tz(self): expected = Series([Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan, np.nan], index=list('ABC'), dtype='object', name=1) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = df.loc[1] expected = Series([Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan, np.nan], index=list('ABC'), dtype='object', name=1) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # indexing - fast_xs df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) @@ -977,7 +966,7 @@ def test_indexing_with_datetime_tz(self): # indexing - boolean result = df[df.a > df.a[3]] expected = df.iloc[4:] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # indexing - setting an element df = DataFrame(data=pd.to_datetime( @@ -995,7 +984,7 @@ def f(): v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') df.loc[df.new_col == 'new', 'time'] = v - assert_series_equal(df.loc[df.new_col == 'new', 'time'], v) + tm.assert_series_equal(df.loc[df.new_col == 'new', 'time'], v) def test_indexing_with_datetimeindex_tz(self): @@ -1010,22 +999,22 @@ def test_indexing_with_datetimeindex_tz(self): for sel in (index, list(index)): # getitem - assert_series_equal(ser[sel], ser) + tm.assert_series_equal(ser[sel], ser) # setitem result = ser.copy() result[sel] = 1 expected = pd.Series(1, index=index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # .loc getitem - assert_series_equal(ser.loc[sel], ser) + tm.assert_series_equal(ser.loc[sel], ser) # .loc setitem result = ser.copy() result.loc[sel] = 1 expected = pd.Series(1, index=index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # single element indexing @@ -1036,7 +1025,7 @@ def test_indexing_with_datetimeindex_tz(self): result = ser.copy() result[index[1]] = 5 expected = pd.Series([0, 5], index=index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # .loc getitem self.assertEqual(ser.loc[index[1]], 1) @@ -1045,7 +1034,7 @@ def test_indexing_with_datetimeindex_tz(self): result = ser.copy() result.loc[index[1]] = 5 expected = pd.Series([0, 5], index=index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_loc_setitem_dups(self): @@ -1059,7 +1048,7 @@ def test_loc_setitem_dups(self): indexer = tuple(['r', ['bar', 'bar2']]) df = df_orig.copy() df.loc[indexer] *= 2.0 - assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) indexer = tuple(['r', 'bar']) df = df_orig.copy() @@ -1069,7 +1058,7 @@ def test_loc_setitem_dups(self): indexer = tuple(['t', ['bar', 'bar2']]) df = df_orig.copy() df.loc[indexer] *= 2.0 - assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) def test_iloc_setitem_dups(self): @@ -1084,24 +1073,24 @@ def test_iloc_setitem_dups(self): inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # del a dup column across blocks expected = DataFrame({0: [1, 2], 1: [3, 4]}) expected.columns = ['B', 'B'] del df['A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # assign back to self df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # reversed x 2 df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( drop=True) df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( drop=True) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_chained_getitem_with_lists(self): @@ -1196,12 +1185,14 @@ def test_loc_getitem_label_list(self): self.check_result('list lbl', 'loc', [4, 8, 10], 'ix', [4, 8, 10], typs=['ints'], axes=2, fails=KeyError) + def test_loc_getitem_label_list_fails(self): # fails self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], typs=['ints'], axes=1, fails=KeyError) self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], typs=['ints'], axes=2, fails=KeyError) + def test_loc_getitem_label_array_like(self): # array like self.check_result('array like', 'loc', Series(index=[0, 2, 4]).index, 'ix', [0, 2, 4], typs=['ints'], axes=0) @@ -1211,7 +1202,6 @@ def test_loc_getitem_label_list(self): 'ix', [4, 8, 12], typs=['ints'], axes=2) def test_loc_getitem_bool(self): - # boolean indexers b = [True, False, True, False] self.check_result('bool', 'loc', b, 'ix', b, @@ -1238,7 +1228,7 @@ def test_loc_getitem_int_slice(self): df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] expected = df.ix[6:8, :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) index = MultiIndex.from_tuples([t for t in product( @@ -1246,17 +1236,17 @@ def test_loc_getitem_int_slice(self): df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] expected = df.ix[20:30, :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # doc examples result = df.loc[10, :] expected = df.ix[10, :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[:, 10] # expected = df.ix[:,10] (this fails) expected = df[10] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_loc_to_fail(self): @@ -1284,7 +1274,7 @@ def test_loc_to_fail(self): s.loc[-1] = 3 result = s.loc[[-1, -2]] expected = Series([3, np.nan], index=[-1, -2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s['a'] = 2 self.assertRaises(KeyError, lambda: s.loc[[-2]]) @@ -1310,6 +1300,7 @@ def f(): self.assertRaises(KeyError, f) + def test_at_to_fail(self): # at should not fallback # GH 7814 s = Series([1, 2, 3], index=list('abc')) @@ -1344,27 +1335,28 @@ def f(): def test_loc_getitem_label_slice(self): # label slices (with ints) - self.check_result('lab slice', 'loc', slice(1, 3), 'ix', slice( - 1, 3), typs=['labels', 'mixed', 'empty', 'ts', 'floats'], - fails=TypeError) + self.check_result('lab slice', 'loc', slice(1, 3), + 'ix', slice(1, 3), + typs=['labels', 'mixed', 'empty', 'ts', 'floats'], + fails=TypeError) # real label slices - self.check_result('lab slice', 'loc', slice('a', 'c'), 'ix', slice( - 'a', 'c'), typs=['labels'], axes=0) - self.check_result('lab slice', 'loc', slice('A', 'C'), 'ix', slice( - 'A', 'C'), typs=['labels'], axes=1) - self.check_result('lab slice', 'loc', slice('W', 'Z'), 'ix', slice( - 'W', 'Z'), typs=['labels'], axes=2) - - self.check_result('ts slice', 'loc', slice( - '20130102', '20130104'), 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=0) - self.check_result('ts slice', 'loc', slice( - '20130102', '20130104'), 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=1, fails=TypeError) - self.check_result('ts slice', 'loc', slice( - '20130102', '20130104'), 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=2, fails=TypeError) + self.check_result('lab slice', 'loc', slice('a', 'c'), + 'ix', slice('a', 'c'), typs=['labels'], axes=0) + self.check_result('lab slice', 'loc', slice('A', 'C'), + 'ix', slice('A', 'C'), typs=['labels'], axes=1) + self.check_result('lab slice', 'loc', slice('W', 'Z'), + 'ix', slice('W', 'Z'), typs=['labels'], axes=2) + + self.check_result('ts slice', 'loc', slice('20130102', '20130104'), + 'ix', slice('20130102', '20130104'), + typs=['ts'], axes=0) + self.check_result('ts slice', 'loc', slice('20130102', '20130104'), + 'ix', slice('20130102', '20130104'), + typs=['ts'], axes=1, fails=TypeError) + self.check_result('ts slice', 'loc', slice('20130102', '20130104'), + 'ix', slice('20130102', '20130104'), + typs=['ts'], axes=2, fails=TypeError) self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), typs=['mixed'], axes=0, fails=TypeError) @@ -1390,7 +1382,7 @@ def test_loc_general(self): # mixed type result = DataFrame({'a': [Timestamp('20130101')], 'b': [1]}).iloc[0] expected = Series([Timestamp('20130101'), 1], index=['a', 'b'], name=0) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) self.assertEqual(result.dtype, object) def test_loc_setitem_consistency(self): @@ -1404,49 +1396,45 @@ def test_loc_setitem_consistency(self): 'val': Series( range(5), dtype=np.int64)}) df.loc[:, 'date'] = 0 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df.loc[:, 'date'] = np.array(0, dtype=np.int64) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df.loc[:, 'date'] = np.array([0, 0, 0, 0, 0], dtype=np.int64) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) expected = DataFrame({'date': Series('foo', index=range(5)), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df.loc[:, 'date'] = 'foo' - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) expected = DataFrame({'date': Series(1.0, index=range(5)), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df.loc[:, 'date'] = 1.0 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + def test_loc_setitem_consistency_empty(self): # empty (essentially noops) expected = DataFrame(columns=['x', 'y']) expected['x'] = expected['x'].astype(np.int64) df = DataFrame(columns=['x', 'y']) df.loc[:, 'x'] = 1 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(columns=['x', 'y']) df['x'] = 1 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + def test_loc_setitem_consistency_slice_column_len(self): # .loc[:,column] setting with slice == len of the column # GH10408 data = """Level_0,,,Respondent,Respondent,Respondent,OtherCat,OtherCat @@ -1469,7 +1457,7 @@ def test_loc_setitem_consistency(self): 'Respondent', 'Duration')].astype('timedelta64[s]') expected = Series([1380, 720, 840, 2160.], index=df.index, name=('Respondent', 'Duration')) - assert_series_equal(df[('Respondent', 'Duration')], expected) + tm.assert_series_equal(df[('Respondent', 'Duration')], expected) def test_loc_setitem_frame(self): df = self.frame_labels @@ -1486,7 +1474,7 @@ def test_loc_setitem_frame(self): df.loc[:, 'B':'D'] = 0 expected = df.loc[:, 'B':'D'] result = df.ix[:, 1:] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 6254 # setting issue @@ -1494,7 +1482,7 @@ def test_loc_setitem_frame(self): df.loc[[4, 3, 5], 'A'] = np.array([1, 2, 3], dtype='int64') expected = DataFrame(dict(A=Series( [1, 2, 3], index=[4, 3, 5]))).reindex(index=[3, 5, 4]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 6252 # setting with an empty frame @@ -1514,14 +1502,14 @@ def test_loc_setitem_frame(self): expected = DataFrame(dict(A=Series(val1, index=keys1), B=Series( val2, index=keys2))).reindex(index=index) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 8669 # invalid coercion of nan -> int df = DataFrame({'A': [1, 2, 3], 'B': np.nan}) df.loc[df.B > df.A, 'B'] = df.A expected = DataFrame({'A': [1, 2, 3], 'B': np.nan}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 6546 # setting with mixed labels @@ -1529,11 +1517,11 @@ def test_loc_setitem_frame(self): result = df.loc[0, [1, 2]] expected = Series([1, 3], index=[1, 2], dtype=object, name=0) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) expected = DataFrame({1: [5, 2], 2: [6, 4], 'a': ['a', 'b']}) df.loc[0, [1, 2]] = [5, 6] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_loc_setitem_frame_multiples(self): # multiple setting @@ -1546,7 +1534,7 @@ def test_loc_setitem_frame_multiples(self): expected = DataFrame({'A': ['bar', 'baz', 'baz'], 'B': Series( [1, 2, 2], dtype=np.int64)}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # multiple setting with frame on rhs (with M8) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), @@ -1560,16 +1548,15 @@ def test_loc_setitem_frame_multiples(self): rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_iloc_getitem_frame(self): - df = DataFrame( - np.random.randn( - 10, 4), index=lrange(0, 20, 2), columns=lrange(0, 8, 2)) + df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2), + columns=lrange(0, 8, 2)) result = df.iloc[2] exp = df.ix[4] - assert_series_equal(result, exp) + tm.assert_series_equal(result, exp) result = df.iloc[2, 2] exp = df.ix[4, 4] @@ -1578,41 +1565,41 @@ def test_iloc_getitem_frame(self): # slice result = df.iloc[4:8] expected = df.ix[8:14] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 2:3] expected = df.ix[:, 4:5] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # list of integers result = df.iloc[[0, 1, 3]] expected = df.ix[[0, 2, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[[0, 1, 3], [0, 1]] expected = df.ix[[0, 2, 6], [0, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # neg indicies result = df.iloc[[-1, 1, 3], [-1, 1]] expected = df.ix[[18, 2, 6], [6, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # dups indicies result = df.iloc[[-1, -1, 1, 3], [-1, 1]] expected = df.ix[[18, 18, 2, 6], [6, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # with index-like s = Series(index=lrange(1, 5)) result = df.iloc[s.index] expected = df.ix[[2, 4, 6, 8]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + def test_iloc_getitem_labelled_frame(self): # try with labelled frame - df = DataFrame( - np.random.randn(10, - 4), index=list('abcdefghij'), columns=list('ABCD')) + df = DataFrame(np.random.randn(10, 4), + index=list('abcdefghij'), columns=list('ABCD')) result = df.iloc[1, 1] exp = df.ix['b', 'B'] @@ -1620,7 +1607,7 @@ def test_iloc_getitem_frame(self): result = df.iloc[:, 2:3] expected = df.ix[:, ['C']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # negative indexing result = df.iloc[-1, -1] @@ -1643,11 +1630,11 @@ def test_iloc_getitem_panel(self): result = p.iloc[1] expected = p.loc['B'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = p.iloc[1, 1] expected = p.loc['B', 'b'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = p.iloc[1, 1, 1] expected = p.loc['B', 'b', 'two'] @@ -1656,31 +1643,31 @@ def test_iloc_getitem_panel(self): # slice result = p.iloc[1:3] expected = p.loc[['B', 'C']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) result = p.iloc[:, 0:2] expected = p.loc[:, ['a', 'b']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) # list of integers result = p.iloc[[0, 2]] expected = p.loc[['A', 'C']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) # neg indicies result = p.iloc[[-1, 1], [-1, 1]] expected = p.loc[['D', 'B'], ['c', 'b']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) # dups indicies result = p.iloc[[-1, -1, 1], [-1, 1]] expected = p.loc[['D', 'D', 'B'], ['c', 'b']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) # combined result = p.iloc[0, [True, True], [0, 1]] expected = p.loc['A', ['a', 'b'], ['one', 'two']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # out-of-bounds exception self.assertRaises(IndexError, p.iloc.__getitem__, tuple([10, 5])) @@ -1700,13 +1687,13 @@ def f(): expected = p['A'] result = p.iloc[0, :, :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = p.iloc[0, [True, True, True], :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = p.iloc[0, [True, True, True], [0, 1]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def f(): p.iloc[0, [True, True, True], [0, 1, 2]] @@ -1718,6 +1705,7 @@ def f(): self.assertRaises(IndexError, f) + def test_iloc_getitem_panel_multiindex(self): # GH 7199 # Panel with multi-index multi_index = pd.MultiIndex.from_tuples([('ONE', 'one'), @@ -1734,33 +1722,33 @@ def f(): expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]] result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG - assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result1, expected1) expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]] result2 = wd2.iloc[0, [True, True, True, False], [0, 2]] - assert_frame_equal(result2, expected2) + tm.assert_frame_equal(result2, expected2) expected1 = DataFrame(index=['a'], columns=multi_index, dtype='float64') result1 = wd1.iloc[0, [0], [0, 1, 2]] - assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result1, expected1) expected2 = DataFrame(index=['a'], columns=simple_index, dtype='float64') result2 = wd2.iloc[0, [0], [0, 1, 2]] - assert_frame_equal(result2, expected2) + tm.assert_frame_equal(result2, expected2) # GH 7516 mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')]) - p = Panel( - np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), - items=['a', 'b', 'c'], major_axis=mi, minor_axis=['u', 'v', 'w']) + p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), + items=['a', 'b', 'c'], major_axis=mi, + minor_axis=['u', 'v', 'w']) result = p.iloc[:, 1, 0] expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u') - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = p.loc[:, (1, 'y'), 'u'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_getitem_doc_issue(self): @@ -1781,7 +1769,7 @@ def test_iloc_getitem_doc_issue(self): expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=columns[0:2]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # for dups df.columns = list('aaaa') @@ -1791,7 +1779,7 @@ def test_iloc_getitem_doc_issue(self): expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=list('aa')) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # related arr = np.random.randn(6, 4) @@ -1805,7 +1793,7 @@ def test_iloc_getitem_doc_issue(self): result.dtypes expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_setitem_ndarray_1d(self): # GH5508 @@ -1827,7 +1815,7 @@ def f(): result = df.ix[2:5, 'bar'] expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[2, 3, 4, 5], name='bar') - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # dtype getting changed? df = DataFrame(index=Index(lrange(1, 11))) @@ -1840,9 +1828,8 @@ def f(): self.assertRaises(ValueError, f) def test_iloc_setitem_series(self): - df = DataFrame( - np.random.randn(10, - 4), index=list('abcdefghij'), columns=list('ABCD')) + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), + columns=list('ABCD')) df.iloc[1, 1] = 1 result = df.iloc[1, 1] @@ -1851,7 +1838,7 @@ def test_iloc_setitem_series(self): df.iloc[:, 2:3] = 0 expected = df.iloc[:, 2:3] result = df.iloc[:, 2:3] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) s = Series(np.random.randn(10), index=lrange(0, 20, 2)) @@ -1862,35 +1849,35 @@ def test_iloc_setitem_series(self): s.iloc[:4] = 0 expected = s.iloc[:4] result = s.iloc[:4] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s = Series([-1] * 6) s.iloc[0::2] = [0, 2, 4] s.iloc[1::2] = [1, 3, 5] result = s expected = Series([0, 1, 2, 3, 4, 5]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_setitem_list_of_lists(self): # GH 7551 # list-of-list is set incorrectly in mixed vs. single dtyped frames - df = DataFrame(dict(A=np.arange(5, dtype='int64'), B=np.arange( - 5, 10, dtype='int64'))) + df = DataFrame(dict(A=np.arange(5, dtype='int64'), + B=np.arange(5, 10, dtype='int64'))) df.iloc[2:4] = [[10, 11], [12, 13]] expected = DataFrame(dict(A=[0, 1, 10, 12, 4], B=[5, 6, 11, 13, 9])) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame( dict(A=list('abcde'), B=np.arange(5, 10, dtype='int64'))) df.iloc[2:4] = [['x', 11], ['y', 13]] - expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], B=[5, 6, 11, 13, - 9])) - assert_frame_equal(df, expected) + expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], + B=[5, 6, 11, 13, 9])) + tm.assert_frame_equal(df, expected) def test_iloc_getitem_multiindex(self): - mi_labels = DataFrame(np.random.randn(4, 3), columns=[['i', 'i', 'j'], - ['A', 'A', 'B']], + mi_labels = DataFrame(np.random.randn(4, 3), + columns=[['i', 'i', 'j'], ['A', 'A', 'B']], index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y', 'Y']]) @@ -1901,14 +1888,14 @@ def test_iloc_getitem_multiindex(self): # the first row rs = mi_int.iloc[0] xp = mi_int.ix[4].ix[8] - assert_series_equal(rs, xp, check_names=False) + tm.assert_series_equal(rs, xp, check_names=False) self.assertEqual(rs.name, (4, 8)) self.assertEqual(xp.name, 8) # 2nd (last) columns rs = mi_int.iloc[:, 2] xp = mi_int.ix[:, 2] - assert_series_equal(rs, xp) + tm.assert_series_equal(rs, xp) # corner column rs = mi_int.iloc[2, 2] @@ -1922,8 +1909,8 @@ def test_iloc_getitem_multiindex(self): def test_loc_multiindex(self): - mi_labels = DataFrame(np.random.randn(3, 3), columns=[['i', 'i', 'j'], - ['A', 'A', 'B']], + mi_labels = DataFrame(np.random.randn(3, 3), + columns=[['i', 'i', 'j'], ['A', 'A', 'B']], index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) mi_int = DataFrame(np.random.randn(3, 3), @@ -1933,26 +1920,28 @@ def test_loc_multiindex(self): # the first row rs = mi_labels.loc['i'] xp = mi_labels.ix['i'] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) # 2nd (last) columns rs = mi_labels.loc[:, 'j'] xp = mi_labels.ix[:, 'j'] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) # corner column rs = mi_labels.loc['j'].loc[:, 'j'] xp = mi_labels.ix['j'].ix[:, 'j'] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) # with a tuple rs = mi_labels.loc[('i', 'X')] xp = mi_labels.ix[('i', 'X')] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) rs = mi_int.loc[4] xp = mi_int.ix[4] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) + + def test_loc_multiindex_indexer_none(self): # GH6788 # multi-index indexer is None (meaning take all) @@ -1963,45 +1952,45 @@ def test_loc_multiindex(self): df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 df = DataFrame(df, columns=index) result = df[attributes] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # GH 7349 # loc with a multi-index seems to be doing fallback - df = DataFrame( - np.arange(12).reshape(-1, 1), - index=pd.MultiIndex.from_product([[1, 2, 3, 4], [1, 2, 3]])) + df = DataFrame(np.arange(12).reshape(-1, 1), + index=pd.MultiIndex.from_product([[1, 2, 3, 4], + [1, 2, 3]])) expected = df.loc[([1, 2], ), :] result = df.loc[[1, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_incomplete(self): # GH 7399 # incomplete indexers - s = pd.Series( - np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + s = pd.Series(np.arange(15, dtype='int64'), + MultiIndex.from_product([range(5), ['a', 'b', 'c']])) expected = s.loc[:, 'a':'c'] result = s.loc[0:4, 'a':'c'] - assert_series_equal(result, expected) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.loc[:4, 'a':'c'] - assert_series_equal(result, expected) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.loc[0:, 'a':'c'] - assert_series_equal(result, expected) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 7400 # multiindexer gettitem with list of indexers skips wrong element - s = pd.Series( - np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + s = pd.Series(np.arange(15, dtype='int64'), + MultiIndex.from_product([range(5), ['a', 'b', 'c']])) expected = s.iloc[[6, 7, 8, 12, 13, 14]] result = s.loc[2:4:2, 'a':'c'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_multiindex_perf_warn(self): @@ -2020,76 +2009,6 @@ def test_multiindex_perf_warn(self): with tm.assert_produces_warning(PerformanceWarning): df.loc[(0, )] - @slow - def test_multiindex_get_loc(self): # GH7724, GH2646 - - with warnings.catch_warnings(record=True): - - # test indexing into a multi-index before & past the lexsort depth - from numpy.random import randint, choice, randn - cols = ['jim', 'joe', 'jolie', 'joline', 'jolia'] - - def validate(mi, df, key): - mask = np.ones(len(df)).astype('bool') - - # test for all partials of this key - for i, k in enumerate(key): - mask &= df.iloc[:, i] == k - - if not mask.any(): - self.assertNotIn(key[:i + 1], mi.index) - continue - - self.assertIn(key[:i + 1], mi.index) - right = df[mask].copy() - - if i + 1 != len(key): # partial key - right.drop(cols[:i + 1], axis=1, inplace=True) - right.set_index(cols[i + 1:-1], inplace=True) - assert_frame_equal(mi.loc[key[:i + 1]], right) - - else: # full key - right.set_index(cols[:-1], inplace=True) - if len(right) == 1: # single hit - right = Series(right['jolia'].values, - name=right.index[0], - index=['jolia']) - assert_series_equal(mi.loc[key[:i + 1]], right) - else: # multi hit - assert_frame_equal(mi.loc[key[:i + 1]], right) - - def loop(mi, df, keys): - for key in keys: - validate(mi, df, key) - - n, m = 1000, 50 - - vals = [randint(0, 10, n), choice( - list('abcdefghij'), n), choice( - pd.date_range('20141009', periods=10).tolist(), n), choice( - list('ZYXWVUTSRQ'), n), randn(n)] - vals = list(map(tuple, zip(*vals))) - - # bunch of keys for testing - keys = [randint(0, 11, m), choice( - list('abcdefghijk'), m), choice( - pd.date_range('20141009', periods=11).tolist(), m), choice( - list('ZYXWVUTSRQP'), m)] - keys = list(map(tuple, zip(*keys))) - keys += list(map(lambda t: t[:-1], vals[::n // m])) - - # covers both unique index and non-unique index - df = pd.DataFrame(vals, columns=cols) - a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1]) - - for frame in a, b: - for i in range(5): # lexsort depth - df = frame.copy() if i == 0 else frame.sort_values( - by=cols[:i]) - mi = df.set_index(cols[:-1]) - assert not mi.index.lexsort_depth < i - loop(mi, df, keys) - def test_series_getitem_multiindex(self): # GH 6018 @@ -2100,27 +2019,29 @@ def test_series_getitem_multiindex(self): result = s[:, 0] expected = Series([1], index=[0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.ix[:, 1] expected = Series([2, 3], index=[1, 2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # xs result = s.xs(0, level=0) expected = Series([1], index=[0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.xs(1, level=1) expected = Series([2, 3], index=[1, 2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH6258 - s = Series([1, 3, 4, 1, 3, 4], index=MultiIndex.from_product([list( - 'AB'), list(date_range('20130903', periods=3))])) + dt = list(date_range('20130903', periods=3)) + idx = MultiIndex.from_product([list('AB'), dt]) + s = Series([1, 3, 4, 1, 3, 4], index=idx) + result = s.xs('20130903', level=1) expected = Series([1, 1], index=list('AB')) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH5684 idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), ('b', 'one'), @@ -2130,28 +2051,16 @@ def test_series_getitem_multiindex(self): result = s.xs('one', level='L2') expected = Series([1, 3], index=['a', 'b']) expected.index.set_names(['L1'], inplace=True) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_ix_general(self): # ix general issues # GH 2817 - data = {'amount': {0: 700, - 1: 600, - 2: 222, - 3: 333, - 4: 444}, - 'col': {0: 3.5, - 1: 3.5, - 2: 4.0, - 3: 4.0, - 4: 4.0}, - 'year': {0: 2012, - 1: 2011, - 2: 2012, - 3: 2012, - 4: 2012}} + data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} df = DataFrame(data).set_index(keys=['col', 'year']) key = 4.0, 2012 @@ -2184,7 +2093,7 @@ def test_ix_weird_slicing(self): 2: -3, 3: 4, 4: 5}}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_xs_multiindex(self): @@ -2196,12 +2105,12 @@ def test_xs_multiindex(self): df.sortlevel(axis=1, inplace=True) result = df.xs('a', level='lvl0', axis=1) expected = df.iloc[:, 0:2].loc[:, 'a'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.xs('foo', level='lvl1', axis=1) expected = df.iloc[:, 1:2].copy() expected.columns = expected.columns.droplevel('lvl1') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_per_axis_per_level_getitem(self): @@ -2216,14 +2125,14 @@ def test_per_axis_per_level_getitem(self): for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C2' or c == 'C3')]] result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([('A', 1), ('A', 2), @@ -2240,40 +2149,40 @@ def test_per_axis_per_level_getitem(self): # identity result = df.loc[(slice(None), slice(None)), :] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) result = df.loc[:, (slice(None), slice(None))] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # index result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), 1), :] expected = df.iloc[[0, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # columns result = df.loc[:, (slice(None), ['foo'])] expected = df.iloc[:, [1, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # both result = df.loc[(slice(None), 1), (slice(None), ['foo'])] expected = df.iloc[[0, 3], [1, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc['A', 'a'] expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), index=Index([1, 2, 3], name='two'), columns=Index(['bar', 'foo'], name='lvl1')) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] expected = df.iloc[[0, 1, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi-level series s = Series(np.arange(len(ix.get_values())), index=ix) @@ -2282,12 +2191,12 @@ def test_per_axis_per_level_getitem(self): for a, b, c, d in s.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # boolean indexers result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] expected = df.iloc[[2, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def f(): df.loc[(slice(None), np.array([True, False])), :] @@ -2301,7 +2210,7 @@ def f(): result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # not lexsorted self.assertEqual(df.index.lexsort_depth, 2) @@ -2327,11 +2236,11 @@ def test_multiindex_slicers_non_unique(self): C=[1, 1], D=[1, 3])) .set_index(['A', 'B', 'C']).sortlevel()) result = df.loc[(slice(None), slice(None), 1), :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # this is equivalent of an xs expression result = df.xs(1, level=2, drop_level=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], B=['a', 'a', 'a', 'a'], @@ -2344,7 +2253,7 @@ def test_multiindex_slicers_non_unique(self): .set_index(['A', 'B', 'C']).sortlevel()) result = df.loc[(slice(None), slice(None), 1), :] self.assertFalse(result.index.is_unique) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH12896 # numpy-implementation dependent bug @@ -2356,7 +2265,7 @@ def test_multiindex_slicers_non_unique(self): result = result.sort_index() result = result.loc[(slice(None), slice(100000))] expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_multiindex_slicers_datetimelike(self): @@ -2376,28 +2285,28 @@ def test_multiindex_slicers_datetimelike(self): # multi-axis slicing idx = pd.IndexSlice expected = df.iloc[[0, 2, 4], [0, 1]] - result = df.loc[(slice( - Timestamp('2012-01-01 12:12:12'), Timestamp( - '2012-01-03 12:12:12')), slice(1, 1)), slice('A', 'B')] - assert_frame_equal(result, expected) + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), + Timestamp('2012-01-03 12:12:12')), + slice(1, 1)), slice('A', 'B')] + tm.assert_frame_equal(result, expected) result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp( '2012-01-03 12:12:12')], idx[1:1]), slice('A', 'B')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) - result = df.loc[(slice( - Timestamp('2012-01-01 12:12:12'), Timestamp( - '2012-01-03 12:12:12')), 1), slice('A', 'B')] - assert_frame_equal(result, expected) + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), + Timestamp('2012-01-03 12:12:12')), 1), + slice('A', 'B')] + tm.assert_frame_equal(result, expected) # with strings result = df.loc[(slice('2012-01-01 12:12:12', '2012-01-03 12:12:12'), slice(1, 1)), slice('A', 'B')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'], 1), idx['A', 'B']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_multiindex_slicers_edges(self): # GH 8132 @@ -2418,47 +2327,47 @@ def test_multiindex_slicers_edges(self): # A1 - Get all values under "A0" and "A1" result = df1.loc[(slice('A1')), :] expected = df1.iloc[0:10] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # A2 - Get all values from the start to "A2" result = df1.loc[(slice('A2')), :] expected = df1 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # A3 - Get all values under "B1" or "B2" result = df1.loc[(slice(None), slice('B1', 'B2')), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # A4 - Get all values between 2013-07-02 and 2013-07-09 - result = df1.loc[(slice(None), slice(None), slice('20130702', - '20130709')), :] + result = df1.loc[(slice(None), slice(None), + slice('20130702', '20130709')), :] expected = df1.iloc[[1, 2, 6, 7, 12]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # B1 - Get all values in B0 that are also under A0, A1 and A2 result = df1.loc[(slice('A2'), slice('B0')), :] expected = df1.iloc[[0, 1, 5, 6, 10, 11]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for # the As) result = df1.loc[(slice(None), slice('B2')), :] expected = df1 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # B3 - Get all values from B1 to B2 and up to 2013-08-06 - result = df1.loc[(slice(None), slice('B1', 'B2'), slice('2013-08-06') - ), :] + result = df1.loc[(slice(None), slice('B1', 'B2'), + slice('2013-08-06')), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # B4 - Same as A4 but the start of the date slice is not a key. # shows indexing on a partial selection slice - result = df1.loc[(slice(None), slice(None), slice('20130701', - '20130709')), :] + result = df1.loc[(slice(None), slice(None), + slice('20130701', '20130709')), :] expected = df1.iloc[[1, 2, 6, 7, 12]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_per_axis_per_level_doc_examples(self): @@ -2473,24 +2382,23 @@ def test_per_axis_per_level_doc_examples(self): names=['lvl0', 'lvl1']) df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') .reshape((len(index), len(columns))), - index=index, - columns=columns) + index=index, columns=columns) result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[idx['A1':'A3', :, ['C1', 'C3']], :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), slice(None), ['C1', 'C3']), :] expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[idx[:, :, ['C1', 'C3']], :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # not sorted def f(): @@ -2524,22 +2432,22 @@ def test_loc_axis_arguments(self): for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc(axis='index')[:, :, ['C1', 'C3']] expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # axis 1 result = df.loc(axis=1)[:, 'foo'] expected = df.loc[:, (slice(None), 'foo')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc(axis='columns')[:, 'foo'] expected = df.loc[:, (slice(None), 'foo')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # invalid axis def f(): @@ -2565,10 +2473,10 @@ def test_loc_coerceion(self): expected = df.dtypes result = df.iloc[[0]] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) result = df.iloc[[1]] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) # 12045 import datetime @@ -2577,20 +2485,20 @@ def test_loc_coerceion(self): expected = df.dtypes result = df.iloc[[0]] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) result = df.iloc[[1]] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) # 11594 df = DataFrame({'text': ['some words'] + [None] * 9}) expected = df.dtypes result = df.iloc[0:2] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) result = df.iloc[3:] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) def test_per_axis_per_level_setitem(self): @@ -2615,70 +2523,70 @@ def test_per_axis_per_level_setitem(self): df.loc[(slice(None), slice(None)), :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # index df = df_orig.copy() df.loc[(slice(None), [1]), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, 1] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # columns df = df_orig.copy() df.loc[:, (slice(None), ['foo'])] = 100 expected = df_orig.copy() expected.iloc[:, [1, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # both df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[idx[:, 1], idx[:, ['foo']]] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc['A', 'a'] = 100 expected = df_orig.copy() expected.iloc[0:3, 0:2] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # setting with a list-like df = df_orig.copy() @@ -2686,7 +2594,7 @@ def test_per_axis_per_level_setitem(self): [[100, 100], [100, 100]], dtype='int64') expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # not enough values df = df_orig.copy() @@ -2709,14 +2617,14 @@ def f(): None), 1), (slice(None), ['foo'])] * 5 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice( None), 1), (slice(None), ['foo'])] expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy() rhs.loc[:, ('c', 'bah')] = 10 @@ -2724,7 +2632,7 @@ def f(): df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_multiindex_setitem(self): @@ -2741,7 +2649,7 @@ def test_multiindex_setitem(self): expected = df_orig.loc[['bar']] * 2 df = df_orig.copy() df.loc[['bar']] *= 2 - assert_frame_equal(df.loc[['bar']], expected) + tm.assert_frame_equal(df.loc[['bar']], expected) # raise because these have differing levels def f(): @@ -2768,17 +2676,17 @@ def f(): idx = pd.IndexSlice df = df_orig.copy() df.loc[idx[:, :, 'Stock'], :] *= 2 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[idx[:, :, 'Stock'], 'price'] *= 2 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_getitem_multiindex(self): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! - index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, - 82]], + index = MultiIndex(levels=[['D', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) @@ -2787,7 +2695,7 @@ def test_getitem_multiindex(self): result = df.val['D'] expected = Series(arr.ravel()[0:3], name='val', index=Index( [26, 37, 57], name='day')) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def f(): df.val['A'] @@ -2800,8 +2708,8 @@ def f(): self.assertRaises(KeyError, f) # A is treated as a special Timestamp - index = MultiIndex(levels=[['A', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, - 82]], + index = MultiIndex(levels=[['A', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) @@ -2809,7 +2717,7 @@ def f(): result = df.val['A'] expected = Series(arr.ravel()[0:3], name='val', index=Index( [26, 37, 57], name='day')) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def f(): df.val['X'] @@ -2818,20 +2726,20 @@ def f(): # GH 7866 # multi-index slicing with missing indexers - s = pd.Series(np.arange(9, dtype='int64'), - index=pd.MultiIndex.from_product( - [['A', 'B', 'C'], ['foo', 'bar', 'baz']], - names=['one', 'two'])).sortlevel() + idx = pd.MultiIndex.from_product([['A', 'B', 'C'], + ['foo', 'bar', 'baz']], + names=['one', 'two']) + s = pd.Series(np.arange(9, dtype='int64'), index=idx).sortlevel() + exp_idx = pd.MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], + names=['one', 'two']) expected = pd.Series(np.arange(3, dtype='int64'), - index=pd.MultiIndex.from_product( - [['A'], ['foo', 'bar', 'baz']], - names=['one', 'two'])).sortlevel() + index=exp_idx).sortlevel() result = s.loc[['A']] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.loc[['A', 'D']] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # not any values found self.assertRaises(KeyError, lambda: s.loc[['D']]) @@ -2839,16 +2747,16 @@ def f(): # empty ok result = s.loc[[]] expected = s.iloc[[]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) idx = pd.IndexSlice expected = pd.Series([0, 3, 6], index=pd.MultiIndex.from_product( [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sortlevel() result = s.loc[idx[:, ['foo']]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.loc[idx[:, ['foo', 'bah']]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 8737 # empty indexer @@ -2862,8 +2770,8 @@ def f(): columns=multi_index.reindex([])[0]) result1 = df.loc[:, ([], slice(None))] result2 = df.loc[:, (['foo'], [])] - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) # regression from < 0.14.0 # GH 7914 @@ -2882,11 +2790,9 @@ def test_setitem_dtype_upcast(self): self.assertEqual(df['c'].dtype, np.float64) df.ix[0, 'c'] = 'foo' - expected = DataFrame([{"a": 1, - "c": 'foo'}, {"a": 3, - "b": 2, - "c": np.nan}]) - assert_frame_equal(df, expected) + expected = DataFrame([{"a": 1, "c": 'foo'}, + {"a": 3, "b": 2, "c": np.nan}]) + tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), @@ -2899,7 +2805,7 @@ def test_setitem_dtype_upcast(self): right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), columns=['foo', 'bar', 'baz']) - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) self.assertTrue(is_integer_dtype(left['foo'])) self.assertTrue(is_integer_dtype(left['baz'])) @@ -2911,23 +2817,22 @@ def test_setitem_dtype_upcast(self): right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), columns=['foo', 'bar', 'baz']) - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) self.assertTrue(is_float_dtype(left['foo'])) self.assertTrue(is_float_dtype(left['baz'])) def test_setitem_iloc(self): # setitem with an iloc list - df = DataFrame( - np.arange(9).reshape((3, 3)), index=["A", "B", "C"], - columns=["A", "B", "C"]) + df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"], + columns=["A", "B", "C"]) df.iloc[[0, 1], [1, 2]] df.iloc[[0, 1], [1, 2]] += 100 expected = DataFrame( np.array([0, 101, 102, 3, 104, 105, 6, 7, 8]).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_dups_fancy_indexing(self): @@ -2951,7 +2856,7 @@ def test_dups_fancy_indexing(self): df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa - assert_frame_equal(df, result) + tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( @@ -2964,10 +2869,10 @@ def test_dups_fancy_indexing(self): 'test1': [7., 6], 'other': ['d', 'c']}, index=rows) result = df.ix[rows] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.ix[Index(rows)] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( @@ -2976,7 +2881,7 @@ def test_dups_fancy_indexing(self): 'other': ['d', 'c', np.nan]}, index=rows) result = df.ix[rows] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] @@ -2986,7 +2891,7 @@ def test_dups_fancy_indexing(self): 'd', 'c', np.nan]}, index=rows) result = df.ix[rows] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are # missing @@ -2995,7 +2900,7 @@ def test_dups_fancy_indexing(self): dfnu = DataFrame(randn(5, 3), index=list('AABCD')) result = dfnu.ix[['E']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # ToDo: check_index_type can be True after GH 11497 @@ -3003,19 +2908,19 @@ def test_dups_fancy_indexing(self): df = DataFrame({"A": [0, 1, 2]}) result = df.ix[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) - assert_frame_equal(result, expected, check_index_type=False) + tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) result = df.ix[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) - assert_frame_equal(result, expected, check_index_type=False) + tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame( {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) result = df.ix[['A', 'A', 'E']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 5835 # dups on index and missing values @@ -3026,34 +2931,29 @@ def test_dups_fancy_indexing(self): [df.ix[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], index=df.index)], axis=1) result = df.ix[:, ['A', 'B', 'C']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing - df = DataFrame( - np.random.randn( - 9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) + df = DataFrame(np.random.randn(9, 2), + index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) expected = df.iloc[0:6] result = df.loc[[1, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ['a', 'b']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ['a', 'b']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_indexing_mixed_frame_bug(self): # GH3492 - df = DataFrame({'a': {1: 'aaa', - 2: 'bbb', - 3: 'ccc'}, - 'b': {1: 111, - 2: 222, - 3: 333}}) + df = DataFrame({'a': {1: 'aaa', 2: 'bbb', 3: 'ccc'}, + 'b': {1: 111, 2: 222, 3: 333}}) # this works, new column is created correctly df['test'] = df['a'].apply(lambda x: '_' if x == 'aaa' else x) @@ -3138,7 +3038,7 @@ def test_set_index_nan(self): result = df.set_index(['year', 'PRuid', 'QC']).reset_index().reindex( columns=df.columns) - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) def test_multi_nan_indexing(self): @@ -3153,7 +3053,7 @@ def test_multi_nan_indexing(self): index=[Index(['R1', 'R2', np.nan, 'R4'], name='a'), Index(['C1', 'C2', 'C3', 'C4'], name='b')]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_iloc_panel_issue(self): @@ -3230,13 +3130,12 @@ def test_panel_setitem(self): index = range(3) columns = list('abc') - panel = Panel( - {'A': DataFrame( - np.random.randn(3, 3), index=index, columns=columns), - 'B': DataFrame( - np.random.randn(3, 3), index=index, columns=columns), - 'C': DataFrame( - np.random.randn(3, 3), index=index, columns=columns)}) + panel = Panel({'A': DataFrame(np.random.randn(3, 3), + index=index, columns=columns), + 'B': DataFrame(np.random.randn(3, 3), + index=index, columns=columns), + 'C': DataFrame(np.random.randn(3, 3), + index=index, columns=columns)}) replace = DataFrame(np.eye(3, 3), index=range(3), columns=columns) expected = Panel({'A': replace, 'B': replace, 'C': replace}) @@ -3255,8 +3154,8 @@ def test_panel_setitem_with_multiindex(self): # 10360 # failing with a multi-index - arr = np.array( - [[[1, 2, 3], [0, 0, 0]], [[0, 0, 0], [0, 0, 0]]], dtype=np.float64) + arr = np.array([[[1, 2, 3], [0, 0, 0]], [[0, 0, 0], [0, 0, 0]]], + dtype=np.float64) # reg index axes = dict(items=['A', 'B'], major_axis=[0, 1], @@ -3294,14 +3193,12 @@ def test_panel_setitem_with_multiindex(self): def test_panel_assignment(self): # GH3777 - wp = Panel( - randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - wp2 = Panel( - randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + wp2 = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) # TODO: unused? # expected = wp.loc[['Item1', 'Item2'], :, ['A', 'B']] @@ -3329,8 +3226,8 @@ def test_multiindex_assignment(self): arr = np.array([0., 1.]) df.ix[4, 'd'] = arr - assert_series_equal(df.ix[4, 'd'], Series(arr, index=[8, 10], - name='d')) + tm.assert_series_equal(df.ix[4, 'd'], + Series(arr, index=[8, 10], name='d')) # single dtype df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), @@ -3338,13 +3235,13 @@ def test_multiindex_assignment(self): index=[[4, 4, 8], [8, 10, 12]]) df.ix[4, 'c'] = arr - assert_series_equal(df.ix[4, 'c'], Series(arr, index=[8, 10], name='c', - dtype='float64')) + exp = Series(arr, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) # scalar ok df.ix[4, 'c'] = 10 - assert_series_equal(df.ix[4, 'c'], Series(10, index=[8, 10], name='c', - dtype='float64')) + exp = Series(10, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) # invalid assignments def f(): @@ -3360,13 +3257,12 @@ def f(): # groupby example NUM_ROWS = 100 NUM_COLS = 10 - col_names = ['A' + num - for num in map(str, np.arange(NUM_COLS).tolist())] + col_names = ['A' + num for num in + map(str, np.arange(NUM_COLS).tolist())] index_cols = col_names[:5] - df = DataFrame( - np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), dtype=np.int64, - columns=col_names) + df = DataFrame(np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), + dtype=np.int64, columns=col_names) df = df.set_index(index_cols).sort_index() grp = df.groupby(level=index_cols[:4]) df['new_col'] = np.nan @@ -3374,9 +3270,8 @@ def f(): f_index = np.arange(5) def f(name, df2): - return Series( - np.arange(df2.shape[0]), - name=df2.index.values[0]).reindex(f_index) + return Series(np.arange(df2.shape[0]), + name=df2.index.values[0]).reindex(f_index) # TODO(wesm): unused? # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T @@ -3410,17 +3305,17 @@ def test_multi_assign(self): # frame on rhs df2.ix[mask, cols] = dft.ix[mask, cols] - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) df2.ix[mask, cols] = dft.ix[mask, cols] - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) # with an ndarray on rhs df2 = df.copy() df2.ix[mask, cols] = dft.ix[mask, cols].values - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) df2.ix[mask, cols] = dft.ix[mask, cols].values - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required df = DataFrame(dict(A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[ @@ -3432,7 +3327,7 @@ def test_multi_assign(self): expected.loc[mask, col] = df['D'] df.loc[df['A'] == 0, ['A', 'B']] = df['D'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_ix_assign_column_mixed(self): # GH #1142 @@ -3441,7 +3336,7 @@ def test_ix_assign_column_mixed(self): orig = df.ix[:, 'B'].copy() df.ix[:, 'B'] = df.ix[:, 'B'] + 1 - assert_series_equal(df.B, orig + 1) + tm.assert_series_equal(df.B, orig + 1) # GH 3668, mixed frame with series value df = DataFrame({'x': lrange(10), 'y': lrange(10, 20), 'z': 'bar'}) @@ -3454,20 +3349,20 @@ def test_ix_assign_column_mixed(self): self.assertEqual(expected.ix[indexer, 'y'], v) df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 4508, making sure consistency of assignments df = DataFrame({'a': [1, 2, 3], 'b': [0, 1, 2]}) df.ix[[0, 2, ], 'b'] = [100, -100] expected = DataFrame({'a': [1, 2, 3], 'b': [100, 1, -100]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = pd.DataFrame({'a': lrange(4)}) df['b'] = np.nan df.ix[[1, 3], 'b'] = [100, -100] expected = DataFrame({'a': [0, 1, 2, 3], 'b': [np.nan, 100, np.nan, -100]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # ok, but chained assignments are dangerous # if we turn off chained assignement it will work @@ -3475,7 +3370,7 @@ def test_ix_assign_column_mixed(self): df = pd.DataFrame({'a': lrange(4)}) df['b'] = np.nan df['b'].ix[[1, 3]] = [100, -100] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_ix_get_set_consistency(self): @@ -3508,7 +3403,7 @@ def test_setitem_list(self): result = DataFrame(index=[0, 1], columns=[0]) result.ix[1, 0] = [1, 2] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # ix with an object class TO(object): @@ -3534,7 +3429,7 @@ def view(self): result = DataFrame(index=[0, 1], columns=[0]) result.ix[1, 0] = TO(2) - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # remains object dtype even after setting it back df = DataFrame(index=[0, 1], columns=[0]) @@ -3542,7 +3437,7 @@ def view(self): df.ix[1, 0] = np.nan result = DataFrame(index=[0, 1], columns=[0]) - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) def test_iloc_mask(self): @@ -3556,7 +3451,7 @@ def test_iloc_mask(self): # ndarray ok result = df.iloc[np.array([True] * len(mask), dtype=bool)] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # the possibilities locs = np.arange(4) @@ -3629,7 +3524,7 @@ def test_ix_slicing_strings(self): 3: 'correct', 4: 'aaa'}}) # bug was 4: 'bbb' - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_non_unique_loc(self): # GH3659 @@ -3637,9 +3532,8 @@ def test_non_unique_loc(self): # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs # these are going to raise becuase the we are non monotonic - df = DataFrame( - {'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]) + df = DataFrame({'A': [1, 2, 3, 4, 5, 6], + 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]) self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1, None)])) self.assertRaises(KeyError, df.loc.__getitem__, @@ -3647,22 +3541,21 @@ def test_non_unique_loc(self): self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1, 2)])) # monotonic are ok - df = DataFrame( - {'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]).sort_index( - axis=0) + df = DataFrame({'A': [1, 2, 3, 4, 5, 6], + 'B': [3, 4, 5, 6, 7, 8]}, + index=[0, 1, 0, 1, 2, 3]).sort_index(axis=0) result = df.loc[1:] - expected = DataFrame( - {'A': [2, 4, 5, 6], - 'B': [4, 6, 7, 8]}, index=[1, 1, 2, 3]) - assert_frame_equal(result, expected) + expected = DataFrame({'A': [2, 4, 5, 6], 'B': [4, 6, 7, 8]}, + index=[1, 1, 2, 3]) + tm.assert_frame_equal(result, expected) result = df.loc[0:] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) result = df.loc[1:2] - expected = DataFrame({'A': [2, 4, 5], 'B': [4, 6, 7]}, index=[1, 1, 2]) - assert_frame_equal(result, expected) + expected = DataFrame({'A': [2, 4, 5], 'B': [4, 6, 7]}, + index=[1, 1, 2]) + tm.assert_frame_equal(result, expected) def test_loc_name(self): # GH 3880 @@ -3687,7 +3580,7 @@ def test_iloc_non_unique_indexing(self): df3 = pd.concat([df, 2 * df, 3 * df]) result = df3.iloc[idx] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df2 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000}) df2 = pd.concat([df2, 2 * df2, 3 * df2]) @@ -3705,7 +3598,7 @@ def test_iloc_non_unique_indexing(self): expected = pd.concat([expected, DataFrame(index=idx[idx > sidx.max()]) ]) result = df2.loc[idx] - assert_frame_equal(result, expected, check_index_type=False) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_mi_access(self): @@ -3726,10 +3619,10 @@ def test_mi_access(self): expected = DataFrame([['a', 1, 1]], index=columns, columns=index).T result = df2.loc[:, ('A', 'A1')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df2[('A', 'A1')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 4146, not returning a block manager when selecting a unique index # from a duplicate index @@ -3737,15 +3630,14 @@ def test_mi_access(self): # with a non-unique) expected = Series(['a', 1, 1], index=['h1', 'h3', 'h5'], name='A1') result = df2['A']['A1'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # selecting a non_unique from the 2nd level expected = DataFrame([['d', 4, 4], ['e', 5, 5]], - index=Index( - ['B2', 'B2'], name='sub'), + index=Index(['B2', 'B2'], name='sub'), columns=['h1', 'h3', 'h5'], ).T result = df2['A']['B2'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_non_unique_loc_memory_error(self): @@ -3755,11 +3647,10 @@ def test_non_unique_loc_memory_error(self): columns = list('ABCDEFG') def gen_test(l, l2): - return pd.concat([DataFrame( - randn(l, len(columns)), index=lrange( - l), columns=columns), DataFrame( - np.ones((l2, len(columns) - )), index=[0] * l2, columns=columns)]) + return pd.concat([DataFrame(randn(l, len(columns)), + index=lrange(l), columns=columns), + DataFrame(np.ones((l2, len(columns))), + index=[0] * l2, columns=columns)]) def gen_expected(df, mask): l = len(mask) @@ -3775,7 +3666,7 @@ def gen_expected(df, mask): mask = np.arange(100) result = df.loc[mask] expected = gen_expected(df, mask) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = gen_test(900000, 100000) self.assertFalse(df.index.is_unique) @@ -3783,61 +3674,57 @@ def gen_expected(df, mask): mask = np.arange(100000) result = df.loc[mask] expected = gen_expected(df, mask) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_astype_assignment(self): # GH4312 (iloc) - df_orig = DataFrame( - [['1', '2', '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) + df_orig = DataFrame([['1', '2', '3', '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) - expected = DataFrame( - [[1, 2, '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) - assert_frame_equal(df, expected) + expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True) - expected = DataFrame( - [[1, 2, '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) - assert_frame_equal(df, expected) + expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) + tm.assert_frame_equal(df, expected) # GH5702 (loc) df = df_orig.copy() df.loc[:, 'A'] = df.loc[:, 'A'].astype(np.int64) - expected = DataFrame( - [[1, '2', '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) - assert_frame_equal(df, expected) + expected = DataFrame([[1, '2', '3', '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, ['B', 'C']] = df.loc[:, ['B', 'C']].astype(np.int64) - expected = DataFrame( - [['1', 2, 3, '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) - assert_frame_equal(df, expected) + expected = DataFrame([['1', 2, 3, '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) + tm.assert_frame_equal(df, expected) # full replacements / no nans df = DataFrame({'A': [1., 2., 3., 4.]}) df.iloc[:, 0] = df['A'].astype(np.int64) expected = DataFrame({'A': [1, 2, 3, 4]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'A': [1., 2., 3., 4.]}) df.loc[:, 'A'] = df['A'].astype(np.int64) expected = DataFrame({'A': [1, 2, 3, 4]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_astype_assignment_with_dups(self): # GH 4686 # assignment with dups that has a dtype change - df = DataFrame( - np.arange(3).reshape((1, 3)), - columns=pd.MultiIndex.from_tuples( - [('A', '1'), ('B', '1'), ('A', '2')] - ), - dtype=object - ) + cols = pd.MultiIndex.from_tuples([('A', '1'), ('B', '1'), ('A', '2')]) + df = DataFrame(np.arange(3).reshape((1, 3)), + columns=cols, dtype=object) index = df.index.copy() df['A'] = df['A'].astype(np.float64) @@ -3857,10 +3744,10 @@ def test_dups_loc(self): index=['a', 'a', 'a', 'a', 'a'], name=1) result = df.iloc[0] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = df.loc[1] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_partial_setting(self): @@ -3872,22 +3759,22 @@ def test_partial_setting(self): s = s_orig.copy() s[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) s = s_orig.copy() s[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() @@ -3924,96 +3811,97 @@ def f(): expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # single dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) df = df_orig.copy() df.ix[:, 'B'] = df.ix[:, 'A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) df = df_orig.copy() df['B'] = df['B'].astype(np.float64) df.ix[:, 'B'] = df.ix[:, 'A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() df.ix[:, 'C'] = df.ix[:, 'A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() df.ix[:, 'C'] = df.ix[:, 'A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # ## panel ## - p_orig = Panel( - np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], - major_axis=pd.date_range('2001/1/12', periods=4), - minor_axis=['A', 'B'], dtype='float64') + p_orig = Panel(np.arange(16).reshape(2, 4, 2), + items=['Item1', 'Item2'], + major_axis=pd.date_range('2001/1/12', periods=4), + minor_axis=['A', 'B'], dtype='float64') # panel setting via item - p_orig = Panel( - np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], - major_axis=pd.date_range('2001/1/12', periods=4), - minor_axis=['A', 'B'], dtype='float64') + p_orig = Panel(np.arange(16).reshape(2, 4, 2), + items=['Item1', 'Item2'], + major_axis=pd.date_range('2001/1/12', periods=4), + minor_axis=['A', 'B'], dtype='float64') expected = p_orig.copy() expected['Item3'] = expected['Item1'] p = p_orig.copy() p.loc['Item3'] = p['Item1'] - assert_panel_equal(p, expected) + tm.assert_panel_equal(p, expected) # panel with aligned series expected = p_orig.copy() expected = expected.transpose(2, 1, 0) - expected['C'] = DataFrame( - {'Item1': [30, 30, 30, 30], - 'Item2': [32, 32, 32, 32]}, index=p_orig.major_axis) + expected['C'] = DataFrame({'Item1': [30, 30, 30, 30], + 'Item2': [32, 32, 32, 32]}, + index=p_orig.major_axis) expected = expected.transpose(2, 1, 0) p = p_orig.copy() p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items) - assert_panel_equal(p, expected) + tm.assert_panel_equal(p, expected) # GH 8473 dates = date_range('1/1/2000', periods=8) - df_orig = DataFrame( - np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + df_orig = DataFrame(np.random.randn(8, 4), index=dates, + columns=['A', 'B', 'C', 'D']) expected = pd.concat([df_orig, DataFrame( {'A': 7}, index=[dates[-1] + 1])]) df = df_orig.copy() df.loc[dates[-1] + 1, 'A'] = 7 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 'A'] = 7 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + + exp_other = DataFrame({0: 7}, index=[dates[-1] + 1]) + expected = pd.concat([df_orig, exp_other], axis=1) - expected = pd.concat( - [df_orig, DataFrame({0: 7}, index=[dates[-1] + 1])], axis=1) df = df_orig.copy() df.loc[dates[-1] + 1, 0] = 7 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 0] = 7 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_partial_setting_mixed_dtype(self): @@ -4026,18 +3914,20 @@ def test_partial_setting_mixed_dtype(self): expected = df.append(s) df.loc[2] = df.loc[1] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # columns will align df = DataFrame(columns=['A', 'B']) df.loc[0] = Series(1, index=range(4)) - assert_frame_equal(df, DataFrame(columns=['A', 'B'], index=[0])) + tm.assert_frame_equal(df, DataFrame(columns=['A', 'B'], index=[0])) # columns will align df = DataFrame(columns=['A', 'B']) df.loc[0] = Series(1, index=['B']) - assert_frame_equal(df, DataFrame( - [[np.nan, 1]], columns=['A', 'B'], index=[0], dtype='float64')) + + exp = DataFrame([[np.nan, 1]], columns=['A', 'B'], + index=[0], dtype='float64') + tm.assert_frame_equal(df, exp) # list-like must conform df = DataFrame(columns=['A', 'B']) @@ -4050,8 +3940,10 @@ def f(): # these are coerced to float unavoidably (as its a list-like to begin) df = DataFrame(columns=['A', 'B']) df.loc[3] = [6, 7] - assert_frame_equal(df, DataFrame( - [[6, 7]], index=[3], columns=['A', 'B'], dtype='float64')) + + exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'], + dtype='float64') + tm.assert_frame_equal(df, exp) def test_partial_setting_with_datetimelike_dtype(self): @@ -4065,7 +3957,7 @@ def test_partial_setting_with_datetimelike_dtype(self): mask = df.A < 1 df.loc[mask, 'C'] = df.loc[mask].index - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_loc_setitem_datetime(self): @@ -4081,7 +3973,7 @@ def test_loc_setitem_datetime(self): df.loc[conv(dt2), 'one'] = 200 expected = DataFrame({'one': [100.0, 200.0]}, index=[dt1, dt2]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_series_partial_set(self): # partial set with new index @@ -4091,55 +3983,55 @@ def test_series_partial_set(self): # loc expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3]) result = ser.loc[[3, 2, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x']) result = ser.loc[[3, 2, 3, 'x']] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1]) result = ser.loc[[2, 2, 1]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1]) result = ser.loc[[2, 2, 'x', 1]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) # raises as nothing in in the index self.assertRaises(KeyError, lambda: ser.loc[[3, 3, 3]]) expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3]) result = ser.loc[[2, 2, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4]) result = Series([0.1, 0.2, 0.3], index=[1, 2, 3]).loc[[3, 4, 4]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 3, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 4, 4]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) result = Series([0.1, 0.2, 0.3, 0.4], index=[4, 5, 6, 7]).loc[[7, 2, 2]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[4, 5, 5]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) # iloc expected = Series([0.2, 0.2, 0.1, 0.1], index=[2, 2, 1, 1]) result = ser.iloc[[1, 1, 0, 0]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) def test_series_partial_set_with_name(self): # GH 11497 @@ -4151,23 +4043,23 @@ def test_series_partial_set_with_name(self): exp_idx = Index([3, 2, 3], dtype='int64', name='idx') expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s') result = ser.loc[[3, 2, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx') expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, name='s') result = ser.loc[[3, 2, 3, 'x']] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([2, 2, 1], dtype='int64', name='idx') expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s') result = ser.loc[[2, 2, 1]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx') expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s') result = ser.loc[[2, 2, 'x', 1]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) # raises as nothing in in the index self.assertRaises(KeyError, lambda: ser.loc[[3, 3, 3]]) @@ -4175,47 +4067,47 @@ def test_series_partial_set_with_name(self): exp_idx = Index([2, 2, 3], dtype='int64', name='idx') expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s') result = ser.loc[[2, 2, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([3, 4, 4], dtype='int64', name='idx') expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s') idx = Index([1, 2, 3], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3], index=idx, name='s').loc[[3, 4, 4]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([5, 3, 3], dtype='int64', name='idx') expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s') idx = Index([1, 2, 3, 4], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 3, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([5, 4, 4], dtype='int64', name='idx') expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s') idx = Index([1, 2, 3, 4], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 4, 4]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([7, 2, 2], dtype='int64', name='idx') expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') idx = Index([4, 5, 6, 7], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[7, 2, 2]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([4, 5, 5], dtype='int64', name='idx') expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') idx = Index([1, 2, 3, 4], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[4, 5, 5]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) # iloc exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx') expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s') result = ser.iloc[[1, 1, 0, 0]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) def test_series_partial_set_datetime(self): # GH 11497 @@ -4225,19 +4117,19 @@ def test_series_partial_set_datetime(self): result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]] exp = Series([0.1, 0.2], index=idx, name='s') - assert_series_equal(result, exp, check_index_type=True) + tm.assert_series_equal(result, exp, check_index_type=True) keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'), Timestamp('2011-01-01')] exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'), name='s') - assert_series_equal(ser.loc[keys], exp, check_index_type=True) + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'), Timestamp('2011-01-03')] exp = Series([np.nan, 0.2, np.nan], index=pd.DatetimeIndex(keys, name='idx'), name='s') - assert_series_equal(ser.loc[keys], exp, check_index_type=True) + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) def test_series_partial_set_period(self): # GH 11497 @@ -4248,14 +4140,14 @@ def test_series_partial_set_period(self): result = ser.loc[[pd.Period('2011-01-01', freq='D'), pd.Period( '2011-01-02', freq='D')]] exp = Series([0.1, 0.2], index=idx, name='s') - assert_series_equal(result, exp, check_index_type=True) + tm.assert_series_equal(result, exp, check_index_type=True) keys = [pd.Period('2011-01-02', freq='D'), pd.Period('2011-01-02', freq='D'), pd.Period('2011-01-01', freq='D')] exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'), name='s') - assert_series_equal(ser.loc[keys], exp, check_index_type=True) + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) keys = [pd.Period('2011-01-03', freq='D'), pd.Period('2011-01-02', freq='D'), @@ -4263,14 +4155,15 @@ def test_series_partial_set_period(self): exp = Series([np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name='idx'), name='s') result = ser.loc[keys] - assert_series_equal(result, exp) + tm.assert_series_equal(result, exp) def test_partial_set_invalid(self): # GH 4940 # allow only setting of 'valid' values - df = tm.makeTimeDataFrame() + orig = tm.makeTimeDataFrame() + df = orig.copy() # don't allow not string inserts def f(): @@ -4294,33 +4187,40 @@ def f(): self.assertRaises(ValueError, f) # allow object conversion here + df = orig.copy() df.loc['a', :] = df.ix[0] + exp = orig.append(pd.Series(df.ix[0], name='a')) + tm.assert_frame_equal(df, exp) + tm.assert_index_equal(df.index, + pd.Index(orig.index.tolist() + ['a'])) + self.assertEqual(df.index.dtype, 'object') - def test_partial_set_empty(self): + def test_partial_set_empty_series(self): # GH5226 - # partially set with an empty object - # series + # partially set with an empty object series s = Series() s.loc[1] = 1 - assert_series_equal(s, Series([1], index=[1])) + tm.assert_series_equal(s, Series([1], index=[1])) s.loc[3] = 3 - assert_series_equal(s, Series([1, 3], index=[1, 3])) + tm.assert_series_equal(s, Series([1, 3], index=[1, 3])) s = Series() s.loc[1] = 1. - assert_series_equal(s, Series([1.], index=[1])) + tm.assert_series_equal(s, Series([1.], index=[1])) s.loc[3] = 3. - assert_series_equal(s, Series([1., 3.], index=[1, 3])) + tm.assert_series_equal(s, Series([1., 3.], index=[1, 3])) s = Series() s.loc['foo'] = 1 - assert_series_equal(s, Series([1], index=['foo'])) + tm.assert_series_equal(s, Series([1], index=['foo'])) s.loc['bar'] = 3 - assert_series_equal(s, Series([1, 3], index=['foo', 'bar'])) + tm.assert_series_equal(s, Series([1, 3], index=['foo', 'bar'])) s.loc[3] = 4 - assert_series_equal(s, Series([1, 3, 4], index=['foo', 'bar', 3])) + tm.assert_series_equal(s, Series([1, 3, 4], index=['foo', 'bar', 3])) + + def test_partial_set_empty_frame(self): # partially set with an empty object # frame @@ -4352,24 +4252,24 @@ def f(): df['foo'] = Series([], dtype='object') return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() df['foo'] = Series(df.index) return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() df['foo'] = df.index return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) - expected = DataFrame(columns=['foo'], index=pd.Index( - [], dtype='int64')) + expected = DataFrame(columns=['foo'], + index=pd.Index([], dtype='int64')) expected['foo'] = expected['foo'].astype('float64') def f(): @@ -4377,109 +4277,119 @@ def f(): df['foo'] = [] return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() df['foo'] = Series(range(len(df))) return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() + tm.assert_index_equal(df.index, pd.Index([], dtype='object')) df['foo'] = range(len(df)) return df - assert_frame_equal(f(), expected) + expected = DataFrame(columns=['foo'], + index=pd.Index([], dtype='int64')) + expected['foo'] = expected['foo'].astype('float64') + tm.assert_frame_equal(f(), expected) df = DataFrame() + tm.assert_index_equal(df.columns, pd.Index([], dtype=object)) df2 = DataFrame() df2[1] = Series([1], index=['foo']) df.loc[:, 1] = Series([1], index=['foo']) - assert_frame_equal(df, DataFrame([[1]], index=['foo'], columns=[1])) - assert_frame_equal(df, df2) + tm.assert_frame_equal(df, DataFrame([[1]], index=['foo'], columns=[1])) + tm.assert_frame_equal(df, df2) # no index to start - expected = DataFrame( - {0: Series(1, index=range(4))}, columns=['A', 'B', 0]) + expected = DataFrame({0: Series(1, index=range(4))}, + columns=['A', 'B', 0]) df = DataFrame(columns=['A', 'B']) df[0] = Series(1, index=range(4)) df.dtypes str(df) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(columns=['A', 'B']) df.loc[:, 0] = Series(1, index=range(4)) df.dtypes str(df) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + def test_partial_set_empty_frame_row(self): # GH5720, GH5744 # don't create rows when empty - expected = DataFrame(columns=['A', 'B', 'New'], index=pd.Index( - [], dtype='int64')) + expected = DataFrame(columns=['A', 'B', 'New'], + index=pd.Index([], dtype='int64')) expected['A'] = expected['A'].astype('int64') expected['B'] = expected['B'].astype('float64') expected['New'] = expected['New'].astype('float64') + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) y = df[df.A > 5] y['New'] = np.nan - assert_frame_equal(y, expected) - # assert_frame_equal(y,expected) + tm.assert_frame_equal(y, expected) + # tm.assert_frame_equal(y,expected) expected = DataFrame(columns=['a', 'b', 'c c', 'd']) expected['d'] = expected['d'].astype('int64') df = DataFrame(columns=['a', 'b', 'c c']) df['d'] = 3 - assert_frame_equal(df, expected) - assert_series_equal(df['c c'], Series(name='c c', dtype=object)) + tm.assert_frame_equal(df, expected) + tm.assert_series_equal(df['c c'], Series(name='c c', dtype=object)) # reindex columns is ok df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) y = df[df.A > 5] result = y.reindex(columns=['A', 'B', 'C']) - expected = DataFrame(columns=['A', 'B', 'C'], index=pd.Index( - [], dtype='int64')) + expected = DataFrame(columns=['A', 'B', 'C'], + index=pd.Index([], dtype='int64')) expected['A'] = expected['A'].astype('int64') expected['B'] = expected['B'].astype('float64') expected['C'] = expected['C'].astype('float64') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + def test_partial_set_empty_frame_set_series(self): # GH 5756 # setting with empty Series df = DataFrame(Series()) - assert_frame_equal(df, DataFrame({0: Series()})) + tm.assert_frame_equal(df, DataFrame({0: Series()})) df = DataFrame(Series(name='foo')) - assert_frame_equal(df, DataFrame({'foo': Series()})) + tm.assert_frame_equal(df, DataFrame({'foo': Series()})) + def test_partial_set_empty_frame_empty_copy_assignment(self): # GH 5932 # copy on empty with assignment fails df = DataFrame(index=[0]) df = df.copy() df['a'] = 0 expected = DataFrame(0, index=[0], columns=['a']) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + def test_partial_set_empty_frame_empty_consistencies(self): # GH 6171 # consistency on empty frames df = DataFrame(columns=['x', 'y']) df['x'] = [1, 2] expected = DataFrame(dict(x=[1, 2], y=[np.nan, np.nan])) - assert_frame_equal(df, expected, check_dtype=False) + tm.assert_frame_equal(df, expected, check_dtype=False) df = DataFrame(columns=['x', 'y']) df['x'] = ['1', '2'] expected = DataFrame( dict(x=['1', '2'], y=[np.nan, np.nan]), dtype=object) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(columns=['x', 'y']) df.loc[0, 'x'] = 1 expected = DataFrame(dict(x=[1], y=[np.nan])) - assert_frame_equal(df, expected, check_dtype=False) + tm.assert_frame_equal(df, expected, check_dtype=False) def test_cache_updating(self): # GH 4939, make sure to update the cache on setitem @@ -4528,9 +4438,9 @@ def test_cache_updating(self): expected = DataFrame(np.zeros((5, 6), dtype='int64'), columns=[ 'a', 'b', 'c', 'd', 'e', 'f'], index=range(5)) expected.at[3, 'f'] = 2 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) expected = Series([0, 0, 0, 2, 0], name='f') - assert_series_equal(df.f, expected) + tm.assert_series_equal(df.f, expected) def test_slice_consolidate_invalidate_item_cache(self): @@ -4587,8 +4497,8 @@ def test_setitem_cache_updating(self): for ix, row in df.iterrows(): out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D'] - assert_frame_equal(out, expected) - assert_series_equal(out['A'], expected['A']) + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out['A'], expected['A']) # try via a chain indexing # this actually works @@ -4598,16 +4508,16 @@ def test_setitem_cache_updating(self): v = out[row['C']][six:eix] + row['D'] out[row['C']][six:eix] = v - assert_frame_equal(out, expected) - assert_series_equal(out['A'], expected['A']) + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out['A'], expected['A']) out = DataFrame({'A': [0, 0, 0]}, index=date_range('5/7/2014', '5/9/2014')) for ix, row in df.iterrows(): out.loc[six:eix, row['C']] += row['D'] - assert_frame_equal(out, expected) - assert_series_equal(out['A'], expected['A']) + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out['A'], expected['A']) def test_setitem_chained_setfault(self): @@ -4619,31 +4529,31 @@ def test_setitem_chained_setfault(self): df = DataFrame({'response': np.array(data)}) mask = df.response == 'timeout' df.response[mask] = 'none' - assert_frame_equal(df, DataFrame({'response': mdata})) + tm.assert_frame_equal(df, DataFrame({'response': mdata})) recarray = np.rec.fromarrays([data], names=['response']) df = DataFrame(recarray) mask = df.response == 'timeout' df.response[mask] = 'none' - assert_frame_equal(df, DataFrame({'response': mdata})) + tm.assert_frame_equal(df, DataFrame({'response': mdata})) df = DataFrame({'response': data, 'response1': data}) mask = df.response == 'timeout' df.response[mask] = 'none' - assert_frame_equal(df, DataFrame({'response': mdata, - 'response1': data})) + tm.assert_frame_equal(df, DataFrame({'response': mdata, + 'response1': data})) # GH 6056 expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar'])) df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) df['A'].iloc[0] = np.nan result = df.head() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) df.A.iloc[0] = np.nan result = df.head() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_detect_chained_assignment(self): @@ -4651,18 +4561,16 @@ def test_detect_chained_assignment(self): # work with the chain expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB')) - df = DataFrame( - np.arange(4).reshape(2, 2), columns=list('AB'), dtype='int64') + df = DataFrame(np.arange(4).reshape(2, 2), + columns=list('AB'), dtype='int64') self.assertIsNone(df.is_copy) df['A'][0] = -5 df['A'][1] = -6 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # test with the chaining - df = DataFrame({'A': Series( - range(2), dtype='int64'), - 'B': np.array( - np.arange(2, 4), dtype=np.float64)}) + df = DataFrame({'A': Series(range(2), dtype='int64'), + 'B': np.array(np.arange(2, 4), dtype=np.float64)}) self.assertIsNone(df.is_copy) def f(): @@ -4677,10 +4585,8 @@ def f(): self.assertIsNone(df['A'].is_copy) # using a copy (the chain), fails - df = DataFrame({'A': Series( - range(2), dtype='int64'), - 'B': np.array( - np.arange(2, 4), dtype=np.float64)}) + df = DataFrame({'A': Series(range(2), dtype='int64'), + 'B': np.array(np.arange(2, 4), dtype=np.float64)}) def f(): df.loc[0]['A'] = -5 @@ -4688,13 +4594,12 @@ def f(): self.assertRaises(com.SettingWithCopyError, f) # doc example - df = DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six' - ], - 'c': Series( - range(7), dtype='int64')}) + df = DataFrame({'a': ['one', 'one', 'two', 'three', + 'two', 'one', 'six'], + 'c': Series(range(7), dtype='int64')}) self.assertIsNone(df.is_copy) - expected = DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', - 'six'], + expected = DataFrame({'a': ['one', 'one', 'two', 'three', + 'two', 'one', 'six'], 'c': [42, 42, 2, 3, 4, 42, 6]}) def f(): @@ -4717,7 +4622,7 @@ def f(): self.assertRaises(com.SettingWithCopyError, f) df.loc[0, 'A'] = 111 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # make sure that is_copy is picked up reconstruction # GH5475 @@ -4807,8 +4712,8 @@ def f(): df = DataFrame(np.random.randn(10, 4)) s = df.iloc[:, 0].sort_values() - assert_series_equal(s, df.iloc[:, 0].sort_values()) - assert_series_equal(s, df[0].sort_values()) + tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) + tm.assert_series_equal(s, df[0].sort_values()) # false positives GH6025 df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]}) @@ -5056,40 +4961,45 @@ def test_iloc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) # vertical empty - assert_frame_equal(df.iloc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal(df.iloc[:, []], df.iloc[:, :0], + check_index_type=True, check_column_type=True) # horizontal empty - assert_frame_equal(df.iloc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal(df.iloc[[], :], df.iloc[:0, :], + check_index_type=True, check_column_type=True) # horizontal empty - assert_frame_equal(df.iloc[[]], df.iloc[:0, :], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.iloc[[]], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) def test_loc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) # vertical empty - assert_frame_equal(df.loc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal(df.loc[:, []], df.iloc[:, :0], + check_index_type=True, check_column_type=True) # horizontal empty - assert_frame_equal(df.loc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal(df.loc[[], :], df.iloc[:0, :], + check_index_type=True, check_column_type=True) # horizontal empty - assert_frame_equal(df.loc[[]], df.iloc[:0, :], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.loc[[]], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) def test_ix_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) # vertical empty - assert_frame_equal(df.ix[:, []], df.iloc[:, :0], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.ix[:, []], df.iloc[:, :0], + check_index_type=True, + check_column_type=True) # horizontal empty - assert_frame_equal(df.ix[[], :], df.iloc[:0, :], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.ix[[], :], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) # horizontal empty - assert_frame_equal(df.ix[[]], df.iloc[:0, :], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.ix[[]], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) def test_index_type_coercion(self): @@ -5188,23 +5098,23 @@ def run_tests(df, rhs, right): left = df.copy() left.loc[r, c] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) left = df.copy() left.iloc[i, j] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) left = df.copy() left.ix[s, l] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) left = df.copy() left.ix[i, j] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) left = df.copy() left.ix[r, c] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) xs = np.arange(20).reshape(5, 4) cols = ['jim', 'joe', 'jolie', 'joline'] @@ -5231,12 +5141,12 @@ def test_str_label_slicing_with_negative_step(self): SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): - assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) if not idx.is_integer: # For integer indices, ix and plain getitem are position-based. - assert_series_equal(s[l_slc], s.iloc[i_slc]) - assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) for idx in [_mklbl('A', 20), np.arange(20) + 100, np.linspace(100, 150, 20)]: @@ -5253,9 +5163,9 @@ def test_multiindex_label_slicing_with_negative_step(self): SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): - assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) - assert_series_equal(s[l_slc], s.iloc[i_slc]) - assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) assert_slices_equivalent(SLC[::-1], SLC[::-1]) @@ -5300,8 +5210,8 @@ def test_indexing_dtypes_on_empty(self): df2 = df.ix[[], :] self.assertEqual(df2.loc[:, 'a'].dtype, np.int64) - assert_series_equal(df2.loc[:, 'a'], df2.iloc[:, 0]) - assert_series_equal(df2.loc[:, 'a'], df2.ix[:, 0]) + tm.assert_series_equal(df2.loc[:, 'a'], df2.iloc[:, 0]) + tm.assert_series_equal(df2.loc[:, 'a'], df2.ix[:, 0]) def test_range_in_series_indexing(self): # range can cause an indexing error @@ -5309,24 +5219,10 @@ def test_range_in_series_indexing(self): for x in [5, 999999, 1000000]: s = pd.Series(index=range(x)) s.loc[range(1)] = 42 - assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) + tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) s.loc[range(2)] = 43 - assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) - - @slow - def test_large_dataframe_indexing(self): - # GH10692 - result = DataFrame({'x': range(10 ** 6)}, dtype='int64') - result.loc[len(result)] = len(result) + 1 - expected = DataFrame({'x': range(10 ** 6 + 1)}, dtype='int64') - assert_frame_equal(result, expected) - - @slow - def test_large_mi_dataframe_indexing(self): - # GH10645 - result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) - assert (not (10 ** 6, 0) in result) + tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) def test_non_reducing_slice(self): df = pd.DataFrame([[0, 1], [2, 3]]) @@ -5391,11 +5287,7 @@ def test_coercion_with_setitem(self): start_series[0] = None expected_series = Series(expected_result) - - assert_attr_equal('dtype', start_series, expected_series) - tm.assert_numpy_array_equal(start_series.values, - expected_series.values, - strict_nan=True) + tm.assert_series_equal(start_series, expected_series) def test_coercion_with_loc_setitem(self): for start_data, expected_result in self.EXPECTED_RESULTS: @@ -5403,11 +5295,7 @@ def test_coercion_with_loc_setitem(self): start_series.loc[0] = None expected_series = Series(expected_result) - - assert_attr_equal('dtype', start_series, expected_series) - tm.assert_numpy_array_equal(start_series.values, - expected_series.values, - strict_nan=True) + tm.assert_series_equal(start_series, expected_series) def test_coercion_with_setitem_and_series(self): for start_data, expected_result in self.EXPECTED_RESULTS: @@ -5415,11 +5303,7 @@ def test_coercion_with_setitem_and_series(self): start_series[start_series == start_series[0]] = None expected_series = Series(expected_result) - - assert_attr_equal('dtype', start_series, expected_series) - tm.assert_numpy_array_equal(start_series.values, - expected_series.values, - strict_nan=True) + tm.assert_series_equal(start_series, expected_series) def test_coercion_with_loc_and_series(self): for start_data, expected_result in self.EXPECTED_RESULTS: @@ -5427,11 +5311,7 @@ def test_coercion_with_loc_and_series(self): start_series.loc[start_series == start_series[0]] = None expected_series = Series(expected_result) - - assert_attr_equal('dtype', start_series, expected_series) - tm.assert_numpy_array_equal(start_series.values, - expected_series.values, - strict_nan=True) + tm.assert_series_equal(start_series, expected_series) class TestDataframeNoneCoercion(tm.TestCase): @@ -5454,12 +5334,7 @@ def test_coercion_with_loc(self): start_dataframe.loc[0, ['foo']] = None expected_dataframe = DataFrame({'foo': expected_result}) - - assert_attr_equal('dtype', start_dataframe['foo'], - expected_dataframe['foo']) - tm.assert_numpy_array_equal(start_dataframe['foo'].values, - expected_dataframe['foo'].values, - strict_nan=True) + tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_coercion_with_setitem_and_dataframe(self): for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: @@ -5468,12 +5343,7 @@ def test_coercion_with_setitem_and_dataframe(self): 0]] = None expected_dataframe = DataFrame({'foo': expected_result}) - - assert_attr_equal('dtype', start_dataframe['foo'], - expected_dataframe['foo']) - tm.assert_numpy_array_equal(start_dataframe['foo'].values, - expected_dataframe['foo'].values, - strict_nan=True) + tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_none_coercion_loc_and_dataframe(self): for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: @@ -5482,12 +5352,7 @@ def test_none_coercion_loc_and_dataframe(self): 'foo'][0]] = None expected_dataframe = DataFrame({'foo': expected_result}) - - assert_attr_equal('dtype', start_dataframe['foo'], - expected_dataframe['foo']) - tm.assert_numpy_array_equal(start_dataframe['foo'].values, - expected_dataframe['foo'].values, - strict_nan=True) + tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_none_coercion_mixed_dtypes(self): start_dataframe = DataFrame({ @@ -5499,19 +5364,12 @@ def test_none_coercion_mixed_dtypes(self): }) start_dataframe.iloc[0] = None - expected_dataframe = DataFrame({ - 'a': [np.nan, 2, 3], - 'b': [np.nan, 2.0, 3.0], - 'c': [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], - 'd': [None, 'b', 'c'] - }) - - for column in expected_dataframe.columns: - assert_attr_equal('dtype', start_dataframe[column], - expected_dataframe[column]) - tm.assert_numpy_array_equal(start_dataframe[column].values, - expected_dataframe[column].values, - strict_nan=True) + exp = DataFrame({'a': [np.nan, 2, 3], + 'b': [np.nan, 2.0, 3.0], + 'c': [NaT, datetime(2000, 1, 2), + datetime(2000, 1, 3)], + 'd': [None, 'b', 'c']}) + tm.assert_frame_equal(start_dataframe, exp) if __name__ == '__main__': diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py new file mode 100644 index 0000000000000..5d563e20087b9 --- /dev/null +++ b/pandas/tests/indexing/test_indexing_slow.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +import warnings + +import numpy as np +import pandas as pd +from pandas.core.api import Series, DataFrame, MultiIndex +import pandas.util.testing as tm + + +class TestIndexingSlow(tm.TestCase): + + _multiprocess_can_split_ = True + + @tm.slow + def test_multiindex_get_loc(self): # GH7724, GH2646 + + with warnings.catch_warnings(record=True): + + # test indexing into a multi-index before & past the lexsort depth + from numpy.random import randint, choice, randn + cols = ['jim', 'joe', 'jolie', 'joline', 'jolia'] + + def validate(mi, df, key): + mask = np.ones(len(df)).astype('bool') + + # test for all partials of this key + for i, k in enumerate(key): + mask &= df.iloc[:, i] == k + + if not mask.any(): + self.assertNotIn(key[:i + 1], mi.index) + continue + + self.assertIn(key[:i + 1], mi.index) + right = df[mask].copy() + + if i + 1 != len(key): # partial key + right.drop(cols[:i + 1], axis=1, inplace=True) + right.set_index(cols[i + 1:-1], inplace=True) + tm.assert_frame_equal(mi.loc[key[:i + 1]], right) + + else: # full key + right.set_index(cols[:-1], inplace=True) + if len(right) == 1: # single hit + right = Series(right['jolia'].values, + name=right.index[0], + index=['jolia']) + tm.assert_series_equal(mi.loc[key[:i + 1]], right) + else: # multi hit + tm.assert_frame_equal(mi.loc[key[:i + 1]], right) + + def loop(mi, df, keys): + for key in keys: + validate(mi, df, key) + + n, m = 1000, 50 + + vals = [randint(0, 10, n), choice( + list('abcdefghij'), n), choice( + pd.date_range('20141009', periods=10).tolist(), n), choice( + list('ZYXWVUTSRQ'), n), randn(n)] + vals = list(map(tuple, zip(*vals))) + + # bunch of keys for testing + keys = [randint(0, 11, m), choice( + list('abcdefghijk'), m), choice( + pd.date_range('20141009', periods=11).tolist(), m), choice( + list('ZYXWVUTSRQP'), m)] + keys = list(map(tuple, zip(*keys))) + keys += list(map(lambda t: t[:-1], vals[::n // m])) + + # covers both unique index and non-unique index + df = pd.DataFrame(vals, columns=cols) + a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1]) + + for frame in a, b: + for i in range(5): # lexsort depth + df = frame.copy() if i == 0 else frame.sort_values( + by=cols[:i]) + mi = df.set_index(cols[:-1]) + assert not mi.index.lexsort_depth < i + loop(mi, df, keys) + + @tm.slow + def test_large_dataframe_indexing(self): + # GH10692 + result = DataFrame({'x': range(10 ** 6)}, dtype='int64') + result.loc[len(result)] = len(result) + 1 + expected = DataFrame({'x': range(10 ** 6 + 1)}, dtype='int64') + tm.assert_frame_equal(result, expected) + + @tm.slow + def test_large_mi_dataframe_indexing(self): + # GH10645 + result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) + assert (not (10 ** 6, 0) in result) From 1f883121c47940cf51fd33f40e64d18908153c71 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 13 Aug 2016 18:13:07 -0400 Subject: [PATCH 253/359] BUG: Sparse indexing with bool sparse may be incorrect Author: sinhrks Closes #13985 from sinhrks/sparse_bool_indexing and squashes the following commits: 0909fa8 [sinhrks] BUG: Sparse indexing with bool sparse may be incorrect --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/indexing.py | 6 ++- pandas/sparse/array.py | 6 ++- pandas/sparse/series.py | 1 + pandas/sparse/tests/test_indexing.py | 61 ++++++++++++++++++++++++++++ 5 files changed, 72 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 411b2b0abaf5a..0fafa6003c945 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -766,6 +766,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` - Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) - Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`) +- Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result (:issue:`13985`) .. _whatsnew_0190.deprecations: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 933ecd1b8de86..a7cc3b9dddd36 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -10,6 +10,7 @@ is_list_like, is_sequence, is_scalar, + is_sparse, _ensure_platform_int) from pandas.types.missing import isnull, _infer_fill_value @@ -1811,9 +1812,10 @@ def check_bool_indexer(ax, key): mask = isnull(result._values) if mask.any(): raise IndexingError('Unalignable boolean Series key provided') - result = result.astype(bool)._values - + elif is_sparse(result): + result = result.to_dense() + result = np.asarray(result, dtype=bool) else: # is_bool_indexer has already checked for nulls in the case of an # object array key, so no check needed here diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index e22a62ee7f917..d14a8eadddc13 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -17,6 +17,7 @@ from pandas.types.generic import ABCSparseArray, ABCSparseSeries from pandas.types.common import (is_float, is_integer, is_integer_dtype, _ensure_platform_int, + is_bool_dtype, is_list_like, is_scalar, is_dtype_equal) from pandas.types.cast import (_possibly_convert_platform, _maybe_promote, @@ -385,7 +386,10 @@ def __getitem__(self, key): data_slice = self.values[key] else: if isinstance(key, SparseArray): - key = np.asarray(key) + if is_bool_dtype(key): + key = key.to_dense() + else: + key = np.asarray(key) if hasattr(key, '__len__') and len(self) != len(key): return self.take(key) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 9045784287d9c..4ad77b4deab4f 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -609,6 +609,7 @@ def take(self, indices, axis=0, convert=True, *args, **kwargs): ------- taken : ndarray """ + convert = nv.validate_take_with_convert(convert, args, kwargs) new_values = SparseArray.take(self.values, indices) new_index = self.index.take(indices) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index 74c3785b06d77..d176d95bb7dbf 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -36,6 +36,10 @@ def test_getitem(self): exp = orig[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_getitem_slice(self): orig = self.orig sparse = self.sparse @@ -68,6 +72,10 @@ def test_getitem_fill_value(self): exp = orig[orig % 2 == 1].to_sparse(fill_value=0) tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_getitem_ellipsis(self): # GH 9467 s = pd.SparseSeries([1, np.nan, 2, 0, np.nan]) @@ -116,6 +124,10 @@ def test_loc(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_loc_index(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE')) sparse = orig.to_sparse() @@ -137,6 +149,10 @@ def test_loc_index(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_loc_index_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) sparse = orig.to_sparse(fill_value=0) @@ -368,6 +384,35 @@ def test_reindex_fill_value(self): exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) + def tests_indexing_with_sparse(self): + # GH 13985 + + for kind in ['integer', 'block']: + for fill in [True, False, np.nan]: + arr = pd.SparseArray([1, 2, 3], kind=kind) + indexer = pd.SparseArray([True, False, True], fill_value=fill, + dtype=bool) + + tm.assert_sp_array_equal(pd.SparseArray([1, 3], kind=kind), + arr[indexer]) + + s = pd.SparseSeries(arr, index=['a', 'b', 'c'], + dtype=np.float64) + exp = pd.SparseSeries([1, 3], index=['a', 'c'], + dtype=np.float64, kind=kind) + tm.assert_sp_series_equal(s[indexer], exp) + tm.assert_sp_series_equal(s.loc[indexer], exp) + tm.assert_sp_series_equal(s.iloc[indexer], exp) + + indexer = pd.SparseSeries(indexer, index=['a', 'b', 'c']) + tm.assert_sp_series_equal(s[indexer], exp) + tm.assert_sp_series_equal(s.loc[indexer], exp) + + msg = ("iLocation based boolean indexing cannot use an " + "indexable as a mask") + with tm.assertRaisesRegexp(ValueError, msg): + s.iloc[indexer] + class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): @@ -405,6 +450,10 @@ def test_getitem_multi(self): exp = orig[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_getitem_multi_tuple(self): orig = self.orig sparse = self.sparse @@ -454,6 +503,10 @@ def test_loc(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_loc_multi_tuple(self): orig = self.orig sparse = self.sparse @@ -578,6 +631,10 @@ def test_loc(self): exp = orig.loc[orig.x % 2 == 1].to_sparse() tm.assert_sp_frame_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)] + tm.assert_sp_frame_equal(result, exp) + def test_loc_index(self): orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], @@ -627,6 +684,10 @@ def test_loc_index(self): exp = orig.loc[orig.x % 2 == 1].to_sparse() tm.assert_sp_frame_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)] + tm.assert_sp_frame_equal(result, exp) + def test_loc_slice(self): orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], From 0975509a79d1021a870ea15b8448b8a9e3a0241b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 13 Aug 2016 18:18:49 -0400 Subject: [PATCH 254/359] Use DeprecationWarning for core.common deprecations (GH13634) Related to second question in #13634 (whether to use FutureWarning or DeprecationWarning in deprecating the public pandas.core.common functions). As those functions are mostly used in library code, and less directly by users in their own code, I think a DeprecationWarning is more appropriate in this case. For example, in our own docs, we started to get warnings due to an example with a statsmodels regression that uses patsy using one of those functions. Note that recent IPython also shows DeprecationWarnings when using a deprecated function interactively. Author: Joris Van den Bossche Closes #13990 from jorisvandenbossche/common-depr-warning and squashes the following commits: 2de5d48 [Joris Van den Bossche] Use DeprecationWarning for core.common deprecations (GH13634) --- doc/source/whatsnew/v0.19.0.txt | 6 +++++- pandas/api/tests/test_api.py | 2 +- pandas/core/common.py | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0fafa6003c945..193987cb64136 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -31,7 +31,7 @@ pandas development API As part of making pandas APi more uniform and accessible in the future, we have created a standard sub-package of pandas, ``pandas.api`` to hold public API's. We are starting by exposing type introspection functions in ``pandas.api.types``. More sub-packages and officially sanctioned API's -will be published in future versions of pandas. +will be published in future versions of pandas (:issue:`13147`, :issue:`13634`) The following are now part of this API: @@ -42,6 +42,10 @@ The following are now part of this API: funcs = [ f for f in dir(types) if not f.startswith('_') ] pprint.pprint(funcs) +.. note:: + + Calling these functions from the internal module ``pandas.core.common`` will now show a ``DeprecationWarning`` (:issue:`13990`) + .. _whatsnew_0190.enhancements.asof_merge: ``merge_asof`` for asof-style time-series joining diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index fda81ee6c9045..2537354091ad1 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -163,7 +163,7 @@ def test_types(self): self.check(types, self.allowed) def check_deprecation(self, fold, fnew): - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(DeprecationWarning): try: result = fold('foo') expected = fnew('foo') diff --git a/pandas/core/common.py b/pandas/core/common.py index 054b899f1ded2..341bd3b4cc845 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -31,7 +31,7 @@ def wrapper(*args, **kwargs): warnings.warn("pandas.core.common.{t} is deprecated. " "import from the public API: " "pandas.api.types.{t} instead".format(t=t), - FutureWarning, stacklevel=3) + DeprecationWarning, stacklevel=3) return getattr(types, t)(*args, **kwargs) return wrapper @@ -57,7 +57,7 @@ def wrapper(*args, **kwargs): "These are not longer public API functions, " "but can be imported from " "pandas.types.common.{t} instead".format(t=t), - FutureWarning, stacklevel=3) + DeprecationWarning, stacklevel=3) return getattr(common, t)(*args, **kwargs) return wrapper From 3760f169a9a15f7ae61154c978b544ef8dca15bd Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 13 Aug 2016 18:33:32 -0400 Subject: [PATCH 255/359] BUG: Index may ignore specified datetime/timedelta dtypes - [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` - [x] whatsnew entry ``Index`` ignores specified ``dtype`` if it is ``datetime64`` (normal and tz) or ``timedelta64``. This PR makes it consistent with ``DatetimeIndex`` and ``TimedeltaIndex``. ``` pd.Index([1, 2, 3], dtype='datetime64[ns, US/Eastern]') # Index([1, 2, 3], dtype='object') pd.Index([1, 2, 3], dtype='datetime64[ns]') # Index([1, 2, 3], dtype='object') pd.Index([1, 2, 3], dtype='timedelta64[ns]') # Int64Index([1, 2, 3], dtype='int64') ``` Also, fixed ``MultiIndex.get_level_values`` not to pass unnecessary ``tz`` and ``freq``. Author: sinhrks Closes #13981 from sinhrks/index_init_datetimelike and squashes the following commits: a922aef [sinhrks] BUG: Index may ignore specified datetime/timedelta dtypes --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/base.py | 16 +++++++----- pandas/indexes/multi.py | 5 +--- pandas/tests/indexes/test_base.py | 39 ++++++++++++++++++++++++++++++ pandas/tests/indexes/test_multi.py | 24 ++++++++++++++++++ pandas/util/testing.py | 5 +--- 6 files changed, 76 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 193987cb64136..d6e46065074fa 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -957,6 +957,7 @@ Bug Fixes - Bug in ``DatetimeIndex`` with nanosecond frequency does not include timestamp specified with ``end`` (:issue:`13672`) - Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) +- Bug in ``Index`` may ignores specified ``datetime64`` or ``timedelta64`` ``dtype`` (:issue:`13981`) - Bug in ``RangeIndex`` can be created without no arguments rather than raises ``TypeError`` (:issue:`13793`) - Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) - Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index de7780d25b1e5..b638e61d8eebe 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -19,7 +19,6 @@ from pandas.types.missing import isnull, array_equivalent from pandas.types.common import (_ensure_int64, _ensure_object, _ensure_platform_int, - is_datetimetz, is_integer, is_float, is_dtype_equal, @@ -27,6 +26,8 @@ is_categorical_dtype, is_bool_dtype, is_integer_dtype, is_float_dtype, + is_datetime64_any_dtype, + is_timedelta64_dtype, needs_i8_conversion, is_iterator, is_list_like, is_scalar) @@ -162,16 +163,19 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): - if (issubclass(data.dtype.type, np.datetime64) or - is_datetimetz(data)): + if (is_datetime64_any_dtype(data) or + (dtype is not None and is_datetime64_any_dtype(dtype)) or + 'tz' in kwargs): from pandas.tseries.index import DatetimeIndex - result = DatetimeIndex(data, copy=copy, name=name, **kwargs) - if dtype is not None and _o_dtype == dtype: + result = DatetimeIndex(data, copy=copy, name=name, + dtype=dtype, **kwargs) + if dtype is not None and is_dtype_equal(_o_dtype, dtype): return Index(result.to_pydatetime(), dtype=_o_dtype) else: return result - elif issubclass(data.dtype.type, np.timedelta64): + elif (is_timedelta64_dtype(data) or + (dtype is not None and is_timedelta64_dtype(dtype))): from pandas.tseries.tdi import TimedeltaIndex result = TimedeltaIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 95ef18d23a037..cc279076f7a5e 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -687,10 +687,7 @@ def get_level_values(self, level): labels = self.labels[num] filled = algos.take_1d(unique.values, labels, fill_value=unique._na_value) - _simple_new = unique._simple_new - values = _simple_new(filled, name=self.names[num], - freq=getattr(unique, 'freq', None), - tz=getattr(unique, 'tz', None)) + values = unique._shallow_copy(filled) return values def format(self, space=2, sparsify=None, adjoin=True, names=False, diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 88e49c4b55c8a..3c9040021fdbf 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -323,6 +323,45 @@ def test_constructor_dtypes(self): self.assertIsInstance(idx, Index) self.assertEqual(idx.dtype, object) + def test_constructor_dtypes_datetime(self): + + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + idx = pd.date_range('2011-01-01', periods=5, tz=tz) + dtype = idx.dtype + + # pass values without timezone, as DatetimeIndex localizes it + for values in [pd.date_range('2011-01-01', periods=5).values, + pd.date_range('2011-01-01', periods=5).asi8]: + + for res in [pd.Index(values, tz=tz), + pd.Index(values, dtype=dtype), + pd.Index(list(values), tz=tz), + pd.Index(list(values), dtype=dtype)]: + tm.assert_index_equal(res, idx) + + # check compat with DatetimeIndex + for res in [pd.DatetimeIndex(values, tz=tz), + pd.DatetimeIndex(values, dtype=dtype), + pd.DatetimeIndex(list(values), tz=tz), + pd.DatetimeIndex(list(values), dtype=dtype)]: + tm.assert_index_equal(res, idx) + + def test_constructor_dtypes_timedelta(self): + + idx = pd.timedelta_range('1 days', periods=5) + dtype = idx.dtype + + for values in [idx.values, idx.asi8]: + + for res in [pd.Index(values, dtype=dtype), + pd.Index(list(values), dtype=dtype)]: + tm.assert_index_equal(res, idx) + + # check compat with TimedeltaIndex + for res in [pd.TimedeltaIndex(values, dtype=dtype), + pd.TimedeltaIndex(list(values), dtype=dtype)]: + tm.assert_index_equal(res, idx) + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 809e1ab05ef6e..bdca91253e37b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -632,6 +632,30 @@ def test_from_arrays_index_series_period(self): tm.assert_index_equal(result, result2) + def test_from_arrays_index_datetimelike_mixed(self): + idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3, + tz='US/Eastern') + idx2 = pd.date_range('2015-01-01 10:00', freq='H', periods=3) + idx3 = pd.timedelta_range('1 days', freq='D', periods=3) + idx4 = pd.period_range('2011-01-01', freq='D', periods=3) + + result = pd.MultiIndex.from_arrays([idx1, idx2, idx3, idx4]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + tm.assert_index_equal(result.get_level_values(2), idx3) + tm.assert_index_equal(result.get_level_values(3), idx4) + + result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), + pd.Series(idx2), + pd.Series(idx3), + pd.Series(idx4)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + tm.assert_index_equal(result2.get_level_values(2), idx3) + tm.assert_index_equal(result2.get_level_values(3), idx4) + + tm.assert_index_equal(result, result2) + def test_from_arrays_different_lengths(self): # GH13599 idx1 = [1, 2, 3] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e95808ddc8225..2d1d88b69941b 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -749,10 +749,7 @@ def _get_ilevel_values(index, level): unique = index.levels[level] labels = index.labels[level] filled = take_1d(unique.values, labels, fill_value=unique._na_value) - values = unique._simple_new(filled, - name=index.names[level], - freq=getattr(unique, 'freq', None), - tz=getattr(unique, 'tz', None)) + values = unique._shallow_copy(filled, name=index.names[level]) return values # instance validation From 7a8d224b09d81e7ecedd77eba79b91e466fa89b2 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 13 Aug 2016 18:41:53 -0400 Subject: [PATCH 256/359] BUG: freqstr may be parsed incorrectly Author: sinhrks Closes #13930 from sinhrks/freq_invalid and squashes the following commits: e797a2d [sinhrks] BUG: freqstr may be parsed incorrectly --- doc/source/whatsnew/v0.19.0.txt | 3 +- pandas/tseries/frequencies.py | 11 +- pandas/tseries/tests/test_frequencies.py | 386 ++++++++++++----------- 3 files changed, 216 insertions(+), 184 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d6e46065074fa..722f2613418e8 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -957,7 +957,7 @@ Bug Fixes - Bug in ``DatetimeIndex`` with nanosecond frequency does not include timestamp specified with ``end`` (:issue:`13672`) - Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) -- Bug in ``Index`` may ignores specified ``datetime64`` or ``timedelta64`` ``dtype`` (:issue:`13981`) +- Bug in ``Index`` may ignore specified ``datetime64`` or ``timedelta64`` passed as ``dtype`` (:issue:`13981`) - Bug in ``RangeIndex`` can be created without no arguments rather than raises ``TypeError`` (:issue:`13793`) - Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) - Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) @@ -977,6 +977,7 @@ Bug Fixes - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) +- Bug in invalid frequency offset string like "D1", "-2-3H" may not raise ``ValueError (:issue:`13930`) - Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 3011e8dc0ae3d..ac094c1f545f3 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -464,7 +464,14 @@ def to_offset(freq): delta = None stride_sign = None try: - for stride, name, _ in opattern.findall(freq): + splitted = re.split(opattern, freq) + if splitted[-1] != '' and not splitted[-1].isspace(): + # the last element must be blank + raise ValueError('last element must be blank') + for sep, stride, name in zip(splitted[0::4], splitted[1::4], + splitted[2::4]): + if sep != '' and not sep.isspace(): + raise ValueError('separator must be spaces') offset = get_offset(name) if stride_sign is None: stride_sign = -1 if stride.startswith('-') else 1 @@ -486,7 +493,7 @@ def to_offset(freq): # hack to handle WOM-1MON -opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-@][\dA-Za-z\-]+)?)') +opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)') def _base_and_stride(freqstr): diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 268933fada7a2..5ba98f15aed8d 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -18,187 +18,210 @@ from pandas import Timedelta -def test_to_offset_multiple(): - freqstr = '2h30min' - freqstr2 = '2h 30min' - - result = frequencies.to_offset(freqstr) - assert (result == frequencies.to_offset(freqstr2)) - expected = offsets.Minute(150) - assert (result == expected) - - freqstr = '2h30min15s' - result = frequencies.to_offset(freqstr) - expected = offsets.Second(150 * 60 + 15) - assert (result == expected) - - freqstr = '2h 60min' - result = frequencies.to_offset(freqstr) - expected = offsets.Hour(3) - assert (result == expected) - - freqstr = '15l500u' - result = frequencies.to_offset(freqstr) - expected = offsets.Micro(15500) - assert (result == expected) - - freqstr = '10s75L' - result = frequencies.to_offset(freqstr) - expected = offsets.Milli(10075) - assert (result == expected) - - freqstr = '2800N' - result = frequencies.to_offset(freqstr) - expected = offsets.Nano(2800) - assert (result == expected) - - freqstr = '2SM' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthEnd(2) - assert (result == expected) - - freqstr = '2SM-16' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthEnd(2, day_of_month=16) - assert (result == expected) - - freqstr = '2SMS-14' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthBegin(2, day_of_month=14) - assert (result == expected) - - freqstr = '2SMS-15' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthBegin(2) - assert (result == expected) - - # malformed - try: - frequencies.to_offset('2h20m') - except ValueError: - pass - else: - assert (False) - - -def test_to_offset_negative(): - freqstr = '-1S' - result = frequencies.to_offset(freqstr) - assert (result.n == -1) - - freqstr = '-5min10s' - result = frequencies.to_offset(freqstr) - assert (result.n == -310) - - freqstr = '-2SM' - result = frequencies.to_offset(freqstr) - assert (result.n == -2) - - freqstr = '-1SMS' - result = frequencies.to_offset(freqstr) - assert (result.n == -1) - - -def test_to_offset_leading_zero(): - freqstr = '00H 00T 01S' - result = frequencies.to_offset(freqstr) - assert (result.n == 1) - - freqstr = '-00H 03T 14S' - result = frequencies.to_offset(freqstr) - assert (result.n == -194) - - -def test_to_offset_pd_timedelta(): - # Tests for #9064 - td = Timedelta(days=1, seconds=1) - result = frequencies.to_offset(td) - expected = offsets.Second(86401) - assert (expected == result) - - td = Timedelta(days=-1, seconds=1) - result = frequencies.to_offset(td) - expected = offsets.Second(-86399) - assert (expected == result) - - td = Timedelta(hours=1, minutes=10) - result = frequencies.to_offset(td) - expected = offsets.Minute(70) - assert (expected == result) - - td = Timedelta(hours=1, minutes=-10) - result = frequencies.to_offset(td) - expected = offsets.Minute(50) - assert (expected == result) - - td = Timedelta(weeks=1) - result = frequencies.to_offset(td) - expected = offsets.Day(7) - assert (expected == result) - - td1 = Timedelta(hours=1) - result1 = frequencies.to_offset(td1) - result2 = frequencies.to_offset('60min') - assert (result1 == result2) - - td = Timedelta(microseconds=1) - result = frequencies.to_offset(td) - expected = offsets.Micro(1) - assert (expected == result) - - td = Timedelta(microseconds=0) - tm.assertRaises(ValueError, lambda: frequencies.to_offset(td)) - - -def test_anchored_shortcuts(): - result = frequencies.to_offset('W') - expected = frequencies.to_offset('W-SUN') - assert (result == expected) - - result1 = frequencies.to_offset('Q') - result2 = frequencies.to_offset('Q-DEC') - expected = offsets.QuarterEnd(startingMonth=12) - assert (result1 == expected) - assert (result2 == expected) - - result1 = frequencies.to_offset('Q-MAY') - expected = offsets.QuarterEnd(startingMonth=5) - assert (result1 == expected) - - result1 = frequencies.to_offset('SM') - result2 = frequencies.to_offset('SM-15') - expected = offsets.SemiMonthEnd(day_of_month=15) - assert (result1 == expected) - assert (result2 == expected) - - result = frequencies.to_offset('SM-1') - expected = offsets.SemiMonthEnd(day_of_month=1) - assert (result == expected) - - result = frequencies.to_offset('SM-27') - expected = offsets.SemiMonthEnd(day_of_month=27) - assert (result == expected) - - result = frequencies.to_offset('SMS-2') - expected = offsets.SemiMonthBegin(day_of_month=2) - assert (result == expected) - - result = frequencies.to_offset('SMS-27') - expected = offsets.SemiMonthBegin(day_of_month=27) - assert (result == expected) - - # ensure invalid cases fail as expected - invalid_anchors = ['SM-0', 'SM-28', 'SM-29', - 'SM-FOO', 'BSM', 'SM--1' - 'SMS-1', 'SMS-28', 'SMS-30', - 'SMS-BAR', 'BSMS', 'SMS--2'] - for invalid_anchor in invalid_anchors: - try: - frequencies.to_offset(invalid_anchor) - except ValueError: - pass - else: - raise AssertionError(invalid_anchor) +class TestToOffset(tm.TestCase): + + def test_to_offset_multiple(self): + freqstr = '2h30min' + freqstr2 = '2h 30min' + + result = frequencies.to_offset(freqstr) + assert (result == frequencies.to_offset(freqstr2)) + expected = offsets.Minute(150) + assert (result == expected) + + freqstr = '2h30min15s' + result = frequencies.to_offset(freqstr) + expected = offsets.Second(150 * 60 + 15) + assert (result == expected) + + freqstr = '2h 60min' + result = frequencies.to_offset(freqstr) + expected = offsets.Hour(3) + assert (result == expected) + + freqstr = '15l500u' + result = frequencies.to_offset(freqstr) + expected = offsets.Micro(15500) + assert (result == expected) + + freqstr = '10s75L' + result = frequencies.to_offset(freqstr) + expected = offsets.Milli(10075) + assert (result == expected) + + freqstr = '2800N' + result = frequencies.to_offset(freqstr) + expected = offsets.Nano(2800) + assert (result == expected) + + freqstr = '2SM' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthEnd(2) + assert (result == expected) + + freqstr = '2SM-16' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthEnd(2, day_of_month=16) + assert (result == expected) + + freqstr = '2SMS-14' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthBegin(2, day_of_month=14) + assert (result == expected) + + freqstr = '2SMS-15' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthBegin(2) + assert (result == expected) + + # malformed + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 2h20m'): + frequencies.to_offset('2h20m') + + def test_to_offset_negative(self): + freqstr = '-1S' + result = frequencies.to_offset(freqstr) + assert (result.n == -1) + + freqstr = '-5min10s' + result = frequencies.to_offset(freqstr) + assert (result.n == -310) + + freqstr = '-2SM' + result = frequencies.to_offset(freqstr) + assert (result.n == -2) + + freqstr = '-1SMS' + result = frequencies.to_offset(freqstr) + assert (result.n == -1) + + def test_to_offset_invalid(self): + # GH 13930 + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: U1'): + frequencies.to_offset('U1') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -U'): + frequencies.to_offset('-U') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 3U1'): + frequencies.to_offset('3U1') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2-3U'): + frequencies.to_offset('-2-3U') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2D:3H'): + frequencies.to_offset('-2D:3H') + + # ToDo: Must be fixed in #8419 + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: .5S'): + frequencies.to_offset('.5S') + + # split offsets with spaces are valid + assert frequencies.to_offset('2D 3H') == offsets.Hour(51) + assert frequencies.to_offset('2 D3 H') == offsets.Hour(51) + assert frequencies.to_offset('2 D 3 H') == offsets.Hour(51) + assert frequencies.to_offset(' 2 D 3 H ') == offsets.Hour(51) + assert frequencies.to_offset(' H ') == offsets.Hour() + assert frequencies.to_offset(' 3 H ') == offsets.Hour(3) + + # special cases + assert frequencies.to_offset('2SMS-15') == offsets.SemiMonthBegin(2) + with tm.assertRaisesRegexp(ValueError, + 'Invalid frequency: 2SMS-15-15'): + frequencies.to_offset('2SMS-15-15') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 2SMS-15D'): + frequencies.to_offset('2SMS-15D') + + def test_to_offset_leading_zero(self): + freqstr = '00H 00T 01S' + result = frequencies.to_offset(freqstr) + assert (result.n == 1) + + freqstr = '-00H 03T 14S' + result = frequencies.to_offset(freqstr) + assert (result.n == -194) + + def test_to_offset_pd_timedelta(self): + # Tests for #9064 + td = Timedelta(days=1, seconds=1) + result = frequencies.to_offset(td) + expected = offsets.Second(86401) + assert (expected == result) + + td = Timedelta(days=-1, seconds=1) + result = frequencies.to_offset(td) + expected = offsets.Second(-86399) + assert (expected == result) + + td = Timedelta(hours=1, minutes=10) + result = frequencies.to_offset(td) + expected = offsets.Minute(70) + assert (expected == result) + + td = Timedelta(hours=1, minutes=-10) + result = frequencies.to_offset(td) + expected = offsets.Minute(50) + assert (expected == result) + + td = Timedelta(weeks=1) + result = frequencies.to_offset(td) + expected = offsets.Day(7) + assert (expected == result) + + td1 = Timedelta(hours=1) + result1 = frequencies.to_offset(td1) + result2 = frequencies.to_offset('60min') + assert (result1 == result2) + + td = Timedelta(microseconds=1) + result = frequencies.to_offset(td) + expected = offsets.Micro(1) + assert (expected == result) + + td = Timedelta(microseconds=0) + tm.assertRaises(ValueError, lambda: frequencies.to_offset(td)) + + def test_anchored_shortcuts(self): + result = frequencies.to_offset('W') + expected = frequencies.to_offset('W-SUN') + assert (result == expected) + + result1 = frequencies.to_offset('Q') + result2 = frequencies.to_offset('Q-DEC') + expected = offsets.QuarterEnd(startingMonth=12) + assert (result1 == expected) + assert (result2 == expected) + + result1 = frequencies.to_offset('Q-MAY') + expected = offsets.QuarterEnd(startingMonth=5) + assert (result1 == expected) + + result1 = frequencies.to_offset('SM') + result2 = frequencies.to_offset('SM-15') + expected = offsets.SemiMonthEnd(day_of_month=15) + assert (result1 == expected) + assert (result2 == expected) + + result = frequencies.to_offset('SM-1') + expected = offsets.SemiMonthEnd(day_of_month=1) + assert (result == expected) + + result = frequencies.to_offset('SM-27') + expected = offsets.SemiMonthEnd(day_of_month=27) + assert (result == expected) + + result = frequencies.to_offset('SMS-2') + expected = offsets.SemiMonthBegin(day_of_month=2) + assert (result == expected) + + result = frequencies.to_offset('SMS-27') + expected = offsets.SemiMonthBegin(day_of_month=27) + assert (result == expected) + + # ensure invalid cases fail as expected + invalid_anchors = ['SM-0', 'SM-28', 'SM-29', + 'SM-FOO', 'BSM', 'SM--1' + 'SMS-1', 'SMS-28', 'SMS-30', + 'SMS-BAR', 'BSMS', 'SMS--2'] + for invalid_anchor in invalid_anchors: + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: '): + frequencies.to_offset(invalid_anchor) def test_get_rule_month(): @@ -275,6 +298,7 @@ def _assert_depr(freq, expected, aliases): class TestFrequencyCode(tm.TestCase): + def test_freq_code(self): self.assertEqual(frequencies.get_freq('A'), 1000) self.assertEqual(frequencies.get_freq('3A'), 1000) From 22ef6f04ba1b95966e42bbc1dc86c1328b2eb2d0 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 13 Aug 2016 20:10:07 -0400 Subject: [PATCH 257/359] CLN: Move PeriodIndex.repeat to DatetimeIndexOpsMixin related to #6469 Author: sinhrks Closes #13995 from sinhrks/period_repeat and squashes the following commits: c627549 [sinhrks] CLN: Move PeriodIndex.repeat to DatetimeIndexOpsMixin --- pandas/tseries/base.py | 7 ++- pandas/tseries/period.py | 14 ----- pandas/tseries/tests/test_base.py | 83 ++++++++++++++++++++----- pandas/tseries/tests/test_period.py | 9 --- pandas/tseries/tests/test_timeseries.py | 7 --- 5 files changed, 75 insertions(+), 45 deletions(-) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 188f538372092..d10e77d7ae45d 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -749,7 +749,12 @@ def repeat(self, repeats, *args, **kwargs): Analogous to ndarray.repeat """ nv.validate_repeat(args, kwargs) - return self._shallow_copy(self.values.repeat(repeats), freq=None) + if isinstance(self, ABCPeriodIndex): + freq = self.freq + else: + freq = None + return self._shallow_copy(self.asi8.repeat(repeats), + freq=freq) def where(self, cond, other=None): """ diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 831d73207bbdf..8126d5f1dbe87 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -34,7 +34,6 @@ from pandas.indexes.base import _index_shared_docs from pandas import compat -from pandas.compat.numpy import function as nv from pandas.util.decorators import Appender, cache_readonly, Substitution from pandas.lib import Timedelta import pandas.tslib as tslib @@ -941,19 +940,6 @@ def append(self, other): for x in to_concat] return Index(com._concat_compat(to_concat), name=name) - def repeat(self, n, *args, **kwargs): - """ - Return a new Index of the values repeated `n` times. - - See also - -------- - numpy.ndarray.repeat - """ - nv.validate_repeat(args, kwargs) - - # overwrites method from DatetimeIndexOpsMixin - return self._shallow_copy(self.values.repeat(n)) - def __setstate__(self, state): """Necessary for making this object picklable""" diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 05f7d9d9ce7b8..5e8f8d3fbc01f 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -170,23 +170,39 @@ def test_round(self): tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='M') tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='M') - def test_repeat(self): - reps = 2 - - for tz in self.tz: - rng = pd.date_range(start='2016-01-01', periods=2, - freq='30Min', tz=tz) + def test_repeat_range(self): + rng = date_range('1/1/2000', '1/1/2001') - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - ]) + result = rng.repeat(5) + self.assertIsNone(result.freq) + self.assertEqual(len(result), 5 * len(rng)) - tm.assert_index_equal(rng.repeat(reps), expected_rng) + for tz in self.tz: + index = pd.date_range('2001-01-01', periods=2, freq='D', tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', + '2001-01-02', '2001-01-02'], tz=tz) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + index = pd.date_range('2001-01-01', periods=2, freq='2D', tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', + '2001-01-03', '2001-01-03'], tz=tz) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + index = pd.DatetimeIndex(['2001-01-01', 'NaT', '2003-01-01'], + tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01', + 'NaT', 'NaT', 'NaT', + '2003-01-01', '2003-01-01', '2003-01-01'], + tz=tz) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) - def test_numpy_repeat(self): + def test_repeat(self): reps = 2 msg = "the 'axis' parameter is not supported" @@ -201,6 +217,10 @@ def test_numpy_repeat(self): Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), ]) + res = rng.repeat(reps) + tm.assert_index_equal(res, expected_rng) + self.assertIsNone(res.freq) + tm.assert_index_equal(np.repeat(rng, reps), expected_rng) tm.assertRaisesRegexp(ValueError, msg, np.repeat, rng, reps, axis=1) @@ -1605,6 +1625,21 @@ def test_shift(self): name='xxx') tm.assert_index_equal(idx.shift(-3, freq='T'), exp) + def test_repeat(self): + index = pd.timedelta_range('1 days', periods=2, freq='D') + exp = pd.TimedeltaIndex(['1 days', '1 days', '2 days', '2 days']) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + index = TimedeltaIndex(['1 days', 'NaT', '3 days']) + exp = TimedeltaIndex(['1 days', '1 days', '1 days', + 'NaT', 'NaT', 'NaT', + '3 days', '3 days', '3 days']) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + class TestPeriodIndexOps(Ops): def setUp(self): @@ -2526,6 +2561,26 @@ def test_shift(self): '2011-01-01 09:00'], name='xxx', freq='H') tm.assert_index_equal(idx.shift(-3), exp) + def test_repeat(self): + index = pd.period_range('2001-01-01', periods=2, freq='D') + exp = pd.PeriodIndex(['2001-01-01', '2001-01-01', + '2001-01-02', '2001-01-02'], freq='D') + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + + index = pd.period_range('2001-01-01', periods=2, freq='2D') + exp = pd.PeriodIndex(['2001-01-01', '2001-01-01', + '2001-01-03', '2001-01-03'], freq='2D') + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + + index = pd.PeriodIndex(['2001-01', 'NaT', '2003-01'], freq='M') + exp = pd.PeriodIndex(['2001-01', '2001-01', '2001-01', + 'NaT', 'NaT', 'NaT', + '2003-01', '2003-01', '2003-01'], freq='M') + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + if __name__ == '__main__': import nose diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index d7f1a52612819..8baac297fe57b 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -2558,15 +2558,6 @@ def test_constructor(self): vals = np.array(vals) self.assertRaises(ValueError, PeriodIndex, vals) - def test_repeat(self): - index = period_range('20010101', periods=2) - expected = PeriodIndex([ - Period('2001-01-01'), Period('2001-01-01'), - Period('2001-01-02'), Period('2001-01-02'), - ]) - - tm.assert_index_equal(index.repeat(2), expected) - def test_numpy_repeat(self): index = period_range('20010101', periods=2) expected = PeriodIndex([Period('2001-01-01'), Period('2001-01-01'), diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index e5bbb923935e0..0544d8a8e32d4 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1342,13 +1342,6 @@ def test_format_pre_1900_dates(self): ts = Series(1, index=rng) repr(ts) - def test_repeat(self): - rng = date_range('1/1/2000', '1/1/2001') - - result = rng.repeat(5) - self.assertIsNone(result.freq) - self.assertEqual(len(result), 5 * len(rng)) - def test_at_time(self): rng = date_range('1/1/2000', '1/5/2000', freq='5min') ts = Series(np.random.randn(len(rng)), index=rng) From 0236d7501c6dc0ba6b8d0ec6b4357ad12e8fd9af Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 13 Aug 2016 20:11:13 -0400 Subject: [PATCH 258/359] TST: Fix unused tests in tseries/test_base.py Author: sinhrks Closes #13996 from sinhrks/period_base_test and squashes the following commits: 869ef66 [sinhrks] TST: Fix unused tests in tseries/test_base.py --- pandas/tseries/tests/test_base.py | 118 +++++++++++++++--------------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 5e8f8d3fbc01f..413d0e8d90445 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -355,7 +355,7 @@ def test_resolution(self): ['day', 'day', 'day', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond']): - for tz in [None, 'Asia/Tokyo', 'US/Eastern']: + for tz in self.tz: idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, tz=tz) self.assertEqual(idx.resolution, expected) @@ -508,7 +508,7 @@ def test_comp_nat(self): def test_value_counts_unique(self): # GH 7735 - for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: + for tz in self.tz: idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) # create repeated values, 'n'th element is repeated by n+1 times idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), @@ -590,49 +590,50 @@ def test_order(self): self.assertEqual(ordered.freq.n, -1) # without freq - idx1 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], name='idx1') - exp1 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05'], name='idx1') - - idx2 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - tz='Asia/Tokyo', name='idx2') - - # TODO(wesm): unused? - - # exp2 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', - # '2011-01-03', '2011-01-05'], - # tz='Asia/Tokyo', name='idx2') - - # idx3 = DatetimeIndex([pd.NaT, '2011-01-03', '2011-01-05', - # '2011-01-02', pd.NaT], name='idx3') - # exp3 = DatetimeIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', - # '2011-01-05'], name='idx3') - - for idx, expected in [(idx1, exp1), (idx1, exp1), (idx1, exp1)]: - ordered = idx.sort_values() - self.assert_index_equal(ordered, expected) - self.assertIsNone(ordered.freq) - - ordered = idx.sort_values(ascending=False) - self.assert_index_equal(ordered, expected[::-1]) - self.assertIsNone(ordered.freq) - - ordered, indexer = idx.sort_values(return_indexer=True) - self.assert_index_equal(ordered, expected) - - exp = np.array([0, 4, 3, 1, 2]) - self.assert_numpy_array_equal(indexer, exp, check_dtype=False) - self.assertIsNone(ordered.freq) - - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) - self.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 4, 0]) - self.assert_numpy_array_equal(indexer, exp, check_dtype=False) - self.assertIsNone(ordered.freq) + for tz in self.tz: + idx1 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', + '2011-01-02', '2011-01-01'], + tz=tz, name='idx1') + exp1 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', + '2011-01-03', '2011-01-05'], + tz=tz, name='idx1') + + idx2 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', + '2011-01-02', '2011-01-01'], + tz=tz, name='idx2') + + exp2 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', + '2011-01-03', '2011-01-05'], + tz=tz, name='idx2') + + idx3 = DatetimeIndex([pd.NaT, '2011-01-03', '2011-01-05', + '2011-01-02', pd.NaT], tz=tz, name='idx3') + exp3 = DatetimeIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', + '2011-01-05'], tz=tz, name='idx3') + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: + ordered = idx.sort_values() + self.assert_index_equal(ordered, expected) + self.assertIsNone(ordered.freq) + + ordered = idx.sort_values(ascending=False) + self.assert_index_equal(ordered, expected[::-1]) + self.assertIsNone(ordered.freq) + + ordered, indexer = idx.sort_values(return_indexer=True) + self.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) + self.assertIsNone(ordered.freq) + + ordered, indexer = idx.sort_values(return_indexer=True, + ascending=False) + self.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) + self.assertIsNone(ordered.freq) def test_getitem(self): idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') @@ -782,7 +783,7 @@ def test_nat_new(self): def test_shift(self): # GH 9903 - for tz in [None, 'US/Eastern', 'Asia/Tokyo']: + for tz in self.tz: idx = pd.DatetimeIndex([], name='xxx', tz=tz) tm.assert_index_equal(idx.shift(0, freq='H'), idx) tm.assert_index_equal(idx.shift(3, freq='H'), idx) @@ -2400,20 +2401,19 @@ def test_order(self): exp1 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', '2011-01-03', '2011-01-05'], freq='D', name='idx1') - # TODO(wesm): unused? - # idx2 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05', - # '2011-01-02', '2011-01-01'], - # freq='D', name='idx2') - # exp2 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', - # '2011-01-03', '2011-01-05'], - # freq='D', name='idx2') - - # idx3 = PeriodIndex([pd.NaT, '2011-01-03', '2011-01-05', - # '2011-01-02', pd.NaT], freq='D', name='idx3') - # exp3 = PeriodIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', - # '2011-01-05'], freq='D', name='idx3') + idx2 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05', + '2011-01-02', '2011-01-01'], + freq='D', name='idx2') + exp2 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', + '2011-01-03', '2011-01-05'], + freq='D', name='idx2') - for idx, expected in [(idx1, exp1), (idx1, exp1), (idx1, exp1)]: + idx3 = PeriodIndex([pd.NaT, '2011-01-03', '2011-01-05', + '2011-01-02', pd.NaT], freq='D', name='idx3') + exp3 = PeriodIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', + '2011-01-05'], freq='D', name='idx3') + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: ordered = idx.sort_values() self.assert_index_equal(ordered, expected) self.assertEqual(ordered.freq, 'D') From a0d05dbf7477df863486d82a0cdd4e3e93023864 Mon Sep 17 00:00:00 2001 From: agraboso Date: Sat, 13 Aug 2016 20:15:17 -0400 Subject: [PATCH 259/359] BUG: groupby cumsum with axis=1 computes cumprod closes #13993 Author: agraboso Closes #13994 from agraboso/fix-13993 and squashes the following commits: 56dc1c7 [agraboso] BUG: groupby cumsum with axis=1 computes cumprod --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/groupby.py | 2 +- pandas/tests/test_groupby.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 722f2613418e8..3600b8f52873b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -855,6 +855,7 @@ Bug Fixes ~~~~~~~~~ - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) +- Bug in ``groupby().cumsum()`` calculating ``cumprod`` when ``axis=1``. (:issue:`13994`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c2ab406e1da65..5c3c5bbfab9a8 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1397,7 +1397,7 @@ def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" nv.validate_groupby_func('cumsum', args, kwargs) if axis != 0: - return self.apply(lambda x: x.cumprod(axis=axis)) + return self.apply(lambda x: x.cumsum(axis=axis)) return self._cython_transform('cumsum') diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index cc588d891b398..bfaf157245c1a 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2973,6 +2973,14 @@ def test_cython_api2(self): result = df.groupby('A', as_index=False).cumsum() assert_frame_equal(result, expected) + # GH 13994 + result = df.groupby('A').cumsum(axis=1) + expected = df.cumsum(axis=1) + assert_frame_equal(result, expected) + result = df.groupby('A').cumprod(axis=1) + expected = df.cumprod(axis=1) + assert_frame_equal(result, expected) + def test_grouping_ndarray(self): grouped = self.df.groupby(self.df['A'].values) From 5b0a8b01f575991cda4945fc5574a0c69e4f7f7c Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Mon, 15 Aug 2016 21:29:12 +0900 Subject: [PATCH 260/359] CLN: move _na_value to DatetimeIndexOpsMixin (#13997) --- pandas/tseries/base.py | 3 +++ pandas/tseries/index.py | 3 --- pandas/tseries/period.py | 4 ---- pandas/tseries/tdi.py | 3 --- pandas/tseries/tests/test_base.py | 12 ++++++++++++ 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index d10e77d7ae45d..353823e296cf8 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -353,6 +353,9 @@ def get_duplicates(self): values = Index.get_duplicates(self) return self._simple_new(values) + _na_value = tslib.NaT + """The expected NA value to use with this index.""" + @cache_readonly def _isnan(self): """ return if each value is nan""" diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index aa50fbe316b94..01728889a8595 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -667,9 +667,6 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return tslib.ints_to_pydatetime(self.asi8, self.tz) - _na_value = tslib.NaT - """The expected NA value to use with this index.""" - @cache_readonly def _is_dates_only(self): from pandas.formats.format import _is_dates_only diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 8126d5f1dbe87..af46162038fef 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -320,10 +320,6 @@ def _coerce_scalar_to_index(self, item): """ return PeriodIndex([item], **self._get_attributes_dict()) - @property - def _na_value(self): - return self._box_func(tslib.iNaT) - def __contains__(self, key): if isinstance(key, Period): if key.freq != self.freq: diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 271fee6341324..a17eda3ac4288 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -282,9 +282,6 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): result._reset_identity() return result - _na_value = tslib.NaT - """The expected NA value to use with this index.""" - @property def _formatter_func(self): from pandas.formats.format import _get_format_timedelta64 diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 413d0e8d90445..800f9470f9845 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -798,6 +798,10 @@ def test_shift(self): '2011-01-01 09:00'], name='xxx', tz=tz) tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + def test_na_value(self): + self.assertIs(pd.DatetimeIndex._na_value, pd.NaT) + self.assertIs(pd.DatetimeIndex([])._na_value, pd.NaT) + class TestTimedeltaIndexOps(Ops): def setUp(self): @@ -1641,6 +1645,10 @@ def test_repeat(self): tm.assert_index_equal(res, exp) self.assertIsNone(res.freq) + def test_na_value(self): + self.assertIs(pd.TimedeltaIndex._na_value, pd.NaT) + self.assertIs(pd.TimedeltaIndex([])._na_value, pd.NaT) + class TestPeriodIndexOps(Ops): def setUp(self): @@ -2581,6 +2589,10 @@ def test_repeat(self): for res in [index.repeat(3), np.repeat(index, 3)]: tm.assert_index_equal(res, exp) + def test_na_value(self): + self.assertIs(pd.PeriodIndex._na_value, pd.NaT) + self.assertIs(pd.PeriodIndex([], freq='M')._na_value, pd.NaT) + if __name__ == '__main__': import nose From 5f496380beb81d3986ca48cf41b893d95c227895 Mon Sep 17 00:00:00 2001 From: Alex Vig Date: Mon, 15 Aug 2016 10:00:01 -0400 Subject: [PATCH 261/359] DOC: Added example and notes to NDFrame.where (#13798) --- doc/source/indexing.rst | 11 ++++++++ pandas/core/generic.py | 58 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 7380c543857a2..0a6691936d97d 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -828,6 +828,8 @@ To select a row where each column meets its own criterion: df[row_mask] +.. _indexing.where_mask: + The :meth:`~pandas.DataFrame.where` Method and Masking ------------------------------------------------------ @@ -891,6 +893,15 @@ without creating a copy: df_orig.where(df > 0, -df, inplace=True); df_orig +.. note:: + + The signature for :func:`DataFrame.where` differs from :func:`numpy.where`. + Roughly ``df1.where(m, df2)`` is equivalent to ``np.where(m, df1, df2)``. + + .. ipython:: python + + df.where(df < 0, -df) == np.where(df < 0, df, -df) + **alignment** Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame), diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d0295afe990c8..62c99d99dd407 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4690,9 +4690,62 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, Returns ------- wh : same type as caller + + Notes + ----- + The %(name)s method is an application of the if-then idiom. For each + element in the calling DataFrame, if ``cond`` is ``%(cond)s`` the + element is used; otherwise the corresponding element from the DataFrame + ``other`` is used. + + The signature for :func:`DataFrame.where` differs from + :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to + ``np.where(m, df1, df2)``. + + For further details and examples see the ``%(name)s`` documentation in + :ref:`indexing `. + + Examples + -------- + >>> s = pd.Series(range(5)) + >>> s.where(s > 0) + 0 NaN + 1 1.0 + 2 2.0 + 3 3.0 + 4 4.0 + + >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B']) + >>> m = df %% 3 == 0 + >>> df.where(m, -df) + A B + 0 0 -1 + 1 -2 3 + 2 -4 -5 + 3 6 -7 + 4 -8 9 + >>> df.where(m, -df) == np.where(m, df, -df) + A B + 0 True True + 1 True True + 2 True True + 3 True True + 4 True True + >>> df.where(m, -df) == df.mask(~m, -df) + A B + 0 True True + 1 True True + 2 True True + 3 True True + 4 True True + + See Also + -------- + :func:`DataFrame.%(name_other)s` """) - @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True")) + @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True", + name='where', name_other='mask')) def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): @@ -4700,7 +4753,8 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, return self._where(cond, other, inplace, axis, level, try_cast, raise_on_error) - @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False")) + @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False", + name='mask', name_other='where')) def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): From 66f9591136492127dd44ecc5bbcf4d727c89d994 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Aug 2016 19:52:44 +0200 Subject: [PATCH 262/359] BUG: RangeIndex accepting length-1 arrays as argument (#13765) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/range.py | 14 ++++++++------ pandas/tests/indexes/test_range.py | 3 ++- pandas/tools/tests/test_concat.py | 14 ++++++++++++++ 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 3600b8f52873b..c58af7ad4e327 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -979,6 +979,7 @@ Bug Fixes - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) - Bug in invalid frequency offset string like "D1", "-2-3H" may not raise ``ValueError (:issue:`13930`) +- Bug in ``concat`` and ``groupby`` for hierarchical frames with ``RangeIndex`` levels (:issue:`13542`). - Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`) diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 7094f8d589036..465ec4904f7ee 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -58,15 +58,17 @@ def __new__(cls, start=None, stop=None, step=None, name=None, dtype=None, # validate the arguments def _ensure_int(value, field): + msg = ("RangeIndex(...) must be called with integers," + " {value} was passed for {field}") + if not is_scalar(value): + raise TypeError(msg.format(value=type(value).__name__, + field=field)) try: new_value = int(value) assert(new_value == value) - except (ValueError, AssertionError): - raise TypeError("RangeIndex(...) must be called with integers," - " {value} was passed for {field}".format( - value=type(value).__name__, - field=field) - ) + except (TypeError, ValueError, AssertionError): + raise TypeError(msg.format(value=type(value).__name__, + field=field)) return new_value diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 8a036def0be1b..51333c46b7b3b 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -102,7 +102,8 @@ def test_constructor(self): # invalid args for i in [Index(['a', 'b']), Series(['a', 'b']), np.array(['a', 'b']), - [], 'foo', datetime(2000, 1, 1, 0, 0), np.arange(0, 10)]: + [], 'foo', datetime(2000, 1, 1, 0, 0), np.arange(0, 10), + np.array([1]), [1]]: self.assertRaises(TypeError, lambda: RangeIndex(i)) def test_constructor_same(self): diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index e3cc60e2856c2..17ccfb27d4b42 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -1444,6 +1444,20 @@ def test_default_index(self): tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + def test_concat_multiindex_rangeindex(self): + # GH13542 + # when multi-index levels are RangeIndex objects + # there is a bug in concat with objects of len 1 + + df = DataFrame(np.random.randn(9, 2)) + df.index = MultiIndex(levels=[pd.RangeIndex(3), pd.RangeIndex(3)], + labels=[np.repeat(np.arange(3), 3), + np.tile(np.arange(3), 3)]) + + res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) + exp = df.iloc[[2, 3, 4, 5], :] + tm.assert_frame_equal(res, exp) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From ef61bebecf48c1bd9cd1a344e12b37913cc3d1d6 Mon Sep 17 00:00:00 2001 From: cr3 Date: Mon, 15 Aug 2016 13:53:24 -0400 Subject: [PATCH 263/359] Fixed to_string with line_width and without index (#13998) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/formats/format.py | 7 +++++-- pandas/tests/formats/test_format.py | 8 ++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index c58af7ad4e327..d436d4dd3703b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -990,3 +990,4 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) - Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13845`) +- Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 50d54ddb95100..f89ceaff2ad64 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -68,7 +68,9 @@ Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row, default True index_names : bool, optional - Prints the names of the indexes, default True""" + Prints the names of the indexes, default True + line_width : int, optional + Width to wrap a line in characters, default no wrap""" justify_docstring = """ justify : {'left', 'right'}, default None @@ -632,7 +634,8 @@ def _join_multiline(self, *strcols): st = 0 for i, ed in enumerate(col_bins): row = strcols[st:ed] - row.insert(0, idx) + if self.index: + row.insert(0, idx) if nbins > 1: if ed <= len(strcols) and i < nbins - 1: row.append([' \\'] + [' '] * (nrows - 1)) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 1580a33fb9456..8a4aca2b320aa 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -1964,6 +1964,14 @@ def test_to_string_no_index(self): self.assertEqual(df_s, expected) + def test_to_string_line_width_no_index(self): + df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = "x \\\n1 \n2 \n3 \n\ny \n4 \n5 \n6" + + self.assertEqual(df_s, expected) + def test_to_string_float_formatting(self): self.reset_display_options() fmt.set_option('display.precision', 5, 'display.column_space', 12, From f93ad1ca828dc70a865445f1555958acbf132af1 Mon Sep 17 00:00:00 2001 From: Chris Grinolds Date: Mon, 15 Aug 2016 13:19:32 -0700 Subject: [PATCH 264/359] Ensure total_rows is always set (#13244) --- pandas/io/gbq.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 6288fdb609962..f4122d8d8b286 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -441,9 +441,8 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize): rows = [] remaining_rows = len(dataframe) - if self.verbose: - total_rows = remaining_rows - self._print("\n\n") + total_rows = remaining_rows + self._print("\n\n") for index, row in dataframe.reset_index(drop=True).iterrows(): row_dict = dict() From 8b50d8c854196e3025c9a881cafeedc5f509aaef Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 15 Aug 2016 18:22:40 -0400 Subject: [PATCH 265/359] BUG: Don't error in pd.to_timedelta when errors=ignore Title is self-explanatory. Closes #13613. Author: gfyoung Closes #13832 from gfyoung/to-timedelta-error-bug and squashes the following commits: dc39205 [gfyoung] BUG: Don't error in pd.to_timedelta when errors=ignore --- asv_bench/benchmarks/timedelta.py | 17 +++- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/src/inference.pyx | 2 +- pandas/tseries/tests/test_timedeltas.py | 23 +++++ pandas/tseries/timedeltas.py | 89 ++++++++++++------ pandas/tslib.pxd | 2 +- pandas/tslib.pyx | 120 +++++++++--------------- 7 files changed, 146 insertions(+), 108 deletions(-) diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 2f252a4d3e1dc..9719fd87dfb2e 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -31,4 +31,19 @@ def setup(self): self.arr = ['00:00:{0:02d}'.format(i) for i in self.arr] def time_timedelta_convert_string_seconds(self): - to_timedelta(self.arr) \ No newline at end of file + to_timedelta(self.arr) + + +class timedelta_convert_bad_parse(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randint(0, 1000, size=10000) + self.arr = ['{0} days'.format(i) for i in self.arr] + self.arr[-1] = 'apple' + + def time_timedelta_convert_coerce(self): + to_timedelta(self.arr, errors='coerce') + + def time_timedelta_convert_ignore(self): + to_timedelta(self.arr, errors='ignore') diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d436d4dd3703b..e3cdefd36b0be 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -858,6 +858,7 @@ Bug Fixes - Bug in ``groupby().cumsum()`` calculating ``cumprod`` when ``axis=1``. (:issue:`13994`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) +- Bug in ``pd.to_timedelta()`` in which the ``errors`` parameter was not being respected (:issue:`13613`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in area plot draws legend incorrectly if subplot is enabled or legend is moved after plot (matplotlib 1.5.0 is required to draw area plot legend properly) (issue:`9161`, :issue:`13544`) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 039e0df4193b3..62555dc7f178c 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -780,7 +780,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, break elif is_timedelta(val): if convert_timedelta: - itimedeltas[i] = convert_to_timedelta64(val, 'ns', False) + itimedeltas[i] = convert_to_timedelta64(val, 'ns') seen_timedelta = 1 else: seen_object = 1 diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 0bdf8590ec487..159d2b4f52f2a 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -845,6 +845,11 @@ def testit(unit, transform): def test_to_timedelta_invalid(self): + # bad value for errors parameter + msg = "errors must be one of" + tm.assertRaisesRegexp(ValueError, msg, to_timedelta, + ['foo'], errors='never') + # these will error self.assertRaises(ValueError, lambda: to_timedelta([1, 2], unit='foo')) self.assertRaises(ValueError, lambda: to_timedelta(1, unit='foo')) @@ -862,6 +867,24 @@ def test_to_timedelta_invalid(self): to_timedelta(['1 day', 'bar', '1 min'], errors='coerce')) + # gh-13613: these should not error because errors='ignore' + invalid_data = 'apple' + self.assertEqual(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + + invalid_data = ['apple', '1 days'] + tm.assert_numpy_array_equal( + np.array(invalid_data, dtype=object), + to_timedelta(invalid_data, errors='ignore')) + + invalid_data = pd.Index(['apple', '1 days']) + tm.assert_index_equal(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + + invalid_data = Series(['apple', '1 days']) + tm.assert_series_equal(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + def test_to_timedelta_via_apply(self): # GH 5458 expected = Series([np.timedelta64(1, 's')]) diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 7f28ec86ec40d..2ca3fcea8005b 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -3,7 +3,9 @@ """ import numpy as np +import pandas as pd import pandas.tslib as tslib + from pandas.types.common import (_ensure_object, is_integer_dtype, is_timedelta64_dtype, @@ -64,37 +66,22 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise', coerce=None): """ unit = _validate_timedelta_unit(unit) - def _convert_listlike(arg, box, unit, name=None): - - if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): - arg = np.array(list(arg), dtype='O') - - # these are shortcutable - if is_timedelta64_dtype(arg): - value = arg.astype('timedelta64[ns]') - elif is_integer_dtype(arg): - value = arg.astype('timedelta64[{0}]'.format( - unit)).astype('timedelta64[ns]', copy=False) - else: - value = tslib.array_to_timedelta64(_ensure_object(arg), - unit=unit, errors=errors) - value = value.astype('timedelta64[ns]', copy=False) - - if box: - from pandas import TimedeltaIndex - value = TimedeltaIndex(value, unit='ns', name=name) - return value + if errors not in ('ignore', 'raise', 'coerce'): + raise ValueError("errors must be one of 'ignore', " + "'raise', or 'coerce'}") if arg is None: return arg elif isinstance(arg, ABCSeries): from pandas import Series - values = _convert_listlike(arg._values, box=False, unit=unit) - return Series(values, index=arg.index, name=arg.name, dtype='m8[ns]') + values = _convert_listlike(arg._values, unit=unit, + box=False, errors=errors) + return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): - return _convert_listlike(arg, box=box, unit=unit, name=arg.name) + return _convert_listlike(arg, unit=unit, box=box, + errors=errors, name=arg.name) elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1: - return _convert_listlike(arg, box=box, unit=unit) + return _convert_listlike(arg, unit=unit, box=box, errors=errors) elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, timedelta, list, tuple, ' '1-d array, or Series') @@ -142,13 +129,55 @@ def _validate_timedelta_unit(arg): def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): - """ - convert strings to timedelta; coerce to Timedelta (if box), else - np.timedelta64 - """ + """Convert string 'r' to a timedelta object.""" + + try: + result = tslib.convert_to_timedelta64(r, unit) + except ValueError: + if errors == 'raise': + raise + elif errors == 'ignore': + return r + + # coerce + result = pd.NaT - result = tslib.convert_to_timedelta(r, unit, errors) if box: result = tslib.Timedelta(result) - return result + + +def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): + """Convert a list of objects to a timedelta index object.""" + + if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): + arg = np.array(list(arg), dtype='O') + + # these are shortcut-able + if is_timedelta64_dtype(arg): + value = arg.astype('timedelta64[ns]') + elif is_integer_dtype(arg): + value = arg.astype('timedelta64[{0}]'.format( + unit)).astype('timedelta64[ns]', copy=False) + else: + try: + value = tslib.array_to_timedelta64(_ensure_object(arg), + unit=unit, errors=errors) + value = value.astype('timedelta64[ns]', copy=False) + except ValueError: + if errors == 'ignore': + return arg + else: + # This else-block accounts for the cases when errors='raise' + # and errors='coerce'. If errors == 'raise', these errors + # should be raised. If errors == 'coerce', we shouldn't + # expect any errors to be raised, since all parsing errors + # cause coercion to pd.NaT. However, if an error / bug is + # introduced that causes an Exception to be raised, we would + # like to surface it. + raise + + if box: + from pandas import TimedeltaIndex + value = TimedeltaIndex(value, unit='ns', name=name) + return value diff --git a/pandas/tslib.pxd b/pandas/tslib.pxd index d6c5810e1d713..aa8cbcb2cedc7 100644 --- a/pandas/tslib.pxd +++ b/pandas/tslib.pxd @@ -1,7 +1,7 @@ from numpy cimport ndarray, int64_t cdef convert_to_tsobject(object, object, object, bint, bint) -cdef convert_to_timedelta64(object, object, object) +cpdef convert_to_timedelta64(object, object) cpdef object maybe_get_tz(object) cdef bint _is_utc(object) cdef bint _is_tzlocal(object) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 3c07cfd2446ed..53c77b2d8f9d7 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2619,7 +2619,7 @@ class Timedelta(_Timedelta): try: nano = kwargs.pop('nanoseconds',0) - value = convert_to_timedelta64(timedelta(**kwargs),'ns',False) + nano + value = convert_to_timedelta64(timedelta(**kwargs),'ns') + nano except TypeError as e: raise ValueError("cannot construct a Timedelta from the passed arguments, allowed keywords are " "[weeks, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds]") @@ -2627,9 +2627,9 @@ class Timedelta(_Timedelta): if isinstance(value, Timedelta): value = value.value elif util.is_string_object(value): - value = np.timedelta64(parse_timedelta_string(value, False)) + value = np.timedelta64(parse_timedelta_string(value)) elif isinstance(value, timedelta): - value = convert_to_timedelta64(value,'ns',False) + value = convert_to_timedelta64(value,'ns') elif isinstance(value, np.timedelta64): if unit is not None: value = value.astype('timedelta64[{0}]'.format(unit)) @@ -2638,7 +2638,7 @@ class Timedelta(_Timedelta): value = np.timedelta64(_delta_to_nanoseconds(value.delta),'ns') elif is_integer_object(value) or util.is_float_object(value): # unit=None is de-facto 'ns' - value = convert_to_timedelta64(value,unit,False) + value = convert_to_timedelta64(value,unit) elif _checknull_with_nat(value): return NaT else: @@ -3001,37 +3001,41 @@ cdef PyTypeObject* td_type = Timedelta cdef inline bint is_timedelta(object o): return Py_TYPE(o) == td_type # isinstance(o, Timedelta) -def array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): - """ convert an ndarray to an array of ints that are timedeltas - force conversion if errors = 'coerce', - else will raise if cannot convert """ +cpdef array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): + """ + Convert an ndarray to an array of timedeltas. If errors == 'coerce', + coerce non-convertible objects to NaT. Otherwise, raise. + """ + cdef: Py_ssize_t i, n ndarray[int64_t] iresult - bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' - assert is_raise or is_ignore or is_coerce + if errors not in ('ignore', 'raise', 'coerce'): + raise ValueError("errors must be one of 'ignore', " + "'raise', or 'coerce'}") n = values.shape[0] result = np.empty(n, dtype='m8[ns]') iresult = result.view('i8') - # usually we have all strings - # if so then we hit the fast path + # Usually, we have all strings. If so, we hit the fast path. + # If this path fails, we try conversion a different way, and + # this is where all of the error handling will take place. try: for i in range(n): - result[i] = parse_timedelta_string(values[i], is_coerce) + result[i] = parse_timedelta_string(values[i]) except: for i in range(n): - result[i] = convert_to_timedelta64(values[i], unit, is_coerce) - return iresult - - -def convert_to_timedelta(object ts, object unit='ns', errors='raise'): - cdef bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' + try: + result[i] = convert_to_timedelta64(values[i], unit) + except ValueError: + if errors == 'coerce': + result[i] = NPY_NAT + else: + raise - assert is_raise or is_ignore or is_coerce - return convert_to_timedelta64(ts, unit, is_coerce) + return iresult cdef dict timedelta_abbrevs = { 'D' : 'd', 'd' : 'd', @@ -3099,15 +3103,10 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): n = ''.join(number) + '.' + ''.join(frac) return cast_from_unit(float(n), unit) -cdef inline parse_timedelta_string(object ts, coerce=False): +cdef inline parse_timedelta_string(object ts): """ - Parse an regular format timedelta string - - Return an int64_t or raise a ValueError on an invalid parse - - if coerce, set a non-valid value to NaT - - Return a ns based int64 + Parse a regular format timedelta string. Return an int64_t (in ns) + or raise a ValueError on an invalid parse. """ cdef: @@ -3163,13 +3162,7 @@ cdef inline parse_timedelta_string(object ts, coerce=False): number.append(c) else: - - try: - r = timedelta_from_spec(number, frac, unit) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, unit) unit, number, frac = [], [c], [] result += timedelta_as_neg(r, neg) @@ -3196,9 +3189,9 @@ cdef inline parse_timedelta_string(object ts, coerce=False): result += timedelta_as_neg(r, neg) have_hhmmss = 1 else: - if coerce: - return NPY_NAT - raise ValueError("expecting hh:mm:ss format, received: {0}".format(ts)) + raise ValueError("expecting hh:mm:ss format, " + "received: {0}".format(ts)) + unit, number = [], [] # after the decimal point @@ -3228,21 +3221,15 @@ cdef inline parse_timedelta_string(object ts, coerce=False): # we had a dot, but we have a fractional # value since we have an unit if have_dot and len(unit): - try: - r = timedelta_from_spec(number, frac, unit) - result += timedelta_as_neg(r, neg) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, unit) + result += timedelta_as_neg(r, neg) # we have a dot as part of a regular format # e.g. hh:mm:ss.fffffff elif have_dot: - if (len(number) or len(frac)) and not len(unit) and current_unit is None: - if coerce: - return NPY_NAT + if ((len(number) or len(frac)) and not len(unit) + and current_unit is None): raise ValueError("no units specified") if len(frac) > 0 and len(frac) <= 3: @@ -3266,38 +3253,24 @@ cdef inline parse_timedelta_string(object ts, coerce=False): # we have a last abbreviation elif len(unit): - if len(number): - try: - r = timedelta_from_spec(number, frac, unit) - result += timedelta_as_neg(r, neg) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, unit) + result += timedelta_as_neg(r, neg) else: - if coerce: - return NPY_NAT raise ValueError("unit abbreviation w/o a number") # treat as nanoseconds # but only if we don't have anything else else: - if have_value: raise ValueError("have leftover units") if len(number): - try: - r = timedelta_from_spec(number, frac, 'ns') - result += timedelta_as_neg(r, neg) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, 'ns') + result += timedelta_as_neg(r, neg) return result -cdef inline convert_to_timedelta64(object ts, object unit, object coerce): +cpdef convert_to_timedelta64(object ts, object unit): """ Convert an incoming object to a timedelta64 if possible @@ -3308,9 +3281,7 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce): - np.int64 (with unit providing a possible modifier) - None/NaT - if coerce, set a non-valid value to NaT - - Return a ns based int64 + Return an ns based int64 # kludgy here until we have a timedelta scalar # handle the numpy < 1.7 case @@ -3346,16 +3317,15 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce): ts = cast_from_unit(ts, unit) ts = np.timedelta64(ts) elif util.is_string_object(ts): - ts = np.timedelta64(parse_timedelta_string(ts, coerce)) + ts = np.timedelta64(parse_timedelta_string(ts)) elif hasattr(ts,'delta'): ts = np.timedelta64(_delta_to_nanoseconds(ts),'ns') if isinstance(ts, timedelta): ts = np.timedelta64(ts) elif not isinstance(ts, np.timedelta64): - if coerce: - return np.timedelta64(NPY_NAT) - raise ValueError("Invalid type for timedelta scalar: %s" % type(ts)) + raise ValueError("Invalid type for timedelta " + "scalar: %s" % type(ts)) return ts.astype('timedelta64[ns]') def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='raise'): From 471c4e7eb29e8f0ec05817420be015c4ae8af017 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 15 Aug 2016 19:14:18 -0400 Subject: [PATCH 266/359] ENH: bool sparse now supports logical op Author: sinhrks Closes #14000 from sinhrks/sparse_bool and squashes the following commits: 6db3096 [sinhrks] ENH: bool sparse now supports logical op --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/sparse/array.py | 25 +- pandas/sparse/tests/test_arithmetics.py | 44 + pandas/src/sparse.pyx | 1 + pandas/src/sparse_op_helper.pxi | 1110 +++++++++++++++-------- pandas/src/sparse_op_helper.pxi.in | 44 +- 6 files changed, 811 insertions(+), 414 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e3cdefd36b0be..e1762b46cc30c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -762,6 +762,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` ValueError: unable to coerce current fill_value nan to int64 dtype - Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) +- ``SparseArray`` with ``bool`` dtype now supports logical (bool) operators (:issue:`14000`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index d14a8eadddc13..8d564d0abbf3f 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -98,6 +98,7 @@ def _sparse_array_op(left, right, op, name, series=False): right = right.astype(np.float64) dtype = _maybe_match_dtype(left, right) + result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: result = op(left.get_values(), right.get_values()) @@ -116,13 +117,26 @@ def _sparse_array_op(left, right, op, name, series=False): left, right = right, left name = name[1:] - opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) - sparse_op = getattr(splib, opname) + if name in ('and', 'or') and dtype == 'bool': + opname = 'sparse_{name}_uint8'.format(name=name, dtype=dtype) + # to make template simple, cast here + left_sp_values = left.sp_values.view(np.uint8) + right_sp_values = right.sp_values.view(np.uint8) + result_dtype = np.bool + else: + opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) + left_sp_values = left.sp_values + right_sp_values = right.sp_values - result, index, fill = sparse_op(left.sp_values, left.sp_index, - left.fill_value, right.sp_values, + sparse_op = getattr(splib, opname) + result, index, fill = sparse_op(left_sp_values, left.sp_index, + left.fill_value, right_sp_values, right.sp_index, right.fill_value) - return _wrap_result(name, result, index, fill, dtype=result.dtype) + + if result_dtype is None: + result_dtype = result.dtype + + return _wrap_result(name, result, index, fill, dtype=result_dtype) def _wrap_result(name, data, sparse_index, fill_value, dtype=None): @@ -750,4 +764,5 @@ def _make_index(length, indices, kind): ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method, comp_method=_arith_method, + bool_method=_arith_method, use_numexpr=False) diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/sparse/tests/test_arithmetics.py index b5945151db678..ec8bc4d8634e6 100644 --- a/pandas/sparse/tests/test_arithmetics.py +++ b/pandas/sparse/tests/test_arithmetics.py @@ -108,6 +108,20 @@ def _check_comparison_ops(self, a, b, a_dense, b_dense): self._check_bool_result(a < b_dense) self._assert((a < b_dense).to_dense(), a_dense < b_dense) + def _check_logical_ops(self, a, b, a_dense, b_dense): + # sparse & sparse + self._check_bool_result(a & b) + self._assert((a & b).to_dense(), a_dense & b_dense) + + self._check_bool_result(a | b) + self._assert((a | b).to_dense(), a_dense | b_dense) + # sparse & dense + self._check_bool_result(a & b_dense) + self._assert((a & b_dense).to_dense(), a_dense & b_dense) + + self._check_bool_result(a | b_dense) + self._assert((a | b_dense).to_dense(), a_dense | b_dense) + def test_float_scalar(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) @@ -305,6 +319,36 @@ def test_int_array_comparison(self): b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2) self._check_comparison_ops(a, b, values, rvalues) + def test_bool_same_index(self): + # GH 14000 + # when sp_index are the same + for kind in ['integer', 'block']: + values = self._base([True, False, True, True], dtype=np.bool) + rvalues = self._base([True, False, True, True], dtype=np.bool) + + for fill_value in [True, False, np.nan]: + a = self._klass(values, kind=kind, dtype=np.bool, + fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool, + fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) + + def test_bool_array_logical(self): + # GH 14000 + # when sp_index are the same + for kind in ['integer', 'block']: + values = self._base([True, False, True, False, True, True], + dtype=np.bool) + rvalues = self._base([True, False, False, True, False, True], + dtype=np.bool) + + for fill_value in [True, False, np.nan]: + a = self._klass(values, kind=kind, dtype=np.bool, + fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool, + fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) + class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx index 646f9126b984c..88eb4cf13815b 100644 --- a/pandas/src/sparse.pyx +++ b/pandas/src/sparse.pyx @@ -758,6 +758,7 @@ cdef class BlockUnion(BlockMerge): include "sparse_op_helper.pxi" + #------------------------------------------------------------------------------- # Indexing operations diff --git a/pandas/src/sparse_op_helper.pxi b/pandas/src/sparse_op_helper.pxi index 5ff96469195e3..8462c31c84679 100644 --- a/pandas/src/sparse_op_helper.pxi +++ b/pandas/src/sparse_op_helper.pxi @@ -248,20 +248,6 @@ cpdef sparse_add_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_add_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = x[i] + y[i] - return out - - cpdef sparse_fill_add_float64(float64_t xfill, float64_t yfill): return xfill + yfill @@ -443,20 +429,6 @@ cpdef sparse_add_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_add_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[int64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.int64) - - for i in range(len(x)): - out[i] = x[i] + y[i] - return out - - cpdef sparse_fill_add_int64(int64_t xfill, int64_t yfill): return xfill + yfill @@ -638,20 +610,6 @@ cpdef sparse_sub_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_sub_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = x[i] - y[i] - return out - - cpdef sparse_fill_sub_float64(float64_t xfill, float64_t yfill): return xfill - yfill @@ -833,20 +791,6 @@ cpdef sparse_sub_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_sub_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[int64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.int64) - - for i in range(len(x)): - out[i] = x[i] - y[i] - return out - - cpdef sparse_fill_sub_int64(int64_t xfill, int64_t yfill): return xfill - yfill @@ -1028,20 +972,6 @@ cpdef sparse_mul_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_mul_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = x[i] * y[i] - return out - - cpdef sparse_fill_mul_float64(float64_t xfill, float64_t yfill): return xfill * yfill @@ -1223,20 +1153,6 @@ cpdef sparse_mul_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_mul_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[int64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.int64) - - for i in range(len(x)): - out[i] = x[i] * y[i] - return out - - cpdef sparse_fill_mul_int64(int64_t xfill, int64_t yfill): return xfill * yfill @@ -1418,20 +1334,6 @@ cpdef sparse_div_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_div_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = __div_float64(x[i], y[i]) - return out - - cpdef sparse_fill_div_float64(float64_t xfill, float64_t yfill): return __div_float64(xfill, yfill) @@ -1613,20 +1515,6 @@ cpdef sparse_div_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_div_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = __div_int64(x[i], y[i]) - return out - - cpdef sparse_fill_div_int64(int64_t xfill, int64_t yfill): return __div_int64(xfill, yfill) @@ -1808,20 +1696,6 @@ cpdef sparse_mod_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_mod_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = __mod_float64(x[i], y[i]) - return out - - cpdef sparse_fill_mod_float64(float64_t xfill, float64_t yfill): return __mod_float64(xfill, yfill) @@ -2003,20 +1877,6 @@ cpdef sparse_mod_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_mod_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[int64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.int64) - - for i in range(len(x)): - out[i] = __mod_int64(x[i], y[i]) - return out - - cpdef sparse_fill_mod_int64(int64_t xfill, int64_t yfill): return __mod_int64(xfill, yfill) @@ -2198,20 +2058,6 @@ cpdef sparse_truediv_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_truediv_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = __truediv_float64(x[i], y[i]) - return out - - cpdef sparse_fill_truediv_float64(float64_t xfill, float64_t yfill): return __truediv_float64(xfill, yfill) @@ -2393,20 +2239,6 @@ cpdef sparse_truediv_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_truediv_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = __truediv_int64(x[i], y[i]) - return out - - cpdef sparse_fill_truediv_int64(int64_t xfill, int64_t yfill): return __truediv_int64(xfill, yfill) @@ -2588,20 +2420,6 @@ cpdef sparse_floordiv_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_floordiv_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = __floordiv_float64(x[i], y[i]) - return out - - cpdef sparse_fill_floordiv_float64(float64_t xfill, float64_t yfill): return __floordiv_float64(xfill, yfill) @@ -2783,20 +2601,6 @@ cpdef sparse_floordiv_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_floordiv_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[int64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.int64) - - for i in range(len(x)): - out[i] = __floordiv_int64(x[i], y[i]) - return out - - cpdef sparse_fill_floordiv_int64(int64_t xfill, int64_t yfill): return __floordiv_int64(xfill, yfill) @@ -2978,20 +2782,6 @@ cpdef sparse_pow_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_pow_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[float64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.float64) - - for i in range(len(x)): - out[i] = x[i] ** y[i] - return out - - cpdef sparse_fill_pow_float64(float64_t xfill, float64_t yfill): return xfill ** yfill @@ -3173,20 +2963,6 @@ cpdef sparse_pow_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_pow_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[int64_t, ndim=1] out - - out = np.empty(len(x), dtype=np.int64) - - for i in range(len(x)): - out[i] = x[i] ** y[i] - return out - - cpdef sparse_fill_pow_int64(int64_t xfill, int64_t yfill): return xfill ** yfill @@ -3368,20 +3144,6 @@ cpdef sparse_eq_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_eq_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] == y[i] - return out - - cpdef sparse_fill_eq_float64(float64_t xfill, float64_t yfill): return xfill == yfill @@ -3563,20 +3325,6 @@ cpdef sparse_eq_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_eq_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] == y[i] - return out - - cpdef sparse_fill_eq_int64(int64_t xfill, int64_t yfill): return xfill == yfill @@ -3758,20 +3506,6 @@ cpdef sparse_ne_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_ne_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] != y[i] - return out - - cpdef sparse_fill_ne_float64(float64_t xfill, float64_t yfill): return xfill != yfill @@ -3953,20 +3687,6 @@ cpdef sparse_ne_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_ne_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] != y[i] - return out - - cpdef sparse_fill_ne_int64(int64_t xfill, int64_t yfill): return xfill != yfill @@ -4148,20 +3868,6 @@ cpdef sparse_lt_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_lt_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] < y[i] - return out - - cpdef sparse_fill_lt_float64(float64_t xfill, float64_t yfill): return xfill < yfill @@ -4343,20 +4049,6 @@ cpdef sparse_lt_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_lt_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] < y[i] - return out - - cpdef sparse_fill_lt_int64(int64_t xfill, int64_t yfill): return xfill < yfill @@ -4538,20 +4230,6 @@ cpdef sparse_gt_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_gt_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] > y[i] - return out - - cpdef sparse_fill_gt_float64(float64_t xfill, float64_t yfill): return xfill > yfill @@ -4733,20 +4411,6 @@ cpdef sparse_gt_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_gt_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] > y[i] - return out - - cpdef sparse_fill_gt_int64(int64_t xfill, int64_t yfill): return xfill > yfill @@ -4928,20 +4592,6 @@ cpdef sparse_le_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_le_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] <= y[i] - return out - - cpdef sparse_fill_le_float64(float64_t xfill, float64_t yfill): return xfill <= yfill @@ -5123,20 +4773,6 @@ cpdef sparse_le_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_le_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] <= y[i] - return out - - cpdef sparse_fill_le_int64(int64_t xfill, int64_t yfill): return xfill <= yfill @@ -5318,20 +4954,6 @@ cpdef sparse_ge_float64(ndarray[float64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_ge_float64(ndarray[float64_t, ndim=1] x, - ndarray[float64_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[uint8_t, ndim=1] out - - out = np.empty(len(x), dtype=np.uint8) - - for i in range(len(x)): - out[i] = x[i] >= y[i] - return out - - cpdef sparse_fill_ge_float64(float64_t xfill, float64_t yfill): return xfill >= yfill @@ -5513,20 +5135,730 @@ cpdef sparse_ge_int64(ndarray[int64_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_ge_int64(ndarray[int64_t, ndim=1] x, - ndarray[int64_t, ndim=1] y): - """ to return NumPy compat result """ +cpdef sparse_fill_ge_int64(int64_t xfill, + int64_t yfill): + return xfill >= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_and_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + cdef: - Py_ssize_t i = 0 + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y ndarray[uint8_t, ndim=1] out - out = np.empty(len(x), dtype=np.uint8) + # to suppress Cython warning + x = x_ + y = y_ - for i in range(len(x)): - out[i] = x[i] >= y[i] - return out + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + # Wow, what a hack job. Need to do something about this -cpdef sparse_fill_ge_int64(int64_t xfill, - int64_t yfill): - return xfill >= yfill + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] & y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill & yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_and_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] & y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + return out, out_index, xfill & yfill + + +cpdef sparse_and_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_and_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_and_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_and_int64(int64_t xfill, + int64_t yfill): + return xfill & yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_and_uint8(ndarray x_, + BlockIndex xindex, + uint8_t xfill, + ndarray y_, + BlockIndex yindex, + uint8_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[uint8_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] & y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill & yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_and_uint8(ndarray x_, IntIndex xindex, + uint8_t xfill, + ndarray y_, IntIndex yindex, + uint8_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[uint8_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] & y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + return out, out_index, xfill & yfill + + +cpdef sparse_and_uint8(ndarray[uint8_t, ndim=1] x, + SparseIndex xindex, uint8_t xfill, + ndarray[uint8_t, ndim=1] y, + SparseIndex yindex, uint8_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_and_uint8(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_and_uint8(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_and_uint8(uint8_t xfill, + uint8_t yfill): + return xfill & yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_or_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] | y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill | yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_or_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] | y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + return out, out_index, xfill | yfill + + +cpdef sparse_or_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_or_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_or_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_or_int64(int64_t xfill, + int64_t yfill): + return xfill | yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_or_uint8(ndarray x_, + BlockIndex xindex, + uint8_t xfill, + ndarray y_, + BlockIndex yindex, + uint8_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[uint8_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] | y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill | yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_or_uint8(ndarray x_, IntIndex xindex, + uint8_t xfill, + ndarray y_, IntIndex yindex, + uint8_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[uint8_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] | y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + return out, out_index, xfill | yfill + + +cpdef sparse_or_uint8(ndarray[uint8_t, ndim=1] x, + SparseIndex xindex, uint8_t xfill, + ndarray[uint8_t, ndim=1] y, + SparseIndex yindex, uint8_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_or_uint8(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_or_uint8(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_or_uint8(uint8_t xfill, + uint8_t yfill): + return xfill | yfill diff --git a/pandas/src/sparse_op_helper.pxi.in b/pandas/src/sparse_op_helper.pxi.in index 1a0e1aa0250f6..d1d9a6f02a72c 100644 --- a/pandas/src/sparse_op_helper.pxi.in +++ b/pandas/src/sparse_op_helper.pxi.in @@ -90,8 +90,12 @@ cdef inline {{dtype}}_t __mod_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): {{py: -# dtype -dtypes = ['float64', 'int64'] +# dtype, arith_comp_group, logical_group +dtypes = [('float64', True, False), + ('int64', True, True), + ('uint8', False, True)] +# do not generate arithmetic / comparison template for uint8, +# it should be done in fused types def get_op(tup): assert isinstance(tup, tuple) @@ -112,7 +116,10 @@ def get_op(tup): 'lt': '{0} < {1}', 'gt': '{0} > {1}', 'le': '{0} <= {1}', - 'ge': '{0} >= {1}'} + 'ge': '{0} >= {1}', + + 'and': '{0} & {1}', # logical op + 'or': '{0} | {1}'} return ops_dict[opname].format(lval, rval, dtype) @@ -120,19 +127,30 @@ def get_op(tup): def get_dispatch(dtypes): ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv', - 'floordiv', 'pow', 'eq', 'ne', 'lt', 'gt', 'le', 'ge'] + 'floordiv', 'pow', + 'eq', 'ne', 'lt', 'gt', 'le', 'ge', + 'and', 'or'] for opname in ops_list: - for dtype in dtypes: + for dtype, arith_comp_group, logical_group in dtypes: if opname in ('div', 'truediv'): rdtype = 'float64' elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): + # comparison op + rdtype = 'uint8' + elif opname in ('and', 'or'): + # logical op rdtype = 'uint8' else: rdtype = dtype - yield opname, dtype, rdtype + if opname in ('and', 'or'): + if logical_group: + yield opname, dtype, rdtype + else: + if arith_comp_group: + yield opname, dtype, rdtype }} @@ -316,20 +334,6 @@ cpdef sparse_{{opname}}_{{dtype}}(ndarray[{{dtype}}_t, ndim=1] x, raise NotImplementedError -cpdef sparse_align_{{opname}}_{{dtype}}(ndarray[{{dtype}}_t, ndim=1] x, - ndarray[{{dtype}}_t, ndim=1] y): - """ to return NumPy compat result """ - cdef: - Py_ssize_t i = 0 - ndarray[{{rdtype}}_t, ndim=1] out - - out = np.empty(len(x), dtype=np.{{rdtype}}) - - for i in range(len(x)): - out[i] = {{(opname, 'x[i]', 'y[i]', dtype) | get_op}} - return out - - cpdef sparse_fill_{{opname}}_{{dtype}}({{dtype}}_t xfill, {{dtype}}_t yfill): return {{(opname, 'xfill', 'yfill', dtype) | get_op}} From 1d7e451ea357c0ecf31f1fb0680b1f83018b70d3 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 16 Aug 2016 05:57:38 -0400 Subject: [PATCH 267/359] DEPR: Deprecate SparseList. (#14007) Closes gh-13784. --- doc/source/sparse.rst | 2 + doc/source/whatsnew/v0.19.0.txt | 1 + pandas/sparse/list.py | 6 ++ pandas/sparse/tests/test_list.py | 126 +++++++++++++++++-------------- 4 files changed, 80 insertions(+), 55 deletions(-) diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 2496335dc7b71..db9734edde482 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -90,6 +90,8 @@ can be converted back to a regular ndarray by calling ``to_dense``: SparseList ---------- +.. note:: The ``SparseList`` class has been deprecated and will be removed in a future version. + ``SparseList`` is a list-like data structure for managing a dynamic collection of SparseArrays. To create one, simply call the ``SparseList`` constructor with a ``fill_value`` (defaulting to ``NaN``): diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e1762b46cc30c..a3b1007698194 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -780,6 +780,7 @@ Deprecations - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``SparseList`` has been deprecated and will be removed in a future version (:issue:`13784`) - ``DataFrame.to_html()`` and ``DataFrame.to_latex()`` have dropped the ``colSpace`` parameter in favor of ``col_space`` (:issue:`13857`) - ``DataFrame.to_sql()`` has deprecated the ``flavor`` parameter, as it is superfluous when SQLAlchemy is not installed (:issue:`13611`) - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) diff --git a/pandas/sparse/list.py b/pandas/sparse/list.py index 666dae8071053..82de8cd7d3959 100644 --- a/pandas/sparse/list.py +++ b/pandas/sparse/list.py @@ -1,3 +1,4 @@ +import warnings import numpy as np from pandas.core.base import PandasObject from pandas.formats.printing import pprint_thing @@ -20,6 +21,11 @@ class SparseList(PandasObject): """ def __init__(self, data=None, fill_value=np.nan): + + # see gh-13784 + warnings.warn("SparseList is deprecated and will be removed " + "in a future version", FutureWarning, stacklevel=2) + self.fill_value = fill_value self._chunks = [] diff --git a/pandas/sparse/tests/test_list.py b/pandas/sparse/tests/test_list.py index 5f8627103e18b..0b933b4f9c6f2 100644 --- a/pandas/sparse/tests/test_list.py +++ b/pandas/sparse/tests/test_list.py @@ -16,83 +16,99 @@ def setUp(self): self.na_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) self.zero_data = np.array([0, 0, 1, 2, 3, 0, 4, 5, 0, 6]) + def test_deprecation(self): + # see gh-13784 + with tm.assert_produces_warning(FutureWarning): + SparseList() + def test_constructor(self): - lst1 = SparseList(self.na_data[:5]) - exp = SparseList() + with tm.assert_produces_warning(FutureWarning): + lst1 = SparseList(self.na_data[:5]) + with tm.assert_produces_warning(FutureWarning): + exp = SparseList() + exp.append(self.na_data[:5]) tm.assert_sp_list_equal(lst1, exp) def test_len(self): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - self.assertEqual(len(splist), 5) - splist.append(arr[5]) - self.assertEqual(len(splist), 6) - splist.append(arr[6:]) - self.assertEqual(len(splist), 10) + with tm.assert_produces_warning(FutureWarning): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + self.assertEqual(len(splist), 5) + splist.append(arr[5]) + self.assertEqual(len(splist), 6) + splist.append(arr[6:]) + self.assertEqual(len(splist), 10) def test_append_na(self): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) + with tm.assert_produces_warning(FutureWarning): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) - sparr = splist.to_array() - tm.assert_sp_array_equal(sparr, SparseArray(arr)) + sparr = splist.to_array() + tm.assert_sp_array_equal(sparr, SparseArray(arr)) def test_append_zero(self): - arr = self.zero_data - splist = SparseList(fill_value=0) - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) + with tm.assert_produces_warning(FutureWarning): + arr = self.zero_data + splist = SparseList(fill_value=0) + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) - sparr = splist.to_array() - tm.assert_sp_array_equal(sparr, SparseArray(arr, fill_value=0)) + sparr = splist.to_array() + tm.assert_sp_array_equal(sparr, SparseArray(arr, fill_value=0)) def test_consolidate(self): - arr = self.na_data - exp_sparr = SparseArray(arr) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + arr = self.na_data + exp_sparr = SparseArray(arr) - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) - consol = splist.consolidate(inplace=False) - self.assertEqual(consol.nchunks, 1) - self.assertEqual(splist.nchunks, 3) - tm.assert_sp_array_equal(consol.to_array(), exp_sparr) + consol = splist.consolidate(inplace=False) + self.assertEqual(consol.nchunks, 1) + self.assertEqual(splist.nchunks, 3) + tm.assert_sp_array_equal(consol.to_array(), exp_sparr) - splist.consolidate() - self.assertEqual(splist.nchunks, 1) - tm.assert_sp_array_equal(splist.to_array(), exp_sparr) + splist.consolidate() + self.assertEqual(splist.nchunks, 1) + tm.assert_sp_array_equal(splist.to_array(), exp_sparr) def test_copy(self): - arr = self.na_data - exp_sparr = SparseArray(arr) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + arr = self.na_data + exp_sparr = SparseArray(arr) - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) - cp = splist.copy() - cp.append(arr[6:]) - self.assertEqual(splist.nchunks, 2) - tm.assert_sp_array_equal(cp.to_array(), exp_sparr) + cp = splist.copy() + cp.append(arr[6:]) + self.assertEqual(splist.nchunks, 2) + tm.assert_sp_array_equal(cp.to_array(), exp_sparr) def test_getitem(self): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - for i in range(len(arr)): - tm.assert_almost_equal(splist[i], arr[i]) - tm.assert_almost_equal(splist[-i], arr[-i]) + with tm.assert_produces_warning(FutureWarning): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + for i in range(len(arr)): + tm.assert_almost_equal(splist[i], arr[i]) + tm.assert_almost_equal(splist[-i], arr[-i]) if __name__ == '__main__': From 4d6a40a1ed8b73c8b036615360d555736d6a1b2f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 16 Aug 2016 05:59:23 -0400 Subject: [PATCH 268/359] TST: move SparseList to deprecate in api, xref #14007 DOC: whatsnew fixes --- doc/source/whatsnew/v0.19.0.txt | 17 +++++++++-------- pandas/api/tests/test_api.py | 5 +++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a3b1007698194..9cac79288ea89 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -411,7 +411,7 @@ Other enhancements df.sort_values(by='row2', axis=1) - Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) -- Raise ImportError for in the sql functions when sqlalchemy is not installed and a connection string is used (:issue:`11920`). +- Raise ``ImportError`` in the sql functions when ``sqlalchemy`` is not installed and a connection string is used (:issue:`11920`). .. _whatsnew_0190.api: @@ -505,10 +505,9 @@ New Behavior: .. _whatsnew_0190.api.to_datetime_coerce: -``.to_datetime()`` when coercing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``.to_datetime()`` changes +^^^^^^^^^^^^^^^^^^^^^^^^^^ -A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. Previous Behavior: @@ -524,6 +523,12 @@ This will now convert integers/floats with the default unit of ``ns``. pd.to_datetime([1, 'foo'], errors='coerce') +- Bug in ``pd.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). +- Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) +- Bug in ``pd.to_datetime()`` which overflowed on ``int8``, and ``int16`` dtypes (:issue:`13451`) +- Bug in ``pd.to_datetime()`` raise ``AttributeError`` with NaN and the other string is not valid when errors='ignore' (:issue:`12424`) +- Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13845`) + .. _whatsnew_0190.api.merging: Merging changes @@ -929,8 +934,6 @@ Bug Fixes - Bug ``Series.isnull`` and ``Series.notnull`` ignore ``Period('NaT')`` (:issue:`13737`) - Bug ``Series.fillna`` and ``Series.dropna`` don't affect to ``Period('NaT')`` (:issue:`13737`) -- Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) -- Bug in ``pd.to_datetime()`` which overflowed on ``int8``, and ``int16`` dtypes (:issue:`13451`) - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`) - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) @@ -945,7 +948,6 @@ Bug Fixes - Bug in ``.set_index`` raises ``AmbiguousTimeError`` if new index contains DST boundary and multi levels (:issue:`12920`) - Bug in ``.shift`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13926`) - Bug in ``pd.read_hdf()`` returns incorrect result when a ``DataFrame`` with a ``categorical`` column and a query which doesn't match any values (:issue:`13792`) -- Bug in ``pd.to_datetime()`` raise ``AttributeError`` with NaN and the other string is not valid when errors='ignore' (:issue:`12424`) - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) @@ -992,5 +994,4 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) -- Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13845`) - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index 2537354091ad1..b1bbf18df3e06 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -58,10 +58,11 @@ class TestPDApi(Base, tm.TestCase): # these are already deprecated; awaiting removal deprecated_classes = ['TimeSeries', 'WidePanel', - 'SparseTimeSeries', 'Panel4D'] + 'SparseTimeSeries', 'Panel4D', + 'SparseList'] # these should be deperecated in the future - deprecated_classes_in_future = ['SparseList', 'Term', 'Panel'] + deprecated_classes_in_future = ['Term', 'Panel'] # these should be removed from top-level namespace remove_classes_from_top_level_namespace = ['Expr'] From 5c27c024c2191817aadc86cf86f897bbf5eb5002 Mon Sep 17 00:00:00 2001 From: Douglas McNeil Date: Tue, 16 Aug 2016 06:08:47 -0400 Subject: [PATCH 269/359] BUG: Avoid sentinel-infinity comparison problems (#13445) The problem causing #13445 ultimately traces to the fact that our Infinity/NegInfinity objects were greater than/less than themselves, which violates an assumption numpy makes when sorting. This was separately reported as https://github.com/numpy/numpy/issues/7934, but we can fix and test downstream as well. closes #13445 Author: Douglas McNeil Closes #14006 from dsm054/fix_rank_segfault and squashes the following commits: 7d79370 [Douglas McNeil] BUG: Avoid sentinel-infinity comparison problems (#13445) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/algos.pyx | 31 +++++++++++++++---------------- pandas/tests/test_algos.py | 30 ++++++++++++++++++++++++++++++ pandas/tests/test_stats.py | 7 +++++++ 4 files changed, 53 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9cac79288ea89..0ee56f865f8c8 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -900,6 +900,7 @@ Bug Fixes - Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) - Bug in printing ``pd.DataFrame`` where unusual elements with the ``object`` dtype were causing segfaults (:issue:`13717`) +- Bug in ranking ``Series`` which could result in segfaults (:issue:`13445`) - Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) - Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) - Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 44288ab9621f1..d3e68ad2a5eee 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -567,28 +567,27 @@ cdef inline are_diff(object left, object right): except TypeError: return left != right -_return_false = lambda self, other: False -_return_true = lambda self, other: True class Infinity(object): + """ provide a positive Infinity comparision method for ranking """ - __lt__ = _return_false - __le__ = _return_false - __eq__ = _return_false - __ne__ = _return_true - __gt__ = _return_true - __ge__ = _return_true - __cmp__ = _return_false + __lt__ = lambda self, other: False + __le__ = lambda self, other: self is other + __eq__ = lambda self, other: self is other + __ne__ = lambda self, other: self is not other + __gt__ = lambda self, other: self is not other + __ge__ = lambda self, other: True class NegInfinity(object): + """ provide a negative Infinity comparision method for ranking """ + + __lt__ = lambda self, other: self is not other + __le__ = lambda self, other: True + __eq__ = lambda self, other: self is other + __ne__ = lambda self, other: self is not other + __gt__ = lambda self, other: False + __ge__ = lambda self, other: self is other - __lt__ = _return_true - __le__ = _return_true - __eq__ = _return_false - __ne__ = _return_true - __gt__ = _return_false - __ge__ = _return_false - __cmp__ = _return_true def rank_2d_generic(object in_arr, axis=0, ties_method='average', ascending=True, na_option='keep', pct=False): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 66fd1861f08f9..452355541a79b 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -5,6 +5,7 @@ from numpy.random import RandomState from numpy import nan from datetime import datetime +from itertools import permutations from pandas import Series, Categorical, CategoricalIndex, Index import pandas as pd @@ -1270,6 +1271,35 @@ def test_groupsort_indexer(): assert (np.array_equal(result, expected)) +def test_infinity_sort(): + # GH 13445 + # numpy's argsort can be unhappy if something is less than + # itself. Instead, let's give our infinities a self-consistent + # ordering, but outside the float extended real line. + + Inf = _algos.Infinity() + NegInf = _algos.NegInfinity() + + ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf] + + assert all(Inf >= x for x in ref_nums) + assert all(Inf > x or x is Inf for x in ref_nums) + assert Inf >= Inf and Inf == Inf + assert not Inf < Inf and not Inf > Inf + + assert all(NegInf <= x for x in ref_nums) + assert all(NegInf < x or x is NegInf for x in ref_nums) + assert NegInf <= NegInf and NegInf == NegInf + assert not NegInf < NegInf and not NegInf > NegInf + + for perm in permutations(ref_nums): + assert sorted(perm) == ref_nums + + # smoke tests + np.array([_algos.Infinity()] * 32).argsort() + np.array([_algos.NegInfinity()] * 32).argsort() + + def test_ensure_platform_int(): arr = np.arange(100) diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index 85ce1d5127512..41d25b9662b5b 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -179,6 +179,13 @@ def test_rank_int(self): expected.index = result.index assert_series_equal(result, expected) + def test_rank_object_bug(self): + # GH 13445 + + # smoke tests + Series([np.nan] * 32).astype(object).rank(ascending=True) + Series([np.nan] * 32).astype(object).rank(ascending=False) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 6b7857b0154353f3fc1e80110876371cb10b2136 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 16 Aug 2016 06:17:08 -0400 Subject: [PATCH 270/359] TST: separate join tests from algos in test_join.py, xref #13925 --- pandas/tests/test_algos.py | 188 ---------------------------------- pandas/tests/test_join.py | 201 +++++++++++++++++++++++++++++++++++++ 2 files changed, 201 insertions(+), 188 deletions(-) create mode 100644 pandas/tests/test_join.py diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 452355541a79b..282b75c463dda 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -13,7 +13,6 @@ import pandas.algos as _algos from pandas.compat import lrange import pandas.core.algorithms as algos -import pandas._join as _join import pandas.util.testing as tm import pandas.hashtable as hashtable from pandas.compat.numpy import np_array_datetime64_compat @@ -301,46 +300,6 @@ def _test_vector_resize(htable, uniques, dtype, nvals): _test_vector_resize(tbl(), vect(), dtype, 10) -class TestIndexer(tm.TestCase): - _multiprocess_can_split_ = True - - def test_outer_join_indexer(self): - typemap = [('int32', _join.outer_join_indexer_int32), - ('int64', _join.outer_join_indexer_int64), - ('float32', _join.outer_join_indexer_float32), - ('float64', _join.outer_join_indexer_float64), - ('object', _join.outer_join_indexer_object)] - - for dtype, indexer in typemap: - left = np.arange(3, dtype=dtype) - right = np.arange(2, 5, dtype=dtype) - empty = np.array([], dtype=dtype) - - result, lindexer, rindexer = indexer(left, right) - tm.assertIsInstance(result, np.ndarray) - tm.assertIsInstance(lindexer, np.ndarray) - tm.assertIsInstance(rindexer, np.ndarray) - tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype)) - exp = np.array([0, 1, 2, -1, -1], dtype=np.int64) - tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(rindexer, exp) - - result, lindexer, rindexer = indexer(empty, right) - tm.assert_numpy_array_equal(result, right) - exp = np.array([-1, -1, -1], dtype=np.int64) - tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(rindexer, exp) - - result, lindexer, rindexer = indexer(left, empty) - tm.assert_numpy_array_equal(result, left) - exp = np.array([0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([-1, -1, -1], dtype=np.int64) - tm.assert_numpy_array_equal(rindexer, exp) - - class TestUnique(tm.TestCase): _multiprocess_can_split_ = True @@ -1068,153 +1027,6 @@ def test_pad(self): self.assert_numpy_array_equal(filler, expect_filler) -def test_left_join_indexer_unique(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([2, 2, 3, 4, 4], dtype=np.int64) - - result = _join.left_join_indexer_unique_int64(b, a) - expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) - assert (np.array_equal(result, expected)) - - -def test_left_outer_join_bug(): - left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, - 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, - 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, - 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, - 3, 1, 2, 0, 2], dtype=np.int64) - - right = np.array([3, 1], dtype=np.int64) - max_groups = 4 - - lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) - - exp_lidx = np.arange(len(left)) - exp_ridx = -np.ones(len(left)) - exp_ridx[left == 1] = 1 - exp_ridx[left == 3] = 0 - - assert (np.array_equal(lidx, exp_lidx)) - assert (np.array_equal(ridx, exp_ridx)) - - -def test_inner_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = _join.inner_join_indexer_int64(a, b) - - index_exp = np.array([3, 5], dtype=np.int64) - assert_almost_equal(index, index_exp) - - aexp = np.array([2, 4], dtype=np.int64) - bexp = np.array([1, 2], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = _join.inner_join_indexer_int64(a, b) - tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) - - -def test_outer_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = _join.outer_join_indexer_int64(a, b) - - index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) - assert_almost_equal(index, index_exp) - - aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) - bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = _join.outer_join_indexer_int64(a, b) - tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) - - -def test_left_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = _join.left_join_indexer_int64(a, b) - - assert_almost_equal(index, a) - - aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) - bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = _join.left_join_indexer_int64(a, b) - tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) - - -def test_left_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = _join.left_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_outer_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = _join.outer_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_inner_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = _join.inner_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - def test_is_lexsorted(): failure = [ np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py new file mode 100644 index 0000000000000..bfdb77f3fb350 --- /dev/null +++ b/pandas/tests/test_join.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- + +import numpy as np +from pandas import Index + +import pandas._join as _join +import pandas.util.testing as tm +from pandas.util.testing import assert_almost_equal + + +class TestIndexer(tm.TestCase): + _multiprocess_can_split_ = True + + def test_outer_join_indexer(self): + typemap = [('int32', _join.outer_join_indexer_int32), + ('int64', _join.outer_join_indexer_int64), + ('float32', _join.outer_join_indexer_float32), + ('float64', _join.outer_join_indexer_float64), + ('object', _join.outer_join_indexer_object)] + + for dtype, indexer in typemap: + left = np.arange(3, dtype=dtype) + right = np.arange(2, 5, dtype=dtype) + empty = np.array([], dtype=dtype) + + result, lindexer, rindexer = indexer(left, right) + tm.assertIsInstance(result, np.ndarray) + tm.assertIsInstance(lindexer, np.ndarray) + tm.assertIsInstance(rindexer, np.ndarray) + tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype)) + exp = np.array([0, 1, 2, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + result, lindexer, rindexer = indexer(empty, right) + tm.assert_numpy_array_equal(result, right) + exp = np.array([-1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + result, lindexer, rindexer = indexer(left, empty) + tm.assert_numpy_array_equal(result, left) + exp = np.array([0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([-1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + +def test_left_join_indexer_unique(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([2, 2, 3, 4, 4], dtype=np.int64) + + result = _join.left_join_indexer_unique_int64(b, a) + expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) + assert (np.array_equal(result, expected)) + + +def test_left_outer_join_bug(): + left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, + 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, + 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, + 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, + 3, 1, 2, 0, 2], dtype=np.int64) + + right = np.array([3, 1], dtype=np.int64) + max_groups = 4 + + lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) + + exp_lidx = np.arange(len(left)) + exp_ridx = -np.ones(len(left)) + exp_ridx[left == 1] = 1 + exp_ridx[left == 3] = 0 + + assert (np.array_equal(lidx, exp_lidx)) + assert (np.array_equal(ridx, exp_ridx)) + + +def test_inner_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _join.inner_join_indexer_int64(a, b) + + index_exp = np.array([3, 5], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([2, 4], dtype=np.int64) + bexp = np.array([1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _join.inner_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_outer_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _join.outer_join_indexer_int64(a, b) + + index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) + bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _join.outer_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _join.left_join_indexer_int64(a, b) + + assert_almost_equal(index, a) + + aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _join.left_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _join.left_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_outer_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _join.outer_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_inner_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _join.inner_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) From 5d791cc7d955c0b074ad602eb03fa32bd3e17503 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 Aug 2016 07:34:05 +0900 Subject: [PATCH 271/359] BUG: handle outofbounds datetimes in DatetimeConverter xref #2579 This at least solves the direct negative consequence (erroring code by importing pandas) of registering our converters by default. Author: Joris Van den Bossche Closes #13801 from jorisvandenbossche/plot-datetime-converter and squashes the following commits: 6b6b08e [Joris Van den Bossche] BUG: handle outofbounds datetimes in DatetimeConverter --- pandas/tests/plotting/test_datetimelike.py | 8 ++++++++ pandas/tseries/converter.py | 2 +- pandas/tseries/tests/test_converter.py | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 3f09317915254..492b9edff0122 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1222,6 +1222,14 @@ def test_secondary_y_irregular_ts_xlim(self): self.assertEqual(left, ts_irregular.index.min().toordinal()) self.assertEqual(right, ts_irregular.index.max().toordinal()) + def test_plot_outofbounds_datetime(self): + # 2579 - checking this does not raise + values = [date(1677, 1, 1), date(1677, 1, 2)] + self.plt.plot(values) + + values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] + self.plt.plot(values) + def _check_plot_works(f, freq=None, series=None, *args, **kwargs): import matplotlib.pyplot as plt diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index fc23f4f99449b..a23e8af3e610c 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -216,7 +216,7 @@ def try_parse(values): else: values = [_dt_to_float_ordinal(x) for x in values] except Exception: - pass + values = _dt_to_float_ordinal(values) return values diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index ceb8660efb9cd..37d9c35639c32 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -77,6 +77,24 @@ def test_conversion_float(self): rs = self.dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None) tm.assert_almost_equal(rs, xp, decimals) + def test_conversion_outofbounds_datetime(self): + # 2579 + values = [date(1677, 1, 1), date(1677, 1, 2)] + rs = self.dtc.convert(values, None, None) + xp = converter.dates.date2num(values) + tm.assert_numpy_array_equal(rs, xp) + rs = self.dtc.convert(values[0], None, None) + xp = converter.dates.date2num(values[0]) + self.assertEqual(rs, xp) + + values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] + rs = self.dtc.convert(values, None, None) + xp = converter.dates.date2num(values) + tm.assert_numpy_array_equal(rs, xp) + rs = self.dtc.convert(values[0], None, None) + xp = converter.dates.date2num(values[0]) + self.assertEqual(rs, xp) + def test_time_formatter(self): self.tc(90000) From 07804437d3a7e10d7a5472a7edc95e3dcc31bc6d Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 17 Aug 2016 06:12:34 -0400 Subject: [PATCH 272/359] PERF/COMPAT: define platform int to np.intp AFAIK this only affects 64 bit python on Windows. `numpy` wants an `np.intp` (i8 on Windows) as a indexer for `take`, but pandas defines a "platform int" as a `np.int_` (i4 on Windows). This hits performance twice, because we often start with i8, cast to i4, then numpy will cast back to i8 in its `take`. Author: Chris Closes #13972 from chris-b1/platform-int and squashes the following commits: Closes #3033 322b11a [Chris] lint fixup fc80938 [Chris] adjust for 32bit 84f38b2 [Chris] adjust test for platform independence 3ced5d5 [Chris] PERF/COMPAT: define platform int to np.intp --- doc/source/whatsnew/v0.19.0.txt | 38 +++++++++++++++++ pandas/core/algorithms.py | 2 +- pandas/hashtable.pyx | 12 +++--- pandas/indexes/base.py | 8 +++- pandas/src/algos_common_helper.pxi | 8 ++-- pandas/src/algos_common_helper.pxi.in | 10 +++-- pandas/src/join.pyx | 16 +++----- pandas/tests/frame/test_operators.py | 2 +- pandas/tests/indexes/common.py | 2 +- pandas/tests/indexes/test_base.py | 31 ++++++++------ pandas/tests/indexes/test_category.py | 4 +- pandas/tests/indexes/test_datetimelike.py | 33 +++++++-------- pandas/tests/indexes/test_multi.py | 20 ++++----- pandas/tests/indexes/test_numeric.py | 50 +++++++++++------------ pandas/tests/indexes/test_range.py | 22 +++++----- pandas/tests/test_algos.py | 38 ++++++++--------- pandas/tests/test_base.py | 10 +++-- pandas/tests/test_groupby.py | 12 +++--- pandas/tools/merge.py | 4 +- pandas/tseries/tests/test_period.py | 6 +-- pandas/tseries/tests/test_timedeltas.py | 4 +- pandas/tseries/tests/test_timeseries.py | 14 +++---- 22 files changed, 200 insertions(+), 146 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0ee56f865f8c8..5001d82117b88 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -778,6 +778,44 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` - Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`) - Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result (:issue:`13985`) +.. _whatsnew_0190.indexer_dtype: + +Indexer dtype Changes +^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + This change only affects 64 bit python running on Windows, and only affects relatively advanced + indexing operations + +Methods such as ``Index.get_indexer`` that return an indexer array, coerce that array to a "platform int", so that it can be +directly used in 3rd party library operations like ``numpy.take``. Previously, a platform int was defined as ``np.int_`` +which corresponds to a C integer, but the correct type, and what is being used now, is ``np.intp``, which corresponds +to the C integer size that can hold a pointer. (:issue:`3033`, :issue:`13972`) + +These types are the same on many platform, but for 64 bit python on Windows, +``np.int_`` is 32 bits, and ``np.intp`` is 64 bits. Changing this behavior improves performance for many +operations on that platform. + +Previous behaviour: + +.. code-block:: ipython + + In [1]: i = pd.Index(['a', 'b', 'c']) + + In [2]: i.get_indexer(['b', 'b', 'c']).dtype + Out[2]: dtype('int32') + +New behaviour: + +.. code-block:: ipython + + In [1]: i = pd.Index(['a', 'b', 'c']) + + In [2]: i.get_indexer(['b', 'b', 'c']).dtype + Out[2]: dtype('int64') + + .. _whatsnew_0190.deprecations: Deprecations diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7920f05b5e7a1..1f863bf7247a0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -259,7 +259,7 @@ def sort_mixed(values): new_labels = reverse_indexer.take(labels, mode='wrap') np.putmask(new_labels, mask, na_sentinel) - return ordered, new_labels + return ordered, _ensure_platform_int(new_labels) def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index d1b6b326d7de6..af694c276b5b7 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -64,10 +64,10 @@ cdef class Factorizer: mask = (labels == na_sentinel) # sort on if sort: - if labels.dtype != np.int_: - labels = labels.astype(np.int_) + if labels.dtype != np.intp: + labels = labels.astype(np.intp) sorter = self.uniques.to_array().argsort() - reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) labels = reverse_indexer.take(labels, mode='clip') labels[mask] = na_sentinel @@ -100,11 +100,11 @@ cdef class Int64Factorizer: # sort on if sort: - if labels.dtype != np.int_: - labels = labels.astype(np.int_) + if labels.dtype != np.intp: + labels = labels.astype(np.intp) sorter = self.uniques.to_array().argsort() - reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) labels = reverse_indexer.take(labels) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index b638e61d8eebe..9b378715b8a96 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2820,7 +2820,7 @@ def _get_leaf_sorter(labels): new_levels[level] = new_level if keep_order: # just drop missing values. o.w. keep order - left_indexer = np.arange(len(left)) + left_indexer = np.arange(len(left), dtype=np.intp) mask = new_lev_labels != -1 if not mask.all(): new_labels = [lab[mask] for lab in new_labels] @@ -2863,6 +2863,10 @@ def _get_leaf_sorter(labels): left_indexer, right_indexer = right_indexer, left_indexer if return_indexers: + left_indexer = (None if left_indexer is None + else _ensure_platform_int(left_indexer)) + right_indexer = (None if right_indexer is None + else _ensure_platform_int(right_indexer)) return join_index, left_indexer, right_indexer else: return join_index @@ -2906,6 +2910,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False): join_index = self._wrap_joined_index(join_index, other) if return_indexers: + lidx = None if lidx is None else _ensure_platform_int(lidx) + ridx = None if ridx is None else _ensure_platform_int(ridx) return join_index, lidx, ridx else: return join_index diff --git a/pandas/src/algos_common_helper.pxi b/pandas/src/algos_common_helper.pxi index 59b3ddff46dec..b89a80a73e2dd 100644 --- a/pandas/src/algos_common_helper.pxi +++ b/pandas/src/algos_common_helper.pxi @@ -2848,16 +2848,18 @@ def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values, # ensure_dtype #---------------------------------------------------------------------- -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.intp)).descr.type_num cpdef ensure_platform_int(object arr): + # GH3033, GH1392 + # platform int is the size of the int pointer, e.g. np.intp if util.is_array(arr): if ( arr).descr.type_num == PLATFORM_INT: return arr else: - return arr.astype(np.int_) + return arr.astype(np.intp) else: - return np.array(arr, dtype=np.int_) + return np.array(arr, dtype=np.intp) cpdef ensure_object(object arr): if util.is_array(arr): diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index 2327f10389cb5..1451ffb054e5d 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -548,16 +548,18 @@ def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values, # ensure_dtype #---------------------------------------------------------------------- -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.intp)).descr.type_num cpdef ensure_platform_int(object arr): + # GH3033, GH1392 + # platform int is the size of the int pointer, e.g. np.intp if util.is_array(arr): if ( arr).descr.type_num == PLATFORM_INT: return arr else: - return arr.astype(np.int_) + return arr.astype(np.intp) else: - return np.array(arr, dtype=np.int_) + return np.array(arr, dtype=np.intp) cpdef ensure_object(object arr): if util.is_array(arr): @@ -600,4 +602,4 @@ cpdef ensure_{{name}}(object arr): else: return np.array(arr, dtype=np.{{dtype}}) -{{endfor}} \ No newline at end of file +{{endfor}} diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index 9281453c643ee..65c790beb5dbf 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -32,7 +32,8 @@ float64 = np.dtype(np.float64) cdef double NaN = np.NaN cdef double nan = NaN -from pandas.algos import groupsort_indexer +from pandas.algos import groupsort_indexer, ensure_platform_int +from pandas.core.algorithms import take_nd include "joins_func_helper.pxi" @@ -148,16 +149,14 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, # no multiple matches for any row on the left # this is a short-cut to avoid groupsort_indexer # otherwise, the `else` path also works in this case - if left_sorter.dtype != np.int_: - left_sorter = left_sorter.astype(np.int_) + left_sorter = ensure_platform_int(left_sorter) - rev = np.empty(len(left), dtype=np.int_) + rev = np.empty(len(left), dtype=np.intp) rev.put(left_sorter, np.arange(len(left))) else: rev, _ = groupsort_indexer(left_indexer, len(left)) - if rev.dtype != np.int_: - rev = rev.astype(np.int_) + rev = ensure_platform_int(rev) right_indexer = right_indexer.take(rev) left_indexer = left_indexer.take(rev) @@ -228,11 +227,8 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, def _get_result_indexer(sorter, indexer): - if indexer.dtype != np.int_: - indexer = indexer.astype(np.int_) if len(sorter) > 0: - res = sorter.take(indexer) - np.putmask(res, indexer == -1, -1) + res = take_nd(sorter, indexer, fill_value=-1) else: # length-0 case res = np.empty(len(indexer), dtype=np.int64) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index ce7af25eb0460..5f3eb84f72127 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -1204,7 +1204,7 @@ def test_alignment_non_pandas(self): align = pd.core.ops._align_method_FRAME - for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.intp)]: + for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64)]: tm.assert_series_equal(align(df, val, 'index'), Series([1, 2, 3], index=df.index)) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 92560363be8fe..a6fde7f85084d 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -110,7 +110,7 @@ def f(): def test_reindex_base(self): idx = self.create_index() - expected = np.arange(idx.size) + expected = np.arange(idx.size, dtype=np.intp) actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 3c9040021fdbf..f0d0d2d49b973 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -975,10 +975,10 @@ def test_get_indexer(self): idx2 = Index([2, 4, 6]) r1 = idx1.get_indexer(idx2) - assert_almost_equal(r1, np.array([1, 3, -1])) + assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp)) r1 = idx2.get_indexer(idx1, method='pad') - e1 = np.array([-1, 0, 0, 1, 1]) + e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='pad') @@ -988,7 +988,7 @@ def test_get_indexer(self): assert_almost_equal(r1, rffill1) r1 = idx2.get_indexer(idx1, method='backfill') - e1 = np.array([0, 0, 1, 1, 2]) + e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp) assert_almost_equal(r1, e1) rbfill1 = idx2.get_indexer(idx1, method='bfill') @@ -1013,25 +1013,30 @@ def test_get_indexer_nearest(self): all_methods = ['pad', 'backfill', 'nearest'] for method in all_methods: actual = idx.get_indexer([0, 5, 9], method=method) - tm.assert_numpy_array_equal(actual, np.array([0, 5, 9])) + tm.assert_numpy_array_equal(actual, np.array([0, 5, 9], + dtype=np.intp)) actual = idx.get_indexer([0, 5, 9], method=method, tolerance=0) - tm.assert_numpy_array_equal(actual, np.array([0, 5, 9])) + tm.assert_numpy_array_equal(actual, np.array([0, 5, 9], + dtype=np.intp)) for method, expected in zip(all_methods, [[0, 1, 8], [1, 2, 9], [0, 2, 9]]): actual = idx.get_indexer([0.2, 1.8, 8.5], method=method) - tm.assert_numpy_array_equal(actual, np.array(expected)) + tm.assert_numpy_array_equal(actual, np.array(expected, + dtype=np.intp)) actual = idx.get_indexer([0.2, 1.8, 8.5], method=method, tolerance=1) - tm.assert_numpy_array_equal(actual, np.array(expected)) + tm.assert_numpy_array_equal(actual, np.array(expected, + dtype=np.intp)) for method, expected in zip(all_methods, [[0, -1, -1], [-1, 2, -1], [0, 2, -1]]): actual = idx.get_indexer([0.2, 1.8, 8.5], method=method, tolerance=0.2) - tm.assert_numpy_array_equal(actual, np.array(expected)) + tm.assert_numpy_array_equal(actual, np.array(expected, + dtype=np.intp)) with tm.assertRaisesRegexp(ValueError, 'limit argument'): idx.get_indexer([1, 0], method='nearest', limit=1) @@ -1042,22 +1047,24 @@ def test_get_indexer_nearest_decreasing(self): all_methods = ['pad', 'backfill', 'nearest'] for method in all_methods: actual = idx.get_indexer([0, 5, 9], method=method) - tm.assert_numpy_array_equal(actual, np.array([9, 4, 0])) + tm.assert_numpy_array_equal(actual, np.array([9, 4, 0], + dtype=np.intp)) for method, expected in zip(all_methods, [[8, 7, 0], [9, 8, 1], [9, 7, 0]]): actual = idx.get_indexer([0.2, 1.8, 8.5], method=method) - tm.assert_numpy_array_equal(actual, np.array(expected)) + tm.assert_numpy_array_equal(actual, np.array(expected, + dtype=np.intp)) def test_get_indexer_strings(self): idx = pd.Index(['b', 'c']) actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='pad') - expected = np.array([-1, 0, 1, 1]) + expected = np.array([-1, 0, 1, 1], dtype=np.intp) tm.assert_numpy_array_equal(actual, expected) actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='backfill') - expected = np.array([0, 0, 1, -1]) + expected = np.array([0, 0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(actual, expected) with tm.assertRaises(TypeError): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index e066842c33126..901b57dcc7bfe 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -336,7 +336,7 @@ def test_reindex_base(self): # determined by cat ordering idx = self.create_index() - expected = np.array([4, 0, 1, 5, 2, 3]) + expected = np.array([4, 0, 1, 5, 2, 3], dtype=np.intp) actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) @@ -403,7 +403,7 @@ def test_get_indexer(self): for indexer in [idx2, list('abf'), Index(list('abf'))]: r1 = idx1.get_indexer(idx2) - assert_almost_equal(r1, np.array([0, 1, 2, -1])) + assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) self.assertRaises(NotImplementedError, lambda: idx2.get_indexer(idx1, method='pad')) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 9371bef8b8f2e..3ff52380a62d8 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -552,20 +552,21 @@ def test_get_loc(self): def test_get_indexer(self): idx = pd.date_range('2000-01-01', periods=3) - tm.assert_numpy_array_equal(idx.get_indexer(idx), np.array([0, 1, 2])) + exp = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1])) + np.array([-1, 0, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2])) + np.array([0, 1, 2], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1])) + np.array([0, 1, 1], dtype=np.intp)) tm.assert_numpy_array_equal( idx.get_indexer(target, 'nearest', tolerance=pd.Timedelta('1 hour')), - np.array([0, -1, 1])) + np.array([0, -1, 1], dtype=np.intp)) with tm.assertRaises(ValueError): idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') @@ -872,19 +873,19 @@ def test_where_other(self): def test_get_indexer(self): idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) target = pd.PeriodIndex(['1999-12-31T23', '2000-01-01T12', '2000-01-02T01'], freq='H') tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.int_)) + np.array([-1, 0, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.int_)) + np.array([0, 1, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', tolerance='1 hour'), - np.array([0, -1, 1], dtype=np.int_)) + np.array([0, -1, 1], dtype=np.intp)) msg = 'Input has different freq from PeriodIndex\\(freq=H\\)' with self.assertRaisesRegexp(ValueError, msg): @@ -892,7 +893,7 @@ def test_get_indexer(self): tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', tolerance='1 day'), - np.array([0, 1, 1], dtype=np.int_)) + np.array([0, 1, 1], dtype=np.intp)) def test_repeat(self): # GH10183 @@ -1048,19 +1049,19 @@ def test_get_loc(self): def test_get_indexer(self): idx = pd.to_timedelta(['0 days', '1 days', '2 days']) tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) target = pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.int_)) + np.array([-1, 0, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.int_)) + np.array([0, 1, 1], dtype=np.intp)) res = idx.get_indexer(target, 'nearest', tolerance=pd.Timedelta('1 hour')) - tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.int_)) + tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) def test_numeric_compat(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index bdca91253e37b..eedbd108510f7 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -799,7 +799,7 @@ def test_legacy_pickle(self): self.assertTrue(obj.equals(obj2)) res = obj.get_indexer(obj) - exp = np.arange(len(obj)) + exp = np.arange(len(obj), dtype=np.intp) assert_almost_equal(res, exp) res = obj.get_indexer(obj2[::-1]) @@ -818,7 +818,7 @@ def test_legacy_v2_unpickle(self): self.assertTrue(obj.equals(obj2)) res = obj.get_indexer(obj) - exp = np.arange(len(obj)) + exp = np.arange(len(obj), dtype=np.intp) assert_almost_equal(res, exp) res = obj.get_indexer(obj2[::-1]) @@ -1063,8 +1063,8 @@ def test_get_indexer(self): major_axis = Index(lrange(4)) minor_axis = Index(lrange(2)) - major_labels = np.array([0, 0, 1, 2, 2, 3, 3]) - minor_labels = np.array([0, 1, 0, 0, 1, 0, 1]) + major_labels = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp) + minor_labels = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp) index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) @@ -1072,10 +1072,10 @@ def test_get_indexer(self): idx2 = index[[1, 3, 5]] r1 = idx1.get_indexer(idx2) - assert_almost_equal(r1, np.array([1, 3, -1])) + assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp)) r1 = idx2.get_indexer(idx1, method='pad') - e1 = np.array([-1, 0, 0, 1, 1]) + e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='pad') @@ -1085,7 +1085,7 @@ def test_get_indexer(self): assert_almost_equal(r1, rffill1) r1 = idx2.get_indexer(idx1, method='backfill') - e1 = np.array([0, 0, 1, 1, 2]) + e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='backfill') @@ -1747,8 +1747,8 @@ def test_join_multi(self): jidx, lidx, ridx = midx.join(idx, how='inner', return_indexers=True) exp_idx = pd.MultiIndex.from_product( [np.arange(4), [1, 2]], names=['a', 'b']) - exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.int_) - exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.int64) + exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp) + exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp) self.assert_index_equal(jidx, exp_idx) self.assert_numpy_array_equal(lidx, exp_lidx) self.assert_numpy_array_equal(ridx, exp_ridx) @@ -1761,7 +1761,7 @@ def test_join_multi(self): # keep MultiIndex jidx, lidx, ridx = midx.join(idx, how='left', return_indexers=True) exp_ridx = np.array([-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, - 1, -1], dtype=np.int64) + 1, -1], dtype=np.intp) self.assert_index_equal(jidx, midx) self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, exp_ridx) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 90025fa014b78..f0af43e3513bb 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -284,15 +284,15 @@ def test_equals(self): def test_get_indexer(self): idx = Float64Index([0.0, 1.0, 2.0]) tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) target = [-0.1, 0.5, 1.1] tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.int_)) + np.array([-1, 0, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.int_)) + np.array([0, 1, 1], dtype=np.intp)) def test_get_loc(self): idx = Float64Index([0.0, 1.0, 2.0]) @@ -560,19 +560,19 @@ def test_identical(self): def test_get_indexer(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target) - expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1]) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_get_indexer_pad(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target, method='pad') - expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_get_indexer_backfill(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target, method='backfill') - expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5]) + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_join_outer(self): @@ -588,9 +588,9 @@ def test_join_outer(self): eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.int_) + dtype=np.intp) eridx = np.array([-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], - dtype=np.int_) + dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) @@ -604,9 +604,9 @@ def test_join_outer(self): self.assert_index_equal(res, noidx_res) elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.int64) + dtype=np.intp) eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], - dtype=np.int64) + dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) @@ -627,8 +627,8 @@ def test_join_inner(self): ridx = ridx.take(ind) eres = Int64Index([2, 12]) - elidx = np.array([1, 6], dtype=np.int_) - eridx = np.array([4, 1], dtype=np.int_) + elidx = np.array([1, 6], dtype=np.intp) + eridx = np.array([4, 1], dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) @@ -642,8 +642,8 @@ def test_join_inner(self): res2 = self.index.intersection(other_mono) self.assert_index_equal(res, res2) - elidx = np.array([1, 6], dtype=np.int64) - eridx = np.array([1, 4], dtype=np.int64) + elidx = np.array([1, 6], dtype=np.intp) + eridx = np.array([1, 4], dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) @@ -658,7 +658,7 @@ def test_join_left(self): return_indexers=True) eres = self.index eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], - dtype=np.int_) + dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) @@ -669,7 +669,7 @@ def test_join_left(self): res, lidx, ridx = self.index.join(other_mono, how='left', return_indexers=True) eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], - dtype=np.int64) + dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) self.assertIsNone(lidx) @@ -680,8 +680,8 @@ def test_join_left(self): idx2 = Index([1, 2, 5, 7, 9]) res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 - eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -694,7 +694,7 @@ def test_join_right(self): res, lidx, ridx = self.index.join(other, how='right', return_indexers=True) eres = other - elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.int_) + elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp) tm.assertIsInstance(other, Int64Index) self.assert_index_equal(res, eres) @@ -705,7 +705,7 @@ def test_join_right(self): res, lidx, ridx = self.index.join(other_mono, how='right', return_indexers=True) eres = other_mono - elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.int64) + elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp) tm.assertIsInstance(other, Int64Index) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) @@ -716,8 +716,8 @@ def test_join_right(self): idx2 = Index([1, 2, 5, 7, 9]) res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 - elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -757,10 +757,10 @@ def test_join_non_unique(self): exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) self.assert_index_equal(joined, exp_joined) - exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.int_) + exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp) tm.assert_numpy_array_equal(lidx, exp_lidx) - exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.int_) + exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(ridx, exp_ridx) def test_join_self(self): diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 51333c46b7b3b..168ef7fc8d100 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -377,19 +377,19 @@ def test_identical(self): def test_get_indexer(self): target = RangeIndex(10) indexer = self.index.get_indexer(target) - expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1]) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) self.assert_numpy_array_equal(indexer, expected) def test_get_indexer_pad(self): target = RangeIndex(10) indexer = self.index.get_indexer(target, method='pad') - expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) self.assert_numpy_array_equal(indexer, expected) def test_get_indexer_backfill(self): target = RangeIndex(10) indexer = self.index.get_indexer(target, method='backfill') - expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5]) + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) self.assert_numpy_array_equal(indexer, expected) def test_join_outer(self): @@ -404,9 +404,9 @@ def test_join_outer(self): eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]) elidx = np.array([0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, - -1, -1, -1, -1, -1, -1, -1], dtype=np.int_) + -1, -1, -1, -1, -1, -1, -1], dtype=np.intp) eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, - 5, 4, 3, 2, 1, 0], dtype=np.int_) + 5, 4, 3, 2, 1, 0], dtype=np.intp) self.assertIsInstance(res, Int64Index) self.assertFalse(isinstance(res, RangeIndex)) @@ -442,8 +442,8 @@ def test_join_inner(self): ridx = ridx.take(ind) eres = Int64Index([16, 18]) - elidx = np.array([8, 9]) - eridx = np.array([9, 7]) + elidx = np.array([8, 9], dtype=np.intp) + eridx = np.array([9, 7], dtype=np.intp) self.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) @@ -468,7 +468,7 @@ def test_join_left(self): res, lidx, ridx = self.index.join(other, how='left', return_indexers=True) eres = self.index - eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.int_) + eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.intp) self.assertIsInstance(res, RangeIndex) self.assert_index_equal(res, eres) @@ -494,7 +494,7 @@ def test_join_right(self): return_indexers=True) eres = other elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], - dtype=np.int_) + dtype=np.intp) self.assertIsInstance(other, Int64Index) self.assert_index_equal(res, eres) @@ -546,9 +546,9 @@ def test_join_non_unique(self): res, lidx, ridx = self.index.join(other, return_indexers=True) eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18]) - elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int_) + elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp) eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], - dtype=np.int_) + dtype=np.intp) self.assert_index_equal(res, eres) self.assert_numpy_array_equal(lidx, elidx) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 282b75c463dda..9543d9bba2a3a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -81,7 +81,7 @@ def test_labels(self): labels = [0, 1, 1, 2, 3, 0, -1, 4] result, result_labels = algos.safe_sort(values, labels) - expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4]) + expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) @@ -89,20 +89,20 @@ def test_labels(self): labels = [0, 1, 1, 2, 3, 0, 99, 4] result, result_labels = algos.safe_sort(values, labels, na_sentinel=99) - expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4]) + expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) # out of bound indices labels = [0, 101, 102, 2, 3, 0, 99, 4] result, result_labels = algos.safe_sort(values, labels) - expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4]) + expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) labels = [] result, result_labels = algos.safe_sort(values, labels) - expected_labels = np.array([], dtype=np.int_) + expected_labels = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) @@ -116,7 +116,7 @@ def test_mixed_integer(self): labels = [0, 1, 2, 3, 0, -1, 1] result, result_labels = algos.safe_sort(values, labels) expected = np.array([0, 1, 'a', 'b'], dtype=object) - expected_labels = np.array([3, 1, 0, 2, 3, -1, 1]) + expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) @@ -155,33 +155,33 @@ def test_basic(self): labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], sort=True) - exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.int_) + exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array(['a', 'b', 'c'], dtype=object) self.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(range(5)))) - exp = np.array([0, 1, 2, 3, 4], dtype=np.int_) + exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array([4, 3, 2, 1, 0], dtype=np.int64) self.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) - exp = np.array([4, 3, 2, 1, 0], dtype=np.int_) + exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) self.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(np.arange(5.)))) - exp = np.array([0, 1, 2, 3, 4], dtype=np.int_) + exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64) self.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True) - exp = np.array([4, 3, 2, 1, 0], dtype=np.int_) + exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64) self.assert_numpy_array_equal(uniques, exp) @@ -192,13 +192,13 @@ def test_mixed(self): x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) - exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.int_) + exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = pd.Index(['A', 'B', 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) - exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.int_) + exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = pd.Index([3.14, np.inf, 'A', 'B']) tm.assert_index_equal(uniques, exp) @@ -211,13 +211,13 @@ def test_datelike(self): x = Series([v1, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) - exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) + exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = pd.DatetimeIndex([v1, v2]) self.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) - exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.int_) + exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = pd.DatetimeIndex([v2, v1]) self.assert_index_equal(uniques, exp) @@ -229,12 +229,12 @@ def test_datelike(self): # periods are not 'sorted' as they are converted back into an index labels, uniques = algos.factorize(x) - exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) + exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) labels, uniques = algos.factorize(x, sort=True) - exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) + exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) @@ -243,12 +243,12 @@ def test_datelike(self): v2 = pd.to_timedelta('1 day') x = Series([v1, v2, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) - exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.int_) + exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.to_timedelta([v1, v2])) labels, uniques = algos.factorize(x, sort=True) - exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.int_) + exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.to_timedelta([v2, v1])) @@ -1113,7 +1113,7 @@ def test_infinity_sort(): def test_ensure_platform_int(): - arr = np.arange(100) + arr = np.arange(100, dtype=np.intp) result = _algos.ensure_platform_int(arr) assert (result is arr) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 2721d8d0e5e69..52cd65af42c5e 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -748,11 +748,11 @@ def test_factorize(self): o = orig.copy() if isinstance(o, Index) and o.is_boolean(): - exp_arr = np.array([0, 1] + [0] * 8) + exp_arr = np.array([0, 1] + [0] * 8, dtype=np.intp) exp_uniques = o exp_uniques = Index([False, True]) else: - exp_arr = np.array(range(len(o))) + exp_arr = np.array(range(len(o)), dtype=np.intp) exp_uniques = o labels, uniques = o.factorize() @@ -782,7 +782,8 @@ def test_factorize_repeated(self): o = o.take(indexer) n = o[5:].append(o) - exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + dtype=np.intp) labels, uniques = n.factorize(sort=True) self.assert_numpy_array_equal(labels, exp_arr) @@ -792,7 +793,8 @@ def test_factorize_repeated(self): else: self.assert_index_equal(uniques, o, check_names=False) - exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4]) + exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], + np.intp) labels, uniques = n.factorize(sort=False) self.assert_numpy_array_equal(labels, exp_arr) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index bfaf157245c1a..6bf1a397c8482 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2576,12 +2576,12 @@ def test_groupby_complex(self): def test_level_preserve_order(self): grouped = self.mframe.groupby(level=0) - exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3]) + exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3], np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) def test_grouping_labels(self): grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) - exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3]) + exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) def test_cython_fail_agg(self): @@ -5966,22 +5966,22 @@ def test_lexsort_indexer(self): # orders=True, na_position='last' result = _lexsort_indexer(keys, orders=True, na_position='last') exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=True, na_position='first' result = _lexsort_indexer(keys, orders=True, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=False, na_position='last' result = _lexsort_indexer(keys, orders=False, na_position='last') exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=False, na_position='first' result = _lexsort_indexer(keys, orders=False, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) def test_nargsort(self): # np.argsort(items) places NaNs last diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 1572363fc6136..ca7288b048427 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -572,7 +572,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if name in self.left: if left_has_missing is None: - left_has_missing = any(left_indexer == -1) + left_has_missing = (left_indexer == -1).any() if left_has_missing: take_right = self.right_join_keys[i] @@ -584,7 +584,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): elif name in self.right: if right_has_missing is None: - right_has_missing = any(right_indexer == -1) + right_has_missing = (right_indexer == -1).any() if right_has_missing: take_left = self.left_join_keys[i] diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 8baac297fe57b..a28312451f6c0 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -3518,7 +3518,7 @@ def test_factorize(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') - exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') arr, idx = idx1.factorize() @@ -3532,12 +3532,12 @@ def test_factorize(self): idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') - exp_arr = np.array([2, 2, 1, 0, 2, 0]) + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) arr, idx = idx2.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - exp_arr = np.array([0, 0, 1, 2, 0, 2]) + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') arr, idx = idx2.factorize() self.assert_numpy_array_equal(arr, exp_arr) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 159d2b4f52f2a..77e0216c5c79a 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1822,7 +1822,7 @@ def test_factorize(self): idx1 = TimedeltaIndex(['1 day', '1 day', '2 day', '2 day', '3 day', '3 day']) - exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) exp_idx = TimedeltaIndex(['1 day', '2 day', '3 day']) arr, idx = idx1.factorize() @@ -1835,7 +1835,7 @@ def test_factorize(self): # freq must be preserved idx3 = timedelta_range('1 day', periods=4, freq='s') - exp_arr = np.array([0, 1, 2, 3]) + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() self.assert_numpy_array_equal(arr, exp_arr) self.assert_index_equal(idx, idx3) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 0544d8a8e32d4..f3980b4e254f8 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3793,7 +3793,7 @@ def test_factorize(self): idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03']) - exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) arr, idx = idx1.factorize() @@ -3815,13 +3815,13 @@ def test_factorize(self): idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01']) - exp_arr = np.array([2, 2, 1, 0, 2, 0]) + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) arr, idx = idx2.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - exp_arr = np.array([0, 0, 1, 2, 0, 2]) + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01']) arr, idx = idx2.factorize() self.assert_numpy_array_equal(arr, exp_arr) @@ -3829,7 +3829,7 @@ def test_factorize(self): # freq must be preserved idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo') - exp_arr = np.array([0, 1, 2, 3]) + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() self.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) @@ -3840,7 +3840,7 @@ def test_factorize_tz(self): base = pd.date_range('2016-11-05', freq='H', periods=100, tz=tz) idx = base.repeat(5) - exp_arr = np.arange(100).repeat(5) + exp_arr = np.arange(100, dtype=np.intp).repeat(5) for obj in [idx, pd.Series(idx)]: arr, res = obj.factorize() @@ -3854,7 +3854,7 @@ def test_factorize_dst(self): for obj in [idx, pd.Series(idx)]: arr, res = obj.factorize() - self.assert_numpy_array_equal(arr, np.arange(12)) + self.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) tm.assert_index_equal(res, idx) idx = pd.date_range('2016-06-13', freq='H', periods=12, @@ -3862,7 +3862,7 @@ def test_factorize_dst(self): for obj in [idx, pd.Series(idx)]: arr, res = obj.factorize() - self.assert_numpy_array_equal(arr, np.arange(12)) + self.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) tm.assert_index_equal(res, idx) def test_slice_with_negative_step(self): From cb43b6c5a1e66343fcd8696402677de98012d6e0 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 17 Aug 2016 06:23:25 -0400 Subject: [PATCH 273/359] BUG: Parse NULL char as null value Fixes bug in C parser in which the `NULL` character (`'\x00'`) was being interpreted as a true line terminator, escape character, or comment character because it was used to indicate that a user had not specified these values. As a result, if the data contains this value, it was being incorrectly parsed. It should be parsed as `NULL`. Closes #14012. Author: gfyoung Closes #14019 from gfyoung/null-char-parse and squashes the following commits: 5d39744 [gfyoung] BUG: Parse NULL char as null value --- doc/source/whatsnew/v0.19.0.txt | 11 ++++++----- pandas/io/tests/parser/c_parser_only.py | 18 ++++++++++++++++++ pandas/src/parser/tokenizer.c | 21 +++++++++++++-------- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 5001d82117b88..8a11cbb42a78f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -957,7 +957,8 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a ``tempfile.TemporaryFile`` on Windows with Python 3 (:issue:`13398`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) -- Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) +- Bug in ``pd.read_csv()`` in the C engine where the NULL character was not being parsed as NULL (:issue:`14012`) +- Bug in ``pd.read_csv()`` with ``engine='c'`` in which NULL ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) - Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) - Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`) @@ -970,8 +971,8 @@ Bug Fixes - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) -- Bug ``Series.isnull`` and ``Series.notnull`` ignore ``Period('NaT')`` (:issue:`13737`) -- Bug ``Series.fillna`` and ``Series.dropna`` don't affect to ``Period('NaT')`` (:issue:`13737`) +- Bug ``Series.isnull()`` and ``Series.notnull()`` ignore ``Period('NaT')`` (:issue:`13737`) +- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737`) - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`) @@ -1008,8 +1009,8 @@ Bug Fixes - Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) - Bug in ``Series`` creation with ``np.datetime64`` which has other unit than ``ns`` as ``object`` dtype results in incorrect values (:issue:`13876`) -- Bug in ``isnull`` ``notnull`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) -- Bug in ``.merge`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) +- Bug in ``pd.isnull()`` ``pd.notnull()`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) +- Bug in ``pd.merge()`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) - Bug in ``HDFStore``/``read_hdf()`` discarded ``DatetimeIndex.name`` if ``tz`` was set (:issue:`13884`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 4cea9e1d6b595..09d521e5a7e46 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -543,3 +543,21 @@ def test_parse_trim_buffers(self): # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) + + def test_internal_null_byte(self): + # see gh-14012 + # + # The null byte ('\x00') should not be used as a + # true line terminator, escape character, or comment + # character, only as a placeholder to indicate that + # none was specified. + # + # This test should be moved to common.py ONLY when + # Python's csv class supports parsing '\x00'. + names = ['a', 'b', 'c'] + data = "1,2,3\n4,\x00,6\n7,8,9" + expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6], + [7, 8, 9]], columns=names) + + result = self.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(result, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 3c09933b3ec87..af85b7b894d26 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -684,14 +684,19 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { #define IS_WHITESPACE(c) ((c == ' ' || c == '\t')) -#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && \ - c == '\n') || c == self->lineterminator) +#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \ + (self->lineterminator != '\0' && \ + c == self->lineterminator)) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) // don't parse '\r' with a custom line terminator #define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r')) +#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar)) + +#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar)) + #define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \ self->skipinitialspace)) @@ -866,7 +871,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) self->state = EAT_CRNL; } break; - } else if (c == self->commentchar) { + } else if (IS_COMMENT_CHAR(c)) { self->state = EAT_LINE_COMMENT; break; } else if (IS_WHITESPACE(c)) { @@ -899,7 +904,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else if (IS_QUOTE(c)) { // start quoted field self->state = IN_QUOTED_FIELD; - } else if (c == self->escapechar) { + } else if (IS_ESCAPE_CHAR(c)) { // possible escaped character self->state = ESCAPED_CHAR; } else if (IS_SKIPPABLE_SPACE(c)) { @@ -912,7 +917,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) // save empty field END_FIELD(); } - } else if (c == self->commentchar) { + } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); self->state = EAT_COMMENT; } else { @@ -950,7 +955,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; - } else if (c == self->escapechar) { + } else if (IS_ESCAPE_CHAR(c)) { // possible escaped character self->state = ESCAPED_CHAR; } else if (IS_DELIMITER(c)) { @@ -962,7 +967,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else { self->state = START_FIELD; } - } else if (c == self->commentchar) { + } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); self->state = EAT_COMMENT; } else { @@ -973,7 +978,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) case IN_QUOTED_FIELD: // in quoted field - if (c == self->escapechar) { + if (IS_ESCAPE_CHAR(c)) { // possible escape character self->state = ESCAPE_IN_QUOTED_FIELD; } else if (IS_QUOTE(c)) { From 7c0b74287408a2dcff4cbcafbbebbf9847a6905a Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 17 Aug 2016 06:32:46 -0400 Subject: [PATCH 274/359] TST: add comprehensive coercion tests Author: sinhrks Closes #14010 from sinhrks/test_coercion and squashes the following commits: de3eb22 [sinhrks] TST: add comprehensive coercion tests --- pandas/tests/indexing/test_coercion.py | 1398 ++++++++++++++++++------ 1 file changed, 1047 insertions(+), 351 deletions(-) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 97a5c48b878fe..2eae226073552 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -13,444 +13,1139 @@ ############################################################### -class TestIndexCoercion(tm.TestCase): +class CoercionBase(object): _multiprocess_can_split_ = True - def test_setitem_index_numeric_coercion_int(self): - # tests setitem with non-existing numeric key - s = pd.Series([1, 2, 3, 4]) - self.assertEqual(s.index.dtype, np.int64) - - # int + int -> int - temp = s.copy() - temp[5] = 5 - tm.assert_series_equal(temp, pd.Series([1, 2, 3, 4, 5], - index=[0, 1, 2, 3, 5])) - self.assertEqual(temp.index.dtype, np.int64) - - # int + float -> float - temp = s.copy() - temp[1.1] = 5 - tm.assert_series_equal(temp, pd.Series([1, 2, 3, 4, 5], - index=[0, 1, 2, 3, 1.1])) - self.assertEqual(temp.index.dtype, np.float64) - - def test_setitem_index_numeric_coercion_float(self): - # tests setitem with non-existing numeric key - s = pd.Series([1, 2, 3, 4], index=[1.1, 2.1, 3.1, 4.1]) - self.assertEqual(s.index.dtype, np.float64) - - # float + int -> int - temp = s.copy() - # TODO_GH12747 The result must be float - with tm.assertRaises(IndexError): - temp[5] = 5 - - # float + float -> float - temp = s.copy() - temp[5.1] = 5 - exp = pd.Series([1, 2, 3, 4, 5], index=[1.1, 2.1, 3.1, 4.1, 5.1]) - tm.assert_series_equal(temp, exp) - self.assertEqual(temp.index.dtype, np.float64) - - def test_insert_numeric_coercion_int(self): - idx = pd.Int64Index([1, 2, 3, 4]) - self.assertEqual(idx.dtype, np.int64) - - # int + int -> int - res = idx.insert(1, 1) - tm.assert_index_equal(res, pd.Index([1, 1, 2, 3, 4])) - self.assertEqual(res.dtype, np.int64) - - # int + float -> float - res = idx.insert(1, 1.1) - tm.assert_index_equal(res, pd.Index([1, 1.1, 2, 3, 4])) - self.assertEqual(res.dtype, np.float64) - - # int + bool -> int - res = idx.insert(1, False) - tm.assert_index_equal(res, pd.Index([1, 0, 2, 3, 4])) - self.assertEqual(res.dtype, np.int64) - - def test_insert_numeric_coercion_float(self): - idx = pd.Float64Index([1, 2, 3, 4]) - self.assertEqual(idx.dtype, np.float64) - - # float + int -> int - res = idx.insert(1, 1) - tm.assert_index_equal(res, pd.Index([1., 1., 2., 3., 4.])) - self.assertEqual(res.dtype, np.float64) - - # float + float -> float - res = idx.insert(1, 1.1) - tm.assert_index_equal(res, pd.Index([1., 1.1, 2., 3., 4.])) - self.assertEqual(res.dtype, np.float64) - - # float + bool -> float - res = idx.insert(1, False) - tm.assert_index_equal(res, pd.Index([1., 0., 2., 3., 4.])) - self.assertEqual(res.dtype, np.float64) - + klasses = ['index', 'series'] + dtypes = ['object', 'int64', 'float64', 'complex128', 'bool', + 'datetime64', 'datetime64tz', 'timedelta64', 'period'] -class TestSeriesCoercion(tm.TestCase): + @property + def method(self): + raise NotImplementedError(self) - _multiprocess_can_split_ = True - - def setUp(self): - self.rep = {} - self.rep['object'] = ['a', 'b'] - self.rep['int64'] = [4, 5] - self.rep['float64'] = [1.1, 2.2] - self.rep['complex128'] = [1 + 1j, 2 + 2j] - self.rep['bool'] = [True, False] - - def test_setitem_numeric_coercion_int(self): - s = pd.Series([1, 2, 3, 4]) - self.assertEqual(s.dtype, np.int64) + def _assert(self, left, right, dtype): + # explicitly check dtype to avoid any unexpected result + if isinstance(left, pd.Series): + tm.assert_series_equal(left, right) + elif isinstance(left, pd.Index): + tm.assert_index_equal(left, right) + else: + raise NotImplementedError + self.assertEqual(left.dtype, dtype) + self.assertEqual(right.dtype, dtype) + + def test_has_comprehensive_tests(self): + for klass in self.klasses: + for dtype in self.dtypes: + method_name = 'test_{0}_{1}_{2}'.format(self.method, + klass, dtype) + if not hasattr(self, method_name): + msg = 'test method is not defined: {0}, {1}' + raise AssertionError(msg.format(type(self), method_name)) + + +class TestSetitemCoercion(CoercionBase, tm.TestCase): + + method = 'setitem' + + def _assert_setitem_series_conversion(self, original_series, loc_value, + expected_series, expected_dtype): + """ test series value's coercion triggered by assignment """ + temp = original_series.copy() + temp[1] = loc_value + tm.assert_series_equal(temp, expected_series) + # check dtype explicitly for sure + self.assertEqual(temp.dtype, expected_dtype) + + # .loc works different rule, temporary disable + # temp = original_series.copy() + # temp.loc[1] = loc_value + # tm.assert_series_equal(temp, expected_series) + + def test_setitem_series_object(self): + obj = pd.Series(list('abcd')) + self.assertEqual(obj.dtype, np.object) + + # object + int -> object + exp = pd.Series(['a', 1, 'c', 'd']) + self._assert_setitem_series_conversion(obj, 1, exp, np.object) + + # object + float -> object + exp = pd.Series(['a', 1.1, 'c', 'd']) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.object) + + # object + complex -> object + exp = pd.Series(['a', 1 + 1j, 'c', 'd']) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.object) + + # object + bool -> object + exp = pd.Series(['a', True, 'c', 'd']) + self._assert_setitem_series_conversion(obj, True, exp, np.object) + + def test_setitem_series_int64(self): + obj = pd.Series([1, 2, 3, 4]) + self.assertEqual(obj.dtype, np.int64) # int + int -> int - temp = s.copy() - temp[1] = 1 - tm.assert_series_equal(temp, pd.Series([1, 1, 3, 4])) - self.assertEqual(temp.dtype, np.int64) + exp = pd.Series([1, 1, 3, 4]) + self._assert_setitem_series_conversion(obj, 1, exp, np.int64) # int + float -> float # TODO_GH12747 The result must be float - temp = s.copy() - temp[1] = 1.1 # tm.assert_series_equal(temp, pd.Series([1, 1.1, 3, 4])) # self.assertEqual(temp.dtype, np.float64) - tm.assert_series_equal(temp, pd.Series([1, 1, 3, 4])) - self.assertEqual(temp.dtype, np.int64) + exp = pd.Series([1, 1, 3, 4]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.int64) # int + complex -> complex - temp = s.copy() - temp[1] = 1 + 1j - tm.assert_series_equal(temp, pd.Series([1, 1 + 1j, 3, 4])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1, 1 + 1j, 3, 4]) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.complex128) # int + bool -> int - temp = s.copy() - temp[1] = True - tm.assert_series_equal(temp, pd.Series([1, 1, 3, 4])) - self.assertEqual(temp.dtype, np.int64) + exp = pd.Series([1, 1, 3, 4]) + self._assert_setitem_series_conversion(obj, True, exp, np.int64) - def test_setitem_numeric_coercion_float(self): - s = pd.Series([1.1, 2.2, 3.3, 4.4]) - self.assertEqual(s.dtype, np.float64) + def test_setitem_series_float64(self): + obj = pd.Series([1.1, 2.2, 3.3, 4.4]) + self.assertEqual(obj.dtype, np.float64) # float + int -> float - temp = s.copy() - temp[1] = 1 - tm.assert_series_equal(temp, pd.Series([1.1, 1.0, 3.3, 4.4])) - self.assertEqual(temp.dtype, np.float64) + exp = pd.Series([1.1, 1.0, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, 1, exp, np.float64) # float + float -> float - temp = s.copy() - temp[1] = 1.1 - tm.assert_series_equal(temp, pd.Series([1.1, 1.1, 3.3, 4.4])) - self.assertEqual(temp.dtype, np.float64) + exp = pd.Series([1.1, 1.1, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.float64) # float + complex -> complex - temp = s.copy() - temp[1] = 1 + 1j - tm.assert_series_equal(temp, pd.Series([1.1, 1 + 1j, 3.3, 4.4])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1.1, 1 + 1j, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, + np.complex128) # float + bool -> float - temp = s.copy() - temp[1] = True - tm.assert_series_equal(temp, pd.Series([1.1, 1.0, 3.3, 4.4])) - self.assertEqual(temp.dtype, np.float64) + exp = pd.Series([1.1, 1.0, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, True, exp, np.float64) - def test_setitem_numeric_coercion_complex(self): - s = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) - self.assertEqual(s.dtype, np.complex128) + def test_setitem_series_complex128(self): + obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) + self.assertEqual(obj.dtype, np.complex128) # complex + int -> complex - temp = s.copy() - temp[1] = 1 - tm.assert_series_equal(temp, pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, True, exp, np.complex128) # complex + float -> complex - temp = s.copy() - temp[1] = 1.1 - tm.assert_series_equal(temp, pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.complex128) # complex + complex -> complex - temp = s.copy() - temp[1] = 1 + 1j - tm.assert_series_equal(temp, - pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.complex128) # complex + bool -> complex - temp = s.copy() - temp[1] = True - tm.assert_series_equal(temp, pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, True, exp, np.complex128) - def test_setitem_numeric_coercion_bool(self): - s = pd.Series([True, False, True, False]) - self.assertEqual(s.dtype, np.bool) + def test_setitem_series_bool(self): + obj = pd.Series([True, False, True, False]) + self.assertEqual(obj.dtype, np.bool) # bool + int -> int # TODO_GH12747 The result must be int - temp = s.copy() - temp[1] = 1 # tm.assert_series_equal(temp, pd.Series([1, 1, 1, 0])) # self.assertEqual(temp.dtype, np.int64) - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, 1, exp, np.bool) # TODO_GH12747 The result must be int - temp = s.copy() - temp[1] = 3 # greater than bool + # assigning int greater than bool # tm.assert_series_equal(temp, pd.Series([1, 3, 1, 0])) # self.assertEqual(temp.dtype, np.int64) - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, 3, exp, np.bool) # bool + float -> float # TODO_GH12747 The result must be float - temp = s.copy() - temp[1] = 1.1 # tm.assert_series_equal(temp, pd.Series([1., 1.1, 1., 0.])) # self.assertEqual(temp.dtype, np.float64) - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.bool) # bool + complex -> complex (buggy, results in bool) # TODO_GH12747 The result must be complex - temp = s.copy() - temp[1] = 1 + 1j # tm.assert_series_equal(temp, pd.Series([1, 1 + 1j, 1, 0])) # self.assertEqual(temp.dtype, np.complex128) - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) - - # bool + bool -> int - temp = s.copy() - temp[1] = True - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) - - def test_where_numeric_coercion_int(self): - s = pd.Series([1, 2, 3, 4]) - self.assertEqual(s.dtype, np.int64) - cond = pd.Series([True, False, True, False]) + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.bool) + + # bool + bool -> bool + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, True, exp, np.bool) + + def test_setitem_series_datetime64(self): + obj = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self.assertEqual(obj.dtype, 'datetime64[ns]') + + # datetime64 + datetime64 -> datetime64 + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_setitem_series_conversion(obj, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + # datetime64 + int -> object + # ToDo: The result must be object + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp(1), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_setitem_series_conversion(obj, 1, exp, 'datetime64[ns]') + + # ToDo: add more tests once the above issue has been fixed + + def test_setitem_series_datetime64tz(self): + tz = 'US/Eastern' + obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2011-01-02', tz=tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self.assertEqual(obj.dtype, 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64tz -> datetime64tz + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz=tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz=tz) + self._assert_setitem_series_conversion(obj, value, exp, + 'datetime64[ns, US/Eastern]') + + # datetime64 + int -> object + # ToDo: The result must be object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp(1).tz_localize(tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_setitem_series_conversion(obj, 1, exp, + 'datetime64[ns, US/Eastern]') + + # ToDo: add more tests once the above issue has been fixed + + def test_setitem_series_timedelta64(self): + pass + + def test_setitem_series_period(self): + pass + + def _assert_setitem_index_conversion(self, original_series, loc_key, + expected_index, expected_dtype): + """ test index's coercion triggered by assign key """ + temp = original_series.copy() + temp[loc_key] = 5 + exp = pd.Series([1, 2, 3, 4, 5], index=expected_index) + tm.assert_series_equal(temp, exp) + # check dtype explicitly for sure + self.assertEqual(temp.index.dtype, expected_dtype) + + temp = original_series.copy() + temp.loc[loc_key] = 5 + exp = pd.Series([1, 2, 3, 4, 5], index=expected_index) + tm.assert_series_equal(temp, exp) + # check dtype explicitly for sure + self.assertEqual(temp.index.dtype, expected_dtype) + + def test_setitem_index_object(self): + obj = pd.Series([1, 2, 3, 4], index=list('abcd')) + self.assertEqual(obj.index.dtype, np.object) + + # object + object -> object + exp_index = pd.Index(list('abcdx')) + self._assert_setitem_index_conversion(obj, 'x', exp_index, np.object) + + # object + int -> IndexError, regarded as location + temp = obj.copy() + with tm.assertRaises(IndexError): + temp[5] = 5 + + # object + float -> object + exp_index = pd.Index(['a', 'b', 'c', 'd', 1.1]) + self._assert_setitem_index_conversion(obj, 1.1, exp_index, np.object) + + def test_setitem_index_int64(self): + # tests setitem with non-existing numeric key + obj = pd.Series([1, 2, 3, 4]) + self.assertEqual(obj.index.dtype, np.int64) + + # int + int -> int + exp_index = pd.Index([0, 1, 2, 3, 5]) + self._assert_setitem_index_conversion(obj, 5, exp_index, np.int64) + + # int + float -> float + exp_index = pd.Index([0, 1, 2, 3, 1.1]) + self._assert_setitem_index_conversion(obj, 1.1, exp_index, np.float64) + + # int + object -> object + exp_index = pd.Index([0, 1, 2, 3, 'x']) + self._assert_setitem_index_conversion(obj, 'x', exp_index, np.object) + + def test_setitem_index_float64(self): + # tests setitem with non-existing numeric key + obj = pd.Series([1, 2, 3, 4], index=[1.1, 2.1, 3.1, 4.1]) + self.assertEqual(obj.index.dtype, np.float64) + + # float + int -> int + temp = obj.copy() + # TODO_GH12747 The result must be float + with tm.assertRaises(IndexError): + temp[5] = 5 + + # float + float -> float + exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, 5.1]) + self._assert_setitem_index_conversion(obj, 5.1, exp_index, np.float64) + + # float + object -> object + exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, 'x']) + self._assert_setitem_index_conversion(obj, 'x', exp_index, np.object) + + def test_setitem_index_complex128(self): + pass + + def test_setitem_index_bool(self): + pass + + def test_setitem_index_datetime64(self): + pass + + def test_setitem_index_datetime64tz(self): + pass + + def test_setitem_index_timedelta64(self): + pass + + def test_setitem_index_period(self): + pass + + +class TestInsertIndexCoercion(CoercionBase, tm.TestCase): + + klasses = ['index'] + method = 'insert' + + def _assert_insert_conversion(self, original, value, + expected, expected_dtype): + """ test coercion triggered by insert """ + target = original.copy() + res = target.insert(1, value) + tm.assert_index_equal(res, expected) + self.assertEqual(res.dtype, expected_dtype) + + def test_insert_index_object(self): + obj = pd.Index(list('abcd')) + self.assertEqual(obj.dtype, np.object) + + # object + int -> object + exp = pd.Index(['a', 1, 'b', 'c', 'd']) + self._assert_insert_conversion(obj, 1, exp, np.object) + + # object + float -> object + exp = pd.Index(['a', 1.1, 'b', 'c', 'd']) + self._assert_insert_conversion(obj, 1.1, exp, np.object) + + # object + bool -> object + res = obj.insert(1, False) + tm.assert_index_equal(res, pd.Index(['a', False, 'b', 'c', 'd'])) + self.assertEqual(res.dtype, np.object) + + # object + object -> object + exp = pd.Index(['a', 'x', 'b', 'c', 'd']) + self._assert_insert_conversion(obj, 'x', exp, np.object) + + def test_insert_index_int64(self): + obj = pd.Int64Index([1, 2, 3, 4]) + self.assertEqual(obj.dtype, np.int64) + + # int + int -> int + exp = pd.Index([1, 1, 2, 3, 4]) + self._assert_insert_conversion(obj, 1, exp, np.int64) + + # int + float -> float + exp = pd.Index([1, 1.1, 2, 3, 4]) + self._assert_insert_conversion(obj, 1.1, exp, np.float64) + + # int + bool -> int + exp = pd.Index([1, 0, 2, 3, 4]) + self._assert_insert_conversion(obj, False, exp, np.int64) + + # int + object -> object + exp = pd.Index([1, 'x', 2, 3, 4]) + self._assert_insert_conversion(obj, 'x', exp, np.object) + + def test_insert_index_float64(self): + obj = pd.Float64Index([1., 2., 3., 4.]) + self.assertEqual(obj.dtype, np.float64) + + # float + int -> int + exp = pd.Index([1., 1., 2., 3., 4.]) + self._assert_insert_conversion(obj, 1, exp, np.float64) + + # float + float -> float + exp = pd.Index([1., 1.1, 2., 3., 4.]) + self._assert_insert_conversion(obj, 1.1, exp, np.float64) + + # float + bool -> float + exp = pd.Index([1., 0., 2., 3., 4.]) + self._assert_insert_conversion(obj, False, exp, np.float64) + + # float + object -> object + exp = pd.Index([1., 'x', 2., 3., 4.]) + self._assert_insert_conversion(obj, 'x', exp, np.object) + + def test_insert_index_complex128(self): + pass + + def test_insert_index_bool(self): + pass + + def test_insert_index_datetime64(self): + obj = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', + '2011-01-04']) + self.assertEqual(obj.dtype, 'datetime64[ns]') + + # datetime64 + datetime64 => datetime64 + exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', '2011-01-02', + '2011-01-03', '2011-01-04']) + self._assert_insert_conversion(obj, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + # ToDo: must coerce to object + msg = "Passed item and index have different timezone" + with tm.assertRaisesRegexp(ValueError, msg): + obj.insert(1, pd.Timestamp('2012-01-01', tz='US/Eastern')) + + # ToDo: must coerce to object + msg = "cannot insert DatetimeIndex with incompatible label" + with tm.assertRaisesRegexp(TypeError, msg): + obj.insert(1, 1) + + def test_insert_index_datetime64tz(self): + obj = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', + '2011-01-04'], tz='US/Eastern') + self.assertEqual(obj.dtype, 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64tz => datetime64 + exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', '2011-01-02', + '2011-01-03', '2011-01-04'], tz='US/Eastern') + val = pd.Timestamp('2012-01-01', tz='US/Eastern') + self._assert_insert_conversion(obj, val, exp, + 'datetime64[ns, US/Eastern]') + + # ToDo: must coerce to object + msg = "Passed item and index have different timezone" + with tm.assertRaisesRegexp(ValueError, msg): + obj.insert(1, pd.Timestamp('2012-01-01')) + + # ToDo: must coerce to object + msg = "Passed item and index have different timezone" + with tm.assertRaisesRegexp(ValueError, msg): + obj.insert(1, pd.Timestamp('2012-01-01', tz='Asia/Tokyo')) + + # ToDo: must coerce to object + msg = "cannot insert DatetimeIndex with incompatible label" + with tm.assertRaisesRegexp(TypeError, msg): + obj.insert(1, 1) + + def test_insert_index_timedelta64(self): + obj = pd.TimedeltaIndex(['1 day', '2 day', '3 day', '4 day']) + self.assertEqual(obj.dtype, 'timedelta64[ns]') + + # timedelta64 + timedelta64 => timedelta64 + exp = pd.TimedeltaIndex(['1 day', '10 day', '2 day', '3 day', '4 day']) + self._assert_insert_conversion(obj, pd.Timedelta('10 day'), + exp, 'timedelta64[ns]') + + # ToDo: must coerce to object + msg = "cannot insert TimedeltaIndex with incompatible label" + with tm.assertRaisesRegexp(TypeError, msg): + obj.insert(1, pd.Timestamp('2012-01-01')) + + # ToDo: must coerce to object + msg = "cannot insert TimedeltaIndex with incompatible label" + with tm.assertRaisesRegexp(TypeError, msg): + obj.insert(1, 1) + + def test_insert_index_period(self): + obj = pd.PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], + freq='M') + self.assertEqual(obj.dtype, np.int64) + + # period + period => period + exp = pd.PeriodIndex(['2011-01', '2012-01', '2011-02', + '2011-03', '2011-04'], freq='M') + self._assert_insert_conversion(obj, pd.Period('2012-01', freq='M'), + exp, np.int64) + + # ToDo: must coerce to object? + exp = pd.PeriodIndex(['2011-01', '2012-01', '2011-02', + '2011-03', '2011-04'], freq='M') + self._assert_insert_conversion(obj, pd.Timestamp('2012-01-01'), + exp, np.int64) + + # period + int => object + msg = "Given date string not likely a datetime." + with tm.assertRaisesRegexp(ValueError, msg): + print(obj.insert(1, 1)) + + +class TestWhereCoercion(CoercionBase, tm.TestCase): + + method = 'where' + + def _assert_where_conversion(self, original, cond, values, + expected, expected_dtype): + """ test coercion triggered by where """ + target = original.copy() + res = target.where(cond, values) + self._assert(res, expected, expected_dtype) + + def _where_object_common(self, klass): + obj = klass(list('abcd')) + self.assertEqual(obj.dtype, np.object) + cond = klass([True, False, True, False]) + + # object + int -> object + exp = klass(['a', 1, 'c', 1]) + self._assert_where_conversion(obj, cond, 1, exp, np.object) + + values = klass([5, 6, 7, 8]) + exp = klass(['a', 6, 'c', 8]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + + # object + float -> object + exp = klass(['a', 1.1, 'c', 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.object) + + values = klass([5.5, 6.6, 7.7, 8.8]) + exp = klass(['a', 6.6, 'c', 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + + # object + complex -> object + exp = klass(['a', 1 + 1j, 'c', 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.object) + + values = klass([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = klass(['a', 6 + 6j, 'c', 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + + if klass is pd.Series: + exp = klass(['a', 1, 'c', 1]) + self._assert_where_conversion(obj, cond, True, exp, np.object) + + values = klass([True, False, True, True]) + exp = klass(['a', 0, 'c', 1]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + elif klass is pd.Index: + # object + bool -> object + exp = klass(['a', True, 'c', True]) + self._assert_where_conversion(obj, cond, True, exp, np.object) + + values = klass([True, False, True, True]) + exp = klass(['a', False, 'c', True]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + else: + NotImplementedError + + def test_where_series_object(self): + self._where_object_common(pd.Series) + + def test_where_index_object(self): + self._where_object_common(pd.Index) + + def _where_int64_common(self, klass): + obj = klass([1, 2, 3, 4]) + self.assertEqual(obj.dtype, np.int64) + cond = klass([True, False, True, False]) # int + int -> int - res = s.where(cond, 1) - tm.assert_series_equal(res, pd.Series([1, 1, 3, 1])) - self.assertEqual(res.dtype, np.int64) - res = s.where(cond, pd.Series([5, 6, 7, 8])) - tm.assert_series_equal(res, pd.Series([1, 6, 3, 8])) - self.assertEqual(res.dtype, np.int64) + exp = klass([1, 1, 3, 1]) + self._assert_where_conversion(obj, cond, 1, exp, np.int64) + + values = klass([5, 6, 7, 8]) + exp = klass([1, 6, 3, 8]) + self._assert_where_conversion(obj, cond, values, exp, np.int64) # int + float -> float - res = s.where(cond, 1.1) - tm.assert_series_equal(res, pd.Series([1, 1.1, 3, 1.1])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([5.5, 6.6, 7.7, 8.8])) - tm.assert_series_equal(res, pd.Series([1, 6.6, 3, 8.8])) - self.assertEqual(res.dtype, np.float64) + exp = klass([1, 1.1, 3, 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.float64) + + values = klass([5.5, 6.6, 7.7, 8.8]) + exp = klass([1, 6.6, 3, 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) # int + complex -> complex - res = s.where(cond, 1 + 1j) - tm.assert_series_equal(res, pd.Series([1, 1 + 1j, 3, 1 + 1j])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j])) - tm.assert_series_equal(res, pd.Series([1, 6 + 6j, 3, 8 + 8j])) - self.assertEqual(res.dtype, np.complex128) + if klass is pd.Series: + exp = klass([1, 1 + 1j, 3, 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, + np.complex128) + + values = klass([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = klass([1, 6 + 6j, 3, 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, + np.complex128) # int + bool -> int - res = s.where(cond, True) - tm.assert_series_equal(res, pd.Series([1, 1, 3, 1])) - self.assertEqual(res.dtype, np.int64) - res = s.where(cond, pd.Series([True, False, True, True])) - tm.assert_series_equal(res, pd.Series([1, 0, 3, 1])) - self.assertEqual(res.dtype, np.int64) - - def test_where_numeric_coercion_float(self): - s = pd.Series([1.1, 2.2, 3.3, 4.4]) - self.assertEqual(s.dtype, np.float64) - cond = pd.Series([True, False, True, False]) + exp = klass([1, 1, 3, 1]) + self._assert_where_conversion(obj, cond, True, exp, np.int64) + + values = klass([True, False, True, True]) + exp = klass([1, 0, 3, 1]) + self._assert_where_conversion(obj, cond, values, exp, np.int64) + + def test_where_series_int64(self): + self._where_int64_common(pd.Series) + + def test_where_index_int64(self): + self._where_int64_common(pd.Index) + + def _where_float64_common(self, klass): + obj = klass([1.1, 2.2, 3.3, 4.4]) + self.assertEqual(obj.dtype, np.float64) + cond = klass([True, False, True, False]) # float + int -> float - res = s.where(cond, 1) - tm.assert_series_equal(res, pd.Series([1.1, 1.0, 3.3, 1.0])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([5, 6, 7, 8])) - tm.assert_series_equal(res, pd.Series([1.1, 6.0, 3.3, 8.0])) - self.assertEqual(res.dtype, np.float64) + exp = klass([1.1, 1.0, 3.3, 1.0]) + self._assert_where_conversion(obj, cond, 1, exp, np.float64) + + values = klass([5, 6, 7, 8]) + exp = klass([1.1, 6.0, 3.3, 8.0]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) # float + float -> float - res = s.where(cond, 1.1) - tm.assert_series_equal(res, pd.Series([1.1, 1.1, 3.3, 1.1])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([5.5, 6.6, 7.7, 8.8])) - tm.assert_series_equal(res, pd.Series([1.1, 6.6, 3.3, 8.8])) - self.assertEqual(res.dtype, np.float64) + exp = klass([1.1, 1.1, 3.3, 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.float64) + + values = klass([5.5, 6.6, 7.7, 8.8]) + exp = klass([1.1, 6.6, 3.3, 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) # float + complex -> complex - res = s.where(cond, 1 + 1j) - tm.assert_series_equal(res, pd.Series([1.1, 1 + 1j, 3.3, 1 + 1j])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j])) - tm.assert_series_equal(res, pd.Series([1.1, 6 + 6j, 3.3, 8 + 8j])) - self.assertEqual(res.dtype, np.complex128) + if klass is pd.Series: + exp = klass([1.1, 1 + 1j, 3.3, 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, + np.complex128) + + values = klass([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = klass([1.1, 6 + 6j, 3.3, 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, + np.complex128) # float + bool -> float - res = s.where(cond, True) - tm.assert_series_equal(res, pd.Series([1.1, 1.0, 3.3, 1.0])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([True, False, True, True])) - tm.assert_series_equal(res, pd.Series([1.1, 0.0, 3.3, 1.0])) - self.assertEqual(res.dtype, np.float64) - - def test_where_numeric_coercion_complex(self): - s = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) - self.assertEqual(s.dtype, np.complex128) + exp = klass([1.1, 1.0, 3.3, 1.0]) + self._assert_where_conversion(obj, cond, True, exp, np.float64) + + values = klass([True, False, True, True]) + exp = klass([1.1, 0.0, 3.3, 1.0]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) + + def test_where_series_float64(self): + self._where_float64_common(pd.Series) + + def test_where_index_float64(self): + self._where_float64_common(pd.Index) + + def test_where_series_complex128(self): + obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) + self.assertEqual(obj.dtype, np.complex128) cond = pd.Series([True, False, True, False]) - # complex + int -> float - res = s.where(cond, 1) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1, 3 + 3j, 1])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5, 6, 7, 8])) - tm.assert_series_equal(res, pd.Series([1 + 1j, 6.0, 3 + 3j, 8.0])) - self.assertEqual(res.dtype, np.complex128) - - # complex + float -> float - res = s.where(cond, 1.1) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1.1, 3 + 3j, 1.1])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5.5, 6.6, 7.7, 8.8])) - tm.assert_series_equal(res, pd.Series([1 + 1j, 6.6, 3 + 3j, 8.8])) - self.assertEqual(res.dtype, np.complex128) + # complex + int -> complex + exp = pd.Series([1 + 1j, 1, 3 + 3j, 1]) + self._assert_where_conversion(obj, cond, 1, exp, np.complex128) + + values = pd.Series([5, 6, 7, 8]) + exp = pd.Series([1 + 1j, 6.0, 3 + 3j, 8.0]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) + + # complex + float -> complex + exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.complex128) + + values = pd.Series([5.5, 6.6, 7.7, 8.8]) + exp = pd.Series([1 + 1j, 6.6, 3 + 3j, 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) # complex + complex -> complex - res = s.where(cond, 1 + 1j) - tm.assert_series_equal(res, - pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 1 + 1j])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j])) - tm.assert_series_equal(res, - pd.Series([1 + 1j, 6 + 6j, 3 + 3j, 8 + 8j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.complex128) + + values = pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = pd.Series([1 + 1j, 6 + 6j, 3 + 3j, 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) # complex + bool -> complex - res = s.where(cond, True) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1, 3 + 3j, 1])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([True, False, True, True])) - tm.assert_series_equal(res, pd.Series([1 + 1j, 0, 3 + 3j, 1])) - self.assertEqual(res.dtype, np.complex128) - - def test_where_numeric_coercion_bool(self): - s = pd.Series([True, False, True, False]) - self.assertEqual(s.dtype, np.bool) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 1]) + self._assert_where_conversion(obj, cond, True, exp, np.complex128) + + values = pd.Series([True, False, True, True]) + exp = pd.Series([1 + 1j, 0, 3 + 3j, 1]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) + + def test_where_index_complex128(self): + pass + + def test_where_series_bool(self): + obj = pd.Series([True, False, True, False]) + self.assertEqual(obj.dtype, np.bool) cond = pd.Series([True, False, True, False]) # bool + int -> int - res = s.where(cond, 1) - tm.assert_series_equal(res, pd.Series([1, 1, 1, 1])) - self.assertEqual(res.dtype, np.int64) - res = s.where(cond, pd.Series([5, 6, 7, 8])) - tm.assert_series_equal(res, pd.Series([1, 6, 1, 8])) - self.assertEqual(res.dtype, np.int64) + exp = pd.Series([1, 1, 1, 1]) + self._assert_where_conversion(obj, cond, 1, exp, np.int64) + + values = pd.Series([5, 6, 7, 8]) + exp = pd.Series([1, 6, 1, 8]) + self._assert_where_conversion(obj, cond, values, exp, np.int64) # bool + float -> float - res = s.where(cond, 1.1) - tm.assert_series_equal(res, pd.Series([1.0, 1.1, 1.0, 1.1])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([5.5, 6.6, 7.7, 8.8])) - tm.assert_series_equal(res, pd.Series([1.0, 6.6, 1.0, 8.8])) - self.assertEqual(res.dtype, np.float64) + exp = pd.Series([1.0, 1.1, 1.0, 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.float64) + + values = pd.Series([5.5, 6.6, 7.7, 8.8]) + exp = pd.Series([1.0, 6.6, 1.0, 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) # bool + complex -> complex - res = s.where(cond, 1 + 1j) - tm.assert_series_equal(res, pd.Series([1, 1 + 1j, 1, 1 + 1j])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j])) - tm.assert_series_equal(res, pd.Series([1, 6 + 6j, 1, 8 + 8j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1, 1 + 1j, 1, 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.complex128) + + values = pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = pd.Series([1, 6 + 6j, 1, 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) # bool + bool -> bool - res = s.where(cond, True) - tm.assert_series_equal(res, pd.Series([True, True, True, True])) - self.assertEqual(res.dtype, np.bool) - res = s.where(cond, pd.Series([True, False, True, True])) - tm.assert_series_equal(res, pd.Series([True, False, True, True])) - self.assertEqual(res.dtype, np.bool) + exp = pd.Series([True, True, True, True]) + self._assert_where_conversion(obj, cond, True, exp, np.bool) + + values = pd.Series([True, False, True, True]) + exp = pd.Series([True, False, True, True]) + self._assert_where_conversion(obj, cond, values, exp, np.bool) + + def test_where_index_bool(self): + pass + + def test_where_series_datetime64(self): + obj = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self.assertEqual(obj.dtype, 'datetime64[ns]') + cond = pd.Series([True, False, True, False]) + + # datetime64 + datetime64 -> datetime64 + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-01')]) + self._assert_where_conversion(obj, cond, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + values = pd.Series([pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2012-01-03'), + pd.Timestamp('2012-01-04')]) + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04')]) + self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + + # ToDo: coerce to object + msg = "cannot coerce a Timestamp with a tz on a naive Block" + with tm.assertRaisesRegexp(TypeError, msg): + obj.where(cond, pd.Timestamp('2012-01-01', tz='US/Eastern')) + + # ToDo: do not coerce to UTC, must be object + values = pd.Series([pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2012-01-02', tz='US/Eastern'), + pd.Timestamp('2012-01-03', tz='US/Eastern'), + pd.Timestamp('2012-01-04', tz='US/Eastern')]) + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02 05:00'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04 05:00')]) + self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + + def test_where_index_datetime64(self): + obj = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self.assertEqual(obj.dtype, 'datetime64[ns]') + cond = pd.Index([True, False, True, False]) + + # datetime64 + datetime64 -> datetime64 + # must support scalar + msg = "cannot coerce a Timestamp with a tz on a naive Block" + with tm.assertRaises(TypeError): + obj.where(cond, pd.Timestamp('2012-01-01')) + + values = pd.Index([pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2012-01-03'), + pd.Timestamp('2012-01-04')]) + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04')]) + self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + + # ToDo: coerce to object + msg = ("Index\\(\\.\\.\\.\\) must be called with a collection " + "of some kind") + with tm.assertRaisesRegexp(TypeError, msg): + obj.where(cond, pd.Timestamp('2012-01-01', tz='US/Eastern')) + + # ToDo: do not ignore timezone, must be object + values = pd.Index([pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2012-01-02', tz='US/Eastern'), + pd.Timestamp('2012-01-03', tz='US/Eastern'), + pd.Timestamp('2012-01-04', tz='US/Eastern')]) + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04')]) + self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + + def test_where_series_datetime64tz(self): + pass + + def test_where_series_timedelta64(self): + pass + + def test_where_series_period(self): + pass + + def test_where_index_datetime64tz(self): + pass + + def test_where_index_timedelta64(self): + pass + + def test_where_index_period(self): + pass + + +class TestFillnaSeriesCoercion(CoercionBase, tm.TestCase): # not indexing, but place here for consisntency - def test_fillna_numeric_coercion_int(self): + method = 'fillna' + + def _assert_fillna_conversion(self, original, value, + expected, expected_dtype): + """ test coercion triggered by fillna """ + target = original.copy() + res = target.fillna(value) + self._assert(res, expected, expected_dtype) + + def _fillna_object_common(self, klass): + obj = klass(['a', np.nan, 'c', 'd']) + self.assertEqual(obj.dtype, np.object) + + # object + int -> object + exp = klass(['a', 1, 'c', 'd']) + self._assert_fillna_conversion(obj, 1, exp, np.object) + + # object + float -> object + exp = klass(['a', 1.1, 'c', 'd']) + self._assert_fillna_conversion(obj, 1.1, exp, np.object) + + # object + complex -> object + exp = klass(['a', 1 + 1j, 'c', 'd']) + self._assert_fillna_conversion(obj, 1 + 1j, exp, np.object) + + # object + bool -> object + exp = klass(['a', True, 'c', 'd']) + self._assert_fillna_conversion(obj, True, exp, np.object) + + def test_fillna_series_object(self): + self._fillna_object_common(pd.Series) + + def test_fillna_index_object(self): + self._fillna_object_common(pd.Index) + + def test_fillna_series_int64(self): # int can't hold NaN pass - def test_fillna_numeric_coercion_float(self): - s = pd.Series([1.1, np.nan, 3.3, 4.4]) - self.assertEqual(s.dtype, np.float64) + def test_fillna_index_int64(self): + pass + + def _fillna_float64_common(self, klass): + obj = klass([1.1, np.nan, 3.3, 4.4]) + self.assertEqual(obj.dtype, np.float64) # float + int -> float - res = s.fillna(1) - tm.assert_series_equal(res, pd.Series([1.1, 1.0, 3.3, 4.4])) - self.assertEqual(res.dtype, np.float64) + exp = klass([1.1, 1.0, 3.3, 4.4]) + self._assert_fillna_conversion(obj, 1, exp, np.float64) # float + float -> float - res = s.fillna(1.1) - tm.assert_series_equal(res, pd.Series([1.1, 1.1, 3.3, 4.4])) - self.assertEqual(res.dtype, np.float64) - - # float + complex -> complex - res = s.fillna(1 + 1j) - tm.assert_series_equal(res, pd.Series([1.1, 1 + 1j, 3.3, 4.4])) - self.assertEqual(res.dtype, np.complex128) + exp = klass([1.1, 1.1, 3.3, 4.4]) + self._assert_fillna_conversion(obj, 1.1, exp, np.float64) + + if klass is pd.Series: + # float + complex -> complex + exp = klass([1.1, 1 + 1j, 3.3, 4.4]) + self._assert_fillna_conversion(obj, 1 + 1j, exp, np.complex128) + elif klass is pd.Index: + # float + complex -> object + exp = klass([1.1, 1 + 1j, 3.3, 4.4]) + self._assert_fillna_conversion(obj, 1 + 1j, exp, np.object) + else: + NotImplementedError # float + bool -> float - res = s.fillna(True) - tm.assert_series_equal(res, pd.Series([1.1, 1.0, 3.3, 4.4])) - self.assertEqual(res.dtype, np.float64) + exp = klass([1.1, 1.0, 3.3, 4.4]) + self._assert_fillna_conversion(obj, True, exp, np.float64) - def test_fillna_numeric_coercion_complex(self): - s = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j]) - self.assertEqual(s.dtype, np.complex128) + def test_fillna_series_float64(self): + self._fillna_float64_common(pd.Series) + + def test_fillna_index_float64(self): + self._fillna_float64_common(pd.Index) + + def test_fillna_series_complex128(self): + obj = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j]) + self.assertEqual(obj.dtype, np.complex128) # complex + int -> complex - res = s.fillna(1) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, 1, exp, np.complex128) # complex + float -> complex - res = s.fillna(1.1) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, 1.1, exp, np.complex128) # complex + complex -> complex - res = s.fillna(1 + 1j) - tm.assert_series_equal(res, - pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, 1 + 1j, exp, np.complex128) # complex + bool -> complex - res = s.fillna(True) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, True, exp, np.complex128) - def test_fillna_numeric_coercion_bool(self): + def test_fillna_index_complex128(self): + self._fillna_float64_common(pd.Index) + + def test_fillna_series_bool(self): # bool can't hold NaN pass + def test_fillna_index_bool(self): + pass + + def test_fillna_series_datetime64(self): + obj = pd.Series([pd.Timestamp('2011-01-01'), + pd.NaT, + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self.assertEqual(obj.dtype, 'datetime64[ns]') + + # datetime64 + datetime64 => datetime64 + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + # datetime64 + datetime64tz => object + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + value = pd.Timestamp('2012-01-01', tz='US/Eastern') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64 + int => object + # ToDo: must be coerced to object + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp(1), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, 1, exp, 'datetime64[ns]') + + # datetime64 + object => object + exp = pd.Series([pd.Timestamp('2011-01-01'), + 'x', + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, 'x', exp, np.object) + + def test_fillna_series_datetime64tz(self): + tz = 'US/Eastern' + + obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.NaT, + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self.assertEqual(obj.dtype, 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64tz => datetime64tz + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz=tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz=tz) + self._assert_fillna_conversion(obj, value, exp, + 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64 => object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64tz + datetime64tz(different tz) => object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz='Asia/Tokyo') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64tz + int => datetime64tz + # ToDo: must be object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp(1).tz_localize(tz=tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_fillna_conversion(obj, 1, exp, + 'datetime64[ns, US/Eastern]') + + # datetime64tz + object => object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + 'x', + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_fillna_conversion(obj, 'x', exp, np.object) + + def test_fillna_series_timedelta64(self): + pass + + def test_fillna_series_period(self): + pass + + def test_fillna_index_datetime64(self): + obj = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', + '2011-01-04']) + self.assertEqual(obj.dtype, 'datetime64[ns]') + + # datetime64 + datetime64 => datetime64 + exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', + '2011-01-03', '2011-01-04']) + self._assert_fillna_conversion(obj, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + # datetime64 + datetime64tz => object + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + value = pd.Timestamp('2012-01-01', tz='US/Eastern') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64 + int => object + exp = pd.Index([pd.Timestamp('2011-01-01'), + 1, + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, 1, exp, np.object) + + # datetime64 + object => object + exp = pd.Index([pd.Timestamp('2011-01-01'), + 'x', + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, 'x', exp, np.object) + + def test_fillna_index_datetime64tz(self): + tz = 'US/Eastern' + + obj = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', + '2011-01-04'], tz=tz) + self.assertEqual(obj.dtype, 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64tz => datetime64tz + exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', + '2011-01-03', '2011-01-04'], tz=tz) + value = pd.Timestamp('2012-01-01', tz=tz) + self._assert_fillna_conversion(obj, value, exp, + 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64 => object + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64tz + datetime64tz(different tz) => object + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz='Asia/Tokyo') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64tz + int => object + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + 1, + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_fillna_conversion(obj, 1, exp, np.object) + + # datetime64tz + object => object + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + 'x', + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_fillna_conversion(obj, 'x', exp, np.object) + + def test_fillna_index_timedelta64(self): + pass + + def test_fillna_index_period(self): + pass + + +class TestReplaceSeriesCoercion(CoercionBase, tm.TestCase): + + # not indexing, but place here for consisntency + + klasses = ['series'] + method = 'replace' + + def setUp(self): + self.rep = {} + self.rep['object'] = ['a', 'b'] + self.rep['int64'] = [4, 5] + self.rep['float64'] = [1.1, 2.2] + self.rep['complex128'] = [1 + 1j, 2 + 2j] + self.rep['bool'] = [True, False] + def _assert_replace_conversion(self, from_key, to_key, how): index = pd.Index([3, 4], name='xxx') - s = pd.Series(self.rep[from_key], index=index, name='yyy') - self.assertEqual(s.dtype, from_key) + obj = pd.Series(self.rep[from_key], index=index, name='yyy') + self.assertEqual(obj.dtype, from_key) if how == 'dict': replacer = dict(zip(self.rep[from_key], self.rep[to_key])) @@ -459,7 +1154,7 @@ def _assert_replace_conversion(self, from_key, to_key, how): else: raise ValueError - result = s.replace(replacer) + result = obj.replace(replacer) # buggy on windows for bool/int64 if (from_key == 'bool' and @@ -495,54 +1190,43 @@ def _assert_replace_conversion(self, from_key, to_key, how): tm.assert_series_equal(result, exp) - def test_replace_conversion_dict_from_object(self): + def test_replace_series_object(self): from_key = 'object' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='dict') - def test_replace_conversion_dict_from_int(self): - from_key = 'int64' for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') + self._assert_replace_conversion(from_key, to_key, how='series') - def test_replace_conversion_dict_from_float(self): - from_key = 'float64' + def test_replace_series_int64(self): + from_key = 'int64' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='dict') - def test_replace_conversion_dict_from_complex(self): - from_key = 'complex128' for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') + self._assert_replace_conversion(from_key, to_key, how='series') - def test_replace_conversion_dict_from_bool(self): - from_key = 'bool' + def test_replace_series_float64(self): + from_key = 'float64' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='dict') - # Series - def test_replace_conversion_series_from_object(self): - from_key = 'object' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='series') - def test_replace_conversion_series_from_int(self): - from_key = 'int64' + def test_replace_series_complex128(self): + from_key = 'complex128' for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') + self._assert_replace_conversion(from_key, to_key, how='dict') - def test_replace_conversion_series_from_float(self): - from_key = 'float64' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='series') - def test_replace_conversion_series_from_complex(self): - from_key = 'complex128' + def test_replace_series_bool(self): + from_key = 'bool' for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') + self._assert_replace_conversion(from_key, to_key, how='dict') - def test_replace_conversion_series_from_bool(self): - from_key = 'bool' for to_key in self.rep: if compat.PY3: @@ -550,3 +1234,15 @@ def test_replace_conversion_series_from_bool(self): raise nose.SkipTest("doesn't work as in PY3") self._assert_replace_conversion(from_key, to_key, how='series') + + def test_replace_series_datetime64(self): + pass + + def test_replace_series_datetime64tz(self): + pass + + def test_replace_series_timedelta64(self): + pass + + def test_replace_series_period(self): + pass From 15e940190f8d54d5d96f434055fe993a79b75916 Mon Sep 17 00:00:00 2001 From: conquistador1492 Date: Thu, 18 Aug 2016 02:03:59 +0400 Subject: [PATCH 275/359] BUG: Copy index(GH 13522) (#14005) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/frame.py | 2 ++ pandas/tests/frame/test_mutate_columns.py | 7 +++++++ 3 files changed, 10 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8a11cbb42a78f..c8e953c1608a6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -907,6 +907,7 @@ Bug Fixes - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in area plot draws legend incorrectly if subplot is enabled or legend is moved after plot (matplotlib 1.5.0 is required to draw area plot legend properly) (issue:`9161`, :issue:`13544`) +- Bug in ``DataFrame`` assignment with an object-dtyped ``Index`` where the resultant column is mutable to the original object. (:issue:`13522`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ea83200465582..5db755b0d3dac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2638,6 +2638,8 @@ def reindexer(value): value = com._asarray_tuplesafe(value) elif value.ndim == 2: value = value.copy().T + elif isinstance(value, Index): + value = value.copy(deep=True) else: value = value.copy() diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 2bdd6657eaf18..5beab1565e538 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -156,6 +156,13 @@ def test_insert(self): df.insert(0, 'baz', df['c']) self.assertEqual(df.columns.name, 'some_name') + # GH 13522 + df = DataFrame(index=['A', 'B', 'C']) + df['X'] = df.index + df['X'] = ['x', 'y', 'z'] + exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) + assert_frame_equal(df, exp) + def test_delitem(self): del self.frame['A'] self.assertNotIn('A', self.frame) From 6fa2b03d78a8a6e94de66a01852a01a5574af2a9 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 17 Aug 2016 18:05:38 -0400 Subject: [PATCH 276/359] ENH: PeriodIndex now has period dtype split from #13755. Author: sinhrks Author: Joris Van den Bossche Closes #13941 from sinhrks/period_dtype and squashes the following commits: a968782 [sinhrks] doc / astype updates 13d9592 [Joris Van den Bossche] DOC: some clean-up of wording 06a4f36 [sinhrks] ENH: PeriodIndex now has period dtype --- doc/source/timeseries.rst | 41 ++++++ doc/source/whatsnew/v0.19.0.txt | 37 +++++- pandas/api/tests/test_api.py | 5 +- pandas/core/algorithms.py | 30 ++--- pandas/tests/indexes/common.py | 5 +- pandas/tests/indexes/test_base.py | 4 +- pandas/tests/indexes/test_datetimelike.py | 2 - pandas/tests/test_categorical.py | 36 ++--- pandas/tests/types/test_cast.py | 15 ++- pandas/tests/types/test_common.py | 56 ++++++-- pandas/tests/types/test_dtypes.py | 152 +++++++++++++++++++++- pandas/tseries/index.py | 7 +- pandas/tseries/period.py | 46 +++++-- pandas/tseries/tests/test_base.py | 37 +++--- pandas/tseries/tests/test_period.py | 78 +++++++++++ pandas/types/api.py | 2 + pandas/types/common.py | 33 ++++- pandas/types/dtypes.py | 124 +++++++++++++++++- 18 files changed, 617 insertions(+), 93 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index b8f747757987c..a35b8d561a5a7 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1594,6 +1594,47 @@ objects: idx idx + MonthEnd(3) +``PeriodIndex`` has its own dtype named ``period``, refer to :ref:`Period Dtypes `. + +.. _timeseries.period_dtype: + +Period Dtypes +~~~~~~~~~~~~~ + +.. versionadded:: 0.19.0 + +``PeriodIndex`` has a custom ``period`` dtype. This is a pandas extension +dtype similar to the :ref:`timezone aware dtype ` (``datetime64[ns, tz]``). + +.. _timeseries.timezone_series: + +The ``period`` dtype holds the ``freq`` attribute and is represented with +``period[freq]`` like ``period[D]`` or ``period[M]``, using :ref:`frequency strings `. + +.. ipython:: python + + pi = pd.period_range('2016-01-01', periods=3, freq='M') + pi + pi.dtype + +The ``period`` dtype can be used in ``.astype(...)``. It allows one to change the +``freq`` of a ``PeriodIndex`` like ``.asfreq()`` and convert a +``DatetimeIndex`` to ``PeriodIndex`` like ``to_period()``: + +.. ipython:: python + + # change monthly freq to daily freq + pi.astype('period[D]') + + # convert to DatetimeIndex + pi.astype('datetime64[ns]') + + # convert to PeriodIndex + dti = pd.date_range('2011-01-01', freq='M', periods=3) + dti + dti.astype('period[M]') + + PeriodIndex Partial String Indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index c8e953c1608a6..2412b645221ab 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -12,7 +12,7 @@ Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` -- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here ` +- ``PeriodIndex`` now has its own ``period`` dtype. see ref:`here ` .. contents:: What's new in v0.19.0 :local: @@ -628,6 +628,41 @@ Furthermore: - Passing duplicated ``percentiles`` will now raise a ``ValueError``. - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) +.. _whatsnew_0190.api.perioddtype: + +``PeriodIndex`` now has ``period`` dtype +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``PeriodIndex`` now has its own ``period`` dtype. The ``period`` dtype is a +pandas extension dtype like ``category`` or :ref:`timezone aware dtype ` (``datetime64[ns, tz]``). (:issue:`13941`). +As a consequence of this change, ``PeriodIndex`` no longer has an integer dtype: + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pi = pd.PeriodIndex(['2016-08-01'], freq='D') + + In [2]: pi + Out[2]: PeriodIndex(['2016-08-01'], dtype='int64', freq='D') + + In [3]: pd.api.types.is_integer_dtype(pi) + Out[3]: True + + In [4]: pi.dtype + Out[4]: dtype('int64') + +New Behavior: + +.. ipython:: python + + pi = pd.PeriodIndex(['2016-08-01'], freq='D') + pi + pd.api.types.is_integer_dtype(pi) + pd.api.types.is_period_dtype(pi) + pi.dtype + type(pi.dtype) + .. _whatsnew_0190.api.periodnat: ``Period('NaT')`` now returns ``pd.NaT`` diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index b1bbf18df3e06..b706d789931b0 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -151,8 +151,9 @@ class TestTypes(Base, tm.TestCase): 'is_floating_dtype', 'is_int64_dtype', 'is_integer', 'is_integer_dtype', 'is_number', 'is_numeric_dtype', 'is_object_dtype', 'is_scalar', 'is_sparse', - 'is_string_dtype', 'is_timedelta64_dtype', - 'is_timedelta64_ns_dtype', + 'is_string_dtype', + 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype', + 'is_period', 'is_period_dtype', 'is_re', 'is_re_compilable', 'is_dict_like', 'is_iterator', 'is_list_like', 'is_hashable', diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1f863bf7247a0..ee59d6552bb2f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -8,15 +8,14 @@ from pandas import compat, lib, tslib, _np_version_under1p8 from pandas.types.cast import _maybe_promote -from pandas.types.generic import (ABCSeries, ABCIndex, ABCPeriodIndex, - ABCDatetimeIndex) +from pandas.types.generic import ABCSeries, ABCIndex from pandas.types.common import (is_integer_dtype, is_int64_dtype, is_categorical_dtype, is_extension_type, is_datetimetz, + is_period_dtype, is_period_arraylike, - is_datetime_or_timedelta_dtype, is_float_dtype, needs_i8_conversion, is_categorical, @@ -395,8 +394,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False, def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) - is_period = (isinstance(values, ABCPeriodIndex) or - is_period_arraylike(values)) + is_period_type = (is_period_dtype(values) or + is_period_arraylike(values)) orig = values @@ -404,11 +403,13 @@ def _value_counts_arraylike(values, dropna=True): values = Series(values).values dtype = values.dtype - if is_datetime_or_timedelta_dtype(dtype) or is_period: + if needs_i8_conversion(dtype) or is_period_type: + from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex - if is_period: + if is_period_type: + # values may be an object values = PeriodIndex(values) freq = values.freq @@ -424,12 +425,8 @@ def _value_counts_arraylike(values, dropna=True): # dtype handling if is_datetimetz_type: - if isinstance(orig, ABCDatetimeIndex): - tz = orig.tz - else: - tz = orig.dt.tz - keys = DatetimeIndex._simple_new(keys, tz=tz) - if is_period: + keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) + if is_period_type: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_integer_dtype(dtype): @@ -472,11 +469,8 @@ def duplicated(values, keep='first'): dtype = values.dtype # no need to revert to original type - if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype): - if isinstance(values, (ABCSeries, ABCIndex)): - values = values.values.view(np.int64) - else: - values = values.view(np.int64) + if needs_i8_conversion(dtype): + values = values.view(np.int64) elif is_period_arraylike(values): from pandas.tseries.period import PeriodIndex values = PeriodIndex(values).asi8 diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index a6fde7f85084d..26f90a814ab29 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -149,10 +149,7 @@ def test_dtype_str(self): for idx in self.indices.values(): dtype = idx.dtype_str self.assertIsInstance(dtype, compat.string_types) - if isinstance(idx, PeriodIndex): - self.assertEqual(dtype, 'period') - else: - self.assertEqual(dtype, str(idx.dtype)) + self.assertEqual(dtype, str(idx.dtype)) def test_repr_max_seq_item_setting(self): # GH10182 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f0d0d2d49b973..edf7fc444c3e1 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -149,8 +149,8 @@ def test_constructor_from_series(self): expected = DatetimeIndex([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')]) - s = Series([Timestamp('20110101'), Timestamp('20120101'), Timestamp( - '20130101')]) + s = Series([Timestamp('20110101'), Timestamp('20120101'), + Timestamp('20130101')]) result = Index(s) self.assert_index_equal(result, expected) result = DatetimeIndex(s) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 3ff52380a62d8..bcc6532fbe0ce 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -791,8 +791,6 @@ def test_astype_raises(self): self.assertRaises(ValueError, idx.astype, float) self.assertRaises(ValueError, idx.astype, 'timedelta64') self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') - self.assertRaises(ValueError, idx.astype, 'datetime64') - self.assertRaises(ValueError, idx.astype, 'datetime64[ns]') def test_shift(self): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 0e37f5bf17405..b630e0914259e 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2276,28 +2276,28 @@ def test_categorical_repr_period(self): idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) c = pd.Categorical(idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" self.assertEqual(repr(c), exp) idx = pd.period_range('2011-01', freq='M', periods=5) c = pd.Categorical(idx) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] -Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] -Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" self.assertEqual(repr(c), exp) @@ -2305,28 +2305,28 @@ def test_categorical_repr_period_ordered(self): idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) c = pd.Categorical(idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" self.assertEqual(repr(c), exp) idx = pd.period_range('2011-01', freq='M', periods=5) c = pd.Categorical(idx, ordered=True) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] -Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] -Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" self.assertEqual(repr(c), exp) @@ -2515,8 +2515,8 @@ def test_categorical_series_repr_period(self): 3 2011-01-01 12:00 4 2011-01-01 13:00 dtype: category -Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" self.assertEqual(repr(s), exp) @@ -2528,7 +2528,7 @@ def test_categorical_series_repr_period(self): 3 2011-04 4 2011-05 dtype: category -Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" self.assertEqual(repr(s), exp) @@ -2541,8 +2541,8 @@ def test_categorical_series_repr_period_ordered(self): 3 2011-01-01 12:00 4 2011-01-01 13:00 dtype: category -Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" self.assertEqual(repr(s), exp) @@ -2554,7 +2554,7 @@ def test_categorical_series_repr_period_ordered(self): 3 2011-04 4 2011-05 dtype: category -Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" self.assertEqual(repr(s), exp) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index 46f37bf0ef8c2..2b4998fd64f4a 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -18,7 +18,7 @@ _maybe_convert_scalar, _find_common_type) from pandas.types.dtypes import (CategoricalDtype, - DatetimeTZDtype) + DatetimeTZDtype, PeriodDtype) from pandas.util import testing as tm _multiprocess_can_split_ = True @@ -241,12 +241,13 @@ def test_numpy_dtypes(self): # empty _find_common_type([]) - def test_pandas_dtypes(self): + def test_categorical_dtype(self): dtype = CategoricalDtype() self.assertEqual(_find_common_type([dtype]), 'category') self.assertEqual(_find_common_type([dtype, dtype]), 'category') self.assertEqual(_find_common_type([np.object, dtype]), np.object) + def test_datetimetz_dtype(self): dtype = DatetimeTZDtype(unit='ns', tz='US/Eastern') self.assertEqual(_find_common_type([dtype, dtype]), 'datetime64[ns, US/Eastern]') @@ -256,6 +257,16 @@ def test_pandas_dtypes(self): self.assertEqual(_find_common_type([dtype, dtype2]), np.object) self.assertEqual(_find_common_type([dtype2, dtype]), np.object) + def test_period_dtype(self): + dtype = PeriodDtype(freq='D') + self.assertEqual(_find_common_type([dtype, dtype]), 'period[D]') + + for dtype2 in [DatetimeTZDtype(unit='ns', tz='Asia/Tokyo'), + PeriodDtype(freq='2D'), PeriodDtype(freq='H'), + np.dtype('datetime64[ns]'), np.object, np.int64]: + self.assertEqual(_find_common_type([dtype, dtype2]), np.object) + self.assertEqual(_find_common_type([dtype2, dtype]), np.object) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py index 0a586410ad5a0..4d6f50862c562 100644 --- a/pandas/tests/types/test_common.py +++ b/pandas/tests/types/test_common.py @@ -3,19 +3,59 @@ import nose import numpy as np -from pandas.types.dtypes import DatetimeTZDtype, CategoricalDtype -from pandas.types.common import pandas_dtype +from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype +from pandas.types.common import pandas_dtype, is_dtype_equal + +import pandas.util.testing as tm _multiprocess_can_split_ = True -def test_pandas_dtype(): +class TestPandasDtype(tm.TestCase): + + def test_numpy_dtype(self): + for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: + self.assertEqual(pandas_dtype(dtype), np.dtype(dtype)) + + def test_numpy_string_dtype(self): + # do not parse freq-like string as period dtype + self.assertEqual(pandas_dtype('U'), np.dtype('U')) + self.assertEqual(pandas_dtype('S'), np.dtype('S')) + + def test_datetimetz_dtype(self): + for dtype in ['datetime64[ns, US/Eastern]', + 'datetime64[ns, Asia/Tokyo]', + 'datetime64[ns, UTC]']: + self.assertIs(pandas_dtype(dtype), DatetimeTZDtype(dtype)) + self.assertEqual(pandas_dtype(dtype), DatetimeTZDtype(dtype)) + self.assertEqual(pandas_dtype(dtype), dtype) + + def test_categorical_dtype(self): + self.assertEqual(pandas_dtype('category'), CategoricalDtype()) + + def test_period_dtype(self): + for dtype in ['period[D]', 'period[3M]', 'period[U]', + 'Period[D]', 'Period[3M]', 'Period[U]']: + self.assertIs(pandas_dtype(dtype), PeriodDtype(dtype)) + self.assertEqual(pandas_dtype(dtype), PeriodDtype(dtype)) + self.assertEqual(pandas_dtype(dtype), dtype) + + +def test_dtype_equal(): + assert is_dtype_equal(np.int64, np.int64) + assert not is_dtype_equal(np.int64, np.float64) + + p1 = PeriodDtype('D') + p2 = PeriodDtype('D') + assert is_dtype_equal(p1, p2) + assert not is_dtype_equal(np.int64, p1) + + p3 = PeriodDtype('2D') + assert not is_dtype_equal(p1, p3) + + assert not DatetimeTZDtype.is_dtype(np.int64) + assert not PeriodDtype.is_dtype(np.int64) - assert pandas_dtype('datetime64[ns, US/Eastern]') == DatetimeTZDtype( - 'datetime64[ns, US/Eastern]') - assert pandas_dtype('category') == CategoricalDtype() - for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: - assert pandas_dtype(dtype) == np.dtype(dtype) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index 1743e80ae01a9..dd1a8dbd5c53a 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -3,14 +3,15 @@ import nose import numpy as np +import pandas as pd from pandas import Series, Categorical, date_range -from pandas.types.dtypes import CategoricalDtype -from pandas.types.common import (is_categorical_dtype, - is_categorical, DatetimeTZDtype, +from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype +from pandas.types.common import (is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, + is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, - is_datetime64_dtype, + is_datetime64_dtype, is_string_dtype, _coerce_to_dtype) import pandas.util.testing as tm @@ -24,6 +25,7 @@ def test_hash(self): def test_equality_invalid(self): self.assertRaises(self.dtype == 'foo') + self.assertFalse(is_dtype_equal(self.dtype, np.int64)) def test_numpy_informed(self): @@ -206,6 +208,148 @@ def test_parser(self): DatetimeTZDtype('ns', tz), ) + def test_empty(self): + dt = DatetimeTZDtype() + with tm.assertRaises(AttributeError): + str(dt) + + +class TestPeriodDtype(Base, tm.TestCase): + + def setUp(self): + self.dtype = PeriodDtype('D') + + def test_construction(self): + with tm.assertRaises(ValueError): + PeriodDtype('xx') + + for s in ['period[D]', 'Period[D]', 'D']: + dt = PeriodDtype(s) + self.assertEqual(dt.freq, pd.tseries.offsets.Day()) + self.assertTrue(is_period_dtype(dt)) + + for s in ['period[3D]', 'Period[3D]', '3D']: + dt = PeriodDtype(s) + self.assertEqual(dt.freq, pd.tseries.offsets.Day(3)) + self.assertTrue(is_period_dtype(dt)) + + for s in ['period[26H]', 'Period[26H]', '26H', + 'period[1D2H]', 'Period[1D2H]', '1D2H']: + dt = PeriodDtype(s) + self.assertEqual(dt.freq, pd.tseries.offsets.Hour(26)) + self.assertTrue(is_period_dtype(dt)) + + def test_subclass(self): + a = PeriodDtype('period[D]') + b = PeriodDtype('period[3D]') + + self.assertTrue(issubclass(type(a), type(a))) + self.assertTrue(issubclass(type(a), type(b))) + + def test_identity(self): + self.assertEqual(PeriodDtype('period[D]'), + PeriodDtype('period[D]')) + self.assertIs(PeriodDtype('period[D]'), + PeriodDtype('period[D]')) + + self.assertEqual(PeriodDtype('period[3D]'), + PeriodDtype('period[3D]')) + self.assertIs(PeriodDtype('period[3D]'), + PeriodDtype('period[3D]')) + + self.assertEqual(PeriodDtype('period[1S1U]'), + PeriodDtype('period[1000001U]')) + self.assertIs(PeriodDtype('period[1S1U]'), + PeriodDtype('period[1000001U]')) + + def test_coerce_to_dtype(self): + self.assertEqual(_coerce_to_dtype('period[D]'), + PeriodDtype('period[D]')) + self.assertEqual(_coerce_to_dtype('period[3M]'), + PeriodDtype('period[3M]')) + + def test_compat(self): + self.assertFalse(is_datetime64_ns_dtype(self.dtype)) + self.assertFalse(is_datetime64_ns_dtype('period[D]')) + self.assertFalse(is_datetime64_dtype(self.dtype)) + self.assertFalse(is_datetime64_dtype('period[D]')) + + def test_construction_from_string(self): + result = PeriodDtype('period[D]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + result = PeriodDtype.construct_from_string('period[D]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('foo') + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('period[foo]') + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('foo[D]') + + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('datetime64[ns]') + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('datetime64[ns, US/Eastern]') + + def test_is_dtype(self): + self.assertTrue(PeriodDtype.is_dtype(self.dtype)) + self.assertTrue(PeriodDtype.is_dtype('period[D]')) + self.assertTrue(PeriodDtype.is_dtype('period[3D]')) + self.assertTrue(PeriodDtype.is_dtype(PeriodDtype('3D'))) + self.assertTrue(PeriodDtype.is_dtype('period[U]')) + self.assertTrue(PeriodDtype.is_dtype('period[S]')) + self.assertTrue(PeriodDtype.is_dtype(PeriodDtype('U'))) + self.assertTrue(PeriodDtype.is_dtype(PeriodDtype('S'))) + + self.assertFalse(PeriodDtype.is_dtype('D')) + self.assertFalse(PeriodDtype.is_dtype('3D')) + self.assertFalse(PeriodDtype.is_dtype('U')) + self.assertFalse(PeriodDtype.is_dtype('S')) + self.assertFalse(PeriodDtype.is_dtype('foo')) + self.assertFalse(PeriodDtype.is_dtype(np.object_)) + self.assertFalse(PeriodDtype.is_dtype(np.int64)) + self.assertFalse(PeriodDtype.is_dtype(np.float64)) + + def test_equality(self): + self.assertTrue(is_dtype_equal(self.dtype, 'period[D]')) + self.assertTrue(is_dtype_equal(self.dtype, PeriodDtype('D'))) + self.assertTrue(is_dtype_equal(self.dtype, PeriodDtype('D'))) + self.assertTrue(is_dtype_equal(PeriodDtype('D'), PeriodDtype('D'))) + + self.assertFalse(is_dtype_equal(self.dtype, 'D')) + self.assertFalse(is_dtype_equal(PeriodDtype('D'), PeriodDtype('2D'))) + + def test_basic(self): + self.assertTrue(is_period_dtype(self.dtype)) + + pidx = pd.period_range('2013-01-01 09:00', periods=5, freq='H') + + self.assertTrue(is_period_dtype(pidx.dtype)) + self.assertTrue(is_period_dtype(pidx)) + self.assertTrue(is_period(pidx)) + + s = Series(pidx, name='A') + # dtypes + # series results in object dtype currently, + # is_period checks period_arraylike + self.assertFalse(is_period_dtype(s.dtype)) + self.assertFalse(is_period_dtype(s)) + self.assertTrue(is_period(s)) + + self.assertFalse(is_period_dtype(np.dtype('float64'))) + self.assertFalse(is_period_dtype(1.0)) + self.assertFalse(is_period(np.dtype('float64'))) + self.assertFalse(is_period(1.0)) + + def test_empty(self): + dt = PeriodDtype() + with tm.assertRaises(AttributeError): + str(dt) + + def test_not_string(self): + # though PeriodDtype has object kind, it cannot be string + self.assertFalse(is_string_dtype(PeriodDtype('D'))) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 01728889a8595..8f50ddc0f9e41 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -13,10 +13,12 @@ is_integer, is_float, is_integer_dtype, is_datetime64_ns_dtype, + is_period_dtype, is_bool_dtype, is_string_dtype, is_list_like, is_scalar, + pandas_dtype, _ensure_int64) from pandas.types.generic import ABCSeries from pandas.types.dtypes import DatetimeTZDtype @@ -802,8 +804,7 @@ def to_datetime(self, dayfirst=False): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): - dtype = np.dtype(dtype) - + dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): @@ -817,6 +818,8 @@ def astype(self, dtype, copy=True): return self elif is_string_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) + elif is_period_dtype(dtype): + return self.to_period(freq=dtype.freq) raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) def _get_time_micros(self): diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index af46162038fef..486cf52f188a9 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -10,11 +10,15 @@ is_integer_dtype, is_float_dtype, is_scalar, + is_datetime64_dtype, + is_datetime64tz_dtype, is_timedelta64_dtype, + is_period_dtype, is_bool_dtype, + pandas_dtype, _ensure_int64, _ensure_object) - +from pandas.types.dtypes import PeriodDtype from pandas.types.generic import ABCSeries import pandas.tseries.frequencies as frequencies @@ -123,7 +127,6 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): ---------- data : array-like (1-dimensional), optional Optional period-like data to construct index with - dtype : NumPy dtype (default: i8) copy : bool Make a copy of input ndarray freq : string or period object, optional @@ -146,6 +149,7 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): second : int, array, or Series, default None tz : object, default None Timezone for converting datetime64 data to Periods + dtype : str or PeriodDtype, default None Examples -------- @@ -175,7 +179,8 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): __ge__ = _period_index_cmp('__ge__') def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, - periods=None, copy=False, name=None, tz=None, **kwargs): + periods=None, copy=False, name=None, tz=None, dtype=None, + **kwargs): if periods is not None: if is_float(periods): @@ -187,6 +192,16 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, if name is None and hasattr(data, 'name'): name = data.name + if dtype is not None: + dtype = pandas_dtype(dtype) + if not is_period_dtype(dtype): + raise ValueError('dtype must be PeriodDtype') + if freq is None: + freq = dtype.freq + elif freq != dtype.freq: + msg = 'specified freq and dtype are different' + raise IncompatibleFrequency(msg) + if data is None: if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) @@ -372,6 +387,11 @@ def _to_embed(self, keep_tz=False): def _formatter_func(self): return lambda x: "'%s'" % x + @property + def _int64index(self): + # do not cache, same as .asi8 + return Int64Index(self.asi8, name=self.name, fastpath=True) + def asof_locs(self, where, mask): """ where : array of timestamps @@ -393,13 +413,19 @@ def asof_locs(self, where, mask): return result @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - dtype = np.dtype(dtype) + def astype(self, dtype, copy=True, how='start'): + dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') + elif is_datetime64_dtype(dtype): + return self.to_timestamp(how=how) + elif is_datetime64tz_dtype(dtype): + return self.to_timestamp(how=how).tz_localize(dtype.tz) + elif is_period_dtype(dtype): + return self.asfreq(freq=dtype.freq) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) @Substitution(klass='PeriodIndex', value='key') @@ -650,9 +676,8 @@ def shift(self, n): return PeriodIndex(data=values, name=self.name, freq=self.freq) @cache_readonly - def dtype_str(self): - """ return the dtype str of the underlying data """ - return self.inferred_type + def dtype(self): + return PeriodDtype.construct_from_string(self.freq) @property def inferred_type(self): @@ -738,7 +763,10 @@ def get_loc(self, key, method=None, tolerance=None): try: ordinal = tslib.iNaT if key is tslib.NaT else key.ordinal - return Index.get_loc(self, ordinal, method, tolerance) + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance) + return self._int64index.get_loc(ordinal, method, tolerance) + except KeyError: raise KeyError(key) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 800f9470f9845..45a5feec7c949 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1767,35 +1767,40 @@ def test_representation(self): idx7 = pd.period_range('2013Q1', periods=1, freq="Q") idx8 = pd.period_range('2013Q1', periods=2, freq="Q") idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + idx10 = PeriodIndex(['2011-01-01', '2011-02-01'], freq='3D') - exp1 = """PeriodIndex([], dtype='int64', freq='D')""" + exp1 = """PeriodIndex([], dtype='period[D]', freq='D')""" - exp2 = """PeriodIndex(['2011-01-01'], dtype='int64', freq='D')""" + exp2 = """PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')""" - exp3 = ("PeriodIndex(['2011-01-01', '2011-01-02'], dtype='int64', " + exp3 = ("PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', " "freq='D')") exp4 = ("PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='int64', freq='D')") + "dtype='period[D]', freq='D')") - exp5 = ("PeriodIndex(['2011', '2012', '2013'], dtype='int64', " + exp5 = ("PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', " "freq='A-DEC')") exp6 = ("PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " - "dtype='int64', freq='H')") + "dtype='period[H]', freq='H')") - exp7 = """PeriodIndex(['2013Q1'], dtype='int64', freq='Q-DEC')""" - - exp8 = ("PeriodIndex(['2013Q1', '2013Q2'], dtype='int64', " + exp7 = ("PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', " "freq='Q-DEC')") - exp9 = ("PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], dtype='int64', " + exp8 = ("PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', " "freq='Q-DEC')") + exp9 = ("PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " + "dtype='period[Q-DEC]', freq='Q-DEC')") + + exp10 = ("PeriodIndex(['2011-01-01', '2011-02-01'], " + "dtype='period[3D]', freq='3D')") + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9], + idx6, idx7, idx8, idx9, idx10], [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9]): + exp6, exp7, exp8, exp9, exp10]): for func in ['__repr__', '__unicode__', '__str__']: result = getattr(idx, func)() self.assertEqual(result, expected) @@ -1805,11 +1810,11 @@ def test_representation_to_series(self): idx1 = PeriodIndex([], freq='D') idx2 = PeriodIndex(['2011-01-01'], freq='D') idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx4 = PeriodIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], freq='D') idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex( - ['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], freq='H') + idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', + 'NaT'], freq='H') idx7 = pd.period_range('2013Q1', periods=1, freq="Q") idx8 = pd.period_range('2013Q1', periods=2, freq="Q") diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index a28312451f6c0..2044d44b35d0b 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1783,6 +1783,35 @@ def test_constructor_datetime64arr(self): self.assertRaises(ValueError, PeriodIndex, vals, freq='D') + def test_constructor_dtype(self): + # passing a dtype with a tz should localize + idx = PeriodIndex(['2013-01', '2013-03'], dtype='period[M]') + exp = PeriodIndex(['2013-01', '2013-03'], freq='M') + tm.assert_index_equal(idx, exp) + self.assertEqual(idx.dtype, 'period[M]') + + idx = PeriodIndex(['2013-01-05', '2013-03-05'], dtype='period[3D]') + exp = PeriodIndex(['2013-01-05', '2013-03-05'], freq='3D') + tm.assert_index_equal(idx, exp) + self.assertEqual(idx.dtype, 'period[3D]') + + # if we already have a freq and its not the same, then asfreq + # (not changed) + idx = PeriodIndex(['2013-01-01', '2013-01-02'], freq='D') + + res = PeriodIndex(idx, dtype='period[M]') + exp = PeriodIndex(['2013-01', '2013-01'], freq='M') + tm.assert_index_equal(res, exp) + self.assertEqual(res.dtype, 'period[M]') + + res = PeriodIndex(idx, freq='M') + tm.assert_index_equal(res, exp) + self.assertEqual(res.dtype, 'period[M]') + + msg = 'specified freq and dtype are different' + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex(['2011-01'], freq='M', dtype='period[D]') + def test_constructor_empty(self): idx = pd.PeriodIndex([], freq='M') tm.assertIsInstance(idx, PeriodIndex) @@ -1970,6 +1999,15 @@ def test_constructor_freq_combined(self): freq='25H') tm.assert_index_equal(pidx, expected) + def test_dtype_str(self): + pi = pd.PeriodIndex([], freq='M') + self.assertEqual(pi.dtype_str, 'period[M]') + self.assertEqual(pi.dtype_str, str(pi.dtype)) + + pi = pd.PeriodIndex([], freq='3M') + self.assertEqual(pi.dtype_str, 'period[3M]') + self.assertEqual(pi.dtype_str, str(pi.dtype)) + def test_view_asi8(self): idx = pd.PeriodIndex([], freq='M') @@ -2314,6 +2352,17 @@ def test_to_timestamp_pi_combined(self): ['2011-01-02 00:00', '2011-01-03 01:00'], name='idx') self.assert_index_equal(result, expected) + def test_to_timestamp_to_period_astype(self): + idx = DatetimeIndex([pd.NaT, '2011-01-01', '2011-02-01'], name='idx') + + res = idx.astype('period[M]') + exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', name='idx') + tm.assert_index_equal(res, exp) + + res = idx.astype('period[3M]') + exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx') + self.assert_index_equal(res, exp) + def test_start_time(self): index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') @@ -3013,6 +3062,16 @@ def test_range_slice_outofbounds(self): tm.assert_frame_equal(df['2013-06':'2013-09'], empty) tm.assert_frame_equal(df['2013-11':'2013-12'], empty) + def test_astype_asfreq(self): + pi1 = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], freq='D') + exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') + tm.assert_index_equal(pi1.asfreq('M'), exp) + tm.assert_index_equal(pi1.astype('period[M]'), exp) + + exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='3M') + tm.assert_index_equal(pi1.asfreq('3M'), exp) + tm.assert_index_equal(pi1.astype('period[3M]'), exp) + def test_pindex_fieldaccessor_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2012-03', '2012-04'], freq='D') @@ -3037,6 +3096,25 @@ def test_period_dt64_round_trip(self): pi = dti.to_period(freq='H') tm.assert_index_equal(pi.to_timestamp(), dti) + def test_period_astype_to_timestamp(self): + pi = pd.PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') + + exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) + tm.assert_index_equal(pi.astype('datetime64[ns]'), exp) + + exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31']) + tm.assert_index_equal(pi.astype('datetime64[ns]', how='end'), exp) + + exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + tz='US/Eastern') + res = pi.astype('datetime64[ns, US/Eastern]') + tm.assert_index_equal(pi.astype('datetime64[ns, US/Eastern]'), exp) + + exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'], + tz='US/Eastern') + res = pi.astype('datetime64[ns, US/Eastern]', how='end') + tm.assert_index_equal(res, exp) + def test_to_period_quarterly(self): # make sure we can make the round trip for month in MONTHS: diff --git a/pandas/types/api.py b/pandas/types/api.py index 2d68e041f632e..096dc2f84aa67 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -18,6 +18,8 @@ is_datetime64_ns_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, + is_period, + is_period_dtype, # string-like is_string_dtype, diff --git a/pandas/types/common.py b/pandas/types/common.py index 39db0be3e416e..2e7a67112e6db 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -5,6 +5,7 @@ from pandas import lib, algos from .dtypes import (CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType, + PeriodDtype, PeriodDtypeType, ExtensionDtype) from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, @@ -63,6 +64,11 @@ def is_datetimetz(array): is_datetime64tz_dtype(array)) +def is_period(array): + """ return if we are a period array """ + return isinstance(array, ABCPeriodIndex) or is_period_arraylike(array) + + def is_datetime64_dtype(arr_or_dtype): try: tipo = _get_dtype_type(arr_or_dtype) @@ -80,13 +86,17 @@ def is_timedelta64_dtype(arr_or_dtype): return issubclass(tipo, np.timedelta64) +def is_period_dtype(arr_or_dtype): + return PeriodDtype.is_dtype(arr_or_dtype) + + def is_categorical_dtype(arr_or_dtype): return CategoricalDtype.is_dtype(arr_or_dtype) def is_string_dtype(arr_or_dtype): dtype = _get_dtype(arr_or_dtype) - return dtype.kind in ('O', 'S', 'U') + return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype) def is_period_arraylike(arr): @@ -231,7 +241,7 @@ def is_object(x): def needs_i8_conversion(arr_or_dtype): return (is_datetime_or_timedelta_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) or - isinstance(arr_or_dtype, ABCPeriodIndex)) + is_period_dtype(arr_or_dtype)) def is_numeric_dtype(arr_or_dtype): @@ -290,6 +300,8 @@ def _coerce_to_dtype(dtype): dtype = CategoricalDtype() elif is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) + elif is_period_dtype(dtype): + dtype = PeriodDtype(dtype) else: dtype = np.dtype(dtype) return dtype @@ -304,11 +316,15 @@ def _get_dtype(arr_or_dtype): return arr_or_dtype elif isinstance(arr_or_dtype, DatetimeTZDtype): return arr_or_dtype + elif isinstance(arr_or_dtype, PeriodDtype): + return arr_or_dtype elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtype.construct_from_string(arr_or_dtype) elif is_datetime64tz_dtype(arr_or_dtype): return DatetimeTZDtype.construct_from_string(arr_or_dtype) + elif is_period_dtype(arr_or_dtype): + return PeriodDtype.construct_from_string(arr_or_dtype) if hasattr(arr_or_dtype, 'dtype'): arr_or_dtype = arr_or_dtype.dtype @@ -324,11 +340,15 @@ def _get_dtype_type(arr_or_dtype): return CategoricalDtypeType elif isinstance(arr_or_dtype, DatetimeTZDtype): return DatetimeTZDtypeType + elif isinstance(arr_or_dtype, PeriodDtype): + return PeriodDtypeType elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtypeType elif is_datetime64tz_dtype(arr_or_dtype): return DatetimeTZDtypeType + elif is_period_dtype(arr_or_dtype): + return PeriodDtypeType return _get_dtype_type(np.dtype(arr_or_dtype)) try: return arr_or_dtype.dtype.type @@ -404,6 +424,8 @@ def pandas_dtype(dtype): """ if isinstance(dtype, DatetimeTZDtype): return dtype + elif isinstance(dtype, PeriodDtype): + return dtype elif isinstance(dtype, CategoricalDtype): return dtype elif isinstance(dtype, string_types): @@ -412,6 +434,13 @@ def pandas_dtype(dtype): except TypeError: pass + if dtype.startswith('period[') or dtype.startswith('Period['): + # do not parse string like U as period[U] + try: + return PeriodDtype.construct_from_string(dtype) + except TypeError: + pass + try: return CategoricalDtype.construct_from_string(dtype) except TypeError: diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index 140d494c3e1b2..5b6d7905d4095 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -244,6 +244,124 @@ def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, DatetimeTZDtype) and \ - self.unit == other.unit and \ - str(self.tz) == str(other.tz) + return (isinstance(other, DatetimeTZDtype) and + self.unit == other.unit and + str(self.tz) == str(other.tz)) + + +class PeriodDtypeType(type): + """ + the type of PeriodDtype, this metaclass determines subclass ability + """ + pass + + +class PeriodDtype(ExtensionDtype): + __metaclass__ = PeriodDtypeType + """ + A Period duck-typed class, suitable for holding a period with freq dtype. + + THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.int64. + """ + type = PeriodDtypeType + kind = 'O' + str = '|O08' + base = np.dtype('O') + num = 102 + _metadata = ['freq'] + _match = re.compile("(P|p)eriod\[(?P.+)\]") + _cache = {} + + def __new__(cls, freq=None): + """ + Parameters + ---------- + freq : frequency + """ + + if isinstance(freq, PeriodDtype): + return freq + + elif freq is None: + # empty constructor for pickle compat + return object.__new__(cls) + + from pandas.tseries.offsets import DateOffset + if not isinstance(freq, DateOffset): + freq = cls._parse_dtype_strict(freq) + + try: + return cls._cache[freq.freqstr] + except KeyError: + u = object.__new__(cls) + u.freq = freq + cls._cache[freq.freqstr] = u + return u + + @classmethod + def _parse_dtype_strict(cls, freq): + if isinstance(freq, compat.string_types): + if freq.startswith('period[') or freq.startswith('Period['): + m = cls._match.search(freq) + if m is not None: + freq = m.group('freq') + from pandas.tseries.frequencies import to_offset + freq = to_offset(freq) + if freq is not None: + return freq + + raise ValueError("could not construct PeriodDtype") + + @classmethod + def construct_from_string(cls, string): + """ + attempt to construct this type from a string, raise a TypeError + if its not possible + """ + from pandas.tseries.offsets import DateOffset + if isinstance(string, (compat.string_types, DateOffset)): + # avoid tuple to be regarded as freq + try: + return cls(freq=string) + except ValueError: + pass + raise TypeError("could not construct PeriodDtype") + + def __unicode__(self): + return "period[{freq}]".format(freq=self.freq.freqstr) + + @property + def name(self): + return str(self) + + def __hash__(self): + # make myself hashable + return hash(str(self)) + + def __eq__(self, other): + if isinstance(other, compat.string_types): + return other == self.name or other == self.name.title() + + return isinstance(other, PeriodDtype) and self.freq == other.freq + + @classmethod + def is_dtype(cls, dtype): + """ + Return a boolean if we if the passed type is an actual dtype that we + can match (via string or type) + """ + + if isinstance(dtype, compat.string_types): + # PeriodDtype can be instanciated from freq string like "U", + # but dosn't regard freq str like "U" as dtype. + if dtype.startswith('period[') or dtype.startswith('Period['): + try: + if cls._parse_dtype_strict(dtype) is not None: + return True + else: + return False + except ValueError: + return False + else: + return False + return super(PeriodDtype, cls).is_dtype(dtype) From c3e24a1cb28533c2287793c55f7e8ceb399dc0fd Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 18 Aug 2016 05:59:12 -0400 Subject: [PATCH 277/359] TST: Fix test_coercion for period dtype xref #13941. Author: sinhrks Closes #14025 from sinhrks/test_coercion_period and squashes the following commits: b873683 [sinhrks] TST: Fix test_coercion for period dtype --- pandas/tests/indexing/test_coercion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 2eae226073552..d8d8242fa50c6 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -482,19 +482,19 @@ def test_insert_index_timedelta64(self): def test_insert_index_period(self): obj = pd.PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], freq='M') - self.assertEqual(obj.dtype, np.int64) + self.assertEqual(obj.dtype, 'period[M]') # period + period => period exp = pd.PeriodIndex(['2011-01', '2012-01', '2011-02', '2011-03', '2011-04'], freq='M') self._assert_insert_conversion(obj, pd.Period('2012-01', freq='M'), - exp, np.int64) + exp, 'period[M]') # ToDo: must coerce to object? exp = pd.PeriodIndex(['2011-01', '2012-01', '2011-02', '2011-03', '2011-04'], freq='M') self._assert_insert_conversion(obj, pd.Timestamp('2012-01-01'), - exp, np.int64) + exp, 'period[M]') # period + int => object msg = "Given date string not likely a datetime." From 1919e26ead5d156c2b505a0ad8d233b02eb1b573 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Tariq Date: Thu, 18 Aug 2016 06:48:31 -0400 Subject: [PATCH 278/359] ENH: GbqConnector should be able to fetch default credentials on Google Compute Engine closes #13577 Author: Muhammad Haseeb Tariq Closes #13608 from mhaseebtariq/master and squashes the following commits: a65b3f1 [Muhammad Haseeb Tariq] newline in documentation 6bbdcdc [Muhammad Haseeb Tariq] fixed documentation 6fe392a [Muhammad Haseeb Tariq] API error on google-api-python-client==1.2 4cf26aa [Muhammad Haseeb Tariq] handling GoogleCredentials import error ac48104 [Muhammad Haseeb Tariq] Updated whatsnew entry cf41e76 [Muhammad Haseeb Tariq] modified documentation + restructured tests 62815e1 [Muhammad Haseeb Tariq] Import errors 5fd1775 [Muhammad Haseeb Tariq] added more documentation 64134ac [Muhammad Haseeb Tariq] apiclient import fix 5a2dd15 [Muhammad Haseeb Tariq] feedback changes 17dd814 [Muhammad Haseeb Tariq] splitting integration tests for 3479ed8 [Muhammad Haseeb Tariq] Merge branch 'master' into master d4c7e3d [Muhammad Haseeb Tariq] Issue #13577 - Feedback incorporation 7f1fd26 [Muhammad Haseeb Tariq] Coverage was giving wrong percentage fca8003 [Muhammad Haseeb Tariq] lint errors a02b620 [Muhammad Haseeb Tariq] unit test fix - if libraries can not be imported 9740938 [Muhammad Haseeb Tariq] concav unable to capture coverage 17f4740 [Muhammad Haseeb Tariq] Issue #13577 - flake8 fixes 9ec2a68 [Muhammad Haseeb Tariq] Issue #13577 --- doc/source/io.rst | 9 ++++ doc/source/whatsnew/v0.19.0.txt | 2 + pandas/io/gbq.py | 83 +++++++++++++++++++++++++++++---- pandas/io/tests/test_gbq.py | 39 ++++++++++++++++ 4 files changed, 124 insertions(+), 9 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 2e62a6cf8d855..cc693170f055a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4478,6 +4478,15 @@ Additional information on service accounts can be found You will need to install an additional dependency: `oauth2client `__. +Authentication via ``application default credentials`` is also possible. This is only valid +if the parameter ``private_key`` is not provided. This method also requires that +the credentials can be fetched from the environment the code is running in. +Otherwise, the OAuth2 client-side authentication is used. +Additional information on +`application default credentials `__. + +.. versionadded:: 0.19.0 + .. note:: The `'private_key'` parameter can be set to either the file path of the service account key diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2412b645221ab..9ac265a20073a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -362,6 +362,8 @@ Google BigQuery Enhancements Other enhancements ^^^^^^^^^^^^^^^^^^ +- The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch [the application default credentials](https://developers.google.com/identity/protocols/application-default-credentials). See the :ref:`docs ` for more details (:issue:`13577`). + - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index f4122d8d8b286..068cfee2b2aa2 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -160,7 +160,60 @@ def get_credentials(self): if self.private_key: return self.get_service_account_credentials() else: - return self.get_user_account_credentials() + # Try to retrieve Application Default Credentials + credentials = self.get_application_default_credentials() + if not credentials: + credentials = self.get_user_account_credentials() + return credentials + + def get_application_default_credentials(self): + """ + This method tries to retrieve the "default application credentials". + This could be useful for running code on Google Cloud Platform. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + None + + Returns + ------- + - GoogleCredentials, + If the default application credentials can be retrieved + from the environment. The retrieved credentials should also + have access to the project (self.project_id) on BigQuery. + - OR None, + If default application credentials can not be retrieved + from the environment. Or, the retrieved credentials do not + have access to the project (self.project_id) on BigQuery. + """ + import httplib2 + try: + from googleapiclient.discovery import build + except ImportError: + from apiclient.discovery import build + try: + from oauth2client.client import GoogleCredentials + except ImportError: + return None + + try: + credentials = GoogleCredentials.get_application_default() + except: + return None + + http = httplib2.Http() + try: + http = credentials.authorize(http) + bigquery_service = build('bigquery', 'v2', http=http) + # Check if the application has rights to the BigQuery project + jobs = bigquery_service.jobs() + job_data = {'configuration': {'query': {'query': 'SELECT 1'}}} + jobs.insert(projectId=self.project_id, body=job_data).execute() + return credentials + except: + return None def get_user_account_credentials(self): from oauth2client.client import OAuth2WebServerFlow @@ -577,10 +630,16 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, https://developers.google.com/api-client-library/python/apis/bigquery/v2 Authentication to the Google BigQuery service is via OAuth 2.0. - By default user account credentials are used. You will be asked to - grant permissions for product name 'pandas GBQ'. It is also posible - to authenticate via service account credentials by using - private_key parameter. + - If "private_key" is not provided: + By default "application default credentials" are used. + + .. versionadded:: 0.19.0 + + If default application credentials are not found or are restrictive, + user account credentials are used. In this case, you will be asked to + grant permissions for product name 'pandas GBQ'. + - If "private_key" is provided: + Service account credentials will be used to authenticate. Parameters ---------- @@ -688,10 +747,16 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, https://developers.google.com/api-client-library/python/apis/bigquery/v2 Authentication to the Google BigQuery service is via OAuth 2.0. - By default user account credentials are used. You will be asked to - grant permissions for product name 'pandas GBQ'. It is also posible - to authenticate via service account credentials by using - private_key parameter. + - If "private_key" is not provided: + By default "application default credentials" are used. + + .. versionadded:: 0.19.0 + + If default application credentials are not found or are restrictive, + user account credentials are used. In this case, you will be asked to + grant permissions for product name 'pandas GBQ'. + - If "private_key" is provided: + Service account credentials will be used to authenticate. Parameters ---------- diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 0d8512ffb5524..4b71192c907f8 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -151,6 +151,30 @@ def test_requirements(): raise nose.SkipTest(import_exception) +def _check_if_can_get_correct_default_credentials(): + # Checks if "Application Default Credentials" can be fetched + # from the environment the tests are running in. + # See Issue #13577 + test_requirements() + import httplib2 + try: + from googleapiclient.discovery import build + except ImportError: + from apiclient.discovery import build + try: + from oauth2client.client import GoogleCredentials + credentials = GoogleCredentials.get_application_default() + http = httplib2.Http() + http = credentials.authorize(http) + bigquery_service = build('bigquery', 'v2', http=http) + jobs = bigquery_service.jobs() + job_data = {'configuration': {'query': {'query': 'SELECT 1'}}} + jobs.insert(projectId=PROJECT_ID, body=job_data).execute() + return True + except: + return False + + def clean_gbq_environment(private_key=None): dataset = gbq._Dataset(PROJECT_ID, private_key=private_key) @@ -217,6 +241,21 @@ def test_should_be_able_to_get_results_from_query(self): schema, pages = self.sut.run_query('SELECT 1') self.assertTrue(pages is not None) + def test_get_application_default_credentials_does_not_throw_error(self): + if _check_if_can_get_correct_default_credentials(): + raise nose.SkipTest("Can get default_credentials " + "from the environment!") + credentials = self.sut.get_application_default_credentials() + self.assertIsNone(credentials) + + def test_get_application_default_credentials_returns_credentials(self): + if not _check_if_can_get_correct_default_credentials(): + raise nose.SkipTest("Cannot get default_credentials " + "from the environment!") + from oauth2client.client import GoogleCredentials + credentials = self.sut.get_application_default_credentials() + self.assertTrue(isinstance(credentials, GoogleCredentials)) + class TestGBQConnectorServiceAccountKeyPathIntegration(tm.TestCase): def setUp(self): From 6d8044c3047fb0e90a880a194895a43a158f811a Mon Sep 17 00:00:00 2001 From: znmean Date: Fri, 19 Aug 2016 05:50:14 +0900 Subject: [PATCH 279/359] DOC: fixed using IP.prompt_manager which is removed from IPython 5.x (#14004) * fixed using IP.prompt_manager which is removed from IPython 5.x class promptManager has removed in IPython 5.x, but this file still using that class, so it makes AttributeError. --- doc/sphinxext/ipython_sphinxext/ipython_directive.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/sphinxext/ipython_sphinxext/ipython_directive.py b/doc/sphinxext/ipython_sphinxext/ipython_directive.py index ad7ada8e4eea3..b8b0935cd5b96 100644 --- a/doc/sphinxext/ipython_sphinxext/ipython_directive.py +++ b/doc/sphinxext/ipython_sphinxext/ipython_directive.py @@ -802,10 +802,14 @@ def setup(self): # reset the execution count if we haven't processed this doc #NOTE: this may be borked if there are multiple seen_doc tmp files #check time stamp? - if not self.state.document.current_source in self.seen_docs: + if self.state.document.current_source not in self.seen_docs: self.shell.IP.history_manager.reset() self.shell.IP.execution_count = 1 - self.shell.IP.prompt_manager.width = 0 + try: + self.shell.IP.prompt_manager.width = 0 + except AttributeError: + # GH14003: class promptManager has removed after IPython 5.x + pass self.seen_docs.add(self.state.document.current_source) # and attach to shell so we don't have to pass them around From 6cc71350bf471f27c2c3f91b83c6703f482af1cb Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 18 Aug 2016 18:24:05 -0400 Subject: [PATCH 280/359] CLN: move PeriodIndex.__getitem__ to DatetimeIndexOpsMixin related to #6469 Author: sinhrks Closes #13987 from sinhrks/period_getitem and squashes the following commits: 48d187d [sinhrks] CLN: move PeriodIndex.__getitem__ to DatetimeOpsMixin --- pandas/tseries/base.py | 21 +++++++++++++++------ pandas/tseries/period.py | 19 ------------------- pandas/tseries/tests/test_period.py | 17 ++++++++++++++++- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 353823e296cf8..ad774d1b92202 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -236,16 +236,25 @@ def __getitem__(self, key): attribs = self._get_attributes_dict() - freq = None - if isinstance(key, slice): - if self.freq is not None and key.step is not None: - freq = key.step * self.freq - else: - freq = self.freq + is_period = isinstance(self, ABCPeriodIndex) + if is_period: + freq = self.freq + else: + freq = None + if isinstance(key, slice): + if self.freq is not None and key.step is not None: + freq = key.step * self.freq + else: + freq = self.freq + attribs['freq'] = freq result = getitem(key) if result.ndim > 1: + # To support MPL which performs slicing with 2 dim + # even though it only has 1 dim by definition + if is_period: + return self._simple_new(result, **attribs) return result return self._simple_new(result, **attribs) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 486cf52f188a9..36bb941e8c668 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -886,25 +886,6 @@ def _apply_meta(self, rawarr): rawarr = PeriodIndex(rawarr, freq=self.freq) return rawarr - def __getitem__(self, key): - getitem = self._data.__getitem__ - if is_scalar(key): - val = getitem(key) - return Period(ordinal=val, freq=self.freq) - else: - if com.is_bool_indexer(key): - key = np.asarray(key) - - result = getitem(key) - if result.ndim > 1: - # MPL kludge - # values = np.asarray(list(values), dtype=object) - # return values.reshape(result.shape) - - return PeriodIndex(result, name=self.name, freq=self.freq) - - return PeriodIndex(result, name=self.name, freq=self.freq) - def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 2044d44b35d0b..5e13d80a7467b 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -2098,8 +2098,23 @@ def test_getitem_ndim2(self): idx = period_range('2007-01', periods=3, freq='M') result = idx[:, None] - # MPL kludge + # MPL kludge, internally has incorrect shape tm.assertIsInstance(result, PeriodIndex) + self.assertEqual(result.shape, (len(idx), 1)) + + def test_getitem_index(self): + idx = period_range('2007-01', periods=10, freq='M', name='x') + + result = idx[[1, 3, 5]] + exp = pd.PeriodIndex(['2007-02', '2007-04', '2007-06'], + freq='M', name='x') + tm.assert_index_equal(result, exp) + + result = idx[[True, True, False, False, False, + True, True, False, False, False]] + exp = pd.PeriodIndex(['2007-01', '2007-02', '2007-06', '2007-07'], + freq='M', name='x') + tm.assert_index_equal(result, exp) def test_getitem_partial(self): rng = period_range('2007-01', periods=50, freq='M') From a01e58fa780ed28f983e1a022b07e196e037d42a Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 18 Aug 2016 18:26:27 -0400 Subject: [PATCH 281/359] BUG: ufunc with PeriodIndex may raise IncompatibleFrequency Author: sinhrks Closes #13980 from sinhrks/period_numpy_ufunc and squashes the following commits: f94d027 [sinhrks] BUG: ufunc with PeriodIndex may raise IncompatibleFrequency --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tseries/period.py | 17 ++++++++++++++--- pandas/tseries/tests/test_base.py | 9 ++++----- pandas/tseries/tests/test_period.py | 29 +++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9ac265a20073a..c7f0beb439596 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1065,6 +1065,7 @@ Bug Fixes - Bug in ``concat`` and ``groupby`` for hierarchical frames with ``RangeIndex`` levels (:issue:`13542`). - Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`) +- Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`) - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 36bb941e8c668..9b2fa705df385 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -359,9 +359,15 @@ def __array_wrap__(self, result, context=None): if isinstance(context, tuple) and len(context) > 0: func = context[0] if (func is np.add): - return self._add_delta(context[1][1]) + try: + return self._add_delta(context[1][1]) + except IncompatibleFrequency: + raise TypeError elif (func is np.subtract): - return self._add_delta(-context[1][1]) + try: + return self._add_delta(-context[1][1]) + except IncompatibleFrequency: + raise TypeError elif isinstance(func, np.ufunc): if 'M->M' not in func.types: msg = "ufunc '{0}' not supported for the PeriodIndex" @@ -371,7 +377,7 @@ def __array_wrap__(self, result, context=None): if is_bool_dtype(result): return result - return PeriodIndex(result, freq=self.freq, name=self.name) + return self._shallow_copy(result) @property def _box_func(self): @@ -628,6 +634,11 @@ def _maybe_convert_timedelta(self, other): offset_nanos = tslib._delta_to_nanoseconds(offset) if (nanos % offset_nanos).all() == 0: return nanos // offset_nanos + elif is_integer(other): + # integer is passed to .shift via + # _add_datetimelike_methods basically + # but ufunc may pass integer to _add_delta + return other # raise when input doesn't have freq msg = "Input has different freq from PeriodIndex(freq={0})" raise IncompatibleFrequency(msg.format(self.freqstr)) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 45a5feec7c949..0d6c991f00c8b 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1758,12 +1758,11 @@ def test_representation(self): idx1 = PeriodIndex([], freq='D') idx2 = PeriodIndex(['2011-01-01'], freq='D') idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], + freq='D') idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex( - ['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], freq='H') - + idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', + 'NaT'], freq='H') idx7 = pd.period_range('2013Q1', periods=1, freq="Q") idx8 = pd.period_range('2013Q1', periods=2, freq="Q") idx9 = pd.period_range('2013Q1', periods=3, freq="Q") diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 5e13d80a7467b..fe6dcf69e0b4e 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -4140,6 +4140,7 @@ def test_pi_ops_errors(self): s = pd.Series(idx) msg = "unsupported operand type\(s\)" + for obj in [idx, s]: for ng in ["str", 1.5]: with tm.assertRaisesRegexp(TypeError, msg): @@ -4152,6 +4153,20 @@ def test_pi_ops_errors(self): with tm.assertRaisesRegexp(TypeError, msg): obj - ng + # ToDo: currently, it accepts float because PeriodIndex.values + # is internally int. Should be fixed after GH13988 + # msg is different depending on NumPy version + if not _np_version_under1p9: + for ng in ["str"]: + with tm.assertRaises(TypeError): + np.add(obj, ng) + + with tm.assertRaises(TypeError): + np.add(ng, obj) + + with tm.assertRaises(TypeError): + np.subtract(ng, obj) + def test_pi_ops_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') @@ -4159,8 +4174,22 @@ def test_pi_ops_nat(self): 'NaT', '2011-06'], freq='M', name='idx') self._check(idx, lambda x: x + 2, expected) self._check(idx, lambda x: 2 + x, expected) + self._check(idx, lambda x: np.add(x, 2), expected) self._check(idx + 2, lambda x: x - 2, idx) + self._check(idx + 2, lambda x: np.subtract(x, 2), idx) + + # freq with mult + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='2M', name='idx') + expected = PeriodIndex(['2011-07', '2011-08', + 'NaT', '2011-10'], freq='2M', name='idx') + self._check(idx, lambda x: x + 3, expected) + self._check(idx, lambda x: 3 + x, expected) + self._check(idx, lambda x: np.add(x, 3), expected) + + self._check(idx + 3, lambda x: x - 3, idx) + self._check(idx + 3, lambda x: np.subtract(x, 3), idx) def test_pi_ops_array_int(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', From 5c955cb6154cf60954019522bf6e3cc338c6248b Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 19 Aug 2016 05:02:46 -0400 Subject: [PATCH 282/359] CLN: Drop lags parameter from Panel.shift (#14041) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/panel.py | 3 +-- pandas/tests/test_panel.py | 14 -------------- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index c7f0beb439596..cc3cc631b9575 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -891,6 +891,7 @@ Removal of prior version deprecations/changes - ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) - ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) - ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) +- ``Panel.shift()`` has dropped the ``lags`` parameter in favour of ``periods`` (:issue:`14041`) - ``pd.Index`` has dropped the ``diff`` method in favour of ``difference`` (:issue:`13669`) - ``Series.to_csv`` has dropped the ``nanRep`` parameter in favor of ``na_rep`` (:issue:`13804`) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index b8cd9b90e7989..b2082ce29545e 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -35,7 +35,7 @@ from pandas.core.ops import _op_descriptions from pandas.core.series import Series from pandas.tools.util import cartesian_product -from pandas.util.decorators import (deprecate, Appender, deprecate_kwarg) +from pandas.util.decorators import (deprecate, Appender) _shared_doc_kwargs = dict( axes='items, major_axis, minor_axis', @@ -1234,7 +1234,6 @@ def count(self, axis='major'): return self._wrap_result(result, axis) - @deprecate_kwarg(old_arg_name='lags', new_arg_name='periods') def shift(self, periods=1, freq=None, axis='major'): """ Shift index by desired number of periods with an optional time freq. diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index a93f2ae5651b4..1f9ca4635b585 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -881,20 +881,6 @@ def setUp(self): self.panel.minor_axis.name = None self.panel.items.name = None - def test_panel_warnings(self): - with tm.assert_produces_warning(FutureWarning): - shifted1 = self.panel.shift(lags=1) - - with tm.assert_produces_warning(False): - shifted2 = self.panel.shift(periods=1) - - tm.assert_panel_equal(shifted1, shifted2) - - with tm.assert_produces_warning(False): - shifted3 = self.panel.shift() - - tm.assert_panel_equal(shifted1, shifted3) - def test_constructor(self): # with BlockManager wp = Panel(self.panel._data) From 453bc26e07ebbfea33dda9ff2f4efab683718506 Mon Sep 17 00:00:00 2001 From: John Zwinck Date: Fri, 19 Aug 2016 06:46:38 -0400 Subject: [PATCH 283/359] DOC: NDFrame.to_hdf(data_columns) documented (#13061). closes #13061 Author: John Zwinck Closes #13951 from jzwinck/fix-13061 and squashes the following commits: cfcca2f [John Zwinck] DOC: NDFrame.to_hdf(data_columns) documented (#13061). --- pandas/core/generic.py | 12 +++++++----- pandas/io/pytables.py | 7 ++++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 62c99d99dd407..b0045054a822a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1088,17 +1088,15 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', lines=lines) def to_hdf(self, path_or_buf, key, **kwargs): - """Activate the HDFStore. + """Write the contained data to an HDF5 file using HDFStore. Parameters ---------- path_or_buf : the path (string) or HDFStore object key : string indentifier for the group in the store - mode : optional, {'a', 'w', 'r', 'r+'}, default 'a' + mode : optional, {'a', 'w', 'r+'}, default 'a' - ``'r'`` - Read-only; no data can be modified. ``'w'`` Write; a new file is created (an existing file with the same name would be deleted). @@ -1116,6 +1114,11 @@ def to_hdf(self, path_or_buf, key, **kwargs): / selecting subsets of the data append : boolean, default False For Table formats, append the input data to the existing + data_columns : list of columns to create as data columns, or True to + use all columns. See + `here `__ # noqa + + Applicable only to format='table'. complevel : int, 1-9, default 0 If a complib is specified compression will be applied where possible @@ -1126,7 +1129,6 @@ def to_hdf(self, path_or_buf, key, **kwargs): If applying compression use the fletcher32 checksum dropna : boolean, default False. If true, ALL nan rows will not be written to store. - """ from pandas.io import pytables diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index aa38958f6c92e..9b3cbb635b454 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -859,6 +859,9 @@ def put(self, key, value, format=None, append=False, **kwargs): append : boolean, default False This will force Table format, append the input data to the existing. + data_columns : list of columns to create as data columns, or True to + use all columns. See + `here `__ # noqa encoding : default None, provide an encoding for strings dropna : boolean, default False, do not write an ALL nan row to the store settable by the option 'io.hdf.dropna_table' @@ -936,7 +939,8 @@ def append(self, key, value, format=None, append=True, columns=None, append : boolean, default True, append the input data to the existing data_columns : list of columns to create as data columns, or True to - use all columns + use all columns. See + `here `__ # noqa min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation chunksize : size to chunk the writing @@ -944,6 +948,7 @@ def append(self, key, value, format=None, append=True, columns=None, encoding : default None, provide an encoding for strings dropna : boolean, default False, do not write an ALL nan row to the store settable by the option 'io.hdf.dropna_table' + Notes ----- Does *not* check if data being appended overlaps with existing From 49af018a98be0da108ced3054c0a02ff1a4f5fc7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 19 Aug 2016 22:41:37 +0200 Subject: [PATCH 284/359] DOC: add data_columns to doc string of `df.to_hdf()` and `HDFStore.append()` (#14046) --- pandas/core/generic.py | 12 +++++++----- pandas/io/pytables.py | 8 +++++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b0045054a822a..8e295174771c4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1105,18 +1105,20 @@ def to_hdf(self, path_or_buf, key, **kwargs): and if the file does not exist it is created. ``'r+'`` It is similar to ``'a'``, but the file must already exist. - format : 'fixed(f)|table(t)', default is 'fixed' + format : 'fixed(f)|table(t)', default is 'fixed' fixed(f) : Fixed format Fast writing/reading. Not-appendable, nor searchable table(t) : Table format Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching / selecting subsets of the data - append : boolean, default False + append : boolean, default False For Table formats, append the input data to the existing - data_columns : list of columns to create as data columns, or True to - use all columns. See - `here `__ # noqa + data_columns : list of columns, or True, default None + List of columns to create as indexed data columns for on-disk + queries, or True to use all columns. By default only the axes + of the object are indexed. See `here + `__. Applicable only to format='table'. complevel : int, 1-9, default 0 diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9b3cbb635b454..5229936bd8a04 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -938,9 +938,11 @@ def append(self, key, value, format=None, append=True, columns=None, / selecting subsets of the data append : boolean, default True, append the input data to the existing - data_columns : list of columns to create as data columns, or True to - use all columns. See - `here `__ # noqa + data_columns : list of columns, or True, default None + List of columns to create as indexed data columns for on-disk + queries, or True to use all columns. By default only the axes + of the object are indexed. See `here + `__. min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation chunksize : size to chunk the writing From 53447f4148998ff7d47639eb95432237fd0d656e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 19 Aug 2016 17:36:29 -0400 Subject: [PATCH 285/359] CLN: Removed DataFrame.to_wide (#14039) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/frame.py | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index cc3cc631b9575..f6a03a613f75f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -894,6 +894,7 @@ Removal of prior version deprecations/changes - ``Panel.shift()`` has dropped the ``lags`` parameter in favour of ``periods`` (:issue:`14041`) - ``pd.Index`` has dropped the ``diff`` method in favour of ``difference`` (:issue:`13669`) +- ``pd.DataFrame`` has dropped the ``to_wide`` method in favour of ``to_panel`` (:issue:`14039`) - ``Series.to_csv`` has dropped the ``nanRep`` parameter in favor of ``na_rep`` (:issue:`13804`) - ``Series.xs``, ``DataFrame.xs``, ``Panel.xs``, ``Panel.major_xs``, and ``Panel.minor_xs`` have dropped the ``copy`` parameter (:issue:`13781`) - ``str.split`` has dropped the ``return_type`` parameter in favor of ``expand`` (:issue:`13701`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5db755b0d3dac..fa46ee4829cb9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -78,8 +78,7 @@ OrderedDict, raise_with_traceback) from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import (deprecate, Appender, Substitution, - deprecate_kwarg) +from pandas.util.decorators import deprecate_kwarg, Appender, Substitution from pandas.tseries.period import PeriodIndex from pandas.tseries.index import DatetimeIndex @@ -1300,8 +1299,6 @@ def to_panel(self): return self._constructor_expanddim(new_mgr) - to_wide = deprecate('to_wide', to_panel) - def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, quoting=None, From 5c78ee6b457439c9521aa3d17113b519740a9cdc Mon Sep 17 00:00:00 2001 From: OXPHOS Date: Sat, 20 Aug 2016 09:20:53 -0400 Subject: [PATCH 286/359] DOC: update docs for read_csv().na_values and keep_default_na closes #13967 Author: OXPHOS Closes #14030 from OXPHOS/excel_na and squashes the following commits: ff1964b [OXPHOS] GH13967: move around _NA_VALUES and add doc for read_csv().na_values --- pandas/io/common.py | 8 ++ pandas/io/excel.py | 185 +++++++++++++++++--------------- pandas/io/parsers.py | 9 +- pandas/io/tests/data/test5.xls | Bin 0 -> 20480 bytes pandas/io/tests/data/test5.xlsm | Bin 0 -> 8017 bytes pandas/io/tests/data/test5.xlsx | Bin 0 -> 8002 bytes pandas/io/tests/test_excel.py | 15 +++ 7 files changed, 120 insertions(+), 97 deletions(-) create mode 100644 pandas/io/tests/data/test5.xls create mode 100644 pandas/io/tests/data/test5.xlsm create mode 100644 pandas/io/tests/data/test5.xlsx diff --git a/pandas/io/common.py b/pandas/io/common.py index b7ac183b7ab41..127ebc4839fd3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -14,6 +14,14 @@ from pandas.core.common import AbstractMethodError from pandas.types.common import is_number +# common NA values +# no longer excluding inf representations +# '1.#INF','-1.#INF', '1.#INF000000', +_NA_VALUES = set([ + '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', + 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' +]) + try: import pathlib _PATHLIB_INSTALLED = True diff --git a/pandas/io/excel.py b/pandas/io/excel.py index b415661c99438..c713cafc0e110 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -16,7 +16,8 @@ from pandas.core.frame import DataFrame from pandas.io.parsers import TextParser from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, - EmptyDataError, get_filepath_or_buffer) + EmptyDataError, get_filepath_or_buffer, + _NA_VALUES) from pandas.tseries.period import Period from pandas import json from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, @@ -27,12 +28,105 @@ import pandas.compat.openpyxl_compat as openpyxl_compat from warnings import warn from distutils.version import LooseVersion +from pandas.util.decorators import Appender __all__ = ["read_excel", "ExcelWriter", "ExcelFile"] _writer_extensions = ["xlsx", "xls", "xlsm"] _writers = {} +_read_excel_doc = """ +Read an Excel table into a pandas DataFrame + +Parameters +---------- +io : string, path object (pathlib.Path or py._path.local.LocalPath), + file-like object, pandas ExcelFile, or xlrd workbook. + The string could be a URL. Valid URL schemes include http, ftp, s3, + and file. For file URLs, a host is expected. For instance, a local + file could be file://localhost/path/to/workbook.xlsx +sheetname : string, int, mixed list of strings/ints, or None, default 0 + + Strings are used for sheet names, Integers are used in zero-indexed + sheet positions. + + Lists of strings/integers are used to request multiple sheets. + + Specify None to get all sheets. + + str|int -> DataFrame is returned. + list|None -> Dict of DataFrames is returned, with keys representing + sheets. + + Available Cases + + * Defaults to 0 -> 1st sheet as a DataFrame + * 1 -> 2nd sheet as a DataFrame + * "Sheet1" -> 1st sheet as a DataFrame + * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames + * None -> All sheets as a dictionary of DataFrames + +header : int, list of ints, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex`` +skiprows : list-like + Rows to skip at the beginning (0-indexed) +skip_footer : int, default 0 + Rows at the end to skip (0-indexed) +index_col : int, list of ints, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex`` +names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None +converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. +parse_cols : int or list, default None + * If None then parse all columns, + * If int then indicates last column to be parsed + * If list of ints then indicates list of column numbers to be parsed + * If string then indicates comma separated list of column names and + column ranges (e.g. "A:E" or "A,C,E:F") +squeeze : boolean, default False + If the parsed data only contains one column then return a Series +na_values : str or list-like or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted + as NaN: '""" + "', '".join(sorted(_NA_VALUES)) + """'. +thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. +keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to. +verbose : boolean, default False + Indicate number of NA values placed in non-numeric columns +engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or xlrd +convert_float : boolean, default True + convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric + data will be read in as floats: Excel stores all numbers as floats + internally +has_index_names : boolean, default None + DEPRECATED: for version 0.17+ index names will be automatically + inferred based on index_col. To read Excel output from 0.16.2 and + prior that had saved index names, use True. + +Returns +------- +parsed : DataFrame or Dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheetname + argument for more information on when a Dict of Dataframes is returned. +""" + def register_writer(klass): """Adds engine to the excel writer registry. You must use this method to @@ -74,100 +168,13 @@ def get_writer(engine_name): raise ValueError("No Excel writer '%s'" % engine_name) +@Appender(_read_excel_doc) def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, has_index_names=None, converters=None, engine=None, squeeze=False, **kwds): - """ - Read an Excel table into a pandas DataFrame - - Parameters - ---------- - io : string, path object (pathlib.Path or py._path.local.LocalPath), - file-like object, pandas ExcelFile, or xlrd workbook. - The string could be a URL. Valid URL schemes include http, ftp, s3, - and file. For file URLs, a host is expected. For instance, a local - file could be file://localhost/path/to/workbook.xlsx - sheetname : string, int, mixed list of strings/ints, or None, default 0 - - Strings are used for sheet names, Integers are used in zero-indexed - sheet positions. - - Lists of strings/integers are used to request multiple sheets. - - Specify None to get all sheets. - - str|int -> DataFrame is returned. - list|None -> Dict of DataFrames is returned, with keys representing - sheets. - - Available Cases - - * Defaults to 0 -> 1st sheet as a DataFrame - * 1 -> 2nd sheet as a DataFrame - * "Sheet1" -> 1st sheet as a DataFrame - * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames - * None -> All sheets as a dictionary of DataFrames - - header : int, list of ints, default 0 - Row (0-indexed) to use for the column labels of the parsed - DataFrame. If a list of integers is passed those row positions will - be combined into a ``MultiIndex`` - skiprows : list-like - Rows to skip at the beginning (0-indexed) - skip_footer : int, default 0 - Rows at the end to skip (0-indexed) - index_col : int, list of ints, default None - Column (0-indexed) to use as the row labels of the DataFrame. - Pass None if there is no such column. If a list is passed, - those columns will be combined into a ``MultiIndex`` - names : array-like, default None - List of column names to use. If file contains no header row, - then you should explicitly pass header=None - converters : dict, default None - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the Excel cell content, and return the transformed - content. - parse_cols : int or list, default None - * If None then parse all columns, - * If int then indicates last column to be parsed - * If list of ints then indicates list of column numbers to be parsed - * If string then indicates comma separated list of column names and - column ranges (e.g. "A:E" or "A,C,E:F") - squeeze : boolean, default False - If the parsed data only contains one column then return a Series - na_values : list-like, default None - List of additional strings to recognize as NA/NaN - thousands : str, default None - Thousands separator for parsing string columns to numeric. Note that - this parameter is only necessary for columns stored as TEXT in Excel, - any numeric columns will automatically be parsed, regardless of display - format. - keep_default_na : bool, default True - If na_values are specified and keep_default_na is False the default NaN - values are overridden, otherwise they're appended to - verbose : boolean, default False - Indicate number of NA values placed in non-numeric columns - engine: string, default None - If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd - convert_float : boolean, default True - convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric - data will be read in as floats: Excel stores all numbers as floats - internally - has_index_names : boolean, default None - DEPRECATED: for version 0.17+ index names will be automatically - inferred based on index_col. To read Excel output from 0.16.2 and - prior that had saved index names, use True. - Returns - ------- - parsed : DataFrame or Dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheetname - argument for more information on when a Dict of Dataframes is returned. - """ if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5372203318d69..e74ad78ed5940 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -25,7 +25,7 @@ from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, _get_handle, UnicodeReader, UTF8Recoder, BaseIterator, CParserError, EmptyDataError, - ParserWarning) + ParserWarning, _NA_VALUES) from pandas.tseries import tools from pandas.util.decorators import Appender @@ -33,13 +33,6 @@ import pandas.lib as lib import pandas.parser as _parser -# common NA values -# no longer excluding inf representations -# '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = set([ - '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', - 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' -]) # BOM character (byte order mark) # This exists at the beginning of a file to indicate endianness diff --git a/pandas/io/tests/data/test5.xls b/pandas/io/tests/data/test5.xls new file mode 100644 index 0000000000000000000000000000000000000000..4bb7cd4767dd7f0632d971f43e1e87147dc42c29 GIT binary patch literal 20480 zcmeHP2V4``*1rh>0;qr@q9}%r0+A-Lq9RgkAgI_2QbH2}5fFFf!78lS3$6;HVnGEJ zcI~~c?U_|9pom!4wya&BYt47g3<;AViQ#)++wZrN-<`~yIro3gJ@?#m%iN^$qT!u& zKX^}4+&g=7(!|dsSP9}NNph*Luv=f1X6oQ9Uz%P>IkV5 zBr`~zAqgOvL$ZK`K7`4Jas3xE<6l+&weg> zF9h!|ajf7rn`R>;NfJpRlSl-7C&71fkJg0CLBEyipiX`iZQY0F)Fi`58kr2AaL{8q z89)+AG^G<#Qb` zXsDOddyvMFWQ+wNK7$2*{)77o{wI#U zln&t>)S(xgIiv_Eh7eDX0<{k3Hq^O-;yFZf!jocnLZXO=Cw@b|QQSTh-va7vL7iK{ znM3wcd2Ub?XNXESfuBznV|8-L1#ONmct8SpKm>S39QnV)(Uru4j_y)TNlTWd0ul)N z#*kR>Ce+#iJk*^WLX$yKR4(3EIc|czRAZp^3}regl7bwqzRJ<_s~o+*!oeZlDocxH zar+7f0geVwP-rp4(D^Ec?pH9l&`*9%3jeE8bd+M4@k}5-L*Hy+Wnt-HY3bxO&QW@4 z#_I$Y;~MF5{Jn?+frif%1PZWZ(UeAU z-rkJ@-H1Ku#tV$%{WJ@z-ByaQbGhQ9?Dv5 zMjNt;n9DPQU=|)w%y{->Gl3x!Q!H{SK&9Ay$rjQZN+)Q@AhoK%i$QA8H%yL*Rz^af zFo79QNJ^nz0EU^rtALr2t`}L3L8J`F0t_jXO+YDmF%1w+=pY`fErW9>TOqPFDj?;T z>Ci}EG zoo5{6xkHVq33)34zmXls1W3n`Iz^Xu<>YTVZKD629WXBk+o$y4vT$}aJm8he+h0bn zP(q)hguYb?T^avcCH&iz(3R=4TnT@n61p;cS|6P8QGO`1!%7OLDV^4*DV^4*DV^4* zDg9?9_{#KA#=lz$&R!+-ol59Ol+bB?aONy;pY0Ss&Y&s!GDE`^dMoq@kG2EOsA+n1 z^wG{oI};wQk0!)CivA}sP>0k%OO?>el+d>*q~i>p!e_-3oc&XDS^bt#`fE$+S4jxW z=U^Z~>w~KX6hB4B#Q>U)D+vvB0bIgj0*lU)iz^Kjzsx?g|3LJX;!K1~SX`iSaCJgn zPnJIy85$bv8H1hac89ANQheIZ(srRk(^+1iaTUxn!c`B`QMTos}48T>Q& zMq~yWNOtrlmD=lsnS4i*DMv>JF08f~Q8P;p-Zpe3tswGqn`S_kW@VtK1I>_tQ`}}r zzNS+N2OiZ>MA7JS(t%YbpACYcRSghyL**dYuGIiR*HjLI!B!0rbXVmd z7VT-O9bB|lLlf1tgR9e|7Te>Ff<*ABiqPyVc@P9Se0p=$A9f8z8=l`>IXow>!!~tkeThT|3t5fvBz>8}&d`*N&}vAgXJ} zP6mSGr6QswksF6$+rbBtGZ!R{hF!wyyt)8~Hr06{?}tpH806KfS27Uo2H>o$EQxd_ zfzO}`8V(%>4Y0EzndmZTd$tu5sIUo{e6~_uft6MY2X@rK9AQdJR#H;ZL@94??INW< zj?knyP2IUWWN2(--rnhojVKWpG;!zux;DnH6nB{^6gGTmgO+4-WJ)!44-YwzB9RjJ zkkl73K-jI_Q)V8tWs~`&)W;+eu?$2Sf_CiK(WJE_josRj2H4qI2OhrmOtF}DY%vNv%~FLJrl*Mm|2)54u^1D!7zLhYt3nLZ)5L+b zwY7@Hv}cP^;A!?M#4tTg94L8Mpjb=?wipGT)=h;Nrl*Mmcb$HuSd1xKi~>({RUwAy zY2v`scXlZj(-FnEtM)Vjh;diL(_CCvop?x#f!2gOYGg636I+Y|PqR`XhUsZ8u5W(I zQ7p!cEk=Q-*{Kl2^fVXObLY+}7Sow6MuDd}s1U>SG#6J%O|)V$0=5_hp5~-N4AawG zTo;x$v>SHan6t$w@H96SVwj%h;#yj{P*G`G5DOTt5Lv(J!|FFXNwk&rm9ToYs!B1a zWg{qf6dTl>1hZ_Ao5# z8y>Zk|BZajb4(=~q%Ta|qGUa=CXT*fQ2_0T*SK=f5J?Q+#XhXz`-Ue(rX_@@#w8^# z<$o^^$TgPe5qkR>ZVgNT=*F#x+SEt_L2+O#6@aS>lmIQ*V}%cKR>J`^Fz@(scRF%+ z`CNV2r2}sxd}BqClLWq^goLI1ES6sFB)XtiEDQ!CVY8+Hu0^0-0u;MZD~0YO)e6V$ zS~9KZJE>Omom4COE}zRI_SqVerF?2g(K$^nOW6;HX~mxSS3W9_a^0OJcbyt}RHhK$sM(6^!>`eXPGIJSr|R zMgVgKmZBO4VA=kl5c+-`C>jF_(sKla(}o7)7aC8)p(|>GW^^FT=eB}L1_K;o9~<0I z0XQB1lz>aTw~2p;f*`mn6C7e78@#mwa5@eu0jGxsTngNc2@bK46%aJqC;-QJNY|L0 zS@OBMq&x2B8IYKq2Ck4MUu9e)C=c#RP6l7ZN^fjvxRxQUG+glbH=Dl!43Ho$1;!;# z5=Aw%|3NHeOdu}d=F&uH{kWBh838n8-YAPj(&&Q=99m$w7Q_SeCxb<4sVU(QURZ&d z(-8vmU;xIFrhrdjV4-I;R6KT&r;lZV{2{cDnd0jaKL=FZ|PoXw}bf}rde4Dn|LE?~tt zl1_P0BL{PPwmVr0o9uCU4rIaa2kQ7*r1M@QxVuU|z1AeJ_&J1Z{hgph5`}ND7n8+P zdcbbV67c(Ma_rbK_{t^)1rk!=8Vwj?A_kmj_(*_GqTyZ)1tHK#@Pn6d%BDU@1|D21 z0v^mKhWrvq0V#lcNde>qTnIG8HCjSk#bTfXH>Lte(EuM^K$b6GPJn^9761Was4Jub zDgn(E@&aL;K`R_Sp)5)TLMBq%?{lFxH{!w-w1{D`B2lVyA>V`)GHi{{@Zdq(06$mQ zz1!FQq>o z=R3Ao&Z313(t!P?TKE$SVop{)P7eMAR~xHz_d{*6G4RrzeZ1ju&Q3PT02O zeEdl#1OIoqUuISAxt?;suH^cK^^3-Lp7%yWYkI=G>z3w0eP6xnrC8Rxo*GXT(8Jp&aGp5PJeu0#tGwejjfMn z^<3NEue|DhY0}&>w;U%=J%{4xvcjDQ@_Y{nZVr1rPdN5$%8&~WOQ)IFIp{p!QTVt+ zXlH}>Ed%a4Ir!eY__}TXfO+pl4_z-EnY{Yc>DoHG`m-_jPyG>0wo2xFx#0Htc6At- zLIR^~d1IASZ+>(Pp16a(aa&ASiYQ@fHzA&L7F`4*wV3@p_wcMsnL{q0G_g+aRe4xb%LDyu-6E{A zHZPeHcCGWuHSO=X2VQw=JoK8!#PUU3&PjHjbP}%#KAu(K@W7v!=&>v??zf0_Te{sJ zc6jj?QQ1|q%H8|kSAT}0G%e_)>C260K>_DN!q@q!v7*VMZj5sfj3;A6Dd-+8#6h|z z1qOZYX0DsIxW*&$pq+ojj`QUQ|3!R@I{2I^zPz^f+JpPEJTBZes#-Ou#*>uZ|{vA&_B-}mI_TiHZlXJTkw7EUt?9?HdmQ{;$x9_eKRPVO0GJoz+ zH(|e3e6M-^Mn1msuFF0DdV3pPuBT$$ENqFuCc{vhE`l?v{m1Bwi%h#4 z9kG1rJ^$gFhu6ebqq~o_&fln4H+id#aA+?Q8Ul<{VRP zUTD0v@Il3ByO|bm!|v3$7F?fmGI3-tqpNj;PQO0<=;HI5+dI#!sJN+Lnl}G_&QDw7 zR&@I@L9d^syUnFtaktIh=%&>U^qhM-ZwKGwsrBGL>_2w?M{@h8FUjod+>drOzCm7h zW*iymU}G5@aL(HOjNzeGKY99fG`k-+rQ?P+FHA2-l!U&TYi_Y*vcDZ~uZK+=mlM@} z_U(E%^1?wQGs`u3KNv3fc%pl?gH?J@dp*A$fipUcJ(K4?`EiVRu-2cw_d3pP-Ktw* z`hYJrr#_$e-?gIskRvBY{9J$Ts95M~a9y@?hs#%d z%qMOgj`trXEDPRS7d!1{i>=w0*J|BfJ!bQm>KAh*zZ*YVZvUnGAzsOs)s8bN9{C#K zb;dNk=bpRgK99}N8Tj&+@sSbnX+?!UtSHW28oMV~xPL-b*aG+2fhN72PMDTo**e_i z(Ig}B4>Qu|2yGnh9eg+@@7d4$VC1oOX4^z2+}_ z7k9<3CcbsAG3V@Trrz4+y~c7>yGa!lVW0l|aLTp*?YzyK7p+;_`@@4bFYjOJ=e_xe z+er z8E0C5{9v0|IV-bAwtMGR{#h^~IpXpAF$VF+EKkR-I(3?iYm?c<(9B_d>HgOl>+U6* zrzZSf+i%J88_yzI-53xk{7fdfB@H#2;&#gL;qr_#(I%z=(^eQbmiLP?+L;ryFKxxB z$$s?{zN9)p+bZXfLlDorYD~hyXQL(u1Z{YBC$f`{9(Rbwd(NgYCKc;7`>i|o zfaHd@{L}p2_j97M+Aj(n=W%!OlP;UYzb~A!bY()~FHRGzpIqzHTO8-T{(MNk(?#OI z-ui}-U3Z*}{zvwS{W`U_Ly9B32Sz*jFGI)SpHXEk_y;##GoN$8==rYM?{?Jf8td#VTr;cX&$n_v z>JLoSc3i*8WW=fTIp0=PK5u*D%WeBN<-0%qD{Z>tsp88Kt}CDId+^2U=KjxX3d0Hjg@2`TY;8PWQht%K83U1G6_8ZnNvYt1%wD zX~4s83r9^|eynteyJ@Cg&+$3^5{lOJI4_!VBJh-@&y2bWDX-H#onII|t*W*yy?Hpw z>qpyLZ8_l|ZLF4ENNqh_W6q7TA-n-=D$HZGTBk%M<@%pIwqAI%V(9Ku+$|4Af8S}T z=Et&4PkT({1|%o+HCp7q=5lDo4*UG~?yiUJSKer0cYO1L1g&kZbq3?6wF;guTpj=D z?5=>m1&KX$+P5Elaw%Cb#of!letPVoxN_6_W$|zP>~@T@bBlkQ6*=Y7xLI|Z+$;MJ z*X=OcHmG*%0Vh#F?5$n4$=}Xid$;4m!4~Oz@&?~+b3O0OlbJ{Jc8KriObcnfk+Wib zewj~y&xMWyPi%7hVeRJc1BUuq?M>0--Cb>cjf|dt?X1t`w^sMh@fPhXOfT{G>ygRJ z4c>j?&k==T9UUBMxR>knPuo^67Tz1n-?Vb9{XDX80C#Q1KjYla@Qw_8*muvGEYsXB z`QzX2EH3hm^IHF!h}!zMeQhKx>ABf*?JE78rgP4AS--!$ps*~jEN+DJ_SwVRU(b7RYVM&E@ogjQR>bE9mwY!Y#ngU#?RDGV zhI6@V&Rh&hyEw$aD8BxJTl~zIds^uIdST4l8$Wlr)Bfs(VOB;}??xng+*^K)A33cu zW8FXX(^u^@F0|R2Su%gup(7zlCf8q><@M7(@<3=89M$#i7eT;J=D(lsHK42MPkzr{ zC&V4=Hpwu5)92#f_H;aY??dj;x~IlPWy88&tsAksGNmqlfBo`7tJ5Et8yTDBh&&GW zcyhd*?E$ecB62ig_Zw+aTC@rv#+t9dnhOF#_3w4dD$O&zUX>5 zVE3he=`M)hIx@M(({I<^{qAYsGp{GDo^WPb{eoM*wLO+k+&uhkr;4%NGDNq#c}^HG zf5R=it3?A#9$X#sw*1PsE1$*)c%=)Ye7^S_H1i`pZf1`_T3%Ybb`SJvrqFHMvidf9 zG(z=ujPAK%bP}X{!pSJ--OOqHq@*hc43^l` zg_510Jv+>a>DJkJ{FfmtES3-oWa4di~Kt;fYFB7I=C#qhJf9B=%8 z2K;4n3phu?57zM75YG5t4Se#c^yTHCh!F2Eir$r?+f(P66eXLQg3y@OurvVsI5^+K zljATwq5s)o7R8-Ioo#?SnmQhO+cK{0WGIilyD+ZT!P^rKAmUVw11^+t`Q=fV7{YPI zTly`L35QqPDR-? zEHRsg6GkTOGgTw>|6dS-cPLSxn%e#Y);3TU>iohRpP>|eYT}A_h_6ra;~gSn4H0q; zY6oW2P-CrRpWd*EVh>f^KJY|(oaGymv()sl4@IIbI{}OaNx(i;o?V7Tx3ep)THQCK~Q*PTw-d{)L4OkMx-bKp7lY-v;vFj zMU@=9eland`p+3UKH@|9Lsj{?Rlimh2~7?1S3vn4-j`$fe}RN;0?Qu)31j^zNciKG zC`h;pg?eD`gayGRJhTU{d*nbud(4M~Wm^l0X%DoeU+8eEE#g0AAQm3VzRf@g>Vfq^ zFP)MWKtf0~FJ>E=lrm{5uBuL@P?1~y;557!O@=oP5hPyzp&EDiPb|z4S^H?HU}1v*xBz?r0Kf<+&pvLl!2kff zu>k-I06wOntb@I)xxMQ{Z7)Z27h_&eJKO7-*qH3;08I4#|Be6QCs3sQP`#BO=qG}d z;P$;`(}HJ89EjJU$Z|#cj+KLLhE0-53v#QGQV7!T->ovSb$I47Q6I%>XIe&rU6jHz z`GV{LuCAEzjCsrK+Te*~A3vlp$BBxt3dbxCi5jBf_PfFo+KUBDxGUeMbYLj1*$Jm0d}Vtz55I$7svm5F_PM!M5`CS z3rkeux>!(3074MPBpJe{bk6Iy-CL%co?;-P?HVQJKNeOb z(9p-c6@3hIExSH)>PHl|3cKF<4x_=bN@B#c_Z4ya95kJ#rY0cmv#->b8jBXMY<&X9 zXOvBeb+T)zuTxzwH%_q zQAJab2+h%l&gQl*e7rxd|8etwn2^6L9UeOf!4d$5?mXWCuFig}dJYqsiNJ3R8^fcH zo08tNXJsC(^L-kaUQ_ysKCa(q|6{bR$EavNMaS5Cky2iW#xhU+fC%m^>0lwAhVqaE z-DF&VAj_mqwaAPft%mb`>-rbkxyNSdlmyWRY%?RUxAS+Tn#brAU;4l!NiCn+DmnD- znRLw&=Ub>&df+b3zLU8&z^Hzn_R$TY?+txQ*VEUEFIm!U1}6y%uo_~ZJM&m=>TXR> zFczMu|JImCuRN5IUPWg`3BAVwNpSf->}30J&+W9JhS}AlamWXi<7i2~7xmekg?M2` zY4H3Gir{xNfyZ~)@1vhd9hxSz0DKHjTfRS-;^E+I19fn)`5|V1u?7RJXlU8{-yTJ= zgVt^Q#GN~Vje*0yP9u>53MnR?=@UFp0j69|mQb+DSzL8mPr6!$Yl0DRT2S*2TgKM* z6T0@Rtm}b#8k8*f2dtB`9)wbf6ZLGs_Gv>&>}Wz(=E-q1ZbayW*3_3zm6+;1wdP+h zmA^Eji%JX!4zJwjrHVzl<{B}F+X7+9l=@`RDWMhI5ImMXqo#u?&65lZU*c_ zu<9Z=c$JA6DYT3smvuk1XkPv5n=8vJ$Duk)DfcYr#8$R=IpSAr_{S6Z=NPYbGwxYk z>mu)|y>A>iRYcHs@@N3{F67E4*=9cUPQMni@SeFqvczk&s> zrRFfhPuxg0=?8T6SZ1#RW518!W_aNLn7y=8(x!}#XgHLyr##~Eg>OZJE0_C}JhMD9 zY$s;sOq1)tSd{M_2huqP%1m5cDU?7i{Y;X@2JZ7H*P@qJbMfi|(ie4(a4+3H{W#5q z30PA395Q5wk8Ge!JPF*R{@masnfg~p!w7>A6@A^9V8gN>sq1Ms&C?kQe ztb+XudEEM|l|b31%C;#ZkIqTp1(yJ<=gd-G%t(r>I33>H?2tv2=~J1JOij$WL6RE96ak&RPvucvS$qwAB4 zqfQ>b8v0fNaX%4Ii|+_$m3Pnds>&1U`#LxSM;#Bos9YZKo(aOKPVC2kX3;HZS^(vC0?q>`lWoKT;>nqVwY~@dm82h;IzR&|wJJA8&V7yU zvS)W4S*-HgpQR}5%j-Zb$O?0DWIwM+?M=0HEqq7fz}cOnO)d6zWie-VOC6c4S9u~r zWh!lpUfHJyjV#~2&6a0Gu?|Ps=4iK?;4-OY8CP$Wt5MUaCa~#}sXtC`_ zSc$&KFQ-lZzEaqxrBIhZ8QpbUQfD21hS=C{>hJdJ9nfpV`Q2iteHWecaRC5B+8@pR zPk6gpn%kN4{k#hNSnH<#U^Ms^sObh$8orvkI?>Opf6I_1E}IBpSIEkxS zI=O>&UTHj)yL%VZjuQsP!`8ND;aw@wXMfIwdN@BN_+~P!&0bFuG=@;=+;3bP+?kpy z?@B27O!n3dz7$lNZx|)D8ZgzdgO>A#12ELF`D zPEN{~=&pP|)$zSgz9NA}v#Cw@{YFo7Kd&J5AZOX2r+B}F8{Gubpw^Ed-3L7J&aJUH z;VcdyMBActpSM808e!@2nYhq@Bn@{i#URh%({A|1I@MY{*ZHjm?40uy9-opmSv?1FDv(A{i*?M`+W@_03#a~*S**$mYRO0AIhM46CUQKl#6i{kuC{;d1N)pK4ncV zY0$&Y>M~m)oqFTJRhyEMhu@A*I|MgS5qWvhscBNDD+kD>QAO#zCZRSgkeW`L-kcxu zs5;HSPx5SgJg%hwSgzyZeDq24_tP92{0+;GUsfj6FFel=Rz%V+PTDB28Yr_e2pL`g z**>}XKoL+zxEij)lc-Q+l0Gq}oDdPn-yv{=mrQ6L#gfTt1m0+=rw9ys?XP$BNT~Sx zbrVPEmlFe^^@kgsmPs9*03!-tjw|%snQv?NJu?>V8C`AHXB1=Nr?;V`)5@y6-vKs4 z4q6~5p^hu7@t$24)L;dV1Rzhbq8X?>NUg>#;enkToRQ{5hH%U$1w7`U25vQM#&HAN zsO<-K3Tj!~^QPAbh%hy=PB3x}s7o?&6f=5O0KwWM$Bc^sVP<|b(b`5L3opKM1`LpLv;j`R_z z;Op(0m2-*Dwck^LkTV};++n6B%uA%5Cq(Z=%9v^LYX$&y*L34jec7!Aks-#LVm-`vcaktyz$cdK#sla0lSbj%Fonk*eDQ@D?Fs6 zLrAO7rZ!Vp$8WA+Ia|Tz=pCB2K(~62$-N48Z-HVG*7fyyGpwwCDab>EJR2h+Gp;~J zNi>QF(wfT?l9m^spe4&(?TmxHkis$n(D$0?mKi}YFZEzY*Ce{<$jgcQVh5{NwjS#y zKBVL3p28eKkaeMy<3&)2M*?lGe(uCC(Bw% zWbOmZDdj2mLA*J)^KPW=QM0R0y=F{@D3#q6O%BH_#MLRx^%_?a+bpVk0uJlu$gC}X zlUM*Nu+kNH7pg-u30vt1Z)mlqgt8}C2dn2iz3&X>dN(u4wycARnc6j|;WnrV1X*tk z2w6WzRuL0@956slzo(t285_9I3>rVuwHH~8pTIZu81l^MC38 zF1XJ*hS04XaztMH*v)lJNv`}sX5Jew`y-?ic4UY zS2i-qe91M%65bGGHQhM_itsu%S+O^QjE$42C= z?2LH)qdXndwGu-iv4_$#v|vQD!m$qAJhNF3_^| z@4k&F;SCsxR&WNiYen*>io1H*n*WsV>^QZE8Gc~MPVjfKyGN1Ga^t#|Lwdr1;3|-C zu!*_8bNhOm($d&=$xgR>a*$-jP=ur)tt4%uAZIF`O}BDLej}Gv)sz-f%5J2)ow6U@ z;ZYiU*azdbNRP7YEciif-I<0Qb)=6ZgaG>rZ~OuN&=z&XB0EnscuqN#s~4{i12I%K ziPMwxT3EkrZ|g8dzxY}Tfdtn%4%EwYqyFiFU6;(nv?Vng6^rhsiNP9H2}!)T8JSV? zRNOwPLnXeHHx9U6pyVahp6@I5%`nEdoG(3qVY`i3V}!$BgAhsk&7Y-sa^{nDD3Ky! zPJuD-vI5}j@g2acmeyR@lI{RaTJ5mg5Qmym;YZQXebDQ6qrIZe!kp_WZrshVqzVBW z9WPD*mQKLBKe2r@ss3i$#65UqcXw8lyzEef9w#l>68>=M(aRN(w@ zH_$%%D_K*U!xo0dN`a-2L*o*zny{=cR>_TQffB2lX9eKHblAq$Y^%zBExJ+W)~WUg z?$!RJl=kM_kDC{ru`-Cg7*Cx;wc!mUGpas&Zp4OPRE+5PSfg+{g2%B=>SodcQ;V=0 zV8m>&@X3^Bm;&c4ShOgDrE&71dDhdU68$+;Pg@ZC*N*d8@mG@fjH5RCiqC(GjUc}& zIX!egk3wUE`VVaU=;>W7&COk1_PZ`u*e?Z%ADxy}E~u($Kj7UC0hEm zh$2HL5M-#Ye6+62p~+M`Gd{g*7wnbRCU#Y_T4Z*ZivwO&=XgJxsP4|(nh%W;ZuiTa z+IHgUG77AY@ZvVJYopgye0@_{k~foXIr*GnQ{v-^RfWY?eKUM_wACr$DqvReOt!B( zI8Jf$jt(P|PU5(H^mR36OZu)mq7ZQ!B1v1eLM2@19aGDtNlvCfiX*MPUE(5UJisJO zloheCCgOXF>8vLnlwMk*9oszXPdQm4JRsEp-u8^A^<|p$?C%zCkW_d~4&}^rD;|yj zMjQ&5S~<3R;z+@0{D;|rjaZJtq{)$(PfWPzLMwuO7?{2B-kpd7`vfuNb^za^p|=Uq zGyGh0GjV3TYC*zt-h$0Wg_9lFmP-UtouF@auS1%^%?ndT3#uW>+8^jkIj9PTrct<6 zWCPo?;ukDutO2No9y)|7gj>RBVUPafX}iQ25pi9+>#5?jhdqHYyd* zEK(!%MrDTvZ2qUY{+|_rW~oo?L-jy@P{*9wH`45#p;kIx zlG1#aCed>C^Du|X#u=D~l3~~37EdW-L}%Aw^4W%%NMvq3lSfs$t8Op#3B3%dl1aE~ zvFSnb0D{1|;;~j~!%JY6q@|GBz#x-}rU~bRf-+3=n?q47%4%MeS&;&i9S`iOxQ%4!oi~&Cb)21;@q}JO|9NU>_B*tZzJp!T zCRhbSexwz>AN0*E9!}j0t&y#oPnoRu5Z{P&Gk%K7k9@p;c5LT#F{bM#IJFs$1z~F8 z(EFhJHkM|_qScFAdNVWms>TZ&iiLm-eT#t#{mI6Xo4|=PGcVM9zdge?(>KHN%__(d z0KY3F%lK8+o_O7x9G92a?F{v1o2QT0@eZJ>%Ez;}eQ(YMZSjW7`d@@)sIV4GEBCOF z;FBm|Enaz1QL8Skw1<|)e`}|yAoL^-4M}<|008(aBp*9E{)b336#pE_u|Lh9kR7Z; z24VzQ%vgXppa`lYd{7F-Saf}&n`@d;#SANo*_=?Xis_tkUyo$qGm*;7-;Op;LQ(g4 z>%mr-uF{Avacz(WPbSPcG{NB1MmX<09>6GKYbF?^^X(2WZWv~B>x**AwbAKn#afkO z8Dv+fq)c~_ism4%`x+Ytf*njGT)7?gzm>P&$SN$YBX9_TIvb(BnT+-(1w^Oz-dFly z;7gKGi7VQuC2c?M6!XPs2J(@I+)+O>oU;DQ+~o zxnZJ-X#Sq}{q0w`+15kd_$tV9<^W4xSrhV=HrJuAq!vCmgT`bHENt=O+_1L&$MyFe zOlyMjzJ5mS*~j?xj!q0%Lrh2wFVJxP1IQSdJZRhI_lK8%74EP3H%FNoD*sII&y$tE z0)NceXm}WXGi%=c%$)bx`#jIuM?)C{69m8p-~s>uI>0?K$vz`A0N@2C z06+@BMK_douy-}LcYUDae+olko`hf7Bol$kf2)9`E=LU-9kLM#4zQ}t!!i| zr~`J3&oN=L`-3u)_etNbY;bO>?ng057q*5<_oQCVG&8`W#QlDFFhpD-42toly&cC< zT>ET_ZFy2SNeW7E8^=qGVy^GK7`nWO8x2g{`?&!pr}>c0MHADb`d*sat%gFaLSV^j z5O3iZi!~`@%m|70gs~^Cr%@#j&G~w|F`K5Dnvf$hIdmyS3S~v+MD;cM{XJa;#}V1I z2X~XpY_$EqPE@RsLU?2}sK5YgyO{25Xm{F+4DuY6j=lCKX{b)x0VYkU%y(JySH zFzM|$7K1z*Z%ZW372}L&Z26io;^mQ*r+MOjFQWp;X|gBwEM8}@{{ufH8vqO8{pel5)3YM7S#%;Qr_Z+m8vvEXFmd2<1c;&4`G4Yd^+ zOpR43$>sZqlkJ0jw{JytjIJKd!#+oOj+P|*QFG=@giEsuLl<|C@O~!~cw(3R0qU7F zpkzV?z(w=4<^Dq{9uCenFb4;lAJywG)u5qD7^=?wZ;z7LA?pqv!tULG=714jr_o4W zxipjR%t_8C08@4+OBh)BEUq@AFH<$kHOYuDBd~RsC2M=-4KgO&L*}V@ z4}8hw$wn4n=Zv8QW;8xC*}lD6zCg0wH9BlRK}Z8Miw{LZ`horAw+c;pl1h3=t;T|TR zvPi2uwwJrp5jQH+|K3?GR&q$XP&AlBQHB_G2Z}Pkk_E1%>M+Yg*i1a-2XysVh17sC z-$roI-hcK8QeG`#Q$bBI5<=Hk8S&`Sx2nmN-F;e?Q5G4t8#8;R$$n@o%>9NH=^O)N zB&@9#NFurORD#I{?(;C;;ysn-@{J{=@6iRqy?p1}hZ%Nsz>4x*@UR^&vWYDDG-x}} zU&S=d0_~m$U5m@}i54mtR(!3D#&M&xXpyM0O^+Mj%g#0 z?kV6UJ8x*$w};2N7`Jz68Hpu?wb89v+|qL?&S_t=%B)lhRNuV2jSeuezkVXEBHk}r zH7sqCTjoI<)KrD0wDVCPlf^GfUU4bOGA7KYRfUiHk$ap32Gbrw?)l1~TdthFc`7T> zIWsA(g;jF9js3OVG>lirFHS!w!k6NRC&)QA(>RdPjj5$kr;ieb&#RQ&PkB`nyTX}e z-Lt)F3IzH$hGyYMCqpl)R|mak1F>q82M&Uo7iHi^owxOeY_#l->ZV2J(}_RlPGOnl z#|`WOsgeh$DC;>bjtG92@i?SdYa=Stc&My{{>ZqC=6)qCPB1xBQs z@FUwiopuv!det1`+U-hJ3QCnE7F}ZXhs;qKq|FFn;jlI<;TMIKRH@(BiaWIA8j{GO zdrrz4tP{@=TRSZSy?*Zp_1dw1x7ZoqW#>X{0KkyyM>qeI-maGBcIMnaue?9j`bB>z z8hjhn!hpO3|CIh|a)43)wjonoE&;->n3;2PL@^Z(9uiJ=atA{bG@i)Zy9esT3IpR{ z>R2;zt(ECRp3@&aSe)j6H5JxjuO|T-M<{n6G_McsPA^pUB$dq(zjlMK1eOt^f7j_5X@R3W>UWPk9X^;j`ao`K{O2M zp~2T9&)RQ)dm+uBz~JLv_~j<~dLsM9?Iz5;i!@H3vUO?DkN2J)1iQl3;k%ZecuL55@)lid zx76g|q|9frmNh|K^(GS!A_YTKC?t5=!KGuxP`-5=&_J+{dl(lp$8zH*gV3F~bvOXD z+|an*<$jU$%QNG9$jQJ>s!x-4ap^8a2VPG4zbKg!*HD_nh^d=qwx+cac zTD}2ASvHn@R^J*RNaxxK-Lij}nIC1ej*-yPiw)NNfz2qV*q65UojfK?82dwU7 z_iM;^rw*t{5??oEtLU(J`mDVkZW!h*z$1^4@g{ex4w6=v@9u4Djd4lx8m5KpGb)$$cTc_-g@ zNqqNOQxNP+AZ{EeZ=0Gf+HdaEuF}OI<%2MBK5!T2-2-h;d}Mh&*K8)1z%oyU_UMe6 zF@DgM1d0(hQ>2Nhm|jL_iA~~c<`5*?@h+Qdu1u@C3%zhn-~VPT)>%>hw&+82IY--j zplI3jQJu01_PA3){$}=+c0OUjNbl2Q0uZH$p5gi4N}va`Md{PMnsh&K5Nn+AQDtVTw3S%)0l=J0mh1q;l_y@nkg-nz zQJ;QEm#L&saZflk9K9GDTAc4Sp&;_5q~S3*td}*rzVuacQD~8sF7KNVDCJb>T32{e zyEPdMl4KpEp7%uC8O;7>c8X;cijJP%Go;}*qzMFBZw(4qKS$OO5_}jmIGTA&wMaQW zsLcqPIM%fnT27qAHT4+w%z97ko>EDk%v|i72d`>s(Qu%c+M@}T$7xyPf~cFZEnVD_ zE2rD+mr!H_m&@!N*jl~gwz(dc|2=cHhPrk6TaXrFn>@8hn{6DSTRrTIu&&=qR8DhW zqXUuC1eyyqi*B^tU}+fO(l9&w$fQ*Ftta1e(QWUnr6=3>dJb&TOnN-FBIl)NMH3$uK#y*g8FG7?FK`Q+ zD8h5*A>*k^d28baN2O!lFSj4x!h9WNCgm@z{QT?^Rkr@!w-F@00wYlsoEBwMUHh|& zyL#E0|196RajFrsJiy@Hpzp-@jw4}}#tm&>Y4H7nYCwWPCg%FiotqsBE8{z5yS?tI zff7~25fc1V5>(CnZ0R^Qy^6tw&Foe+(^~Xtdy(#Tihk5zk24@)?~FSlJt}f@;D_}M zXBsjTkvW%GwFUYF<8eayOtn0;jRo%HEsH1L z_f1g$5=Qr$E#3ndw%3d?jz6*yh)6kTos;CuTTF$LA%#So0%G75MZo!!yMTnY_WaNl z-9gHX`VqHbR#m6s55ge_pqHIS`z77QdAF3^I9fwfs(5XnUTgpisQ>0OLi=bU{VyF; zOGqNHh-rRPt68mowEhLs8Y6|MBzz_W8)rScM?{BAj_uvO0Q=|-;+77FZ8VLwB1-{> z<`ryJL1|r#GKO5?JkEb=m=`S4hbp%2-x-Mcx6C~7(qqh1>FMi97K))IpJybt`gyIIpAKduS)4N!j zo4dMj|D1kyK5!jlhZG)=Z${PS!&8a27xB5F8hNjapw`vah?&PdM$gIMueN=3#xBmd z}sl+xrys5~lNnbxZ zF|%hEy&;Rh#?NEfgj`AeT985Kw z9UNV_O&y%g|8&#;vm#I`^@%lBZ|9*1+EqO#R^FALrBg_@laXz@F75iXvvH>7-SS5+ zyS0^tSvwv&D(+dA`GWN(EAjb6*>Y@Cmp3FbPcX4E0~6~Fm_Li81T%o{_av+a>AuLk zd0#+nXb7gsn{{m^2q2(f7a~`j7aBI4wu?up90@G z(;Cp5srp*MU3kie{Mrp6UZ1Do3l_)fsuMhVN2QpV0*TM@Fy^e{)J{Z`+Hx-TBJfS# z-F;*Xk0w0gBd|vd?E1z8G|lH76o))?K6AXWvXJ~(2s(pv7h96rrkFZ@^5O$yO11#K zD==0{A%rq=s_R+I(v$CpeLgp9njyz%7c4~5eE1s>h9nMpxSV_^#w|PNjHfRS!M~%V zDL?Eq4n;{C3;+Q5D;q_K6Aw08>a4$&A4b2;>d*F>1_-Wm8_%c{l}Zq~YEKq_&XTq)1< z@ofR~nY+Lgn@G~@&U)zsD$Weo(p7P&7zStQ7_PCja^Y%TCJiw7$gWLpv}hd~9qc>M zd!a_qL^jP2BzhS1na;p56@W7x4|f8G=3K0;5Zn$`QDSb>q3)=6rSxAkMNG;>KM? zoVNg2@yeN$t+u%ZOORZeYXyx<8(7%l#JOSYJe$xza4@Y4EZCSk+P9DKdp|ZgXsu*I zWO#|9>mN-Z8agM+w)y>OWYyQpIrH1l91N`%3Kzsl`Z_&+zp gU*Rj1zrg?88a0%$P}v0l@KLX4sES@f{o~XB0g6JUX#fBK literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 34e47ebcfcf5a..449c27482e0a5 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -244,6 +244,21 @@ def test_excel_passes_na(self): columns=['Test']) tm.assert_frame_equal(parsed, expected) + # 13967 + excel = self.get_excelfile('test5') + + parsed = read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + def test_excel_table_sheet_by_index(self): excel = self.get_excelfile('test1') From 51b20de44fdcb998f25392558e029621e10c451c Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 20 Aug 2016 09:31:43 -0400 Subject: [PATCH 287/359] API: SparseSeries comparison now returns sparse Because #13985 has been fixed, ``SparseSeries`` comparison op now can return ``SparseSeries`` (it returns normal ``Series`` on current master). also fixed a bug when ``SparseArray`` created from ``SparseSeries`` may not inherit ``dtype``. Author: sinhrks Closes #13999 from sinhrks/sparse_comparison and squashes the following commits: eafc94c [sinhrks] API: SparseSeries comparison now returns sparse --- doc/source/whatsnew/v0.19.0.txt | 3 +++ pandas/sparse/array.py | 11 ++++++++-- pandas/sparse/series.py | 2 +- pandas/sparse/tests/test_arithmetics.py | 5 ----- pandas/sparse/tests/test_array.py | 28 ++++++++++++++++++++++++- 5 files changed, 40 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f6a03a613f75f..7b261f6249e04 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -814,6 +814,9 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) - Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`) - Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result (:issue:`13985`) +- Bug in ``SparseArray`` created from ``SparseSeries`` may lose ``dtype`` (:issue:`13999`) +- Bug in ``SparseSeries`` comparison with dense returns normal ``Series`` rather than ``SparseSeries`` (:issue:`13999`) + .. _whatsnew_0190.indexer_dtype: diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 8d564d0abbf3f..74d592c32d3aa 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -188,9 +188,17 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', values.fill(data) data = values + if isinstance(data, ABCSparseSeries): + data = data.values + is_sparse_array = isinstance(data, SparseArray) + if dtype is not None: dtype = np.dtype(dtype) - is_sparse_array = isinstance(data, SparseArray) + if is_sparse_array: + # temp, always inherit passed SparseArray dtype + # can be removed after GH 13849 + dtype = data.dtype + if fill_value is None: if is_sparse_array: fill_value = data.fill_value @@ -211,7 +219,6 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', raise AssertionError("Non array-like type {0} must have" " the same length as the" " index".format(type(values))) - # Create array, do *not* copy data by default if copy: try: diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 4ad77b4deab4f..94834ac22166b 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -806,7 +806,7 @@ def from_coo(cls, A, dense_index=False): # overwrite basic arithmetic to use SparseSeries version # force methods to overwrite previous definitions. ops.add_special_arithmetic_methods(SparseSeries, _arith_method, - comp_method=None, + comp_method=_arith_method, bool_method=None, use_numexpr=False, force=True) diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/sparse/tests/test_arithmetics.py index ec8bc4d8634e6..ab84308e02e7c 100644 --- a/pandas/sparse/tests/test_arithmetics.py +++ b/pandas/sparse/tests/test_arithmetics.py @@ -358,11 +358,6 @@ class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): def _assert(self, a, b): tm.assert_series_equal(a, b) - def _check_bool_result(self, res): - # ToDo: Must return SparseSeries after GH 667 - tm.assertIsInstance(res, self._base) - self.assertEqual(res.dtype, np.bool) - def test_alignment(self): da = pd.Series(np.arange(4)) db = pd.Series(np.arange(4), index=[1, 2, 3, 4]) diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 70cda5acc3f4c..e702b7ed5e349 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -7,7 +7,7 @@ import numpy as np from pandas import _np_version_under1p8 -from pandas.sparse.api import SparseArray +from pandas.sparse.api import SparseArray, SparseSeries from pandas._sparse import IntIndex from pandas.util.testing import assert_almost_equal, assertRaisesRegexp import pandas.util.testing as tm @@ -102,6 +102,32 @@ def test_constructor_spindex_dtype(self): self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) + def test_sparseseries_roundtrip(self): + # GH 13999 + for kind in ['integer', 'block']: + for fill in [1, np.nan, 0]: + arr = SparseArray([np.nan, 1, np.nan, 2, 3], kind=kind, + fill_value=fill) + res = SparseArray(SparseSeries(arr)) + tm.assert_sp_array_equal(arr, res) + + arr = SparseArray([0, 0, 0, 1, 1, 2], dtype=np.int64, + kind=kind, fill_value=fill) + res = SparseArray(SparseSeries(arr), dtype=np.int64) + tm.assert_sp_array_equal(arr, res) + + res = SparseArray(SparseSeries(arr)) + tm.assert_sp_array_equal(arr, res) + + for fill in [True, False, np.nan]: + arr = SparseArray([True, False, True, True], dtype=np.bool, + kind=kind, fill_value=fill) + res = SparseArray(SparseSeries(arr)) + tm.assert_sp_array_equal(arr, res) + + res = SparseArray(SparseSeries(arr)) + tm.assert_sp_array_equal(arr, res) + def test_get_item(self): self.assertTrue(np.isnan(self.arr[1])) From ce61b3f1c85c1541cfbe1b3bb594431b38689946 Mon Sep 17 00:00:00 2001 From: Robert Kern Date: Sun, 21 Aug 2016 09:27:41 -0400 Subject: [PATCH 288/359] ENH: Fine-grained errstate handling closes #13109 closes #13135 The precise strategy to be taken here is open for discussion. I tried to be reasonably fine-grained rather than slap a generic decorator over everything because it's easier to go that direction than the reverse. The `errstate()` blocks in the tests were added *after* fixing all of the library code. Unfortunately, these are less fine-grained than I would like because some of the tests have many lines of the form `assert_array_equal(pandas_expression_to_test, expected_raw_numpy_expression)` where `expected_raw_numpy_expression` is what is triggering the warning. It was tedious to try to rewrite all of that to wrap just `expected_raw_numpy_expression`. I think I got everything exercised by the test suite except for parts of the test suite that are skipped on my machine due to dependencies. We'll see how things go in the CI. I haven't added any new tests yet. Could do if requested. Author: Robert Kern Author: Robert Kern Closes #13145 from rkern/fix/errstate and squashes the following commits: ef9c001 [Robert Kern] BUG: whoops, wrong function. 7fd2e86 [Robert Kern] ENH: More whatsnew documentation. 44805db [Robert Kern] ENH: Rearrange expression to avoid generating a warning that would need to be silenced. 1fe1bc2 [Robert Kern] pep8 bf1f662 [Robert Kern] BUG: New fixes after master rebase. e7adc03 [Robert Kern] BUG: wrong function. a59cfa7 [Robert Kern] ENH: Avoiding the bounds error is better than silencing the warning. 0e1ea81 [Robert Kern] BUG: A few more stragglers. 863ac93 [Robert Kern] TST: Add a new test to ensure that boolean comparisons are errstate-protected. 6932851 [Robert Kern] TST: Basic check that the global errstate remains unchanged. c9df7b3 [Robert Kern] BUG: removed debugging print 3b12f08 [Robert Kern] ENH: Silence numpy warnings from certain expressions computed during tests. eca512c [Robert Kern] BUG: Handle NaT explicitly. 6fbc9ce [Robert Kern] BUG: First pass at fine-grained errstate. --- doc/source/whatsnew/v0.19.0.txt | 13 +++ pandas/compat/numpy/__init__.py | 2 - pandas/computation/align.py | 2 +- pandas/computation/expressions.py | 3 +- pandas/computation/ops.py | 3 +- pandas/computation/tests/test_eval.py | 6 +- pandas/core/frame.py | 9 +- pandas/core/groupby.py | 12 ++- pandas/core/internals.py | 6 +- pandas/core/nanops.py | 24 +++-- pandas/core/ops.py | 18 ++-- pandas/core/panel.py | 29 +++--- pandas/core/series.py | 27 +++-- pandas/core/window.py | 23 +++-- pandas/formats/format.py | 23 +++-- pandas/indexes/base.py | 11 +- pandas/indexes/range.py | 12 ++- pandas/sparse/array.py | 28 +++-- pandas/sparse/series.py | 6 +- pandas/sparse/tests/test_arithmetics.py | 132 +++++++++++++----------- pandas/sparse/tests/test_array.py | 20 ++-- pandas/tests/formats/test_format.py | 2 +- pandas/tests/frame/test_apply.py | 30 +++--- pandas/tests/frame/test_misc_api.py | 3 +- pandas/tests/frame/test_operators.py | 26 ++++- pandas/tests/indexes/common.py | 11 +- pandas/tests/series/test_analytics.py | 55 +++++----- pandas/tests/series/test_apply.py | 23 +++-- pandas/tests/series/test_operators.py | 119 ++++++++++----------- pandas/tests/test_groupby.py | 4 +- pandas/tests/test_nanops.py | 14 +-- pandas/tests/test_panel.py | 18 ++-- pandas/tests/test_panel4d.py | 13 +-- pandas/tests/test_util.py | 10 ++ pandas/tslib.pyx | 26 +++-- 35 files changed, 449 insertions(+), 314 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 7b261f6249e04..c2afb6619cb5c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -7,6 +7,10 @@ This is a major release from 0.18.1 and includes a small number of API changes, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. +.. warning:: + + pandas >= 0.19.0 will no longer silence numpy ufunc warnings upon import, see :ref:`here `. (:issue:`13109`, :issue:`13145`) + Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` @@ -357,6 +361,15 @@ Google BigQuery Enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). +.. _whatsnew_0190.errstate: + +Fine-grained numpy errstate +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previous versions of pandas would permanently silence numpy's ufunc error handling when ``pandas`` was imported (:issue:`13109`). Pandas did this in order to silence the warnings that would arise from using numpy ufuncs on missing data, which are usually represented as NaNs. Unfortunately, this silenced legitimate warnings arising in non-pandas code in the application. Starting with 0.19.0, pandas will use the ``numpy.errstate`` context manager to silence these warnings in a more fine-grained manner only around where these operations are actually used in the pandas codebase. + +After upgrading pandas, you may see "new" ``RuntimeWarnings`` being issued from your code. These are likely legitimate, and the underlying cause likely existed in the code when using previous versions of pandas that simply silenced the warning. Use `numpy.errstate `__ around the source of the ``RuntimeWarning`` to control how these conditions are handled. + .. _whatsnew_0190.enhancements.other: Other enhancements diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 8ecc5dc979792..f2d837a4c9908 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -5,8 +5,6 @@ from distutils.version import LooseVersion from pandas.compat import string_types, string_and_binary_types -# turn off all numpy warnings -np.seterr(all='ignore') # numpy versioning _np_version = np.version.short_version diff --git a/pandas/computation/align.py b/pandas/computation/align.py index a117342fdefe2..4e12d58a4ab85 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -95,7 +95,7 @@ def _align_core(terms): term_axis_size = len(ti.axes[axis]) reindexer_size = len(reindexer) - ordm = np.log10(abs(reindexer_size - term_axis_size)) + ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) if ordm >= 1 and reindexer_size >= 10000: warnings.warn('Alignment difference on axis {0} is larger ' 'than an order of magnitude on term {1!r}, ' diff --git a/pandas/computation/expressions.py b/pandas/computation/expressions.py index 086e92dbde1a0..8fd9ab3477b74 100644 --- a/pandas/computation/expressions.py +++ b/pandas/computation/expressions.py @@ -59,7 +59,8 @@ def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): """ standard evaluation """ if _TEST_MODE: _store_test_result(False) - return op(a, b) + with np.errstate(all='ignore'): + return op(a, b) def _can_use_numexpr(op, op_str, a, b, dtype_check): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 96a04cff9372e..9446e84d891c4 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -523,7 +523,8 @@ def __init__(self, func, args): def __call__(self, env): operands = [op(env) for op in self.operands] - return self.func.func(*operands) + with np.errstate(all='ignore'): + return self.func.func(*operands) def __unicode__(self): operands = map(str, self.operands) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 066df0521fef6..c50944f0a4d3b 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1613,7 +1613,8 @@ def test_unary_functions(self): for fn in self.unary_fns: expr = "{0}(a)".format(fn) got = self.eval(expr) - expect = getattr(np, fn)(a) + with np.errstate(all='ignore'): + expect = getattr(np, fn)(a) tm.assert_series_equal(got, expect, check_names=False) def test_binary_functions(self): @@ -1624,7 +1625,8 @@ def test_binary_functions(self): for fn in self.binary_fns: expr = "{0}(a, b)".format(fn) got = self.eval(expr) - expect = getattr(np, fn)(a, b) + with np.errstate(all='ignore'): + expect = getattr(np, fn)(a, b) tm.assert_almost_equal(got, expect, check_names=False) def test_df_use_case(self): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fa46ee4829cb9..501f4e443b1fc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3810,7 +3810,8 @@ def update(self, other, join='left', overwrite=True, filter_func=None, this = self[col].values that = other[col].values if filter_func is not None: - mask = ~filter_func(this) | isnull(that) + with np.errstate(all='ignore'): + mask = ~filter_func(this) | isnull(that) else: if raise_conflict: mask_this = notnull(that) @@ -4105,7 +4106,8 @@ def f(x): return self._apply_empty_result(func, axis, reduce, *args, **kwds) if isinstance(f, np.ufunc): - results = f(self.values) + with np.errstate(all='ignore'): + results = f(self.values) return self._constructor(data=results, index=self.index, columns=self.columns, copy=False) else: @@ -4931,7 +4933,8 @@ def f(x): "type %s not implemented." % filter_type) raise_with_traceback(e) - result = f(data.values) + with np.errstate(all='ignore'): + result = f(data.values) labels = data._get_agg_axis(axis) else: if numeric_only: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5c3c5bbfab9a8..9436257b88941 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -678,7 +678,8 @@ def apply(self, func, *args, **kwargs): @wraps(func) def f(g): - return func(g, *args, **kwargs) + with np.errstate(all='ignore'): + return func(g, *args, **kwargs) else: raise ValueError('func must be a callable if args or ' 'kwargs are supplied') @@ -4126,7 +4127,10 @@ def loop(labels, shape): out = stride * labels[0].astype('i8', subok=False, copy=False) for i in range(1, nlev): - stride //= shape[i] + if shape[i] == 0: + stride = 0 + else: + stride //= shape[i] out += labels[i] * stride if xnull: # exclude nulls @@ -4365,7 +4369,9 @@ def _get_group_index_sorter(group_index, ngroups): count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters - if alpha + beta * ngroups < count * np.log(count): + do_groupsort = (count > 0 and ((alpha + beta * ngroups) < + (count * np.log(count)))) + if do_groupsort: sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), ngroups) return _ensure_platform_int(sorter) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e9b45e444d8d8..e11fd4086347f 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -348,7 +348,8 @@ def apply(self, func, mgr=None, **kwargs): """ apply the function to my values; return a block if we are not one """ - result = func(self.values, **kwargs) + with np.errstate(all='ignore'): + result = func(self.values, **kwargs) if not isinstance(result, Block): result = self.make_block(values=_block_shape(result, ndim=self.ndim)) @@ -1156,7 +1157,8 @@ def handle_error(): # get the result try: - result = get_result(other) + with np.errstate(all='ignore'): + result = get_result(other) # if we have an invalid shape/broadcast error # GH4576, so raise instead of allowing to pass through diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 2199daf549824..a76e348b7dee2 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -45,7 +45,8 @@ def _f(*args, **kwargs): 'this dtype'.format( f.__name__.replace('nan', ''))) try: - return f(*args, **kwargs) + with np.errstate(invalid='ignore'): + return f(*args, **kwargs) except ValueError as e: # we want to transform an object array # ValueError message to the more typical TypeError @@ -513,7 +514,8 @@ def nanskew(values, axis=None, skipna=True): m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) - result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) + with np.errstate(invalid='ignore', divide='ignore'): + result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) dtype = values.dtype if is_float_dtype(dtype): @@ -562,10 +564,11 @@ def nankurt(values, axis=None, skipna=True): m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) - adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) - numer = count * (count + 1) * (count - 1) * m4 - denom = (count - 2) * (count - 3) * m2**2 - result = numer / denom - adj + with np.errstate(invalid='ignore', divide='ignore'): + adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) + numer = count * (count + 1) * (count - 1) * m4 + denom = (count - 2) * (count - 3) * m2**2 + result = numer / denom - adj # floating point error numer = _zero_out_fperr(numer) @@ -579,7 +582,8 @@ def nankurt(values, axis=None, skipna=True): if denom == 0: return 0 - result = numer / denom - adj + with np.errstate(invalid='ignore', divide='ignore'): + result = numer / denom - adj dtype = values.dtype if is_float_dtype(dtype): @@ -658,7 +662,8 @@ def _maybe_null_out(result, axis, mask): def _zero_out_fperr(arg): if isinstance(arg, np.ndarray): - return np.where(np.abs(arg) < 1e-14, 0, arg) + with np.errstate(invalid='ignore'): + return np.where(np.abs(arg) < 1e-14, 0, arg) else: return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg @@ -760,7 +765,8 @@ def f(x, y): ymask = isnull(y) mask = xmask | ymask - result = op(x, y) + with np.errstate(all='ignore'): + result = op(x, y) if mask.any(): if is_bool_dtype(result): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 66d9391d2facf..8d49e41284a7b 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -636,7 +636,8 @@ def na_op(x, y): def safe_na_op(lvalues, rvalues): try: - return na_op(lvalues, rvalues) + with np.errstate(all='ignore'): + return na_op(lvalues, rvalues) except Exception: if isinstance(rvalues, ABCSeries): if is_object_dtype(rvalues): @@ -743,7 +744,8 @@ def na_op(x, y): x = x.view('i8') try: - result = getattr(x, name)(y) + with np.errstate(all='ignore'): + result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: @@ -796,13 +798,15 @@ def wrapper(self, other, axis=None): # which would then not take categories ordering into account # we can go directly to op, as the na_op would just test again and # dispatch to it. - res = op(self.values, other) + with np.errstate(all='ignore'): + res = op(self.values, other) else: values = self.get_values() if isinstance(other, (list, np.ndarray)): other = np.asarray(other) - res = na_op(values, other) + with np.errstate(all='ignore'): + res = na_op(values, other) if isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) @@ -1096,13 +1100,15 @@ def na_op(x, y): xrav = xrav[mask] yrav = yrav[mask] if np.prod(xrav.shape) and np.prod(yrav.shape): - result[mask] = op(xrav, yrav) + with np.errstate(all='ignore'): + result[mask] = op(xrav, yrav) elif hasattr(x, 'size'): result = np.empty(x.size, dtype=x.dtype) mask = notnull(xrav) xrav = xrav[mask] if np.prod(xrav.shape): - result[mask] = op(xrav, y) + with np.errstate(all='ignore'): + result[mask] = op(xrav, y) else: raise TypeError("cannot perform operation {op} between " "objects of type {x} and {y}".format( diff --git a/pandas/core/panel.py b/pandas/core/panel.py index b2082ce29545e..b2f318d825db6 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -713,7 +713,8 @@ def _combine(self, other, func, axis=0): (str(type(other)), str(type(self)))) def _combine_const(self, other, func): - new_values = func(self.values, other) + with np.errstate(all='ignore'): + new_values = func(self.values, other) d = self._construct_axes_dict() return self._constructor(new_values, **d) @@ -723,14 +724,15 @@ def _combine_frame(self, other, func, axis=0): other = other.reindex(index=index, columns=columns) - if axis == 0: - new_values = func(self.values, other.values) - elif axis == 1: - new_values = func(self.values.swapaxes(0, 1), other.values.T) - new_values = new_values.swapaxes(0, 1) - elif axis == 2: - new_values = func(self.values.swapaxes(0, 2), other.values) - new_values = new_values.swapaxes(0, 2) + with np.errstate(all='ignore'): + if axis == 0: + new_values = func(self.values, other.values) + elif axis == 1: + new_values = func(self.values.swapaxes(0, 1), other.values.T) + new_values = new_values.swapaxes(0, 1) + elif axis == 2: + new_values = func(self.values.swapaxes(0, 2), other.values) + new_values = new_values.swapaxes(0, 2) return self._constructor(new_values, self.items, self.major_axis, self.minor_axis) @@ -744,7 +746,8 @@ def _combine_panel(self, other, func): this = self.reindex(items=items, major=major, minor=minor) other = other.reindex(items=items, major=major, minor=minor) - result_values = func(this.values, other.values) + with np.errstate(all='ignore'): + result_values = func(this.values, other.values) return self._constructor(result_values, items, major, minor) @@ -1011,7 +1014,8 @@ def apply(self, func, axis='major', **kwargs): # try ufunc like if isinstance(f, np.ufunc): try: - result = np.apply_along_axis(func, axis, self.values) + with np.errstate(all='ignore'): + result = np.apply_along_axis(func, axis, self.values) return self._wrap_result(result, axis=axis) except (AttributeError): pass @@ -1113,7 +1117,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, axis_number = self._get_axis_number(axis_name) f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) - result = f(self.values) + with np.errstate(all='ignore'): + result = f(self.values) axes = self._get_plane_axes(axis_name) if result.ndim == 2 and axis_name != self._info_axis_name: diff --git a/pandas/core/series.py b/pandas/core/series.py index e388683012a66..32edcf6e698a3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1626,7 +1626,8 @@ def _binop(self, other, func, level=None, fill_value=None): this_vals[this_mask & mask] = fill_value other_vals[other_mask & mask] = fill_value - result = func(this_vals, other_vals) + with np.errstate(all='ignore'): + result = func(this_vals, other_vals) name = _maybe_match_name(self, other) result = self._constructor(result, index=new_index, name=name) result = result.__finalize__(self) @@ -1658,10 +1659,12 @@ def combine(self, other, func, fill_value=nan): for i, idx in enumerate(new_index): lv = self.get(idx, fill_value) rv = other.get(idx, fill_value) - new_values[i] = func(lv, rv) + with np.errstate(all='ignore'): + new_values[i] = func(lv, rv) else: new_index = self.index - new_values = func(self._values, other) + with np.errstate(all='ignore'): + new_values = func(self._values, other) new_name = self.name return self._constructor(new_values, index=new_index, name=new_name) @@ -2240,14 +2243,15 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): else: f = func - if isinstance(f, np.ufunc): - return f(self) + with np.errstate(all='ignore'): + if isinstance(f, np.ufunc): + return f(self) - if is_extension_type(self.dtype): - mapped = self._values.map(f) - else: - values = self.asobject - mapped = lib.map_infer(values, f, convert=convert_dtype) + if is_extension_type(self.dtype): + mapped = self._values.map(f) + else: + values = self.asobject + mapped = lib.map_infer(values, f, convert=convert_dtype) if len(mapped) and isinstance(mapped[0], Series): from pandas.core.frame import DataFrame @@ -2272,7 +2276,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) - return op(delegate, skipna=skipna, **kwds) + with np.errstate(all='ignore'): + return op(delegate, skipna=skipna, **kwds) return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only, diff --git a/pandas/core/window.py b/pandas/core/window.py index 9e2a27adc25a7..b7276aed506de 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -733,10 +733,11 @@ def calc(x): def calc(x): return func(x, window, min_periods=self.min_periods) - if values.ndim > 1: - result = np.apply_along_axis(calc, self.axis, values) - else: - result = calc(values) + with np.errstate(all='ignore'): + if values.ndim > 1: + result = np.apply_along_axis(calc, self.axis, values) + else: + result = calc(values) if center: result = self._center_window(result, window) @@ -1617,10 +1618,11 @@ def _cov(x, y): x_values = X._prep_values() y_values = Y._prep_values() - cov = _cov(x_values, y_values) - x_var = _cov(x_values, x_values) - y_var = _cov(y_values, y_values) - corr = cov / _zsqrt(x_var * y_var) + with np.errstate(all='ignore'): + cov = _cov(x_values, y_values) + x_var = _cov(x_values, x_values) + y_var = _cov(y_values, y_values) + corr = cov / _zsqrt(x_var * y_var) return X._wrap_result(corr) return _flex_binary_moment(self._selected_obj, other._selected_obj, @@ -1757,8 +1759,9 @@ def _use_window(minp, window): def _zsqrt(x): - result = np.sqrt(x) - mask = x < 0 + with np.errstate(all='ignore'): + result = np.sqrt(x) + mask = x < 0 from pandas import DataFrame if isinstance(x, DataFrame): diff --git a/pandas/formats/format.py b/pandas/formats/format.py index f89ceaff2ad64..b83e3c4e73fdb 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -2094,14 +2094,14 @@ def format_values_with(float_format): else: too_long = False - abs_vals = np.abs(self.values) - - # this is pretty arbitrary for now - # large values: more that 8 characters including decimal symbol - # and first digit, hence > 1e6 - has_large_values = (abs_vals > 1e6).any() - has_small_values = ((abs_vals < 10**(-self.digits)) & - (abs_vals > 0)).any() + with np.errstate(invalid='ignore'): + abs_vals = np.abs(self.values) + # this is pretty arbitrary for now + # large values: more that 8 characters including decimal symbol + # and first digit, hence > 1e6 + has_large_values = (abs_vals > 1e6).any() + has_small_values = ((abs_vals < 10**(-self.digits)) & + (abs_vals > 0)).any() if has_small_values or (too_long and has_large_values): float_format = '%% .%de' % self.digits @@ -2211,9 +2211,10 @@ def format_percentiles(percentiles): percentiles = np.asarray(percentiles) # It checks for np.NaN as well - if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ - or not np.all(percentiles <= 1): - raise ValueError("percentiles should all be in the interval [0,1]") + with np.errstate(invalid='ignore'): + if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ + or not np.all(percentiles <= 1): + raise ValueError("percentiles should all be in the interval [0,1]") percentiles = 100 * percentiles int_idx = (percentiles.astype(int) == percentiles) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 9b378715b8a96..e4e5a4e4cfec7 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -3303,9 +3303,12 @@ def _evaluate_compare(self, other): if is_object_dtype(self) and self.nlevels == 1: # don't pass MultiIndex - result = _comp_method_OBJECT_ARRAY(op, self.values, other) + with np.errstate(all='ignore'): + result = _comp_method_OBJECT_ARRAY( + op, self.values, other) else: - result = op(self.values, np.asarray(other)) + with np.errstate(all='ignore'): + result = op(self.values, np.asarray(other)) # technically we could support bool dtyped Index # for now just return the indexing array directly @@ -3450,7 +3453,9 @@ def _evaluate_numeric_binop(self, other): attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) - return Index(op(values, other), **attrs) + with np.errstate(all='ignore'): + result = op(values, other) + return Index(result, **attrs) return _evaluate_numeric_binop diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 465ec4904f7ee..76166e7155bd0 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -576,7 +576,8 @@ def _evaluate_numeric_binop(self, other): try: # alppy if we have an override if step: - rstep = step(self._step, other) + with np.errstate(all='ignore'): + rstep = step(self._step, other) # we don't have a representable op # so return a base index @@ -586,8 +587,9 @@ def _evaluate_numeric_binop(self, other): else: rstep = self._step - rstart = op(self._start, other) - rstop = op(self._stop, other) + with np.errstate(all='ignore'): + rstart = op(self._start, other) + rstop = op(self._stop, other) result = RangeIndex(rstart, rstop, @@ -612,7 +614,9 @@ def _evaluate_numeric_binop(self, other): if isinstance(other, RangeIndex): other = other.values - return Index(op(self, other), **attrs) + with np.errstate(all='ignore'): + results = op(self, other) + return Index(results, **attrs) return _evaluate_numeric_binop diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 74d592c32d3aa..ca9d5efe2fbe5 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -55,9 +55,11 @@ def wrapper(self, other): dtype=dtype) return _sparse_array_op(self, other, op, name) elif is_scalar(other): - fill = op(_get_fill(self), np.asarray(other)) - return _wrap_result(name, op(self.sp_values, other), - self.sp_index, fill) + with np.errstate(all='ignore'): + fill = op(_get_fill(self), np.asarray(other)) + result = op(self.sp_values, other) + + return _wrap_result(name, result, self.sp_index, fill) else: # pragma: no cover raise TypeError('operation with %s not supported' % type(other)) @@ -101,17 +103,19 @@ def _sparse_array_op(left, right, op, name, series=False): result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: - result = op(left.get_values(), right.get_values()) + with np.errstate(all='ignore'): + result = op(left.get_values(), right.get_values()) + fill = op(_get_fill(left), _get_fill(right)) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index - fill = op(_get_fill(left), _get_fill(right)) elif left.sp_index.equals(right.sp_index): - result = op(left.sp_values, right.sp_values) + with np.errstate(all='ignore'): + result = op(left.sp_values, right.sp_values) + fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index - fill = op(_get_fill(left), _get_fill(right)) else: if name[0] == 'r': left, right = right, left @@ -129,9 +133,10 @@ def _sparse_array_op(left, right, op, name, series=False): right_sp_values = right.sp_values sparse_op = getattr(splib, opname) - result, index, fill = sparse_op(left_sp_values, left.sp_index, - left.fill_value, right_sp_values, - right.sp_index, right.fill_value) + with np.errstate(all='ignore'): + result, index, fill = sparse_op(left_sp_values, left.sp_index, + left.fill_value, right_sp_values, + right.sp_index, right.fill_value) if result_dtype is None: result_dtype = result.dtype @@ -295,7 +300,8 @@ def __array_wrap__(self, out_arr, context=None): ufunc, args, domain = context # to apply ufunc only to fill_value (to avoid recursive call) args = [getattr(a, 'fill_value', a) for a in args] - fill_value = ufunc(self.fill_value, *args[1:]) + with np.errstate(all='ignore'): + fill_value = ufunc(self.fill_value, *args[1:]) else: fill_value = self.fill_value diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 94834ac22166b..888dbde8ffb0f 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -57,7 +57,8 @@ def wrapper(self, other): elif isinstance(other, DataFrame): return NotImplemented elif is_scalar(other): - new_values = op(self.values, other) + with np.errstate(all='ignore'): + new_values = op(self.values, other) return self._constructor(new_values, index=self.index, name=self.name) @@ -310,7 +311,8 @@ def __array_wrap__(self, result, context=None): if isinstance(context, tuple) and len(context) == 3: ufunc, args, domain = context args = [getattr(a, 'fill_value', a) for a in args] - fill_value = ufunc(self.fill_value, *args[1:]) + with np.errstate(all='ignore'): + fill_value = ufunc(self.fill_value, *args[1:]) else: fill_value = self.fill_value diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/sparse/tests/test_arithmetics.py index ab84308e02e7c..def3d15a43f0f 100644 --- a/pandas/sparse/tests/test_arithmetics.py +++ b/pandas/sparse/tests/test_arithmetics.py @@ -14,55 +14,59 @@ def _assert(self, a, b): tm.assert_numpy_array_equal(a, b) def _check_numeric_ops(self, a, b, a_dense, b_dense): - # sparse & sparse - self._assert((a + b).to_dense(), a_dense + b_dense) - self._assert((b + a).to_dense(), b_dense + a_dense) + with np.errstate(invalid='ignore', divide='ignore'): + # Unfortunately, trying to wrap the computation of each expected + # value is with np.errstate() is too tedious. - self._assert((a - b).to_dense(), a_dense - b_dense) - self._assert((b - a).to_dense(), b_dense - a_dense) + # sparse & sparse + self._assert((a + b).to_dense(), a_dense + b_dense) + self._assert((b + a).to_dense(), b_dense + a_dense) - self._assert((a * b).to_dense(), a_dense * b_dense) - self._assert((b * a).to_dense(), b_dense * a_dense) + self._assert((a - b).to_dense(), a_dense - b_dense) + self._assert((b - a).to_dense(), b_dense - a_dense) - # pandas uses future division - self._assert((a / b).to_dense(), a_dense * 1.0 / b_dense) - self._assert((b / a).to_dense(), b_dense * 1.0 / a_dense) + self._assert((a * b).to_dense(), a_dense * b_dense) + self._assert((b * a).to_dense(), b_dense * a_dense) - # ToDo: FIXME in GH 13843 - if not (self._base == pd.Series and a.dtype == 'int64'): - self._assert((a // b).to_dense(), a_dense // b_dense) - self._assert((b // a).to_dense(), b_dense // a_dense) + # pandas uses future division + self._assert((a / b).to_dense(), a_dense * 1.0 / b_dense) + self._assert((b / a).to_dense(), b_dense * 1.0 / a_dense) - self._assert((a % b).to_dense(), a_dense % b_dense) - self._assert((b % a).to_dense(), b_dense % a_dense) + # ToDo: FIXME in GH 13843 + if not (self._base == pd.Series and a.dtype == 'int64'): + self._assert((a // b).to_dense(), a_dense // b_dense) + self._assert((b // a).to_dense(), b_dense // a_dense) - self._assert((a ** b).to_dense(), a_dense ** b_dense) - self._assert((b ** a).to_dense(), b_dense ** a_dense) + self._assert((a % b).to_dense(), a_dense % b_dense) + self._assert((b % a).to_dense(), b_dense % a_dense) - # sparse & dense - self._assert((a + b_dense).to_dense(), a_dense + b_dense) - self._assert((b_dense + a).to_dense(), b_dense + a_dense) + self._assert((a ** b).to_dense(), a_dense ** b_dense) + self._assert((b ** a).to_dense(), b_dense ** a_dense) + + # sparse & dense + self._assert((a + b_dense).to_dense(), a_dense + b_dense) + self._assert((b_dense + a).to_dense(), b_dense + a_dense) - self._assert((a - b_dense).to_dense(), a_dense - b_dense) - self._assert((b_dense - a).to_dense(), b_dense - a_dense) + self._assert((a - b_dense).to_dense(), a_dense - b_dense) + self._assert((b_dense - a).to_dense(), b_dense - a_dense) - self._assert((a * b_dense).to_dense(), a_dense * b_dense) - self._assert((b_dense * a).to_dense(), b_dense * a_dense) + self._assert((a * b_dense).to_dense(), a_dense * b_dense) + self._assert((b_dense * a).to_dense(), b_dense * a_dense) - # pandas uses future division - self._assert((a / b_dense).to_dense(), a_dense * 1.0 / b_dense) - self._assert((b_dense / a).to_dense(), b_dense * 1.0 / a_dense) + # pandas uses future division + self._assert((a / b_dense).to_dense(), a_dense * 1.0 / b_dense) + self._assert((b_dense / a).to_dense(), b_dense * 1.0 / a_dense) - # ToDo: FIXME in GH 13843 - if not (self._base == pd.Series and a.dtype == 'int64'): - self._assert((a // b_dense).to_dense(), a_dense // b_dense) - self._assert((b_dense // a).to_dense(), b_dense // a_dense) + # ToDo: FIXME in GH 13843 + if not (self._base == pd.Series and a.dtype == 'int64'): + self._assert((a // b_dense).to_dense(), a_dense // b_dense) + self._assert((b_dense // a).to_dense(), b_dense // a_dense) - self._assert((a % b_dense).to_dense(), a_dense % b_dense) - self._assert((b_dense % a).to_dense(), b_dense % a_dense) + self._assert((a % b_dense).to_dense(), a_dense % b_dense) + self._assert((b_dense % a).to_dense(), b_dense % a_dense) - self._assert((a ** b_dense).to_dense(), a_dense ** b_dense) - self._assert((b_dense ** a).to_dense(), b_dense ** a_dense) + self._assert((a ** b_dense).to_dense(), a_dense ** b_dense) + self._assert((b_dense ** a).to_dense(), b_dense ** a_dense) def _check_bool_result(self, res): tm.assertIsInstance(res, self._klass) @@ -70,43 +74,47 @@ def _check_bool_result(self, res): self.assertIsInstance(res.fill_value, bool) def _check_comparison_ops(self, a, b, a_dense, b_dense): - # sparse & sparse - self._check_bool_result(a == b) - self._assert((a == b).to_dense(), a_dense == b_dense) + with np.errstate(invalid='ignore'): + # Unfortunately, trying to wrap the computation of each expected + # value is with np.errstate() is too tedious. + # + # sparse & sparse + self._check_bool_result(a == b) + self._assert((a == b).to_dense(), a_dense == b_dense) - self._check_bool_result(a != b) - self._assert((a != b).to_dense(), a_dense != b_dense) + self._check_bool_result(a != b) + self._assert((a != b).to_dense(), a_dense != b_dense) - self._check_bool_result(a >= b) - self._assert((a >= b).to_dense(), a_dense >= b_dense) + self._check_bool_result(a >= b) + self._assert((a >= b).to_dense(), a_dense >= b_dense) - self._check_bool_result(a <= b) - self._assert((a <= b).to_dense(), a_dense <= b_dense) + self._check_bool_result(a <= b) + self._assert((a <= b).to_dense(), a_dense <= b_dense) - self._check_bool_result(a > b) - self._assert((a > b).to_dense(), a_dense > b_dense) + self._check_bool_result(a > b) + self._assert((a > b).to_dense(), a_dense > b_dense) - self._check_bool_result(a < b) - self._assert((a < b).to_dense(), a_dense < b_dense) + self._check_bool_result(a < b) + self._assert((a < b).to_dense(), a_dense < b_dense) - # sparse & dense - self._check_bool_result(a == b_dense) - self._assert((a == b_dense).to_dense(), a_dense == b_dense) + # sparse & dense + self._check_bool_result(a == b_dense) + self._assert((a == b_dense).to_dense(), a_dense == b_dense) - self._check_bool_result(a != b_dense) - self._assert((a != b_dense).to_dense(), a_dense != b_dense) + self._check_bool_result(a != b_dense) + self._assert((a != b_dense).to_dense(), a_dense != b_dense) - self._check_bool_result(a >= b_dense) - self._assert((a >= b_dense).to_dense(), a_dense >= b_dense) + self._check_bool_result(a >= b_dense) + self._assert((a >= b_dense).to_dense(), a_dense >= b_dense) - self._check_bool_result(a <= b_dense) - self._assert((a <= b_dense).to_dense(), a_dense <= b_dense) + self._check_bool_result(a <= b_dense) + self._assert((a <= b_dense).to_dense(), a_dense <= b_dense) - self._check_bool_result(a > b_dense) - self._assert((a > b_dense).to_dense(), a_dense > b_dense) + self._check_bool_result(a > b_dense) + self._assert((a > b_dense).to_dense(), a_dense > b_dense) - self._check_bool_result(a < b_dense) - self._assert((a < b_dense).to_dense(), a_dense < b_dense) + self._check_bool_result(a < b_dense) + self._assert((a < b_dense).to_dense(), a_dense < b_dense) def _check_logical_ops(self, a, b, a_dense, b_dense): # sparse & sparse diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index e702b7ed5e349..63e29656b66ea 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -542,15 +542,17 @@ def _check_inplace_op(op): tmp = arr1.copy() self.assertRaises(NotImplementedError, op, tmp, arr2) - bin_ops = [operator.add, operator.sub, operator.mul, operator.truediv, - operator.floordiv, operator.pow] - for op in bin_ops: - _check_op(op, arr1, arr2) - _check_op(op, farr1, farr2) - - inplace_ops = ['iadd', 'isub', 'imul', 'itruediv', 'ifloordiv', 'ipow'] - for op in inplace_ops: - _check_inplace_op(getattr(operator, op)) + with np.errstate(all='ignore'): + bin_ops = [operator.add, operator.sub, operator.mul, + operator.truediv, operator.floordiv, operator.pow] + for op in bin_ops: + _check_op(op, arr1, arr2) + _check_op(op, farr1, farr2) + + inplace_ops = ['iadd', 'isub', 'imul', 'itruediv', 'ifloordiv', + 'ipow'] + for op in inplace_ops: + _check_inplace_op(getattr(operator, op)) def test_pickle(self): def _check_roundtrip(obj): diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 8a4aca2b320aa..e6147737e9a1d 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -1668,7 +1668,7 @@ def test_string_repr_encoding(self): def test_repr_corner(self): # representing infs poses no problems - df = DataFrame({'foo': np.inf * np.empty(10)}) + df = DataFrame({'foo': [-np.inf, np.inf]}) repr(df) def test_frame_info_encoding(self): diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 020b7f1f1ab9d..5cadb4dba577f 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -22,18 +22,19 @@ class TestDataFrameApply(tm.TestCase, TestData): _multiprocess_can_split_ = True def test_apply(self): - # ufunc - applied = self.frame.apply(np.sqrt) - assert_series_equal(np.sqrt(self.frame['A']), applied['A']) + with np.errstate(all='ignore'): + # ufunc + applied = self.frame.apply(np.sqrt) + assert_series_equal(np.sqrt(self.frame['A']), applied['A']) - # aggregator - applied = self.frame.apply(np.mean) - self.assertEqual(applied['A'], np.mean(self.frame['A'])) + # aggregator + applied = self.frame.apply(np.mean) + self.assertEqual(applied['A'], np.mean(self.frame['A'])) - d = self.frame.index[0] - applied = self.frame.apply(np.mean, axis=1) - self.assertEqual(applied[d], np.mean(self.frame.xs(d))) - self.assertIs(applied.index, self.frame.index) # want this + d = self.frame.index[0] + applied = self.frame.apply(np.mean, axis=1) + self.assertEqual(applied[d], np.mean(self.frame.xs(d))) + self.assertIs(applied.index, self.frame.index) # want this # invalid axis df = DataFrame( @@ -187,10 +188,11 @@ def _checkit(axis=0, raw=False): _checkit(raw=True) _checkit(axis=0, raw=True) - _check(no_cols, lambda x: x) - _check(no_cols, lambda x: x.mean()) - _check(no_index, lambda x: x) - _check(no_index, lambda x: x.mean()) + with np.errstate(all='ignore'): + _check(no_cols, lambda x: x) + _check(no_cols, lambda x: x.mean()) + _check(no_index, lambda x: x) + _check(no_index, lambda x: x.mean()) result = no_cols.apply(lambda x: x.mean(), broadcast=True) tm.assertIsInstance(result, DataFrame) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 03b3c0a5e65d0..089b71b30119b 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -207,7 +207,8 @@ def test_new_empty_index(self): self.assertIsNone(df2.index.name) def test_array_interface(self): - result = np.sqrt(self.frame) + with np.errstate(all='ignore'): + result = np.sqrt(self.frame) tm.assertIsInstance(result, type(self.frame)) self.assertIs(result.index, self.frame.index) self.assertIs(result.columns, self.frame.columns) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 5f3eb84f72127..85aadee8b0900 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -217,7 +217,9 @@ def test_modulo(self): assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatement - result2 = DataFrame(p.values % p.values, index=p.index, + with np.errstate(all='ignore'): + arr = p.values % p.values + result2 = DataFrame(arr, index=p.index, columns=p.columns, dtype='float64') result2.iloc[0:3, 1] = np.nan assert_frame_equal(result2, expected) @@ -227,8 +229,9 @@ def test_modulo(self): assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatement - result2 = DataFrame(p.values.astype('float64') % - 0, index=p.index, columns=p.columns) + with np.errstate(all='ignore'): + arr = p.values.astype('float64') % 0 + result2 = DataFrame(arr, index=p.index, columns=p.columns) assert_frame_equal(result2, expected) # not commutative with series @@ -248,7 +251,9 @@ def test_div(self): 'second': Series([nan, nan, nan, 1])}) assert_frame_equal(result, expected) - result2 = DataFrame(p.values.astype('float') / p.values, index=p.index, + with np.errstate(all='ignore'): + arr = p.values.astype('float') / p.values + result2 = DataFrame(arr, index=p.index, columns=p.columns) assert_frame_equal(result2, expected) @@ -258,7 +263,9 @@ def test_div(self): assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatement - result2 = DataFrame(p.values.astype('float64') / 0, index=p.index, + with np.errstate(all='ignore'): + arr = p.values.astype('float64') / 0 + result2 = DataFrame(arr, index=p.index, columns=p.columns) assert_frame_equal(result2, expected) @@ -922,6 +929,15 @@ def test_comp(func): test_comp(operator.ge) test_comp(operator.le) + def test_comparison_protected_from_errstate(self): + missing_df = tm.makeDataFrame() + missing_df.iloc[0]['A'] = np.nan + with np.errstate(invalid='ignore'): + expected = missing_df.values < 0 + with np.errstate(invalid='raise'): + result = (missing_df < 0).values + self.assert_numpy_array_equal(result, expected) + def test_string_comparison(self): df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) mask_a = df.a > 1 diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 26f90a814ab29..59b98ebcff82a 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -709,11 +709,13 @@ def test_numpy_ufuncs(self): # raise TypeError or ValueError (PeriodIndex) # PeriodIndex behavior should be changed in future version with tm.assertRaises(Exception): - func(idx) + with np.errstate(all='ignore'): + func(idx) elif isinstance(idx, (Float64Index, Int64Index)): # coerces to float (e.g. np.sin) - result = func(idx) - exp = Index(func(idx.values), name=idx.name) + with np.errstate(all='ignore'): + result = func(idx) + exp = Index(func(idx.values), name=idx.name) self.assert_index_equal(result, exp) self.assertIsInstance(result, pd.Float64Index) else: @@ -722,7 +724,8 @@ def test_numpy_ufuncs(self): continue else: with tm.assertRaises(Exception): - func(idx) + with np.errstate(all='ignore'): + func(idx) for func in [np.isfinite, np.isinf, np.isnan, np.signbit]: if isinstance(idx, pd.tseries.base.DatetimeIndexOpsMixin): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6575c106f006f..24e3a0ff5f325 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -622,39 +622,40 @@ def test_all_any_params(self): self.assertRaises(NotImplementedError, s.all, bool_only=True) def test_modulo(self): + with np.errstate(all='ignore'): + + # GH3590, modulo as ints + p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = p['first'] % p['second'] + expected = Series(p['first'].values % p['second'].values, + dtype='float64') + expected.iloc[0:3] = np.nan + assert_series_equal(result, expected) - # GH3590, modulo as ints - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values, - dtype='float64') - expected.iloc[0:3] = np.nan - assert_series_equal(result, expected) - - result = p['first'] % 0 - expected = Series(np.nan, index=p.index, name='first') - assert_series_equal(result, expected) + result = p['first'] % 0 + expected = Series(np.nan, index=p.index, name='first') + assert_series_equal(result, expected) - p = p.astype('float64') - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values) - assert_series_equal(result, expected) + p = p.astype('float64') + result = p['first'] % p['second'] + expected = Series(p['first'].values % p['second'].values) + assert_series_equal(result, expected) - p = p.astype('float64') - result = p['first'] % p['second'] - result2 = p['second'] % p['first'] - self.assertFalse(np.array_equal(result, result2)) + p = p.astype('float64') + result = p['first'] % p['second'] + result2 = p['second'] % p['first'] + self.assertFalse(np.array_equal(result, result2)) - # GH 9144 - s = Series([0, 1]) + # GH 9144 + s = Series([0, 1]) - result = s % 0 - expected = Series([nan, nan]) - assert_series_equal(result, expected) + result = s % 0 + expected = Series([nan, nan]) + assert_series_equal(result, expected) - result = 0 % s - expected = Series([nan, 0.0]) - assert_series_equal(result, expected) + result = 0 % s + expected = Series([nan, 0.0]) + assert_series_equal(result, expected) def test_ops_consistency_on_empty(self): diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 26fc80c3ef988..8d7676bef4d72 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -18,17 +18,18 @@ class TestSeriesApply(TestData, tm.TestCase): _multiprocess_can_split_ = True def test_apply(self): - assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) - - # elementwise-apply - import math - assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) - - # how to handle Series result, #2316 - result = self.ts.apply(lambda x: Series( - [x, x ** 2], index=['x', 'x^2'])) - expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) - tm.assert_frame_equal(result, expected) + with np.errstate(all='ignore'): + assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) + + # elementwise-apply + import math + assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) + + # how to handle Series result, #2316 + result = self.ts.apply(lambda x: Series( + [x, x ** 2], index=['x', 'x^2'])) + expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) + tm.assert_frame_equal(result, expected) # empty series s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 5ebe528ff8cab..5fc44fe1dc608 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -34,7 +34,8 @@ def test_comparisons(self): left[:3] = np.nan result = nanops.nangt(left, right) - expected = (left > right).astype('O') + with np.errstate(invalid='ignore'): + expected = (left > right).astype('O') expected[:3] = np.nan assert_almost_equal(result, expected) @@ -81,62 +82,63 @@ def test_invert(self): assert_series_equal(-(self.series < 0), ~(self.series < 0)) def test_div(self): + with np.errstate(all='ignore'): + # no longer do integer div for any ops, but deal with the 0's + p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = p['first'] / p['second'] + expected = Series( + p['first'].values.astype(float) / p['second'].values, + dtype='float64') + expected.iloc[0:3] = np.inf + assert_series_equal(result, expected) - # no longer do integer div for any ops, but deal with the 0's - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p['first'] / p['second'] - expected = Series(p['first'].values.astype(float) / p['second'].values, - dtype='float64') - expected.iloc[0:3] = np.inf - assert_series_equal(result, expected) - - result = p['first'] / 0 - expected = Series(np.inf, index=p.index, name='first') - assert_series_equal(result, expected) + result = p['first'] / 0 + expected = Series(np.inf, index=p.index, name='first') + assert_series_equal(result, expected) - p = p.astype('float64') - result = p['first'] / p['second'] - expected = Series(p['first'].values / p['second'].values) - assert_series_equal(result, expected) + p = p.astype('float64') + result = p['first'] / p['second'] + expected = Series(p['first'].values / p['second'].values) + assert_series_equal(result, expected) - p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]}) - result = p['first'] / p['second'] - assert_series_equal(result, p['first'].astype('float64'), - check_names=False) - self.assertTrue(result.name is None) - self.assertFalse(np.array_equal(result, p['second'] / p['first'])) - - # inf signing - s = Series([np.nan, 1., -1.]) - result = s / 0 - expected = Series([np.nan, np.inf, -np.inf]) - assert_series_equal(result, expected) + p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]}) + result = p['first'] / p['second'] + assert_series_equal(result, p['first'].astype('float64'), + check_names=False) + self.assertTrue(result.name is None) + self.assertFalse(np.array_equal(result, p['second'] / p['first'])) + + # inf signing + s = Series([np.nan, 1., -1.]) + result = s / 0 + expected = Series([np.nan, np.inf, -np.inf]) + assert_series_equal(result, expected) - # float/integer issue - # GH 7785 - p = DataFrame({'first': (1, 0), 'second': (-0.01, -0.02)}) - expected = Series([-0.01, -np.inf]) + # float/integer issue + # GH 7785 + p = DataFrame({'first': (1, 0), 'second': (-0.01, -0.02)}) + expected = Series([-0.01, -np.inf]) - result = p['second'].div(p['first']) - assert_series_equal(result, expected, check_names=False) + result = p['second'].div(p['first']) + assert_series_equal(result, expected, check_names=False) - result = p['second'] / p['first'] - assert_series_equal(result, expected) + result = p['second'] / p['first'] + assert_series_equal(result, expected) - # GH 9144 - s = Series([-1, 0, 1]) + # GH 9144 + s = Series([-1, 0, 1]) - result = 0 / s - expected = Series([0.0, nan, 0.0]) - assert_series_equal(result, expected) + result = 0 / s + expected = Series([0.0, nan, 0.0]) + assert_series_equal(result, expected) - result = s / 0 - expected = Series([-inf, nan, inf]) - assert_series_equal(result, expected) + result = s / 0 + expected = Series([-inf, nan, inf]) + assert_series_equal(result, expected) - result = s // 0 - expected = Series([-inf, nan, inf]) - assert_series_equal(result, expected) + result = s // 0 + expected = Series([-inf, nan, inf]) + assert_series_equal(result, expected) def test_operators(self): def _check_op(series, other, op, pos_only=False, @@ -1432,18 +1434,19 @@ def _check_fill(meth, op, a, b, fill_value=0): exp_values = [] for i in range(len(exp_index)): - if amask[i]: - if bmask[i]: - exp_values.append(nan) - continue - exp_values.append(op(fill_value, b[i])) - elif bmask[i]: + with np.errstate(all='ignore'): if amask[i]: - exp_values.append(nan) - continue - exp_values.append(op(a[i], fill_value)) - else: - exp_values.append(op(a[i], b[i])) + if bmask[i]: + exp_values.append(nan) + continue + exp_values.append(op(fill_value, b[i])) + elif bmask[i]: + if amask[i]: + exp_values.append(nan) + continue + exp_values.append(op(a[i], fill_value)) + else: + exp_values.append(op(a[i], b[i])) result = meth(a, b, fill_value=fill_value) expected = Series(exp_values, exp_index) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6bf1a397c8482..9a82332621933 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2595,9 +2595,11 @@ def test_cython_fail_agg(self): def test_apply_series_to_frame(self): def f(piece): + with np.errstate(invalid='ignore'): + logged = np.log(piece) return DataFrame({'value': piece, 'demeaned': piece - piece.mean(), - 'logged': np.log(piece)}) + 'logged': logged}) dr = bdate_range('1/1/2000', periods=100) ts = Series(np.random.randn(100), index=dr) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index eeeddc278c714..dd3a49de55d73 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -58,12 +58,14 @@ def setUp(self): 'O'), self.arr_utf.astype('O'), self.arr_date.astype('O'), self.arr_tdelta.astype('O')]) - self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j - self.arr_complex_nan = np.vstack([self.arr_complex, self.arr_nan_nanj]) - - self.arr_nan_infj = self.arr_inf * 1j - self.arr_complex_nan_infj = np.vstack([self.arr_complex, - self.arr_nan_infj]) + with np.errstate(invalid='ignore'): + self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j + self.arr_complex_nan = np.vstack([self.arr_complex, + self.arr_nan_nanj]) + + self.arr_nan_infj = self.arr_inf * 1j + self.arr_complex_nan_infj = np.vstack([self.arr_complex, + self.arr_nan_infj]) self.arr_float_2d = self.arr_float[:, :, 0] self.arr_float1_2d = self.arr_float1[:, :, 0] diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 1f9ca4635b585..10a6693525590 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -824,12 +824,13 @@ def test_comp(func): self.assert_numpy_array_equal(result3.values, func(self.panel.values, 0)) - test_comp(operator.eq) - test_comp(operator.ne) - test_comp(operator.lt) - test_comp(operator.gt) - test_comp(operator.ge) - test_comp(operator.le) + with np.errstate(invalid='ignore'): + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) def test_get_value(self): for item in self.panel.items: @@ -1186,8 +1187,9 @@ def test_apply(self): # ufunc applied = self.panel.apply(np.sqrt) - self.assertTrue(assert_almost_equal(applied.values, np.sqrt( - self.panel.values))) + with np.errstate(invalid='ignore'): + expected = np.sqrt(self.panel.values) + assert_almost_equal(applied.values, expected) # ufunc same shape result = self.panel.apply(lambda x: x * 2, axis='items') diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 50ede3f2c2367..493889e579af2 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -461,12 +461,13 @@ def test_comp(func): self.assert_numpy_array_equal(result3.values, func(self.panel4d.values, 0)) - test_comp(operator.eq) - test_comp(operator.ne) - test_comp(operator.lt) - test_comp(operator.gt) - test_comp(operator.ge) - test_comp(operator.le) + with np.errstate(invalid='ignore'): + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) def test_major_xs(self): ref = self.panel4d['l1']['ItemA'] diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index d6baa720bac19..9193880df7feb 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -326,6 +326,16 @@ def test_exactly_one_ref(self): self.assertEqual(bytearray(as_stolen_buf), b'test') +def test_numpy_errstate_is_default(): + # The defaults since numpy 1.6.0 + expected = {'over': 'warn', 'divide': 'warn', 'invalid': 'warn', + 'under': 'ignore'} + import numpy as np + from pandas.compat import numpy # noqa + # The errstate should be unchanged after that import. + tm.assert_equal(np.geterr(), expected) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 53c77b2d8f9d7..c1b990c417553 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1516,8 +1516,12 @@ cdef inline void _localize_tso(_TSObject obj, object tz): dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, obj.dts.min, obj.dts.sec, obj.dts.us, tz) delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 - pandas_datetime_to_datetimestruct(obj.value + delta, - PANDAS_FR_ns, &obj.dts) + if obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + delta, + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz else: # Adjust datetime64 timestamp, recompute datetimestruct @@ -1529,7 +1533,7 @@ cdef inline void _localize_tso(_TSObject obj, object tz): # static/pytz/dateutil specific code if _is_fixed_offset(tz): # statictzinfo - if len(deltas) > 0: + if len(deltas) > 0 and obj.value != NPY_NAT: pandas_datetime_to_datetimestruct(obj.value + deltas[0], PANDAS_FR_ns, &obj.dts) else: @@ -1537,12 +1541,20 @@ cdef inline void _localize_tso(_TSObject obj, object tz): obj.tzinfo = tz elif _treat_tz_as_pytz(tz): inf = tz._transition_info[pos] - pandas_datetime_to_datetimestruct(obj.value + deltas[pos], - PANDAS_FR_ns, &obj.dts) + if obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + deltas[pos], + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz._tzinfos[inf] elif _treat_tz_as_dateutil(tz): - pandas_datetime_to_datetimestruct(obj.value + deltas[pos], - PANDAS_FR_ns, &obj.dts) + if obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + deltas[pos], + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz else: obj.tzinfo = tz From 86a36f766bc06c8d20a393d804a6bceea0f08b27 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 21 Aug 2016 09:38:13 -0400 Subject: [PATCH 289/359] DOC: whatsnew fix --- doc/source/whatsnew/v0.19.0.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index c2afb6619cb5c..8dcc5a00436aa 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -9,7 +9,7 @@ users upgrade to this version. .. warning:: - pandas >= 0.19.0 will no longer silence numpy ufunc warnings upon import, see :ref:`here `. (:issue:`13109`, :issue:`13145`) + pandas >= 0.19.0 will no longer silence numpy ufunc warnings upon import, see :ref:`here `. Highlights include: @@ -366,9 +366,9 @@ Google BigQuery Enhancements Fine-grained numpy errstate ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previous versions of pandas would permanently silence numpy's ufunc error handling when ``pandas`` was imported (:issue:`13109`). Pandas did this in order to silence the warnings that would arise from using numpy ufuncs on missing data, which are usually represented as NaNs. Unfortunately, this silenced legitimate warnings arising in non-pandas code in the application. Starting with 0.19.0, pandas will use the ``numpy.errstate`` context manager to silence these warnings in a more fine-grained manner only around where these operations are actually used in the pandas codebase. +Previous versions of pandas would permanently silence numpy's ufunc error handling when ``pandas`` was imported. Pandas did this in order to silence the warnings that would arise from using numpy ufuncs on missing data, which are usually represented as ``NaN`` s. Unfortunately, this silenced legitimate warnings arising in non-pandas code in the application. Starting with 0.19.0, pandas will use the ``numpy.errstate`` context manager to silence these warnings in a more fine-grained manner, only around where these operations are actually used in the pandas codebase. (:issue:`13109`, :issue:`13145`) -After upgrading pandas, you may see "new" ``RuntimeWarnings`` being issued from your code. These are likely legitimate, and the underlying cause likely existed in the code when using previous versions of pandas that simply silenced the warning. Use `numpy.errstate `__ around the source of the ``RuntimeWarning`` to control how these conditions are handled. +After upgrading pandas, you may see *new* ``RuntimeWarnings`` being issued from your code. These are likely legitimate, and the underlying cause likely existed in the code when using previous versions of pandas that simply silenced the warning. Use `numpy.errstate `__ around the source of the ``RuntimeWarning`` to control how these conditions are handled. .. _whatsnew_0190.enhancements.other: From ae4ffac560f9f6fbf1139dd4f391da52d12cdcce Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 21 Aug 2016 10:17:47 -0400 Subject: [PATCH 290/359] BUG: Don't error when usecols is a numpy array (#14055) Closes gh-12546. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/io/parsers.py | 1 + pandas/io/tests/parser/usecols.py | 10 ++++++++++ 3 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8dcc5a00436aa..e60ac7f3773f0 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -958,6 +958,7 @@ Bug Fixes - Bug in ``groupby().cumsum()`` calculating ``cumprod`` when ``axis=1``. (:issue:`13994`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` which raised errors when a numpy array was passed in for ``usecols`` (:issue:`12546`) - Bug in ``pd.to_timedelta()`` in which the ``errors`` parameter was not being respected (:issue:`13613`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e74ad78ed5940..9a7c966031044 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -972,6 +972,7 @@ def _validate_usecols_arg(usecols): 'string', 'unicode'): raise ValueError(msg) + return set(usecols) return usecols diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 8e34018df279b..ac32c20034c66 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -8,6 +8,7 @@ from datetime import datetime import nose +import numpy as np import pandas.util.testing as tm from pandas import DataFrame @@ -361,3 +362,12 @@ def test_empty_usecols(self): expected = DataFrame() result = self.read_csv(StringIO(data), usecols=set([])) tm.assert_frame_equal(result, expected) + + def test_np_array_usecols(self): + # See gh-12546 + data = 'a,b,c\n1,2,3' + usecols = np.array(['a', 'b']) + + expected = DataFrame([[1, 2]], columns=usecols) + result = self.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) From 447df80ac69efee840376d060ef541e0247427fc Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 21 Aug 2016 15:34:41 -0400 Subject: [PATCH 291/359] BUG, DOC: Fix inconsistencies with scalar na_values in read_csv (#14056) Update documentation to state that scalars are accepted for na_values. In addition, accept scalars for the values when a dictionary is passed in for na_values. Closes gh-12224. --- doc/source/io.rst | 2 +- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/io/excel.py | 2 +- pandas/io/parsers.py | 10 ++++++---- pandas/io/tests/parser/na_values.py | 16 ++++++++++++++++ 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index cc693170f055a..26e928020b893 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -208,7 +208,7 @@ memory_map : boolean, default False NA and Missing Data Handling ++++++++++++++++++++++++++++ -na_values : str, list-like or dict, default ``None`` +na_values : scalar, str, list-like, or dict, default ``None`` Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA', diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e60ac7f3773f0..08b59390339aa 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -957,6 +957,7 @@ Bug Fixes - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``groupby().cumsum()`` calculating ``cumprod`` when ``axis=1``. (:issue:`13994`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) +- Bug in ``pd.read_csv()``, which caused errors to be raised when a dictionary containing scalars is passed in for ``na_values`` (:issue:`12224`) - Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) - Bug in ``pd.read_csv()`` with ``engine='python'`` which raised errors when a numpy array was passed in for ``usecols`` (:issue:`12546`) - Bug in ``pd.to_timedelta()`` in which the ``errors`` parameter was not being respected (:issue:`13613`) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index c713cafc0e110..5e4dd4379a8e3 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -94,7 +94,7 @@ column ranges (e.g. "A:E" or "A,C,E:F") squeeze : boolean, default False If the parsed data only contains one column then return a Series -na_values : str or list-like or dict, default None +na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: '""" + "', '".join(sorted(_NA_VALUES)) + """'. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9a7c966031044..e40ea611fcd0a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -129,7 +129,7 @@ DEPRECATED: use the `skipfooter` parameter instead, as they are identical nrows : int, default None Number of rows of file to read. Useful for reading pieces of large files -na_values : str or list-like or dict, default None +na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: `'""" + "'`, `'".join(sorted(_NA_VALUES)) + """'`. @@ -1604,8 +1604,8 @@ def TextParser(*args, **kwds): has_index_names: boolean, default False True if the cols defined in index_col have an index name and are not in the header - na_values : iterable, default None - Custom NA values + na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. keep_default_na : bool, default True thousands : str, default None Thousands separator @@ -2687,7 +2687,9 @@ def _clean_na_values(na_values, keep_default_na=True): elif isinstance(na_values, dict): if keep_default_na: for k, v in compat.iteritems(na_values): - v = set(list(v)) | _NA_VALUES + if not is_list_like(v): + v = [v] + v = set(v) | _NA_VALUES na_values[k] = v na_fvalues = dict([ (k, _floatify_na_values(v)) for k, v in na_values.items() # noqa diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index 2a8c934abce61..92107cf2e82a7 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -250,3 +250,19 @@ def test_na_trailing_columns(self): result = self.read_csv(StringIO(data)) self.assertEqual(result['Date'][1], '2012-05-12') self.assertTrue(result['UnitPrice'].isnull().all()) + + def test_na_values_scalar(self): + # see gh-12224 + names = ['a', 'b'] + data = '1,2\n2,1' + + expected = DataFrame([[np.nan, 2.0], [2.0, np.nan]], + columns=names) + out = self.read_csv(StringIO(data), names=names, na_values=1) + tm.assert_frame_equal(out, expected) + + expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], + columns=names) + out = self.read_csv(StringIO(data), names=names, + na_values={'a': 2, 'b': 1}) + tm.assert_frame_equal(out, expected) From df2d9ab917a1ef4732cfb6c3317432f60c071074 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 21 Aug 2016 15:53:49 -0400 Subject: [PATCH 292/359] BUG: Validate the ordered parameter for Categorical (#14059) Closes gh-14058. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/categorical.py | 24 ++++++++++++++++++++++-- pandas/indexes/category.py | 1 + pandas/tests/test_categorical.py | 18 ++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 08b59390339aa..222bd250034d8 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -967,6 +967,7 @@ Bug Fixes - Bug in ``DataFrame`` assignment with an object-dtyped ``Index`` where the resultant column is mutable to the original object. (:issue:`13522`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) +- Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`) - Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 6ea0a5e96672d..3ec1c7085c87d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -231,6 +231,8 @@ class Categorical(PandasObject): def __init__(self, values, categories=None, ordered=False, name=None, fastpath=False): + self._validate_ordered(ordered) + if fastpath: # fast path self._codes = _coerce_indexer_dtype(values, categories) @@ -502,6 +504,25 @@ def _get_labels(self): _categories = None + @classmethod + def _validate_ordered(cls, ordered): + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + @classmethod def _validate_categories(cls, categories, fastpath=False): """ @@ -588,8 +609,7 @@ def set_ordered(self, value, inplace=False): Whether or not to set the ordered attribute inplace or return a copy of this categorical with ordered set to the value """ - if not is_bool(value): - raise TypeError("ordered must be a boolean value") + self._validate_ordered(value) cat = self if inplace else self.copy() cat._ordered = value if not inplace: diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index f1d4fe2f26bdd..23c534624930f 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -123,6 +123,7 @@ def _create_categorical(self, data, categories=None, ordered=None): Categorical """ if not isinstance(data, ABCCategorical): + ordered = False if ordered is None else ordered from pandas.core.categorical import Categorical data = Categorical(data, categories=categories, ordered=ordered) else: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index b630e0914259e..70e07b1e4930a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -398,6 +398,24 @@ def f(): codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) pd.Categorical.from_codes(codes, categories=["train", "test"]) + def test_validate_ordered(self): + # see gh-14058 + exp_msg = "'ordered' must either be 'True' or 'False'" + exp_err = TypeError + + # This should be a boolean. + ordered = np.array([0, 1, 2]) + + with tm.assertRaisesRegexp(exp_err, exp_msg): + Categorical([1, 2, 3], ordered=ordered) + + with tm.assertRaisesRegexp(exp_err, exp_msg): + Categorical.from_array([1, 2, 3], ordered=ordered) + + with tm.assertRaisesRegexp(exp_err, exp_msg): + Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'], + ordered=ordered) + def test_comparisons(self): result = self.factor[self.factor == 'a'] From ba2df222205fd795597a8ca3c4be3523abf9d0f3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Aug 2016 15:35:12 -0500 Subject: [PATCH 293/359] COMPAT/TST Matplotlib 2.0 compatability (#13662) --- ci/install_travis.sh | 4 ++ doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tests/plotting/common.py | 18 +++-- pandas/tests/plotting/test_datetimelike.py | 16 +++-- pandas/tests/plotting/test_frame.py | 76 ++++++++++++++++------ pandas/tests/plotting/test_misc.py | 10 ++- pandas/tests/plotting/test_series.py | 5 +- pandas/tools/plotting.py | 8 +++ 8 files changed, 106 insertions(+), 33 deletions(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 3d9651d4f579b..98ce36acc096e 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -138,5 +138,9 @@ else fi +if [ "$JOB_NAME" == "34_slow" ]; then + conda install -c conda-forge/label/rc -c conda-forge matplotlib +fi + echo "done" exit 0 diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 222bd250034d8..0483cb184ee19 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -427,7 +427,7 @@ Other enhancements - Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) - Raise ``ImportError`` in the sql functions when ``sqlalchemy`` is not installed and a connection string is used (:issue:`11920`). - +- Compatibility with matplotlib 2.0. Older versions of pandas should also work with matplotlib 2.0 (:issue:`13333`) .. _whatsnew_0190.api: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index faf16430fc94f..7dcc3d6e5734f 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -52,6 +52,7 @@ def setUp(self): self.mpl_ge_1_3_1 = plotting._mpl_ge_1_3_1() self.mpl_ge_1_4_0 = plotting._mpl_ge_1_4_0() self.mpl_ge_1_5_0 = plotting._mpl_ge_1_5_0() + self.mpl_ge_2_0_0 = plotting._mpl_ge_2_0_0() if self.mpl_ge_1_4_0: self.bp_n_objects = 7 @@ -64,6 +65,11 @@ def setUp(self): else: self.polycollection_factor = 1 + if self.mpl_ge_2_0_0: + self.default_figsize = (6.4, 4.8) + else: + self.default_figsize = (8.0, 6.0) + self.default_tick_position = 'left' if self.mpl_ge_2_0_0 else 'default' # common test data from pandas import read_csv path = os.path.join(os.path.dirname(curpath()), 'data', 'iris.csv') @@ -189,7 +195,9 @@ def _check_colors(self, collections, linecolors=None, facecolors=None, """ from matplotlib.lines import Line2D - from matplotlib.collections import Collection, PolyCollection + from matplotlib.collections import ( + Collection, PolyCollection, LineCollection + ) conv = self.colorconverter if linecolors is not None: @@ -203,7 +211,7 @@ def _check_colors(self, collections, linecolors=None, facecolors=None, result = patch.get_color() # Line2D may contains string color expression result = conv.to_rgba(result) - elif isinstance(patch, PolyCollection): + elif isinstance(patch, (PolyCollection, LineCollection)): result = tuple(patch.get_edgecolor()[0]) else: result = patch.get_edgecolor() @@ -318,7 +326,7 @@ def _check_ax_scales(self, axes, xaxis='linear', yaxis='linear'): self.assertEqual(ax.yaxis.get_scale(), yaxis) def _check_axes_shape(self, axes, axes_num=None, layout=None, - figsize=(8.0, 6.0)): + figsize=None): """ Check expected number of axes is drawn in expected layout @@ -333,6 +341,8 @@ def _check_axes_shape(self, axes, axes_num=None, layout=None, figsize : tuple expected figsize. default is matplotlib default """ + if figsize is None: + figsize = self.default_figsize visible_axes = self._flatten_visible(axes) if axes_num is not None: @@ -346,7 +356,7 @@ def _check_axes_shape(self, axes, axes_num=None, layout=None, self.assertEqual(result, layout) self.assert_numpy_array_equal( - np.round(visible_axes[0].figure.get_size_inches()), + visible_axes[0].figure.get_size_inches(), np.array(figsize, dtype=np.float64)) def _get_axes_layout(self, axes): diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 492b9edff0122..0f7bc02e24915 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -26,6 +26,7 @@ class TestTSPlot(TestPlotBase): def setUp(self): TestPlotBase.setUp(self) + freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q', 'A'] idx = [period_range('12/31/1999', freq=x, periods=100) for x in freq] self.period_ser = [Series(np.random.randn(len(x)), x) for x in idx] @@ -122,7 +123,8 @@ def test_tsplot(self): _check_plot_works(s.plot, ax=ax) ax = ts.plot(style='k') - self.assertEqual((0., 0., 0.), ax.get_lines()[0].get_color()) + color = (0., 0., 0., 1) if self.mpl_ge_2_0_0 else (0., 0., 0.) + self.assertEqual(color, ax.get_lines()[0].get_color()) def test_both_style_and_color(self): import matplotlib.pyplot as plt # noqa @@ -575,7 +577,8 @@ def test_secondary_y(self): plt.close(fig) ax2 = ser2.plot() - self.assertEqual(ax2.get_yaxis().get_ticks_position(), 'default') + self.assertEqual(ax2.get_yaxis().get_ticks_position(), + self.default_tick_position) plt.close(ax2.get_figure()) ax = ser2.plot() @@ -605,7 +608,8 @@ def test_secondary_y_ts(self): plt.close(fig) ax2 = ser2.plot() - self.assertEqual(ax2.get_yaxis().get_ticks_position(), 'default') + self.assertEqual(ax2.get_yaxis().get_ticks_position(), + self.default_tick_position) plt.close(ax2.get_figure()) ax = ser2.plot() @@ -639,7 +643,8 @@ def test_secondary_frame(self): df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) axes = df.plot(secondary_y=['a', 'c'], subplots=True) self.assertEqual(axes[0].get_yaxis().get_ticks_position(), 'right') - self.assertEqual(axes[1].get_yaxis().get_ticks_position(), 'default') + self.assertEqual(axes[1].get_yaxis().get_ticks_position(), + self.default_tick_position) self.assertEqual(axes[2].get_yaxis().get_ticks_position(), 'right') @slow @@ -647,7 +652,8 @@ def test_secondary_bar_frame(self): df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) axes = df.plot(kind='bar', secondary_y=['a', 'c'], subplots=True) self.assertEqual(axes[0].get_yaxis().get_ticks_position(), 'right') - self.assertEqual(axes[1].get_yaxis().get_ticks_position(), 'default') + self.assertEqual(axes[1].get_yaxis().get_ticks_position(), + self.default_tick_position) self.assertEqual(axes[2].get_yaxis().get_ticks_position(), 'right') def test_mixed_freq_regular_first(self): diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 11180c3e9b4f7..91be0a7a73e35 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import (Series, DataFrame, MultiIndex, PeriodIndex, date_range, bdate_range) +from pandas.types.api import is_list_like from pandas.compat import (range, lrange, StringIO, lmap, lzip, u, zip, PY3) from pandas.formats.printing import pprint_thing import pandas.util.testing as tm @@ -952,9 +953,12 @@ def test_scatter_colors(self): with tm.assertRaises(TypeError): df.plot.scatter(x='a', y='b', c='c', color='green') + default_colors = self._maybe_unpack_cycler(self.plt.rcParams) + ax = df.plot.scatter(x='a', y='b', c='c') - tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], - np.array([0, 0, 1, 1], dtype=np.float64)) + tm.assert_numpy_array_equal( + ax.collections[0].get_facecolor()[0], + np.array(self.colorconverter.to_rgba(default_colors[0]))) ax = df.plot.scatter(x='a', y='b', color='white') tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], @@ -1623,6 +1627,8 @@ def test_line_colors_and_styles_subplots(self): axes = df.plot(subplots=True) for ax, c in zip(axes, list(default_colors)): + if self.mpl_ge_2_0_0: + c = [c] self._check_colors(ax.get_lines(), linecolors=c) tm.close() @@ -1703,9 +1709,14 @@ def test_area_colors(self): self._check_colors(poly, facecolors=custom_colors) handles, labels = ax.get_legend_handles_labels() - # legend is stored as Line2D, thus check linecolors - linehandles = [x for x in handles if not isinstance(x, PolyCollection)] - self._check_colors(linehandles, linecolors=custom_colors) + if self.mpl_ge_1_5_0: + self._check_colors(handles, facecolors=custom_colors) + else: + # legend is stored as Line2D, thus check linecolors + linehandles = [x for x in handles + if not isinstance(x, PolyCollection)] + self._check_colors(linehandles, linecolors=custom_colors) + for h in handles: self.assertTrue(h.get_alpha() is None) tm.close() @@ -1717,8 +1728,12 @@ def test_area_colors(self): self._check_colors(poly, facecolors=jet_colors) handles, labels = ax.get_legend_handles_labels() - linehandles = [x for x in handles if not isinstance(x, PolyCollection)] - self._check_colors(linehandles, linecolors=jet_colors) + if self.mpl_ge_1_5_0: + self._check_colors(handles, facecolors=jet_colors) + else: + linehandles = [x for x in handles + if not isinstance(x, PolyCollection)] + self._check_colors(linehandles, linecolors=jet_colors) for h in handles: self.assertTrue(h.get_alpha() is None) tm.close() @@ -1731,8 +1746,12 @@ def test_area_colors(self): self._check_colors(poly, facecolors=jet_with_alpha) handles, labels = ax.get_legend_handles_labels() - # Line2D can't have alpha in its linecolor - self._check_colors(handles[:len(jet_colors)], linecolors=jet_colors) + if self.mpl_ge_1_5_0: + linecolors = jet_with_alpha + else: + # Line2D can't have alpha in its linecolor + linecolors = jet_colors + self._check_colors(handles[:len(jet_colors)], linecolors=linecolors) for h in handles: self.assertEqual(h.get_alpha(), 0.5) @@ -1855,7 +1874,10 @@ def test_kde_colors_and_styles_subplots(self): @slow def test_boxplot_colors(self): def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c='k', - fliers_c='b'): + fliers_c=None): + # TODO: outside this func? + if fliers_c is None: + fliers_c = 'k' if self.mpl_ge_2_0_0 else 'b' self._check_colors(bp['boxes'], linecolors=[box_c] * len(bp['boxes'])) self._check_colors(bp['whiskers'], @@ -2232,16 +2254,24 @@ def test_errorbar_asymmetrical(self): np.random.seed(0) err = np.random.rand(3, 2, 5) - data = np.random.randn(5, 3) - df = DataFrame(data) + # each column is [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]... + df = DataFrame(np.arange(15).reshape(3, 5)).T + data = df.values ax = df.plot(yerr=err, xerr=err / 2) - self.assertEqual(ax.lines[7].get_ydata()[0], data[0, 1] - err[1, 0, 0]) - self.assertEqual(ax.lines[8].get_ydata()[0], data[0, 1] + err[1, 1, 0]) + if self.mpl_ge_2_0_0: + yerr_0_0 = ax.collections[1].get_paths()[0].vertices[:, 1] + expected_0_0 = err[0, :, 0] * np.array([-1, 1]) + tm.assert_almost_equal(yerr_0_0, expected_0_0) + else: + self.assertEqual(ax.lines[7].get_ydata()[0], + data[0, 1] - err[1, 0, 0]) + self.assertEqual(ax.lines[8].get_ydata()[0], + data[0, 1] + err[1, 1, 0]) - self.assertEqual(ax.lines[5].get_xdata()[0], -err[1, 0, 0] / 2) - self.assertEqual(ax.lines[6].get_xdata()[0], err[1, 1, 0] / 2) + self.assertEqual(ax.lines[5].get_xdata()[0], -err[1, 0, 0] / 2) + self.assertEqual(ax.lines[6].get_xdata()[0], err[1, 1, 0] / 2) with tm.assertRaises(ValueError): df.plot(yerr=err.T) @@ -2277,9 +2307,17 @@ def test_errorbar_scatter(self): self._check_has_errorbars(ax, xerr=1, yerr=1) def _check_errorbar_color(containers, expected, has_err='has_xerr'): - errs = [c.lines[1][0] - for c in ax.containers if getattr(c, has_err, False)] - self._check_colors(errs, linecolors=[expected] * len(errs)) + lines = [] + errs = [c.lines + for c in ax.containers if getattr(c, has_err, False)][0] + for el in errs: + if is_list_like(el): + lines.extend(el) + else: + lines.append(el) + err_lines = [x for x in lines if x in ax.collections] + self._check_colors( + err_lines, linecolors=np.array([expected] * len(err_lines))) # GH 8081 df = DataFrame( diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 8b9a4fe05bb2e..a484217da5969 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -103,7 +103,10 @@ def test_scatter_matrix_axis(self): axes0_labels = axes[0][0].yaxis.get_majorticklabels() # GH 5662 - expected = ['-2', '-1', '0', '1', '2'] + if self.mpl_ge_2_0_0: + expected = ['-2', '0', '2'] + else: + expected = ['-2', '-1', '0', '1', '2'] self._check_text_labels(axes0_labels, expected) self._check_ticks_props( axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @@ -115,7 +118,10 @@ def test_scatter_matrix_axis(self): axes = _check_plot_works(scatter_matrix, filterwarnings='always', frame=df, range_padding=.1) axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ['-1.2', '-1.0', '-0.8', '-0.6', '-0.4', '-0.2', '0.0'] + if self.mpl_ge_2_0_0: + expected = ['-1.0', '-0.5', '0.0'] + else: + expected = ['-1.2', '-1.0', '-0.8', '-0.6', '-0.4', '-0.2', '0.0'] self._check_text_labels(axes0_labels, expected) self._check_ticks_props( axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 2bd2f8255569d..e752197c6ad77 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -218,12 +218,13 @@ def test_bar_log(self): expected = np.hstack((1.0e-04, expected, 1.0e+01)) ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') - self.assertEqual(ax.get_ylim(), (0.001, 0.10000000000000001)) + ymax = 0.12589254117941673 if self.mpl_ge_2_0_0 else .10000000000000001 + self.assertEqual(ax.get_ylim(), (0.001, ymax)) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) tm.close() ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') - self.assertEqual(ax.get_xlim(), (0.001, 0.10000000000000001)) + self.assertEqual(ax.get_xlim(), (0.001, ymax)) tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) @slow diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index a61a21d259e57..1abd11017dbfe 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -141,6 +141,14 @@ def _mpl_ge_1_5_0(): except ImportError: return False + +def _mpl_ge_2_0_0(): + try: + import matplotlib + return matplotlib.__version__ >= LooseVersion('2.0') + except ImportError: + return False + if _mpl_ge_1_5_0(): # Compat with mp 1.5, which uses cycler. import cycler From 6645b2b11a82343e5f07b15a25a250f411067819 Mon Sep 17 00:00:00 2001 From: Nate George Date: Mon, 22 Aug 2016 14:47:18 -0600 Subject: [PATCH 294/359] BUG: fix read_csv c engine to accept unicode aliases for encoding (#14060) --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/io/parsers.py | 3 +++ pandas/io/tests/parser/common.py | 10 ++++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0483cb184ee19..d7fe44e046d8e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1095,3 +1095,5 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. + +- Bug in ``read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e40ea611fcd0a..e765ebc36e33e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -343,6 +343,9 @@ def _validate_nrows(nrows): def _read(filepath_or_buffer, kwds): "Generic reader of line files." encoding = kwds.get('encoding', None) + if encoding is not None: + encoding = re.sub('_', '-', encoding).lower() + kwds['encoding'] = encoding # If the input could be a filename, check for a recognizable compression # extension. If we're reading from a URL, the `get_filepath_or_buffer` diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 96eb0ec6fd7a2..7777a9f2fadb5 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1583,3 +1583,13 @@ def test_temporary_file(self): new_file.close() expected = DataFrame([[0, 0]]) tm.assert_frame_equal(result, expected) + + def test_read_csv_utf_aliases(self): + # see gh issue 13549 + expected = pd.DataFrame({'mb_num': [4.8], 'multibyte': ['test']}) + for byte in [8, 16]: + for fmt in ['utf-{0}', 'utf_{0}', 'UTF-{0}', 'UTF_{0}']: + encoding = fmt.format(byte) + data = 'mb_num,multibyte\n4.8,test'.encode(encoding) + result = self.read_csv(BytesIO(data), encoding=encoding) + tm.assert_frame_equal(result, expected) From fb6fbaeed160b19909c4f2b454643fc2b7145ed6 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 24 Aug 2016 19:27:44 -0400 Subject: [PATCH 295/359] TST: Add tests for internal EOF in read_csv (#14069) Closes gh-5500. --- pandas/io/tests/parser/common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 7777a9f2fadb5..b90fc304e125e 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1593,3 +1593,11 @@ def test_read_csv_utf_aliases(self): data = 'mb_num,multibyte\n4.8,test'.encode(encoding) result = self.read_csv(BytesIO(data), encoding=encoding) tm.assert_frame_equal(result, expected) + + def test_internal_eof_byte(self): + # see gh-5500 + data = "a,b\n1\x1a,2" + + expected = pd.DataFrame([["1\x1a", 2]], columns=['a', 'b']) + result = self.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) From 3923fcd163cb1b6f7911b0e11b110593f6e0c9e4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 24 Aug 2016 19:28:44 -0400 Subject: [PATCH 296/359] BUG: Align to_csv signatures with DataFrame and Series (#14063) Closes gh-14054. --- pandas/core/series.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 32edcf6e698a3..7979a230eed84 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2538,16 +2538,17 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, return result - def to_csv(self, path, index=True, sep=",", na_rep='', float_format=None, - header=False, index_label=None, mode='w', encoding=None, - date_format=None, decimal='.'): + def to_csv(self, path=None, index=True, sep=",", na_rep='', + float_format=None, header=False, index_label=None, + mode='w', encoding=None, date_format=None, decimal='.'): """ Write Series to a comma-separated values (csv) file Parameters ---------- - path : string file path or file handle / StringIO. If None is provided - the result is returned as a string. + path : string or file handle, default None + File path or object, if None is provided the result is returned as + a string. na_rep : string, default '' Missing data representation float_format : string, default None From e23e6f164209167c0fba0d32c862c5e75e6d4a8a Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 24 Aug 2016 19:33:51 -0400 Subject: [PATCH 297/359] API: PeriodIndex.values now return array of Period objects split from #13941 (comment) Author: sinhrks Closes #13988 from sinhrks/period_values and squashes the following commits: d7637c9 [sinhrks] API: PeriodIndex.values now return array of Period objects --- doc/source/whatsnew/v0.19.0.txt | 26 ++++- pandas/indexes/base.py | 27 +++-- pandas/io/pytables.py | 24 ++-- pandas/tests/indexes/common.py | 15 ++- pandas/tests/indexes/test_datetimelike.py | 2 +- pandas/tests/indexing/test_coercion.py | 28 +++-- pandas/tests/indexing/test_indexing.py | 4 +- pandas/tests/test_base.py | 2 +- pandas/tseries/base.py | 2 +- pandas/tseries/converter.py | 6 +- pandas/tseries/period.py | 128 ++++++++++++++-------- pandas/tseries/resample.py | 5 +- pandas/tseries/tests/test_base.py | 4 +- pandas/tseries/tests/test_period.py | 55 +++++++--- 14 files changed, 219 insertions(+), 109 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d7fe44e046d8e..4039dafd323c4 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -16,7 +16,7 @@ Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` -- ``PeriodIndex`` now has its own ``period`` dtype. see ref:`here ` +- ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See ref:`here ` .. contents:: What's new in v0.19.0 :local: @@ -643,10 +643,13 @@ Furthermore: - Passing duplicated ``percentiles`` will now raise a ``ValueError``. - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) -.. _whatsnew_0190.api.perioddtype: +.. _whatsnew_0190.api.period: + +``Period`` changes +^^^^^^^^^^^^^^^^^^ ``PeriodIndex`` now has ``period`` dtype -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +"""""""""""""""""""""""""""""""""""""""" ``PeriodIndex`` now has its own ``period`` dtype. The ``period`` dtype is a pandas extension dtype like ``category`` or :ref:`timezone aware dtype ` (``datetime64[ns, tz]``). (:issue:`13941`). @@ -681,7 +684,7 @@ New Behavior: .. _whatsnew_0190.api.periodnat: ``Period('NaT')`` now returns ``pd.NaT`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +"""""""""""""""""""""""""""""""""""""""" Previously, ``Period`` has its own ``Period('NaT')`` representation different from ``pd.NaT``. Now ``Period('NaT')`` has been changed to return ``pd.NaT``. (:issue:`12759`, :issue:`13582`) @@ -719,6 +722,21 @@ New Behavior: pd.NaT + 1 pd.NaT - 1 +``PeriodIndex.values`` now returns array of ``Period`` object +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +``.values`` is changed to return array of ``Period`` object, rather than array +of ``int64`` (:issue:`13988`) + +.. code-block:: ipython + In [6]: pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + In [7]: pi.values + array([492, 493]) + +.. ipython:: python + + pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + pi.values .. _whatsnew_0190.api.difference: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index e4e5a4e4cfec7..49b16ec9a71ab 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1251,7 +1251,7 @@ def _constructor(self): @cache_readonly def _engine(self): # property, for now, slow to look up - return self._engine_type(lambda: self.values, len(self)) + return self._engine_type(lambda: self._values, len(self)) def _validate_index_level(self, level): """ @@ -1823,13 +1823,13 @@ def union(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._outer_indexer(self.values, other._values)[0] + result = self._outer_indexer(self._values, other._values)[0] except TypeError: # incomparable objects - result = list(self.values) + result = list(self._values) # worth making this faster? a very unusual case - value_set = set(self.values) + value_set = set(self._values) result.extend([x for x in other._values if x not in value_set]) else: indexer = self.get_indexer(other) @@ -1838,10 +1838,10 @@ def union(self, other): if len(indexer) > 0: other_diff = algos.take_nd(other._values, indexer, allow_fill=False) - result = _concat._concat_compat((self.values, other_diff)) + result = _concat._concat_compat((self._values, other_diff)) try: - self.values[0] < other_diff[0] + self._values[0] < other_diff[0] except TypeError as e: warnings.warn("%s, sort order is undefined for " "incomparable objects" % e, RuntimeWarning, @@ -1853,7 +1853,7 @@ def union(self, other): result.sort() else: - result = self.values + result = self._values try: result = np.sort(result) @@ -1906,17 +1906,17 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self.values, other._values)[0] + result = self._inner_indexer(self._values, other._values)[0] return self._wrap_union_result(other, result) except TypeError: pass try: - indexer = Index(self.values).get_indexer(other._values) + indexer = Index(self._values).get_indexer(other._values) indexer = indexer.take((indexer != -1).nonzero()[0]) except: # duplicates - indexer = Index(self.values).get_indexer_non_unique( + indexer = Index(self._values).get_indexer_non_unique( other._values)[0].unique() indexer = indexer[indexer != -1] @@ -2536,7 +2536,7 @@ def _reindex_non_unique(self, target): missing = _ensure_platform_int(missing) missing_labels = target.take(missing) missing_indexer = _ensure_int64(l[~check]) - cur_labels = self.take(indexer[check])._values + cur_labels = self.take(indexer[check]).values cur_indexer = _ensure_int64(l[check]) new_labels = np.empty(tuple([len(indexer)]), dtype=object) @@ -2556,7 +2556,7 @@ def _reindex_non_unique(self, target): else: # need to retake to have the same size as the indexer - indexer = indexer._values + indexer = indexer.values indexer[~check] = 0 # reset the new indexer to account for the new size @@ -2879,7 +2879,7 @@ def _join_monotonic(self, other, how='left', return_indexers=False): else: return ret_index - sv = self.values + sv = self._values ov = other._values if self.is_unique and other.is_unique: @@ -3185,7 +3185,6 @@ def insert(self, loc, item): """ _self = np.asarray(self) item = self._coerce_scalar_to_index(item)._values - idx = np.concatenate((_self[:loc], item, _self[loc:])) return self._shallow_copy_with_infer(idx) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5229936bd8a04..f77076e54f34d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2349,6 +2349,11 @@ def f(values, freq=None, tz=None): return DatetimeIndex._simple_new(values, None, freq=freq, tz=tz) return f + elif klass == PeriodIndex: + def f(values, freq=None, tz=None): + return PeriodIndex._simple_new(values, None, freq=freq) + return f + return klass def validate_read(self, kwargs): @@ -2450,7 +2455,9 @@ def write_index(self, key, index): setattr(self.attrs, '%s_variety' % key, 'regular') converted = _convert_index(index, self.encoding, self.format_type).set_name('index') + self.write_array(key, converted.values) + node = getattr(self.group, key) node._v_attrs.kind = converted.kind node._v_attrs.name = index.name @@ -2552,12 +2559,12 @@ def read_index_node(self, node, start=None, stop=None): kwargs['tz'] = node._v_attrs['tz'] if kind in (u('date'), u('datetime')): - index = factory( - _unconvert_index(data, kind, encoding=self.encoding), - dtype=object, **kwargs) + index = factory(_unconvert_index(data, kind, + encoding=self.encoding), + dtype=object, **kwargs) else: - index = factory( - _unconvert_index(data, kind, encoding=self.encoding), **kwargs) + index = factory(_unconvert_index(data, kind, + encoding=self.encoding), **kwargs) index.name = name @@ -4377,9 +4384,10 @@ def _convert_index(index, encoding=None, format_type=None): index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return IndexCol( - index.values, 'integer', atom, freq=getattr(index, 'freq', None), - index_name=index_name) + # avoid to store ndarray of Period objects + return IndexCol(index._values, 'integer', atom, + freq=getattr(index, 'freq', None), + index_name=index_name) if isinstance(index, MultiIndex): raise TypeError('MultiIndex not supported here!') diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 59b98ebcff82a..f7e8a4e858441 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -245,9 +245,18 @@ def test_ensure_copied_data(self): tm.assert_numpy_array_equal(index.values, result.values, check_same='copy') - result = index_type(index.values, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index.values, result.values, - check_same='same') + if not isinstance(index, PeriodIndex): + result = index_type(index.values, copy=False, **init_kwargs) + tm.assert_numpy_array_equal(index.values, result.values, + check_same='same') + tm.assert_numpy_array_equal(index._values, result._values, + check_same='same') + else: + # .values an object array of Period, thus copied + result = index_type(ordinal=index.asi8, copy=False, + **init_kwargs) + tm.assert_numpy_array_equal(index._values, result._values, + check_same='same') def test_copy_and_deepcopy(self): from copy import copy, deepcopy diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index bcc6532fbe0ce..7502a4ce26b04 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -781,7 +781,7 @@ def test_astype(self): idx = period_range('1990', '2009', freq='A') result = idx.astype('i8') self.assert_index_equal(result, Index(idx.asi8)) - self.assert_numpy_array_equal(result.values, idx.values) + self.assert_numpy_array_equal(result.values, idx.asi8) def test_astype_raises(self): # GH 13149, GH 13209 diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index d8d8242fa50c6..5fbaea6c5efcb 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -490,16 +490,30 @@ def test_insert_index_period(self): self._assert_insert_conversion(obj, pd.Period('2012-01', freq='M'), exp, 'period[M]') - # ToDo: must coerce to object? - exp = pd.PeriodIndex(['2011-01', '2012-01', '2011-02', - '2011-03', '2011-04'], freq='M') + # period + datetime64 => object + exp = pd.Index([pd.Period('2011-01', freq='M'), + pd.Timestamp('2012-01-01'), + pd.Period('2011-02', freq='M'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')], freq='M') self._assert_insert_conversion(obj, pd.Timestamp('2012-01-01'), - exp, 'period[M]') + exp, np.object) # period + int => object - msg = "Given date string not likely a datetime." - with tm.assertRaisesRegexp(ValueError, msg): - print(obj.insert(1, 1)) + exp = pd.Index([pd.Period('2011-01', freq='M'), + 1, + pd.Period('2011-02', freq='M'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')], freq='M') + self._assert_insert_conversion(obj, 1, exp, np.object) + + # period + object => object + exp = pd.Index([pd.Period('2011-01', freq='M'), + 'x', + pd.Period('2011-02', freq='M'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')], freq='M') + self._assert_insert_conversion(obj, 'x', exp, np.object) class TestWhereCoercion(CoercionBase, tm.TestCase): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index b051b92e15540..e0d63d5aa0c44 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -4137,8 +4137,8 @@ def test_series_partial_set_period(self): idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx') ser = Series([0.1, 0.2], index=idx, name='s') - result = ser.loc[[pd.Period('2011-01-01', freq='D'), pd.Period( - '2011-01-02', freq='D')]] + result = ser.loc[[pd.Period('2011-01-01', freq='D'), + pd.Period('2011-01-02', freq='D')]] exp = Series([0.1, 0.2], index=idx, name='s') tm.assert_series_equal(result, exp, check_index_type=True) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 52cd65af42c5e..66216758ca091 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -393,7 +393,7 @@ def test_ops(self): if not isinstance(o, PeriodIndex): expected = getattr(o.values, op)() else: - expected = pd.Period(ordinal=getattr(o.values, op)(), + expected = pd.Period(ordinal=getattr(o._values, op)(), freq=o.freq) try: self.assertEqual(result, expected) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index ad774d1b92202..e64a0d2ebaf5e 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -323,7 +323,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - sorted_values = np.sort(self.values) + sorted_values = np.sort(self._values) attribs = self._get_attributes_dict() freq = attribs['freq'] diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index a23e8af3e610c..8f8519a498a31 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -141,11 +141,11 @@ def convert(values, units, axis): is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, PeriodIndex): - return values.asfreq(axis.freq).values + return values.asfreq(axis.freq)._values if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) if is_period_arraylike(values): - return PeriodIndex(values, freq=axis.freq).values + return PeriodIndex(values, freq=axis.freq)._values if isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] return values @@ -518,7 +518,7 @@ def _daily_finder(vmin, vmax, freq): info = np.zeros(span, dtype=[('val', np.int64), ('maj', bool), ('min', bool), ('fmt', '|S20')]) - info['val'][:] = dates_.values + info['val'][:] = dates_._values info['fmt'][:] = '' info['maj'][[0, -1]] = True # .. and set some shortcuts diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 9b2fa705df385..8bce01b0759fc 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -35,11 +35,11 @@ _quarter_to_myear) from pandas.core.base import _shared_docs -from pandas.indexes.base import _index_shared_docs +from pandas.indexes.base import _index_shared_docs, _ensure_index from pandas import compat from pandas.util.decorators import Appender, cache_readonly, Substitution -from pandas.lib import Timedelta +from pandas.lib import infer_dtype import pandas.tslib as tslib from pandas.compat import zip, u @@ -47,7 +47,7 @@ def _field_accessor(name, alias, docstring=None): def f(self): base, mult = _gfc(self.freq) - return get_period_field_arr(alias, self.values, base) + return get_period_field_arr(alias, self._values, base) f.__name__ = name f.__doc__ = docstring return property(f) @@ -73,7 +73,7 @@ def _period_index_cmp(opname, nat_result=False): def wrapper(self, other): if isinstance(other, Period): - func = getattr(self.values, opname) + func = getattr(self._values, opname) other_base, _ = _gfc(other.freq) if other.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) @@ -85,7 +85,7 @@ def wrapper(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - result = getattr(self.values, opname)(other.values) + result = getattr(self._values, opname)(other._values) mask = self._isnan | other._isnan if mask.any(): @@ -93,11 +93,11 @@ def wrapper(self, other): return result elif other is tslib.NaT: - result = np.empty(len(self.values), dtype=bool) + result = np.empty(len(self._values), dtype=bool) result.fill(nat_result) else: other = Period(other, freq=self.freq) - func = getattr(self.values, opname) + func = getattr(self._values, opname) result = func(other.ordinal) if self.hasnans: @@ -265,13 +265,17 @@ def _from_arraylike(cls, data, freq, tz): if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: freq = data.freq - data = data.values + data = data._values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data.values, + data = period.period_asfreq_arr(data._values, base1, base2, 1) else: + if is_object_dtype(data): + inferred = infer_dtype(data) + if inferred == 'integer': + data = data.astype(np.int64) if freq is None and is_object_dtype(data): # must contain Period instance and thus extract ordinals @@ -286,11 +290,8 @@ def _from_arraylike(cls, data, freq, tz): if np.issubdtype(data.dtype, np.datetime64): data = dt64arr_to_periodarr(data, freq, tz) else: - try: - data = _ensure_int64(data) - except (TypeError, ValueError): - data = _ensure_object(data) - data = period.extract_ordinals(data, freq) + data = _ensure_object(data) + data = period.extract_ordinals(data, freq) return data, freq @@ -349,6 +350,29 @@ def __contains__(self, key): return False return False + @property + def asi8(self): + return self._values.view('i8') + + @property + def _int64index(self): + # do not cache, same as .asi8 + return Int64Index(self.asi8, name=self.name, fastpath=True) + + @property + def values(self): + return self.asobject.values + + @property + def _values(self): + return self._data + + def __array__(self, dtype=None): + if is_integer_dtype(dtype): + return self.asi8 + else: + return self.asobject.values + def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. Needs additional handling as @@ -359,15 +383,17 @@ def __array_wrap__(self, result, context=None): if isinstance(context, tuple) and len(context) > 0: func = context[0] if (func is np.add): - try: - return self._add_delta(context[1][1]) - except IncompatibleFrequency: - raise TypeError + pass elif (func is np.subtract): - try: - return self._add_delta(-context[1][1]) - except IncompatibleFrequency: - raise TypeError + name = self.name + left = context[1][0] + right = context[1][1] + if (isinstance(left, PeriodIndex) and + isinstance(right, PeriodIndex)): + name = left.name if left.name == right.name else None + return Index(result, name=name) + elif isinstance(left, Period) or isinstance(right, Period): + return Index(result, name=name) elif isinstance(func, np.ufunc): if 'M->M' not in func.types: msg = "ufunc '{0}' not supported for the PeriodIndex" @@ -377,7 +403,9 @@ def __array_wrap__(self, result, context=None): if is_bool_dtype(result): return result - return self._shallow_copy(result) + # the result is object dtype array of Period + # cannot pass _simple_new as it is + return PeriodIndex(result, freq=self.freq, name=self.name) @property def _box_func(self): @@ -393,11 +421,6 @@ def _to_embed(self, keep_tz=False): def _formatter_func(self): return lambda x: "'%s'" % x - @property - def _int64index(self): - # do not cache, same as .asi8 - return Int64Index(self.asi8, name=self.name, fastpath=True) - def asof_locs(self, where, mask): """ where : array of timestamps @@ -408,13 +431,13 @@ def asof_locs(self, where, mask): if isinstance(where_idx, DatetimeIndex): where_idx = PeriodIndex(where_idx.values, freq=self.freq) - locs = self.values[mask].searchsorted(where_idx.values, side='right') + locs = self._values[mask].searchsorted(where_idx._values, side='right') locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[(locs == 0) & (where_idx.values < self.values[first])] = -1 + result[(locs == 0) & (where_idx._values < self._values[first])] = -1 return result @@ -424,8 +447,10 @@ def astype(self, dtype, copy=True, how='start'): if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): - return Index(self.values.astype('i8', copy=copy), name=self.name, - dtype='i8') + if copy: + return self._int64index.copy() + else: + return self._int64index elif is_datetime64_dtype(dtype): return self.to_timestamp(how=how) elif is_datetime64tz_dtype(dtype): @@ -445,7 +470,7 @@ def searchsorted(self, key, side='left', sorter=None): elif isinstance(key, compat.string_types): key = Period(key, freq=self.freq).ordinal - return self.values.searchsorted(key, side=side, sorter=sorter) + return self._values.searchsorted(key, side=side, sorter=sorter) @property def is_all_dates(self): @@ -570,8 +595,7 @@ def equals(self, other): if self.is_(other): return True - if (not hasattr(other, 'inferred_type') or - other.inferred_type != 'int64'): + if not isinstance(other, PeriodIndex): try: other = PeriodIndex(other) except: @@ -605,12 +629,11 @@ def to_timestamp(self, freq=None, how='start'): base, mult = _gfc(freq) new_data = self.asfreq(freq, how) - new_data = period.periodarr_to_dt64arr(new_data.values, base) + new_data = period.periodarr_to_dt64arr(new_data._values, base) return DatetimeIndex(new_data, freq='infer', name=self.name) def _maybe_convert_timedelta(self, other): - if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, Timedelta)): + if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) @@ -681,7 +704,7 @@ def shift(self, n): ------- shifted : PeriodIndex """ - values = self.values + n * self.freq.n + values = self._values + n * self.freq.n if self.hasnans: values[self._isnan] = tslib.iNaT return PeriodIndex(data=values, name=self.name, freq=self.freq) @@ -712,7 +735,7 @@ def get_value(self, series, key): grp = frequencies.Resolution.get_freq_group(reso) freqn = frequencies.get_freq_group(self.freq) - vals = self.values + vals = self._values # if our data is higher resolution than requested key, slice if grp < freqn: @@ -723,7 +746,7 @@ def get_value(self, series, key): if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) - pos = np.searchsorted(self.values, [ord1, ord2]) + pos = np.searchsorted(self._values, [ord1, ord2]) key = slice(pos[0], pos[1] + 1) return series[key] elif grp == freqn: @@ -740,10 +763,19 @@ def get_value(self, series, key): series, key) def get_indexer(self, target, method=None, limit=None, tolerance=None): + target = _ensure_index(target) + if hasattr(target, 'freq') and target.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, target.freqstr) raise IncompatibleFrequency(msg) - return Index.get_indexer(self, target, method, limit, tolerance) + + if isinstance(target, PeriodIndex): + target = target.asi8 + + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance) + return Index.get_indexer(self._int64index, target, method, + limit, tolerance) def get_loc(self, key, method=None, tolerance=None): """ @@ -862,6 +894,14 @@ def _convert_tolerance(self, tolerance): tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance) return self._maybe_convert_timedelta(tolerance) + def insert(self, loc, item): + if not isinstance(item, Period) or self.freq != item.freq: + return self.asobject.insert(loc, item) + + idx = np.concatenate((self[:loc].asi8, np.array([item.ordinal]), + self[loc:].asi8)) + return self._shallow_copy(idx) + def join(self, other, how='left', level=None, return_indexers=False): """ See Index.join @@ -949,10 +989,10 @@ def append(self, other): # box to_concat = [x.asobject.values for x in to_concat] else: - cat_values = np.concatenate([x.values for x in to_concat]) + cat_values = np.concatenate([x._values for x in to_concat]) return PeriodIndex(cat_values, freq=self.freq, name=name) - to_concat = [x.values if isinstance(x, Index) else x + to_concat = [x._values if isinstance(x, Index) else x for x in to_concat] return Index(com._concat_compat(to_concat), name=name) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 38c2e009a01f3..5c4bfe5360fac 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -784,7 +784,7 @@ def _get_new_index(self): else: start = ax[0].asfreq(self.freq, how=self.convention) end = ax[-1].asfreq(self.freq, how='end') - values = period_range(start, end, freq=self.freq).values + values = period_range(start, end, freq=self.freq).asi8 return ax._shallow_copy(values, freq=self.freq) @@ -815,7 +815,8 @@ def _downsample(self, how, **kwargs): if len(new_index) == 0: bins = [] else: - rng = np.arange(memb.values[0], memb.values[-1] + 1) + i8 = memb.asi8 + rng = np.arange(i8[0], i8[-1] + 1) bins = memb.searchsorted(rng, side='right') grouper = BinGrouper(bins, new_index) return self._groupby_and_aggregate(how, grouper=grouper) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 0d6c991f00c8b..4d3c60ce39291 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -2364,8 +2364,8 @@ def _check_freq(index, expected_index): freq='D') result = pidx.sort_values() - expected = PeriodIndex( - ['NaT', '2011', '2011', '2013'], name='pidx', freq='D') + expected = PeriodIndex(['NaT', '2011', '2011', '2013'], + name='pidx', freq='D') self.assert_index_equal(result, expected) self.assertEqual(result.freq, 'D') diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index fe6dcf69e0b4e..1ddcc11c15a59 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1747,8 +1747,12 @@ def test_constructor_corner(self): def test_constructor_fromarraylike(self): idx = period_range('2007-01', periods=20, freq='M') - self.assertRaises(ValueError, PeriodIndex, idx.values) - self.assertRaises(ValueError, PeriodIndex, list(idx.values)) + # values is an array of Period, thus can retrieve freq + tm.assert_index_equal(PeriodIndex(idx.values), idx) + tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) + + self.assertRaises(ValueError, PeriodIndex, idx._values) + self.assertRaises(ValueError, PeriodIndex, list(idx._values)) self.assertRaises(ValueError, PeriodIndex, data=Period('2007', freq='A')) @@ -2027,26 +2031,29 @@ def test_view_asi8(self): tm.assert_numpy_array_equal(idx.asi8, exp) def test_values(self): - # ToDo: .values and .get_values() should return Period as object - # dtype array. ._values shouldn't be changed idx = pd.PeriodIndex([], freq='M') - exp = np.array([], dtype=np.int64) + exp = np.array([], dtype=np.object) tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([], dtype=np.int64) tm.assert_numpy_array_equal(idx._values, exp) idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') - exp = np.array([492, -9223372036854775808], dtype=np.int64) + exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([492, -9223372036854775808], dtype=np.int64) tm.assert_numpy_array_equal(idx._values, exp) - exp = np.array([14975, -9223372036854775808], dtype=np.int64) idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + + exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], + dtype=object) tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([14975, -9223372036854775808], dtype=np.int64) tm.assert_numpy_array_equal(idx._values, exp) def test_asobject_like(self): @@ -2100,7 +2107,7 @@ def test_getitem_ndim2(self): result = idx[:, None] # MPL kludge, internally has incorrect shape tm.assertIsInstance(result, PeriodIndex) - self.assertEqual(result.shape, (len(idx), 1)) + self.assertEqual(result.shape, (len(idx), )) def test_getitem_index(self): idx = period_range('2007-01', periods=10, freq='M', name='x') @@ -4153,19 +4160,23 @@ def test_pi_ops_errors(self): with tm.assertRaisesRegexp(TypeError, msg): obj - ng - # ToDo: currently, it accepts float because PeriodIndex.values - # is internally int. Should be fixed after GH13988 - # msg is different depending on NumPy version - if not _np_version_under1p9: - for ng in ["str"]: - with tm.assertRaises(TypeError): - np.add(obj, ng) + with tm.assertRaises(TypeError): + np.add(obj, ng) + if _np_version_under1p9: + self.assertIs(np.add(ng, obj), NotImplemented) + else: with tm.assertRaises(TypeError): np.add(ng, obj) + with tm.assertRaises(TypeError): + np.subtract(obj, ng) + + if _np_version_under1p9: + self.assertIs(np.subtract(ng, obj), NotImplemented) + else: with tm.assertRaises(TypeError): - np.subtract(ng, obj) + np.subtract(ng, obj) def test_pi_ops_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', @@ -4260,10 +4271,19 @@ def test_pi_sub_period(self): exp = pd.Index([-12, -11, -10, -9], name='idx') tm.assert_index_equal(result, exp) + result = np.subtract(idx, pd.Period('2012-01', freq='M')) + tm.assert_index_equal(result, exp) + result = pd.Period('2012-01', freq='M') - idx exp = pd.Index([12, 11, 10, 9], name='idx') tm.assert_index_equal(result, exp) + result = np.subtract(pd.Period('2012-01', freq='M'), idx) + if _np_version_under1p9: + self.assertIs(result, NotImplemented) + else: + tm.assert_index_equal(result, exp) + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) @@ -4407,7 +4427,8 @@ def test_nanosecondly(self): def _check_freq(self, freq, base_date): rng = PeriodIndex(start=base_date, periods=10, freq=freq) exp = np.arange(10, dtype=np.int64) - self.assert_numpy_array_equal(rng.values, exp) + self.assert_numpy_array_equal(rng._values, exp) + self.assert_numpy_array_equal(rng.asi8, exp) def test_negone_ordinals(self): freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] From 5152cdd77ba4799a276d2716535255c3301e9741 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Thu, 25 Aug 2016 19:20:52 +0900 Subject: [PATCH 298/359] API/BUG: Fix Series ops inconsistencies (#13894) - series comparison operator to check whether labels are identical (currently: ignores labels) - series boolean operator to align with labels (currently: only keeps left index) --- doc/source/whatsnew/v0.19.0.txt | 138 +++++++++++++++ pandas/core/ops.py | 85 +++++++-- pandas/io/tests/json/test_ujson.py | 34 ++-- pandas/tests/indexes/common.py | 3 +- pandas/tests/series/test_operators.py | 240 ++++++++++++++++++++++++-- 5 files changed, 450 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 4039dafd323c4..2811e31128156 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -488,6 +488,143 @@ New Behavior: type(s.tolist()[0]) +.. _whatsnew_0190.api.series_ops: + +``Series`` operators for different indexes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Following ``Series`` operators has been changed to make all operators consistent, +including ``DataFrame`` (:issue:`1134`, :issue:`4581`, :issue:`13538`) + +- ``Series`` comparison operators now raise ``ValueError`` when ``index`` are different. +- ``Series`` logical operators align both ``index``. + +.. warning:: + Until 0.18.1, comparing ``Series`` with the same length has been succeeded even if + these ``index`` are different (the result ignores ``index``). As of 0.19.0, it raises ``ValueError`` to be more strict. This section also describes how to keep previous behaviour or align different indexes using flexible comparison methods like ``.eq``. + + +As a result, ``Series`` and ``DataFrame`` operators behave as below: + +Arithmetic operators +"""""""""""""""""""" + +Arithmetic operators align both ``index`` (no changes). + +.. ipython:: python + + s1 = pd.Series([1, 2, 3], index=list('ABC')) + s2 = pd.Series([2, 2, 2], index=list('ABD')) + s1 + s2 + + df1 = pd.DataFrame([1, 2, 3], index=list('ABC')) + df2 = pd.DataFrame([2, 2, 2], index=list('ABD')) + df1 + df2 + +Comparison operators +"""""""""""""""""""" + +Comparison operators raise ``ValueError`` when ``index`` are different. + +Previous Behavior (``Series``): + +``Series`` compares values ignoring ``index`` as long as both lengthes are the same. + +.. code-block:: ipython + + In [1]: s1 == s2 + Out[1]: + A False + B True + C False + dtype: bool + +New Behavior (``Series``): + +.. code-block:: ipython + + In [2]: s1 == s2 + Out[2]: + ValueError: Can only compare identically-labeled Series objects + +.. note:: + To achieve the same result as previous versions (compare values based on locations ignoring ``index``), compare both ``.values``. + + .. ipython:: python + + s1.values == s2.values + + If you want to compare ``Series`` aligning its ``index``, see flexible comparison methods section below. + +Current Behavior (``DataFrame``, no change): + +.. code-block:: ipython + + In [3]: df1 == df2 + Out[3]: + ValueError: Can only compare identically-labeled DataFrame objects + +Logical operators +""""""""""""""""" + +Logical operators align both ``index``. + +Previous Behavior (``Series``): + +Only left hand side ``index`` is kept. + +.. code-block:: ipython + + In [4]: s1 = pd.Series([True, False, True], index=list('ABC')) + In [5]: s2 = pd.Series([True, True, True], index=list('ABD')) + In [6]: s1 & s2 + Out[6]: + A True + B False + C False + dtype: bool + +New Behavior (``Series``): + +.. ipython:: python + + s1 = pd.Series([True, False, True], index=list('ABC')) + s2 = pd.Series([True, True, True], index=list('ABD')) + s1 & s2 + +.. note:: + ``Series`` logical operators fill ``NaN`` result with ``False``. + +.. note:: + To achieve the same result as previous versions (compare values based on locations ignoring ``index``), compare both ``.values``. + + .. ipython:: python + + s1.values & s2.values + +Current Behavior (``DataFrame``, no change): + +.. ipython:: python + + df1 = pd.DataFrame([True, False, True], index=list('ABC')) + df2 = pd.DataFrame([True, True, True], index=list('ABD')) + df1 & df2 + +Flexible comparison methods +""""""""""""""""""""""""""" + +``Series`` flexible comparison methods like ``eq``, ``ne``, ``le``, ``lt``, ``ge`` and ``gt`` now align both ``index``. Use these operators if you want to compare two ``Series`` +which has the different ``index``. + +.. ipython:: python + + s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) + s2 = pd.Series([2, 2, 2], index=['b', 'c', 'd']) + s1.eq(s2) + s1.ge(s2) + +Previously, it worked as the same as comparison operators (see above). + .. _whatsnew_0190.api.promote: ``Series`` type promotion on assignment @@ -1107,6 +1244,7 @@ Bug Fixes - Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`) - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) +- Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`) - Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 8d49e41284a7b..c8d074d3d3bdf 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -311,17 +311,6 @@ def get_op(cls, left, right, name, na_op): is_datetime_lhs = (is_datetime64_dtype(left) or is_datetime64tz_dtype(left)) - if isinstance(left, ABCSeries) and isinstance(right, ABCSeries): - # avoid repated alignment - if not left.index.equals(right.index): - left, right = left.align(right, copy=False) - - index, lidx, ridx = left.index.join(right.index, how='outer', - return_indexers=True) - # if DatetimeIndex have different tz, convert to UTC - left.index = index - right.index = index - if not (is_datetime_lhs or is_timedelta_lhs): return _Op(left, right, name, na_op) else: @@ -603,6 +592,33 @@ def _is_offset(self, arr_or_obj): return False +def _align_method_SERIES(left, right, align_asobject=False): + """ align lhs and rhs Series """ + + # ToDo: Different from _align_method_FRAME, list, tuple and ndarray + # are not coerced here + # because Series has inconsistencies described in #13637 + + if isinstance(right, ABCSeries): + # avoid repeated alignment + if not left.index.equals(right.index): + + if align_asobject: + # to keep original value's dtype for bool ops + left = left.astype(object) + right = right.astype(object) + + left, right = left.align(right, copy=False) + + index, lidx, ridx = left.index.join(right.index, how='outer', + return_indexers=True) + # if DatetimeIndex have different tz, convert to UTC + left.index = index + right.index = index + + return left, right + + def _arith_method_SERIES(op, name, str_rep, fill_zeros=None, default_axis=None, **eval_kwargs): """ @@ -655,6 +671,8 @@ def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, pd.DataFrame): return NotImplemented + left, right = _align_method_SERIES(left, right) + converted = _Op.get_op(left, right, name, na_op) left, right = converted.left, converted.right @@ -763,8 +781,9 @@ def wrapper(self, other, axis=None): if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) - if len(self) != len(other): - raise ValueError('Series lengths must match to compare') + if not self._indexed_same(other): + msg = 'Can only compare identically-labeled Series objects' + raise ValueError(msg) return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover @@ -786,6 +805,7 @@ def wrapper(self, other, axis=None): return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) + elif isinstance(other, pd.Categorical): if not is_categorical_dtype(self): msg = ("Cannot compare a Categorical for op {op} with Series " @@ -860,9 +880,10 @@ def wrapper(self, other): fill_int = lambda x: x.fillna(0) fill_bool = lambda x: x.fillna(False).astype(bool) + self, other = _align_method_SERIES(self, other, align_asobject=True) + if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) - other = other.reindex_like(self) is_other_int_dtype = is_integer_dtype(other.dtype) other = fill_int(other) if is_other_int_dtype else fill_bool(other) @@ -912,7 +933,32 @@ def wrapper(self, other): 'floordiv': {'op': '//', 'desc': 'Integer division', 'reversed': False, - 'reverse': 'rfloordiv'}} + 'reverse': 'rfloordiv'}, + + 'eq': {'op': '==', + 'desc': 'Equal to', + 'reversed': False, + 'reverse': None}, + 'ne': {'op': '!=', + 'desc': 'Not equal to', + 'reversed': False, + 'reverse': None}, + 'lt': {'op': '<', + 'desc': 'Less than', + 'reversed': False, + 'reverse': None}, + 'le': {'op': '<=', + 'desc': 'Less than or equal to', + 'reversed': False, + 'reverse': None}, + 'gt': {'op': '>', + 'desc': 'Greater than', + 'reversed': False, + 'reverse': None}, + 'ge': {'op': '>=', + 'desc': 'Greater than or equal to', + 'reversed': False, + 'reverse': None}} _op_names = list(_op_descriptions.keys()) for k in _op_names: @@ -963,10 +1009,11 @@ def _flex_method_SERIES(op, name, str_rep, default_axis=None, fill_zeros=None, @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # validate axis - self._get_axis_number(axis) + if axis is not None: + self._get_axis_number(axis) if isinstance(other, ABCSeries): return self._binop(other, op, level=level, fill_value=fill_value) - elif isinstance(other, (np.ndarray, ABCSeries, list, tuple)): + elif isinstance(other, (np.ndarray, list, tuple)): if len(other) != len(self): raise ValueError('Lengths must be equal') return self._binop(self._constructor(other, self.index), op, @@ -975,7 +1022,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): if fill_value is not None: self = self.fillna(fill_value) - return self._constructor(op(self.values, other), + return self._constructor(op(self, other), self.index).__finalize__(self) flex_wrapper.__name__ = name @@ -983,7 +1030,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): series_flex_funcs = dict(flex_arith_method=_flex_method_SERIES, - flex_comp_method=_comp_method_SERIES) + flex_comp_method=_flex_method_SERIES) series_special_funcs = dict(arith_method=_arith_method_SERIES, comp_method=_comp_method_SERIES, diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/io/tests/json/test_ujson.py index 13b2dafec9c89..0dda6ead2a3b9 100644 --- a/pandas/io/tests/json/test_ujson.py +++ b/pandas/io/tests/json/test_ujson.py @@ -1306,43 +1306,45 @@ def testSeries(self): # column indexed outp = Series(ujson.decode(ujson.encode(s))).sort_values() - self.assertTrue((s == outp).values.all()) + exp = Series([10, 20, 30, 40, 50, 60], + index=['6', '7', '8', '9', '10', '15']) + tm.assert_series_equal(outp, exp) outp = Series(ujson.decode(ujson.encode(s), numpy=True)).sort_values() - self.assertTrue((s == outp).values.all()) + tm.assert_series_equal(outp, exp) dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"))) outp = Series(**dec) - self.assertTrue((s == outp).values.all()) - self.assertTrue(s.name == outp.name) + tm.assert_series_equal(outp, s) dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"), numpy=True)) outp = Series(**dec) - self.assertTrue((s == outp).values.all()) - self.assertTrue(s.name == outp.name) - outp = Series(ujson.decode(ujson.encode( - s, orient="records"), numpy=True)) - self.assertTrue((s == outp).values.all()) + outp = Series(ujson.decode(ujson.encode(s, orient="records"), + numpy=True)) + exp = Series([10, 20, 30, 40, 50, 60]) + tm.assert_series_equal(outp, exp) outp = Series(ujson.decode(ujson.encode(s, orient="records"))) - self.assertTrue((s == outp).values.all()) + tm.assert_series_equal(outp, exp) - outp = Series(ujson.decode( - ujson.encode(s, orient="values"), numpy=True)) - self.assertTrue((s == outp).values.all()) + outp = Series(ujson.decode(ujson.encode(s, orient="values"), + numpy=True)) + tm.assert_series_equal(outp, exp) outp = Series(ujson.decode(ujson.encode(s, orient="values"))) - self.assertTrue((s == outp).values.all()) + tm.assert_series_equal(outp, exp) outp = Series(ujson.decode(ujson.encode( s, orient="index"))).sort_values() - self.assertTrue((s == outp).values.all()) + exp = Series([10, 20, 30, 40, 50, 60], + index=['6', '7', '8', '9', '10', '15']) + tm.assert_series_equal(outp, exp) outp = Series(ujson.decode(ujson.encode( s, orient="index"), numpy=True)).sort_values() - self.assertTrue((s == outp).values.all()) + tm.assert_series_equal(outp, exp) def testSeriesNested(self): s = Series([10, 20, 30, 40, 50, 60], name="series", diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index f7e8a4e858441..687782172693a 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -685,7 +685,8 @@ def test_equals_op(self): index_a == series_d with tm.assertRaisesRegexp(ValueError, "Lengths must match"): index_a == array_d - with tm.assertRaisesRegexp(ValueError, "Series lengths must match"): + msg = "Can only compare identically-labeled Series objects" + with tm.assertRaisesRegexp(ValueError, msg): series_a == series_d with tm.assertRaisesRegexp(ValueError, "Lengths must match"): series_a == array_d diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 5fc44fe1dc608..f7fc45d78af97 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -44,8 +44,9 @@ def test_comparisons(self): s2 = Series([False, True, False]) # it works! - s == s2 - s2 == s + exp = Series([False, False, False]) + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) def test_op_method(self): def check(series, other, check_reverse=False): @@ -1082,15 +1083,15 @@ def test_comparison_label_based(self): a = Series([True, False, True], list('bca')) b = Series([False, True, False], list('abc')) - expected = Series([True, False, False], list('bca')) + expected = Series([False, True, False], list('abc')) result = a & b assert_series_equal(result, expected) - expected = Series([True, False, True], list('bca')) + expected = Series([True, True, False], list('abc')) result = a | b assert_series_equal(result, expected) - expected = Series([False, False, True], list('bca')) + expected = Series([True, False, False], list('abc')) result = a ^ b assert_series_equal(result, expected) @@ -1098,11 +1099,11 @@ def test_comparison_label_based(self): a = Series([True, False, True], list('bca')) b = Series([False, True, False, True], list('abcd')) - expected = Series([True, False, False], list('bca')) + expected = Series([False, True, False, False], list('abcd')) result = a & b assert_series_equal(result, expected) - expected = Series([True, False, True], list('bca')) + expected = Series([True, True, False, False], list('abcd')) result = a | b assert_series_equal(result, expected) @@ -1119,20 +1120,28 @@ def test_comparison_label_based(self): # vs non-matching result = a & Series([1], ['z']) - expected = Series([False, False, False], list('bca')) + expected = Series([False, False, False, False], list('abcz')) assert_series_equal(result, expected) result = a | Series([1], ['z']) - expected = Series([True, False, True], list('bca')) + expected = Series([True, True, False, False], list('abcz')) assert_series_equal(result, expected) # identity # we would like s[s|e] == s to hold for any e, whether empty or not - for e in [Series([]), Series([1], ['z']), Series(['z']), + for e in [Series([]), Series([1], ['z']), Series(np.nan, b.index), Series(np.nan, a.index)]: result = a[a | e] assert_series_equal(result, a[a]) + for e in [Series(['z'])]: + if compat.PY3: + with tm.assert_produces_warning(RuntimeWarning): + result = a[a | e] + else: + result = a[a | e] + assert_series_equal(result, a[a]) + # vs scalars index = list('bca') t = Series([True, False, True]) @@ -1162,6 +1171,76 @@ def test_comparison_label_based(self): for v in [np.nan]: self.assertRaises(TypeError, lambda: t & v) + def test_comparison_flex_basic(self): + left = pd.Series(np.random.randn(10)) + right = pd.Series(np.random.randn(10)) + + tm.assert_series_equal(left.eq(right), left == right) + tm.assert_series_equal(left.ne(right), left != right) + tm.assert_series_equal(left.le(right), left < right) + tm.assert_series_equal(left.lt(right), left <= right) + tm.assert_series_equal(left.gt(right), left > right) + tm.assert_series_equal(left.ge(right), left >= right) + + # axis + for axis in [0, None, 'index']: + tm.assert_series_equal(left.eq(right, axis=axis), left == right) + tm.assert_series_equal(left.ne(right, axis=axis), left != right) + tm.assert_series_equal(left.le(right, axis=axis), left < right) + tm.assert_series_equal(left.lt(right, axis=axis), left <= right) + tm.assert_series_equal(left.gt(right, axis=axis), left > right) + tm.assert_series_equal(left.ge(right, axis=axis), left >= right) + + # + msg = 'No axis named 1 for object type' + for op in ['eq', 'ne', 'le', 'le', 'gt', 'ge']: + with tm.assertRaisesRegexp(ValueError, msg): + getattr(left, op)(right, axis=1) + + def test_comparison_flex_alignment(self): + left = Series([1, 3, 2], index=list('abc')) + right = Series([2, 2, 2], index=list('bcd')) + + exp = pd.Series([False, False, True, False], index=list('abcd')) + tm.assert_series_equal(left.eq(right), exp) + + exp = pd.Series([True, True, False, True], index=list('abcd')) + tm.assert_series_equal(left.ne(right), exp) + + exp = pd.Series([False, False, True, False], index=list('abcd')) + tm.assert_series_equal(left.le(right), exp) + + exp = pd.Series([False, False, False, False], index=list('abcd')) + tm.assert_series_equal(left.lt(right), exp) + + exp = pd.Series([False, True, True, False], index=list('abcd')) + tm.assert_series_equal(left.ge(right), exp) + + exp = pd.Series([False, True, False, False], index=list('abcd')) + tm.assert_series_equal(left.gt(right), exp) + + def test_comparison_flex_alignment_fill(self): + left = Series([1, 3, 2], index=list('abc')) + right = Series([2, 2, 2], index=list('bcd')) + + exp = pd.Series([False, False, True, True], index=list('abcd')) + tm.assert_series_equal(left.eq(right, fill_value=2), exp) + + exp = pd.Series([True, True, False, False], index=list('abcd')) + tm.assert_series_equal(left.ne(right, fill_value=2), exp) + + exp = pd.Series([False, False, True, True], index=list('abcd')) + tm.assert_series_equal(left.le(right, fill_value=0), exp) + + exp = pd.Series([False, False, False, True], index=list('abcd')) + tm.assert_series_equal(left.lt(right, fill_value=0), exp) + + exp = pd.Series([True, True, True, False], index=list('abcd')) + tm.assert_series_equal(left.ge(right, fill_value=0), exp) + + exp = pd.Series([True, True, False, False], index=list('abcd')) + tm.assert_series_equal(left.gt(right, fill_value=0), exp) + def test_operators_bitwise(self): # GH 9016: support bitwise op for integer types index = list('bca') @@ -1197,11 +1276,11 @@ def test_operators_bitwise(self): s_a0b1c0 = Series([1], list('b')) res = s_tft & s_a0b1c0 - expected = s_tff + expected = s_tff.reindex(list('abc')) assert_series_equal(res, expected) res = s_tft | s_a0b1c0 - expected = s_tft + expected = s_tft.reindex(list('abc')) assert_series_equal(res, expected) n0 = 0 @@ -1238,9 +1317,25 @@ def test_operators_bitwise(self): self.assertRaises(TypeError, lambda: s_0123 & [0.1, 4, 3.14, 2]) # s_0123 will be all false now because of reindexing like s_tft - assert_series_equal(s_tft & s_0123, Series([False] * 3, list('bca'))) + if compat.PY3: + # unable to sort incompatible object via .union. + exp = Series([False] * 7, index=['b', 'c', 'a', 0, 1, 2, 3]) + with tm.assert_produces_warning(RuntimeWarning): + assert_series_equal(s_tft & s_0123, exp) + else: + exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) + assert_series_equal(s_tft & s_0123, exp) + # s_tft will be all false now because of reindexing like s_0123 - assert_series_equal(s_0123 & s_tft, Series([False] * 4)) + if compat.PY3: + # unable to sort incompatible object via .union. + exp = Series([False] * 7, index=[0, 1, 2, 3, 'b', 'c', 'a']) + with tm.assert_produces_warning(RuntimeWarning): + assert_series_equal(s_0123 & s_tft, exp) + else: + exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) + assert_series_equal(s_0123 & s_tft, exp) + assert_series_equal(s_0123 & False, Series([False] * 4)) assert_series_equal(s_0123 ^ False, Series([False, True, True, True])) assert_series_equal(s_0123 & [False], Series([False] * 4)) @@ -1324,6 +1419,123 @@ def _check_op(arr, op): _check_op(arr, operator.truediv) _check_op(arr, operator.floordiv) + def test_arith_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') + + exp = pd.Series([3.0, 4.0, np.nan, np.nan], + index=list('ABCD'), name='x') + tm.assert_series_equal(s1 + s2, exp) + tm.assert_series_equal(s2 + s1, exp) + + exp = pd.DataFrame({'x': [3.0, 4.0, np.nan, np.nan]}, + index=list('ABCD')) + tm.assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) + tm.assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) + + # different length + s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') + + exp = pd.Series([3, 4, 5, np.nan], + index=list('ABCD'), name='x') + tm.assert_series_equal(s3 + s4, exp) + tm.assert_series_equal(s4 + s3, exp) + + exp = pd.DataFrame({'x': [3, 4, 5, np.nan]}, + index=list('ABCD')) + tm.assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) + tm.assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) + + def test_comp_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') + + s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') + + for l, r in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: + + msg = "Can only compare identically-labeled Series objects" + with tm.assertRaisesRegexp(ValueError, msg): + l == r + + with tm.assertRaisesRegexp(ValueError, msg): + l != r + + with tm.assertRaisesRegexp(ValueError, msg): + l < r + + msg = "Can only compare identically-labeled DataFrame objects" + with tm.assertRaisesRegexp(ValueError, msg): + l.to_frame() == r.to_frame() + + with tm.assertRaisesRegexp(ValueError, msg): + l.to_frame() != r.to_frame() + + with tm.assertRaisesRegexp(ValueError, msg): + l.to_frame() < r.to_frame() + + def test_bool_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([True, False, True], index=list('ABC'), name='x') + s2 = pd.Series([True, True, False], index=list('ABD'), name='x') + + exp = pd.Series([True, False, False, False], + index=list('ABCD'), name='x') + tm.assert_series_equal(s1 & s2, exp) + tm.assert_series_equal(s2 & s1, exp) + + # True | np.nan => True + exp = pd.Series([True, True, True, False], + index=list('ABCD'), name='x') + tm.assert_series_equal(s1 | s2, exp) + # np.nan | True => np.nan, filled with False + exp = pd.Series([True, True, False, False], + index=list('ABCD'), name='x') + tm.assert_series_equal(s2 | s1, exp) + + # DataFrame doesn't fill nan with False + exp = pd.DataFrame({'x': [True, False, np.nan, np.nan]}, + index=list('ABCD')) + tm.assert_frame_equal(s1.to_frame() & s2.to_frame(), exp) + tm.assert_frame_equal(s2.to_frame() & s1.to_frame(), exp) + + exp = pd.DataFrame({'x': [True, True, np.nan, np.nan]}, + index=list('ABCD')) + tm.assert_frame_equal(s1.to_frame() | s2.to_frame(), exp) + tm.assert_frame_equal(s2.to_frame() | s1.to_frame(), exp) + + # different length + s3 = pd.Series([True, False, True], index=list('ABC'), name='x') + s4 = pd.Series([True, True, True, True], index=list('ABCD'), name='x') + + exp = pd.Series([True, False, True, False], + index=list('ABCD'), name='x') + tm.assert_series_equal(s3 & s4, exp) + tm.assert_series_equal(s4 & s3, exp) + + # np.nan | True => np.nan, filled with False + exp = pd.Series([True, True, True, False], + index=list('ABCD'), name='x') + tm.assert_series_equal(s3 | s4, exp) + # True | np.nan => True + exp = pd.Series([True, True, True, True], + index=list('ABCD'), name='x') + tm.assert_series_equal(s4 | s3, exp) + + exp = pd.DataFrame({'x': [True, False, True, np.nan]}, + index=list('ABCD')) + tm.assert_frame_equal(s3.to_frame() & s4.to_frame(), exp) + tm.assert_frame_equal(s4.to_frame() & s3.to_frame(), exp) + + exp = pd.DataFrame({'x': [True, True, True, np.nan]}, + index=list('ABCD')) + tm.assert_frame_equal(s3.to_frame() | s4.to_frame(), exp) + tm.assert_frame_equal(s4.to_frame() | s3.to_frame(), exp) + def test_series_frame_radd_bug(self): # GH 353 vals = Series(tm.rands_array(5, 10)) From 185fcbebd8c65706a9e3fea2d1ae28aa2d8baa93 Mon Sep 17 00:00:00 2001 From: Grant Roch Date: Thu, 25 Aug 2016 18:20:12 -0400 Subject: [PATCH 299/359] TST/DOC: apply date() with timezones (#14085) --- doc/source/timeseries.rst | 4 ++-- pandas/tests/series/test_datetime_values.py | 14 +++++++++++++- pandas/tseries/index.py | 3 ++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index a35b8d561a5a7..6f44ee0c87945 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -544,8 +544,8 @@ There are several time/date properties that one can access from ``Timestamp`` or second,"The seconds of the datetime" microsecond,"The microseconds of the datetime" nanosecond,"The nanoseconds of the datetime" - date,"Returns datetime.date" - time,"Returns datetime.time" + date,"Returns datetime.date (does not contain timezone information)" + time,"Returns datetime.time (does not contain timezone information)" dayofyear,"The ordinal day of year" weekofyear,"The week ordinal of the year" week,"The week ordinal of the year" diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 6211597b4a91b..8f2ab0ed28839 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -1,7 +1,7 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -from datetime import datetime +from datetime import datetime, date import numpy as np import pandas as pd @@ -410,3 +410,15 @@ def test_between(self): result = s[s.between(s[3], s[17], inclusive=False)] expected = s[5:16].dropna() assert_series_equal(result, expected) + + def test_date_tz(self): + # GH11757 + rng = pd.DatetimeIndex(['2014-04-04 23:56', + '2014-07-18 21:24', + '2015-11-22 22:14'], tz="US/Eastern") + s = Series(rng) + expected = Series([date(2014, 4, 4), + date(2014, 7, 18), + date(2015, 11, 22)]) + assert_series_equal(s.dt.date, expected) + assert_series_equal(s.apply(lambda x: x.date()), expected) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 8f50ddc0f9e41..f78574521ffeb 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1597,7 +1597,8 @@ def time(self): @property def date(self): """ - Returns numpy array of datetime.date. The date part of the Timestamps. + Returns numpy array of python datetime.date objects (namely, the date + part of Timestamps without timezone information). """ return self._maybe_mask_results(_algos.arrmap_object( self.asobject.values, lambda x: x.date())) From 042b6f00ad691345812e61bb7e86e52476805602 Mon Sep 17 00:00:00 2001 From: Kernc Date: Fri, 26 Aug 2016 15:58:24 -0400 Subject: [PATCH 300/359] BUG: yield correct Series subclass in df.iterrows() (#13977) closes #13977 Author: Kernc Closes #13978 from kernc/iterrows-with-constructor-sliced and squashes the following commits: 9aaac80 [Kernc] BUG: yield correct Series subclass in df.iterrows() (#13977) f8f4230 [Kernc] DOC: tm.assert_series_equal() fix docstring default values --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/frame.py | 3 ++- pandas/tests/frame/test_subclass.py | 7 +++++++ pandas/util/testing.py | 4 ++-- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2811e31128156..4a2468012e069 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1209,6 +1209,7 @@ Bug Fixes - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) - Bug in ``.to_html``, ``.to_latex`` and ``.to_string`` silently ignore custom datetime formatter passed through the ``formatters`` key word (:issue:`10690`) +- Bug in ``DataFrame.iterrows()``, not yielding a ``Series`` subclasse if defined (:issue:`13977`) - Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) - Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 501f4e443b1fc..205af5c805877 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -694,8 +694,9 @@ def iterrows(self): """ columns = self.columns + klass = self._constructor_sliced for k, v in zip(self.index, self.values): - s = Series(v, index=columns, name=k) + s = klass(v, index=columns, name=k) yield k, s def itertuples(self, index=True, name="Pandas"): diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 0e0ee75a30c84..6a57f67a6cb3d 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -211,6 +211,13 @@ def test_subclass_align_combinations(self): tm.assertIsInstance(res2, tm.SubclassedDataFrame) tm.assert_frame_equal(res2, exp1) + def test_subclass_iterrows(self): + # GH 13977 + df = tm.SubclassedDataFrame({'a': [1]}) + for i, row in df.iterrows(): + tm.assertIsInstance(row, tm.SubclassedSeries) + tm.assert_series_equal(row, df.loc[i]) + def test_subclass_sparse_slice(self): rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] ssdf = tm.SubclassedSparseDataFrame(rows) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 2d1d88b69941b..94de8cb034024 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1104,10 +1104,10 @@ def assert_series_equal(left, right, check_dtype=True, right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. - check_index_type : bool / string {'equiv'}, default False + check_index_type : bool / string {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. - check_series_type : bool, default False + check_series_type : bool, default True Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. From e31f981d23c932c1679e7a893b01d7f6d936af5a Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Fri, 26 Aug 2016 16:12:09 -0400 Subject: [PATCH 301/359] BUG: Series indexing with tuple-valued data and a numeric index closes #13509 Author: Ben Kandel Closes #14092 from bkandel/fix-floatindex-tuple and squashes the following commits: 2b77554 [Ben Kandel] Fix bug in Float64Index.get_value() for tuples. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/numeric.py | 10 +--------- pandas/tests/indexing/test_floats.py | 11 +++++++++++ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 4a2468012e069..0ff63117ed121 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1150,6 +1150,7 @@ Bug Fixes - Bug in ``DatetimeTZDtype`` dtype with ``dateutil.tz.tzlocal`` cannot be regarded as valid dtype (:issue:`13583`) - Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) - Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) +- Bug in ``Series`` indexing with tuple-valued data and a numeric index (:issue:`13509`) - Bug in printing ``pd.DataFrame`` where unusual elements with the ``object`` dtype were causing segfaults (:issue:`13717`) - Bug in ranking ``Series`` which could result in segfaults (:issue:`13445`) diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 82a6ec0b28ac9..e1ac0939812f6 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -293,19 +293,11 @@ def get_value(self, series, key): if not is_scalar(key): raise InvalidIndexError - from pandas.core.indexing import maybe_droplevels - from pandas.core.series import Series - k = _values_from_object(key) loc = self.get_loc(k) new_values = _values_from_object(series)[loc] - if is_scalar(new_values) or new_values is None: - return new_values - - new_index = self[loc] - new_index = maybe_droplevels(new_index, k) - return Series(new_values, index=new_index, name=series.name) + return new_values def equals(self, other): """ diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 29f3889d20bd0..920aefa24b576 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -676,3 +676,14 @@ def test_floating_misc(self): assert_series_equal(result1, result2) assert_series_equal(result1, result3) assert_series_equal(result1, Series([1], index=[2.5])) + + def test_floating_tuples(self): + # GH13509 + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name='foo') + result = s[0.0] + self.assertEqual(result, (1, 1)) + + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name='foo') + result = s[0.0] + expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name='foo') + assert_series_equal(result, expected) From 0db43045508c474f1fcaf8c3f10a306c0e571c91 Mon Sep 17 00:00:00 2001 From: Tom Bird Date: Wed, 3 Aug 2016 18:48:45 +0100 Subject: [PATCH 302/359] BUG: Empty lists shouldn't be counted as DateOffsets. closes #13844 closes #13889 --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/ops.py | 5 ++--- pandas/tests/series/test_timeseries.py | 9 +++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0ff63117ed121..9ccbf5772aaef 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1133,6 +1133,7 @@ Bug Fixes - Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()`` ); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) - Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) +- Bug where empty ``Series`` were incorrectly coerced in datetime-like numeric operations (:issue:`13844`) - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index c8d074d3d3bdf..b84eb0ba4cbf9 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -586,10 +586,9 @@ def _is_offset(self, arr_or_obj): """ check if obj or all elements of list-like is DateOffset """ if isinstance(arr_or_obj, pd.DateOffset): return True - elif is_list_like(arr_or_obj): + elif is_list_like(arr_or_obj) and len(arr_or_obj): return all(isinstance(x, pd.DateOffset) for x in arr_or_obj) - else: - return False + return False def _align_method_SERIES(left, right, align_asobject=False): diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 341d18f987abc..07d2abc1bcbb2 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -552,3 +552,12 @@ def test_timeseries_coercion(self): self.assertTrue(ser.is_time_series) self.assertTrue(ser.index.is_all_dates) self.assertIsInstance(ser.index, DatetimeIndex) + + def test_empty_series_ops(self): + # see issue #13844 + a = Series(dtype='M8[ns]') + b = Series(dtype='m8[ns]') + assert_series_equal(a, a + b) + assert_series_equal(a, a - b) + assert_series_equal(a, b + a) + self.assertRaises(TypeError, lambda x, y: x - y, b, a) From 0e61847e111a3ba181f16c8b9b974c74d360ad2e Mon Sep 17 00:00:00 2001 From: John Liekezer Date: Fri, 26 Aug 2016 16:45:13 -0400 Subject: [PATCH 303/359] BUG: Dataframe.fillna with np.nan for dtype=category(GH 14021) closes #14021 Author: John Liekezer Closes #14051 from conquistador1492/issue_14021 and squashes the following commits: a405777 [John Liekezer] BUG: Dataframe.fillna with np.nan for dtype=category(GH 14021) --- doc/source/whatsnew/v0.19.0.txt | 3 ++- pandas/core/categorical.py | 5 ++++- pandas/tests/test_categorical.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9ccbf5772aaef..37c1be0289d40 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1188,7 +1188,8 @@ Bug Fixes - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) - Bug ``Series.isnull()`` and ``Series.notnull()`` ignore ``Period('NaT')`` (:issue:`13737`) -- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737`) +- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737` +- Bug in ``.fillna(value=np.nan)`` incorrectly raises ``KeyError`` on a ``category`` dtyped ``Series`` (:issue:`14021`) - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 3ec1c7085c87d..2c89e4c05c633 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1464,7 +1464,10 @@ def fillna(self, value=None, method=None, limit=None): mask = values == -1 if mask.any(): values = values.copy() - values[mask] = self.categories.get_loc(value) + if isnull(value): + values[mask] = -1 + else: + values[mask] = self.categories.get_loc(value) return self._constructor(values, categories=self.categories, ordered=self.ordered, fastpath=True) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 70e07b1e4930a..d703ee7c1d1c2 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4118,6 +4118,37 @@ def f(): res = df.fillna("a") tm.assert_frame_equal(res, df_exp) + # GH 14021 + # np.nan should always be a is a valid filler + cat = Categorical([np.nan, 2, np.nan]) + val = Categorical([np.nan, np.nan, np.nan]) + df = DataFrame({"cats": cat, "vals": val}) + res = df.fillna(df.median()) + v_exp = [np.nan, np.nan, np.nan] + df_exp = pd.DataFrame({"cats": [2, 2, 2], "vals": v_exp}, + dtype='category') + tm.assert_frame_equal(res, df_exp) + + result = df.cats.fillna(np.nan) + tm.assert_series_equal(result, df.cats) + result = df.vals.fillna(np.nan) + tm.assert_series_equal(result, df.vals) + + idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', + '2011-01-01 09:00', pd.NaT, pd.NaT]) + df = DataFrame({'a': pd.Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01', + pd.NaT, pd.NaT], freq='M') + df = DataFrame({'a': pd.Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.TimedeltaIndex(['1 days', '2 days', + '1 days', pd.NaT, pd.NaT]) + df = pd.DataFrame({'a': pd.Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + def test_astype_to_other(self): s = self.cat['value_group'] From 670435ab346d6b35e65b3b1de2b56d850ff25cc6 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 27 Aug 2016 05:21:45 -0400 Subject: [PATCH 304/359] DEPR: Deprecated Index.to_datetime (#14096) Partially addresses gh-8254. Closes gh-8612 because pd.to_datetime has a format arg. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/base.py | 5 ++++ pandas/io/tests/parser/parse_dates.py | 38 +++++++------------------ pandas/tseries/tests/test_timeseries.py | 20 +++++++------ 4 files changed, 29 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 37c1be0289d40..7e857b254d0a6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1031,6 +1031,7 @@ Deprecations - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``Index.to_datetime`` and ``DatetimeIndex.to_datetime`` have been deprecated in favour of ``pd.to_datetime`` (:issue:`8254`) - ``SparseList`` has been deprecated and will be removed in a future version (:issue:`13784`) - ``DataFrame.to_html()`` and ``DataFrame.to_latex()`` have dropped the ``colSpace`` parameter in favor of ``col_space`` (:issue:`13857`) - ``DataFrame.to_sql()`` has deprecated the ``flavor`` parameter, as it is superfluous when SQLAlchemy is not installed (:issue:`13611`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 49b16ec9a71ab..8c1378c07a1d2 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -851,9 +851,14 @@ def _to_safe_for_reshape(self): def to_datetime(self, dayfirst=False): """ + DEPRECATED: use :meth:`pandas.to_datetime` instead. + For an Index containing strings or datetime.datetime objects, attempt conversion to DatetimeIndex """ + warnings.warn("to_datetime is deprecated. Use pd.to_datetime(...)", + FutureWarning, stacklevel=2) + from pandas.tseries.index import DatetimeIndex if self.inferred_type == 'string': from dateutil.parser import parse diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py index 01816bde66120..ef48cc8a65620 100644 --- a/pandas/io/tests/parser/parse_dates.py +++ b/pandas/io/tests/parser/parse_dates.py @@ -20,8 +20,7 @@ from pandas import DataFrame, Series, Index, DatetimeIndex from pandas import compat -from pandas.compat import(parse_date, StringIO, - lrange, lmap) +from pandas.compat import parse_date, StringIO, lrange from pandas.tseries.index import date_range @@ -291,33 +290,18 @@ def test_yy_format_with_yearfirst(self): tm.assert_frame_equal(rs, xp) def test_parse_dates_column_list(self): - from pandas.core.datetools import to_datetime - - data = '''date;destination;ventilationcode;unitcode;units;aux_date -01/01/2010;P;P;50;1;12/1/2011 -01/01/2010;P;R;50;1;13/1/2011 -15/01/2010;P;P;50;1;14/1/2011 -01/05/2010;P;P;50;1;15/1/2011''' - - expected = self.read_csv(StringIO(data), sep=";", index_col=lrange(4)) - - lev = expected.index.levels[0] - levels = list(expected.index.levels) - levels[0] = lev.to_datetime(dayfirst=True) - # hack to get this to work - remove for final test - levels[0].name = lev.name - expected.index.set_levels(levels, inplace=True) - expected['aux_date'] = to_datetime(expected['aux_date'], - dayfirst=True) - expected['aux_date'] = lmap(Timestamp, expected['aux_date']) - tm.assertIsInstance(expected['aux_date'][0], datetime) - - df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), - parse_dates=[0, 5], dayfirst=True) + data = 'a,b,c\n01/01/2010,1,15/02/2010' + + expected = DataFrame({'a': [datetime(2010, 1, 1)], 'b': [1], + 'c': [datetime(2010, 2, 15)]}) + expected = expected.set_index(['a', 'b']) + + df = self.read_csv(StringIO(data), index_col=[0, 1], + parse_dates=[0, 2], dayfirst=True) tm.assert_frame_equal(df, expected) - df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), - parse_dates=['date', 'aux_date'], dayfirst=True) + df = self.read_csv(StringIO(data), index_col=[0, 1], + parse_dates=['a', 'c'], dayfirst=True) tm.assert_frame_equal(df, expected) def test_multi_index_parse_dates(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index f3980b4e254f8..c19de2ff7ca35 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2535,15 +2535,19 @@ def test_unit_mixed(self): def test_index_to_datetime(self): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) - result = idx.to_datetime() - expected = DatetimeIndex(datetools.to_datetime(idx.values)) - tm.assert_index_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = idx.to_datetime() + expected = DatetimeIndex(datetools.to_datetime(idx.values)) + tm.assert_index_equal(result, expected) - today = datetime.today() - idx = Index([today], dtype=object) - result = idx.to_datetime() - expected = DatetimeIndex([today]) - tm.assert_index_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + today = datetime.today() + idx = Index([today], dtype=object) + result = idx.to_datetime() + expected = DatetimeIndex([today]) + tm.assert_index_equal(result, expected) def test_dataframe(self): From 9d10b76fa22382b6017741a1344922c627c881d3 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 27 Aug 2016 09:31:30 -0400 Subject: [PATCH 305/359] BUG: Don't parse index column as numeric when parse_dates=True (#14077) When a thousands parameter is specified, if the index column data contains that thousands value for date purposes (e.g. '.'), do not interpret those characters as the thousands parameter. Closes gh-14066. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/io/parsers.py | 15 +++++++++++ pandas/io/tests/parser/parse_dates.py | 32 +++++++++++++++++++++++ pandas/io/tests/parser/usecols.py | 37 ++++++++++++--------------- 4 files changed, 64 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 7e857b254d0a6..9c5c36528d31b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1126,6 +1126,7 @@ Bug Fixes - Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`) - Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) +- Bug in ``pd.read_csv()`` where the index columns were being incorrectly parsed when parsed as dates with a ``thousands`` parameter (:issue:`14066`) - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) - Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e765ebc36e33e..62f2ad1419d92 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1474,6 +1474,13 @@ def _set(x): else: _set(val) + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + def set_error_bad_lines(self, status): self._reader.set_error_bad_lines(int(status)) @@ -1856,6 +1863,14 @@ def _set(x): _set(k) else: _set(val) + + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + return noconvert_columns def _make_reader(self, f): diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py index ef48cc8a65620..9fe49f616c5f2 100644 --- a/pandas/io/tests/parser/parse_dates.py +++ b/pandas/io/tests/parser/parse_dates.py @@ -458,3 +458,35 @@ def test_parse_dates_empty_string(self): result = self.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) self.assertTrue(result['Date'].isnull()[1]) + + def test_parse_dates_noconvert_thousands(self): + # see gh-14066 + data = 'a\n04.15.2016' + + expected = DataFrame([datetime(2016, 4, 15)], columns=['a']) + result = self.read_csv(StringIO(data), parse_dates=['a'], + thousands='.') + tm.assert_frame_equal(result, expected) + + exp_index = DatetimeIndex(['2016-04-15'], name='a') + expected = DataFrame(index=exp_index) + result = self.read_csv(StringIO(data), index_col=0, + parse_dates=True, thousands='.') + tm.assert_frame_equal(result, expected) + + data = 'a,b\n04.15.2016,09.16.2013' + + expected = DataFrame([[datetime(2016, 4, 15), + datetime(2013, 9, 16)]], + columns=['a', 'b']) + result = self.read_csv(StringIO(data), parse_dates=['a', 'b'], + thousands='.') + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[datetime(2016, 4, 15), + datetime(2013, 9, 16)]], + columns=['a', 'b']) + expected = expected.set_index(['a', 'b']) + result = self.read_csv(StringIO(data), index_col=[0, 1], + parse_dates=True, thousands='.') + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index ac32c20034c66..16a19c50be960 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -5,13 +5,12 @@ for all of the parsers defined in parsers.py """ -from datetime import datetime import nose import numpy as np import pandas.util.testing as tm -from pandas import DataFrame +from pandas import DataFrame, Index from pandas.lib import Timestamp from pandas.compat import StringIO @@ -99,35 +98,31 @@ def test_usecols_index_col_False(self): def test_usecols_index_col_conflict(self): # see gh-4201: test that index_col as integer reflects usecols - data = """SecId,Time,Price,P2,P3 -10000,2013-5-11,100,10,1 -500,2013-5-12,101,11,1 -""" - expected = DataFrame({'Price': [100, 101]}, index=[ - datetime(2013, 5, 11), datetime(2013, 5, 12)]) - expected.index.name = 'Time' + data = 'a,b,c,d\nA,a,1,one\nB,b,2,two' + expected = DataFrame({'c': [1, 2]}, index=Index( + ['a', 'b'], name='b')) - df = self.read_csv(StringIO(data), usecols=[ - 'Time', 'Price'], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=['b', 'c'], + index_col=0) tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[ - 'Time', 'Price'], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=['b', 'c'], + index_col='b') tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[ - 1, 2], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=[1, 2], + index_col='b') tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[ - 1, 2], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=[1, 2], + index_col=0) tm.assert_frame_equal(expected, df) expected = DataFrame( - {'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) - expected = expected.set_index(['Price', 'P2']) - df = self.read_csv(StringIO(data), usecols=[ - 'Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) + {'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')}) + expected = expected.set_index(['b', 'c']) + df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'], + index_col=['b', 'c']) tm.assert_frame_equal(expected, df) def test_usecols_implicit_index_col(self): From a0151a7b1287e77bcaf0340eba51a2efd7fa8bba Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Aug 2016 15:34:27 +0200 Subject: [PATCH 306/359] DOC: unpin IPython version (GH13639) (#14037) --- ci/requirements-2.7_DOC_BUILD.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7_DOC_BUILD.run b/ci/requirements-2.7_DOC_BUILD.run index cde0719aa027e..b87a41df4191d 100644 --- a/ci/requirements-2.7_DOC_BUILD.run +++ b/ci/requirements-2.7_DOC_BUILD.run @@ -1,4 +1,4 @@ -ipython=4.2.0 +ipython ipykernel sphinx nbconvert From 8d1646c36a289e3fe921aeeb5fd5ab60c7a10ae7 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Sun, 28 Aug 2016 08:14:55 -0500 Subject: [PATCH 307/359] COMPAT: int dtype in json tests (#14100) --- pandas/io/tests/json/test_ujson.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/io/tests/json/test_ujson.py index 0dda6ead2a3b9..704023bd847b7 100644 --- a/pandas/io/tests/json/test_ujson.py +++ b/pandas/io/tests/json/test_ujson.py @@ -1321,20 +1321,22 @@ def testSeries(self): numpy=True)) outp = Series(**dec) + exp_np = Series(np.array([10, 20, 30, 40, 50, 60])) + exp_pd = Series([10, 20, 30, 40, 50, 60]) outp = Series(ujson.decode(ujson.encode(s, orient="records"), numpy=True)) - exp = Series([10, 20, 30, 40, 50, 60]) - tm.assert_series_equal(outp, exp) + tm.assert_series_equal(outp, exp_np) outp = Series(ujson.decode(ujson.encode(s, orient="records"))) - tm.assert_series_equal(outp, exp) + exp = Series([10, 20, 30, 40, 50, 60]) + tm.assert_series_equal(outp, exp_pd) outp = Series(ujson.decode(ujson.encode(s, orient="values"), numpy=True)) - tm.assert_series_equal(outp, exp) + tm.assert_series_equal(outp, exp_np) outp = Series(ujson.decode(ujson.encode(s, orient="values"))) - tm.assert_series_equal(outp, exp) + tm.assert_series_equal(outp, exp_pd) outp = Series(ujson.decode(ujson.encode( s, orient="index"))).sort_values() From be61825986ba565bc038beb2f5df2750fc1aca30 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 28 Aug 2016 13:26:12 -0500 Subject: [PATCH 308/359] DOC: Update README to link to install instructions. (#13882) --- README.md | 89 +++++-------------------------------------------------- 1 file changed, 7 insertions(+), 82 deletions(-) diff --git a/README.md b/README.md index e1149ac10795e..1334d9696764b 100644 --- a/README.md +++ b/README.md @@ -129,102 +129,27 @@ Here are just a few of the things that pandas does well: The source code is currently hosted on GitHub at: http://github.com/pydata/pandas -Binary installers for the latest released version are available at the Python -package index - - http://pypi.python.org/pypi/pandas/ - -And via `easy_install`: +Binary installers for the latest released version are available at the [Python +package index](http://pypi.python.org/pypi/pandas/) and on conda. ```sh -easy_install pandas +# conda +conda install pandas ``` -or `pip`: - ```sh +# or PyPI pip install pandas ``` -or `conda`: - -```sh -conda install pandas -``` - ## Dependencies - [NumPy](http://www.numpy.org): 1.7.0 or higher - [python-dateutil](http://labix.org/python-dateutil): 1.5 or higher - [pytz](http://pytz.sourceforge.net) - Needed for time zone support with ``pandas.date_range`` -### Highly Recommended Dependencies -- [numexpr](https://github.com/pydata/numexpr) - - Needed to accelerate some expression evaluation operations - - Required by PyTables -- [bottleneck](http://berkeleyanalytics.com/bottleneck) - - Needed to accelerate certain numerical operations - -### Optional dependencies -- [Cython](http://www.cython.org): Only necessary to build development version. Version 0.17.1 or higher. -- [SciPy](http://www.scipy.org): miscellaneous statistical functions -- [PyTables](http://www.pytables.org): necessary for HDF5-based storage -- [SQLAlchemy](http://www.sqlalchemy.org): for SQL database support. Version 0.8.1 or higher recommended. -- [matplotlib](http://matplotlib.org/): for plotting -- [statsmodels](http://www.statsmodels.org/) - - Needed for parts of `pandas.stats` -- For Excel I/O: - - [xlrd/xlwt](http://www.python-excel.org/) - - Excel reading (xlrd) and writing (xlwt) - - [openpyxl](http://packages.python.org/openpyxl/) - - openpyxl version 1.6.1 or higher, but lower than 2.0.0, for - writing .xlsx files - - xlrd >= 0.9.0 - - [XlsxWriter](https://pypi.python.org/pypi/XlsxWriter) - - Alternative Excel writer. -- [Google bq Command Line Tool](https://cloud.google.com/bigquery/bq-command-line-tool) - - Needed for `pandas.io.gbq` -- [boto](https://pypi.python.org/pypi/boto): necessary for Amazon S3 access. -- One of the following combinations of libraries is needed to use the - top-level [`pandas.read_html`][read-html-docs] function: - - [BeautifulSoup4][BeautifulSoup4] and [html5lib][html5lib] (Any - recent version of [html5lib][html5lib] is okay.) - - [BeautifulSoup4][BeautifulSoup4] and [lxml][lxml] - - [BeautifulSoup4][BeautifulSoup4] and [html5lib][html5lib] and [lxml][lxml] - - Only [lxml][lxml], although see [HTML reading gotchas][html-gotchas] - for reasons as to why you should probably **not** take this approach. - -#### Notes about HTML parsing libraries -- If you install [BeautifulSoup4][BeautifulSoup4] you must install - either [lxml][lxml] or [html5lib][html5lib] or both. - `pandas.read_html` will **not** work with *only* `BeautifulSoup4` - installed. -- You are strongly encouraged to read [HTML reading - gotchas][html-gotchas]. It explains issues surrounding the - installation and usage of the above three libraries. -- You may need to install an older version of - [BeautifulSoup4][BeautifulSoup4]: - - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and - 32-bit Ubuntu/Debian -- Additionally, if you're using [Anaconda][Anaconda] you should - definitely read [the gotchas about HTML parsing][html-gotchas] - libraries -- If you're on a system with `apt-get` you can do - - ```sh - sudo apt-get build-dep python-lxml - ``` - - to get the necessary dependencies for installation of [lxml][lxml]. - This will prevent further headaches down the line. - - [html5lib]: https://github.com/html5lib/html5lib-python "html5lib" - [BeautifulSoup4]: http://www.crummy.com/software/BeautifulSoup "BeautifulSoup4" - [lxml]: http://lxml.de - [Anaconda]: https://store.continuum.io/cshop/anaconda - [NumPy]: http://numpy.scipy.org/ - [html-gotchas]: http://pandas.pydata.org/pandas-docs/stable/gotchas.html#html-table-parsing - [read-html-docs]: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.html.read_html.html#pandas.io.html.read_html +See the [full installation instructions](http://pandas.pydata.org/pandas-docs/stable/install.html) +for recommended and optional dependencies. ## Installation from sources To install pandas from source you need Cython in addition to the normal From ca2b1043fb03575d08c23ab825641e31132fe3cd Mon Sep 17 00:00:00 2001 From: jackieleng Date: Mon, 29 Aug 2016 14:24:29 +0200 Subject: [PATCH 309/359] Added consistent pandas imports in io documentation (#14097) --- doc/source/io.rst | 203 +++++++++++++++++++++++----------------------- 1 file changed, 101 insertions(+), 102 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 26e928020b893..96ec624f4fd3c 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -19,11 +19,10 @@ import matplotlib.pyplot as plt plt.close('all') - from pandas import * - options.display.max_rows=15 import pandas.util.testing as tm - clipdf = DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']}, - index=['x','y','z']) + pd.options.display.max_rows=15 + clipdf = pd.DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']}, + index=['x','y','z']) =============================== IO Tools (Text, CSV, HDF5, ...) @@ -1390,7 +1389,7 @@ class of the csv module. For this, you have to specify ``sep=None``. .. ipython:: python :suppress: - df = DataFrame(np.random.randn(10, 4)) + df = pd.DataFrame(np.random.randn(10, 4)) df.to_csv('tmp.sv', sep='|') df.to_csv('tmp2.sv', sep=':') @@ -1571,7 +1570,7 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet .. ipython:: python - dfj = DataFrame(randn(5, 2), columns=list('AB')) + dfj = pd.DataFrame(randn(5, 2), columns=list('AB')) json = dfj.to_json() json @@ -1583,10 +1582,10 @@ file / string. Consider the following DataFrame and Series: .. ipython:: python - dfjo = DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), - columns=list('ABC'), index=list('xyz')) + dfjo = pd.DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), + columns=list('ABC'), index=list('xyz')) dfjo - sjo = Series(dict(x=15, y=16, z=17), name='D') + sjo = pd.Series(dict(x=15, y=16, z=17), name='D') sjo **Column oriented** (the default for ``DataFrame``) serializes the data as @@ -1643,8 +1642,8 @@ Writing in ISO date format .. ipython:: python - dfd = DataFrame(randn(5, 2), columns=list('AB')) - dfd['date'] = Timestamp('20130101') + dfd = pd.DataFrame(randn(5, 2), columns=list('AB')) + dfd['date'] = pd.Timestamp('20130101') dfd = dfd.sort_index(1, ascending=False) json = dfd.to_json(date_format='iso') json @@ -1668,10 +1667,10 @@ Writing to a file, with a date index and a date column .. ipython:: python dfj2 = dfj.copy() - dfj2['date'] = Timestamp('20130101') + dfj2['date'] = pd.Timestamp('20130101') dfj2['ints'] = list(range(5)) dfj2['bools'] = True - dfj2.index = date_range('20130101', periods=5) + dfj2.index = pd.date_range('20130101', periods=5) dfj2.to_json('test.json') open('test.json').read() @@ -1707,7 +1706,7 @@ can be dealt with by specifying a simple ``default_handler``: .. ipython:: python - DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json(default_handler=str) + pd.DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json(default_handler=str) .. _io.json_reader: @@ -1820,7 +1819,7 @@ Preserve string indices: .. ipython:: python - si = DataFrame(np.zeros((4, 4)), + si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)), index=[str(i) for i in range(4)]) si @@ -1868,17 +1867,17 @@ data: randfloats = np.random.uniform(-100, 1000, 10000) randfloats.shape = (1000, 10) - dffloats = DataFrame(randfloats, columns=list('ABCDEFGHIJ')) + dffloats = pd.DataFrame(randfloats, columns=list('ABCDEFGHIJ')) jsonfloats = dffloats.to_json() .. ipython:: python - timeit read_json(jsonfloats) + timeit pd.read_json(jsonfloats) .. ipython:: python - timeit read_json(jsonfloats, numpy=True) + timeit pd.read_json(jsonfloats, numpy=True) The speedup is less noticeable for smaller datasets: @@ -1888,11 +1887,11 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python - timeit read_json(jsonfloats) + timeit pd.read_json(jsonfloats) .. ipython:: python - timeit read_json(jsonfloats, numpy=True) + timeit pd.read_json(jsonfloats, numpy=True) .. warning:: @@ -1997,7 +1996,7 @@ Read a URL with no options .. ipython:: python url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' - dfs = read_html(url) + dfs = pd.read_html(url) dfs .. note:: @@ -2017,7 +2016,7 @@ as a string .. ipython:: python with open(file_path, 'r') as f: - dfs = read_html(f.read()) + dfs = pd.read_html(f.read()) dfs You can even pass in an instance of ``StringIO`` if you so desire @@ -2027,7 +2026,7 @@ You can even pass in an instance of ``StringIO`` if you so desire with open(file_path, 'r') as f: sio = StringIO(f.read()) - dfs = read_html(sio) + dfs = pd.read_html(sio) dfs .. note:: @@ -2044,7 +2043,7 @@ Read a URL and match a table that contains specific text .. code-block:: python match = 'Metcalf Bank' - df_list = read_html(url, match=match) + df_list = pd.read_html(url, match=match) Specify a header row (by default ```` elements are used to form the column index); if specified, the header row is taken from the data minus the parsed @@ -2052,40 +2051,40 @@ header elements (```` elements). .. code-block:: python - dfs = read_html(url, header=0) + dfs = pd.read_html(url, header=0) Specify an index column .. code-block:: python - dfs = read_html(url, index_col=0) + dfs = pd.read_html(url, index_col=0) Specify a number of rows to skip .. code-block:: python - dfs = read_html(url, skiprows=0) + dfs = pd.read_html(url, skiprows=0) Specify a number of rows to skip using a list (``xrange`` (Python 2 only) works as well) .. code-block:: python - dfs = read_html(url, skiprows=range(2)) + dfs = pd.read_html(url, skiprows=range(2)) Specify an HTML attribute .. code-block:: python - dfs1 = read_html(url, attrs={'id': 'table'}) - dfs2 = read_html(url, attrs={'class': 'sortable'}) + dfs1 = pd.read_html(url, attrs={'id': 'table'}) + dfs2 = pd.read_html(url, attrs={'class': 'sortable'}) print(np.array_equal(dfs1[0], dfs2[0])) # Should be True Specify values that should be converted to NaN .. code-block:: python - dfs = read_html(url, na_values=['No Acquirer']) + dfs = pd.read_html(url, na_values=['No Acquirer']) .. versionadded:: 0.19 @@ -2093,7 +2092,7 @@ Specify whether to keep the default set of NaN values .. code-block:: python - dfs = read_html(url, keep_default_na=False) + dfs = pd.read_html(url, keep_default_na=False) .. versionadded:: 0.19 @@ -2105,7 +2104,7 @@ columns to strings. .. code-block:: python url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' - dfs = read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC': + dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC': str}) .. versionadded:: 0.19 @@ -2114,15 +2113,15 @@ Use some combination of the above .. code-block:: python - dfs = read_html(url, match='Metcalf Bank', index_col=0) + dfs = pd.read_html(url, match='Metcalf Bank', index_col=0) Read in pandas ``to_html`` output (with some loss of floating point precision) .. code-block:: python - df = DataFrame(randn(2, 2)) + df = pd.DataFrame(randn(2, 2)) s = df.to_html(float_format='{0:.40g}'.format) - dfin = read_html(s, index_col=0) + dfin = pd.read_html(s, index_col=0) The ``lxml`` backend will raise an error on a failed parse if that is the only parser you provide (if you only have a single parser you can provide just a @@ -2131,13 +2130,13 @@ for example, the function expects a sequence of strings) .. code-block:: python - dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) + dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) or .. code-block:: python - dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor='lxml') + dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor='lxml') However, if you have bs4 and html5lib installed and pass ``None`` or ``['lxml', 'bs4']`` then the parse will most likely succeed. Note that *as soon as a parse @@ -2145,7 +2144,7 @@ succeeds, the function will return*. .. code-block:: python - dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) + dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) .. _io.html: @@ -2173,7 +2172,7 @@ in the method ``to_string`` described above. .. ipython:: python - df = DataFrame(randn(2, 2)) + df = pd.DataFrame(randn(2, 2)) df print(df.to_html()) # raw html @@ -2249,7 +2248,7 @@ Finally, the ``escape`` argument allows you to control whether the .. ipython:: python - df = DataFrame({'a': list('&<>'), 'b': randn(3)}) + df = pd.DataFrame({'a': list('&<>'), 'b': randn(3)}) .. ipython:: python @@ -2701,7 +2700,7 @@ DataFrame into clipboard and reading it back. .. ipython:: python - df=pd.DataFrame(randn(5,3)) + df = pd.DataFrame(randn(5,3)) df df.to_clipboard() pd.read_clipboard() @@ -2731,7 +2730,7 @@ any pickled pandas object (or any other pickled object) from file: .. ipython:: python - read_pickle('foo.pkl') + pd.read_pickle('foo.pkl') .. ipython:: python :suppress: @@ -2795,10 +2794,10 @@ both on the writing (serialization), and reading (deserialization). .. ipython:: python - df = DataFrame(np.random.rand(5,2),columns=list('AB')) + df = pd.DataFrame(np.random.rand(5,2),columns=list('AB')) df.to_msgpack('foo.msg') pd.read_msgpack('foo.msg') - s = Series(np.random.rand(5),index=date_range('20130101',periods=5)) + s = pd.Series(np.random.rand(5),index=pd.date_range('20130101',periods=5)) You can pass a list of objects and you will receive them back on deserialization. @@ -2883,7 +2882,7 @@ for some advanced strategies .. ipython:: python - store = HDFStore('store.h5') + store = pd.HDFStore('store.h5') print(store) Objects can be written to the file just like adding key-value pairs to a @@ -2892,13 +2891,13 @@ dict: .. ipython:: python np.random.seed(1234) - index = date_range('1/1/2000', periods=8) - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = DataFrame(randn(8, 3), index=index, - columns=['A', 'B', 'C']) - wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) + index = pd.date_range('1/1/2000', periods=8) + s = pd.Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + df = pd.DataFrame(randn(8, 3), index=index, + columns=['A', 'B', 'C']) + wp = pd.Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) # store.put('s', s) is an equivalent method store['s'] = s @@ -2941,7 +2940,7 @@ Closing a Store, Context Manager # Working with, and automatically closing the store with the context # manager - with HDFStore('store.h5') as store: + with pd.HDFStore('store.h5') as store: store.keys() .. ipython:: python @@ -2961,9 +2960,9 @@ similar to how ``read_csv`` and ``to_csv`` work. (new in 0.11.0) .. ipython:: python - df_tl = DataFrame(dict(A=list(range(5)), B=list(range(5)))) + df_tl = pd.DataFrame(dict(A=list(range(5)), B=list(range(5)))) df_tl.to_hdf('store_tl.h5','table',append=True) - read_hdf('store_tl.h5', 'table', where = ['index>2']) + pd.read_hdf('store_tl.h5', 'table', where = ['index>2']) .. ipython:: python :suppress: @@ -3008,7 +3007,7 @@ This is also true for the major axis of a ``Panel``: [[np.nan, np.nan, np.nan], [np.nan,5,6]], [[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]] - panel_with_major_axis_all_missing = Panel(matrix, + panel_with_major_axis_all_missing = pd.Panel(matrix, items=['Item1', 'Item2','Item3'], major_axis=[1,2], minor_axis=['A', 'B', 'C']) @@ -3019,7 +3018,7 @@ This is also true for the major axis of a ``Panel``: dropna = True, format='table', mode='w') - reloaded = read_hdf('file.h5', 'panel') + reloaded = pd.read_hdf('file.h5', 'panel') reloaded @@ -3052,7 +3051,7 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for .. code-block:: python - DataFrame(randn(10,2)).to_hdf('test_fixed.h5','df') + pd.DataFrame(randn(10,2)).to_hdf('test_fixed.h5','df') pd.read_hdf('test_fixed.h5','df',where='index>5') TypeError: cannot pass a where specification when reading a fixed format. @@ -3084,7 +3083,7 @@ enable ``put/append/to_hdf`` to by default store in the ``table`` format. .. ipython:: python - store = HDFStore('store.h5') + store = pd.HDFStore('store.h5') df1 = df[0:4] df2 = df[4:] @@ -3172,14 +3171,14 @@ defaults to `nan`. .. ipython:: python - df_mixed = DataFrame({ 'A' : randn(8), - 'B' : randn(8), - 'C' : np.array(randn(8),dtype='float32'), - 'string' :'string', - 'int' : 1, - 'bool' : True, - 'datetime64' : Timestamp('20010102')}, - index=list(range(8))) + df_mixed = pd.DataFrame({ 'A' : randn(8), + 'B' : randn(8), + 'C' : np.array(randn(8),dtype='float32'), + 'string' :'string', + 'int' : 1, + 'bool' : True, + 'datetime64' : pd.Timestamp('20010102')}, + index=list(range(8))) df_mixed.ix[3:5,['A', 'B', 'string', 'datetime64']] = np.nan store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) @@ -3198,13 +3197,13 @@ storing/selecting from homogeneous index DataFrames. .. ipython:: python - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - df_mi = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) df_mi store.append('df_mi',df_mi) @@ -3319,14 +3318,14 @@ Here are some examples: .. ipython:: python - dfq = DataFrame(randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq = pd.DataFrame(randn(10,4),columns=list('ABCD'),index=pd.date_range('20130101',periods=10)) store.append('dfq',dfq,format='table',data_columns=True) Use boolean expressions, with in-line function evaluation. .. ipython:: python - store.select('dfq',"index>Timestamp('20130104') & columns=['A', 'B']") + store.select('dfq',"index>pd.Timestamp('20130104') & columns=['A', 'B']") Use and inline column reference @@ -3340,7 +3339,7 @@ Works with a Panel as well. store.append('wp',wp) store - store.select('wp', "major_axis>Timestamp('20000102') & minor_axis=['A', 'B']") + store.select('wp', "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") The ``columns`` keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a @@ -3385,7 +3384,7 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = DataFrame(dict(A = Timestamp('20130101'), B = [ Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) + dftd = pd.DataFrame(dict(A = pd.Timestamp('20130101'), B = [ pd.Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) dftd['C'] = dftd['A']-dftd['B'] dftd store.append('dftd',dftd,data_columns=True) @@ -3421,8 +3420,8 @@ Oftentimes when appending large amounts of data to a store, it is useful to turn .. ipython:: python - df_1 = DataFrame(randn(10,2),columns=list('AB')) - df_2 = DataFrame(randn(10,2),columns=list('AB')) + df_1 = pd.DataFrame(randn(10,2),columns=list('AB')) + df_2 = pd.DataFrame(randn(10,2),columns=list('AB')) st = pd.HDFStore('appends.h5',mode='w') st.append('df', df_1, data_columns=['B'], index=False) @@ -3468,7 +3467,7 @@ be data_columns # on-disk operations store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) - store.select('df_dc', [ Term('B>0') ]) + store.select('df_dc', [ pd.Term('B>0') ]) # getting creative store.select('df_dc', 'B > 0 & C > 0 & string == foo') @@ -3507,7 +3506,7 @@ The default is 50,000 rows returned in a chunk. .. code-block:: python - for df in read_hdf('store.h5','df', chunksize=3): + for df in pd.read_hdf('store.h5','df', chunksize=3): print(df) Note, that the chunksize keyword applies to the **source** rows. So if you @@ -3519,7 +3518,7 @@ chunks. .. ipython:: python - dfeq = DataFrame({'number': np.arange(1,11)}) + dfeq = pd.DataFrame({'number': np.arange(1,11)}) dfeq store.append('dfeq', dfeq, data_columns=['number']) @@ -3559,7 +3558,7 @@ Sometimes you want to get the coordinates (a.k.a the index locations) of your qu .. ipython:: python - df_coord = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) + df_coord = pd.DataFrame(np.random.randn(1000,2),index=pd.date_range('20000101',periods=1000)) store.append('df_coord',df_coord) c = store.select_as_coordinates('df_coord','index>20020101') c.summary() @@ -3576,10 +3575,10 @@ a datetimeindex which are 5. .. ipython:: python - df_mask = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) + df_mask = pd.DataFrame(np.random.randn(1000,2),index=pd.date_range('20000101',periods=1000)) store.append('df_mask',df_mask) c = store.select_column('df_mask','index') - where = c[DatetimeIndex(c).month==5].index + where = c[pd.DatetimeIndex(c).month==5].index store.select('df_mask',where=where) Storer Object @@ -3624,8 +3623,8 @@ results. .. ipython:: python - df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt = pd.DataFrame(randn(8, 6), index=pd.date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' df_mt.ix[1, ('A', 'B')] = np.nan @@ -3716,7 +3715,7 @@ Compression for all objects within the file .. code-block:: python - store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc') + store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, complib='blosc') Or on-the-fly compression (this only applies to tables). You can turn off file compression for a specific table by passing ``complevel=0`` @@ -3814,8 +3813,8 @@ stored in a more efficient manner. .. ipython:: python - dfcat = DataFrame({ 'A' : Series(list('aabbcdba')).astype('category'), - 'B' : np.random.randn(8) }) + dfcat = pd.DataFrame({ 'A' : pd.Series(list('aabbcdba')).astype('category'), + 'B' : np.random.randn(8) }) dfcat dfcat.dtypes cstore = pd.HDFStore('cats.h5', mode='w') @@ -3872,7 +3871,7 @@ Starting in 0.11.0, passing a ``min_itemsize`` dict will cause all passed column .. ipython:: python - dfs = DataFrame(dict(A = 'foo', B = 'bar'),index=list(range(5))) + dfs = pd.DataFrame(dict(A = 'foo', B = 'bar'),index=list(range(5))) dfs # A and B have a size of 30 @@ -3891,7 +3890,7 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = DataFrame(dict(A = ['foo','bar','nan'])) + dfss = pd.DataFrame(dict(A = ['foo','bar','nan'])) dfss store.append('dfss', dfss) @@ -3925,7 +3924,7 @@ It is possible to write an ``HDFStore`` object that can easily be imported into index=range(100)) df_for_r.head() - store_export = HDFStore('export.h5') + store_export = pd.HDFStore('export.h5') store_export.append('df_for_r', df_for_r, data_columns=df_dc.columns) store_export @@ -4015,7 +4014,7 @@ number of options, please see the docstring. :okwarning: # a legacy store - legacy_store = HDFStore(legacy_file_path,'r') + legacy_store = pd.HDFStore(legacy_file_path,'r') legacy_store # copy (and return the new handle) @@ -4062,7 +4061,7 @@ HDFStore supports ``Panel4D`` storage. .. ipython:: python :okwarning: - p4d = Panel4D({ 'l1' : wp }) + p4d = pd.Panel4D({ 'l1' : wp }) p4d store.append('p4d', p4d) store @@ -4079,7 +4078,7 @@ object). This cannot be changed after table creation. store.append('p4d2', p4d, axes=['labels', 'major_axis', 'minor_axis']) store - store.select('p4d2', [ Term('labels=l1'), Term('items=Item1'), Term('minor_axis=A_big_strings') ]) + store.select('p4d2', [ pd.Term('labels=l1'), pd.Term('items=Item1'), pd.Term('minor_axis=A_big_strings') ]) .. ipython:: python :suppress: @@ -4181,7 +4180,7 @@ the database using :func:`~pandas.DataFrame.to_sql`. (42, datetime.datetime(2010,10,19), 'Y', -12.5, False), (63, datetime.datetime(2010,10,20), 'Z', 5.73, True)] - data = DataFrame(d, columns=c) + data = pd.DataFrame(d, columns=c) .. ipython:: python @@ -4677,7 +4676,7 @@ into a .dta file. The format version of this file is always 115 (Stata 12). .. ipython:: python - df = DataFrame(randn(10, 2), columns=list('AB')) + df = pd.DataFrame(randn(10, 2), columns=list('AB')) df.to_stata('stata.dta') *Stata* data files have limited data type support; only strings with @@ -4902,7 +4901,7 @@ This is an informal comparison of various IO methods, using pandas 0.13.1. .. code-block:: ipython - In [1]: df = DataFrame(randn(1000000,2),columns=list('AB')) + In [1]: df = pd.DataFrame(randn(1000000,2),columns=list('AB')) In [2]: df.info() @@ -4976,7 +4975,7 @@ And here's the code import os from pandas.io import sql - df = DataFrame(randn(1000000,2),columns=list('AB')) + df = pd.DataFrame(randn(1000000,2),columns=list('AB')) def test_sql_write(df): if os.path.exists('test.sql'): From 5a20ea262eadbabc487fbe1d83ab4b6085f4c7c0 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Mon, 29 Aug 2016 21:29:04 +0900 Subject: [PATCH 310/359] API: change unique to return Index (#13979) --- doc/source/whatsnew/v0.19.0.txt | 33 ++++++- pandas/core/base.py | 31 +++--- pandas/core/series.py | 12 ++- pandas/indexes/base.py | 8 +- pandas/indexes/category.py | 8 ++ pandas/tests/indexes/test_category.py | 1 + pandas/tests/indexes/test_multi.py | 32 +++++++ pandas/tests/test_base.py | 133 +++++++++++++++----------- pandas/tests/test_categorical.py | 24 +++++ pandas/tseries/base.py | 13 --- pandas/util/testing.py | 2 +- 11 files changed, 209 insertions(+), 88 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9c5c36528d31b..29d6d972a7a55 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -457,7 +457,7 @@ API changes - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) - ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) - +- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) @@ -904,6 +904,35 @@ New Behavior: idx1.difference(idx2) idx1.symmetric_difference(idx2) +.. _whatsnew_0190.api.unique_index: + +``Index.unique`` consistently returns ``Index`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Index.unique()`` now returns unique values as an +``Index`` of the appropriate ``dtype``. (:issue:`13395`) + +Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex``, +``TimedeltaIndex`` and ``PeriodIndex`` returned ``Index`` to keep metadata like timezone. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pd.Index([1, 2, 3]).unique() + Out[1]: array([1, 2, 3]) + In [2]: pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() + Out[2]: DatetimeIndex(['2011-01-01 00:00:00+09:00', '2011-01-02 00:00:00+09:00', + '2011-01-03 00:00:00+09:00'], + dtype='datetime64[ns, Asia/Tokyo]', freq=None) + +New Behavior: + +.. ipython:: python + + pd.Index([1, 2, 3]).unique() + pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() + .. _whatsnew_0190.api.autogenerated_chunksize_index: ``read_csv`` will progressively enumerate chunks @@ -1181,6 +1210,7 @@ Bug Fixes - Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) - Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`) + - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) @@ -1248,7 +1278,6 @@ Bug Fixes - Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`) - Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`) - - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) - Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`) diff --git a/pandas/core/base.py b/pandas/core/base.py index 0f9eb14be40db..b9a70292498e4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,8 +7,7 @@ from pandas.types.missing import isnull from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass -from pandas.types.common import (is_object_dtype, - is_list_like, is_scalar) +from pandas.types.common import is_object_dtype, is_list_like, is_scalar from pandas.core import common as com import pandas.core.nanops as nanops @@ -21,7 +20,7 @@ _shared_docs = dict() _indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', - duplicated='IndexOpsMixin') + unique='IndexOpsMixin', duplicated='IndexOpsMixin') class StringMixin(object): @@ -952,21 +951,27 @@ def value_counts(self, normalize=False, sort=True, ascending=False, normalize=normalize, bins=bins, dropna=dropna) return result - def unique(self): + _shared_docs['unique'] = ( """ - Return array of unique values in the object. Significantly faster than - numpy.unique. Includes NA values. + Return %(unique)s of unique values in the object. + Significantly faster than numpy.unique. Includes NA values. + The order of the original is preserved. Returns ------- - uniques : ndarray - """ - from pandas.core.nanops import unique1d - values = self.values - if hasattr(values, 'unique'): - return values.unique() + uniques : %(unique)s + """) - return unique1d(values) + @Appender(_shared_docs['unique'] % _indexops_doc_kwargs) + def unique(self): + values = self._values + + if hasattr(values, 'unique'): + result = values.unique() + else: + from pandas.core.nanops import unique1d + result = unique1d(values) + return result def nunique(self, dropna=True): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 7979a230eed84..01d6f6f078d17 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -18,6 +18,7 @@ is_float_dtype, is_extension_type, is_datetimetz, is_datetimelike, + is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, is_hashable, @@ -77,7 +78,7 @@ axes='index', klass='Series', axes_single_arg="{0, 'index'}", inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", - duplicated='Series', + unique='np.ndarray', duplicated='Series', optional_by='') @@ -1231,6 +1232,15 @@ def mode(self): # TODO: Add option for bins like value_counts() return algos.mode(self) + @Appender(base._shared_docs['unique'] % _shared_doc_kwargs) + def unique(self): + result = super(Series, self).unique() + if is_datetime64tz_dtype(self.dtype): + # to return array of Timestamp with tz + # ToDo: it must return DatetimeArray with tz in pandas 2.0 + return result.asobject.values + return result + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 8c1378c07a1d2..15cd2064624d9 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -60,7 +60,8 @@ _unsortable_types = frozenset(('mixed', 'mixed-integer')) -_index_doc_kwargs = dict(klass='Index', inplace='', duplicated='np.array') +_index_doc_kwargs = dict(klass='Index', inplace='', + unique='Index', duplicated='np.ndarray') _index_shared_docs = dict() @@ -3217,6 +3218,11 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) + @Appender(base._shared_docs['unique'] % _index_doc_kwargs) + def unique(self): + result = super(Index, self).unique() + return self._shallow_copy(result) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 23c534624930f..251886ebdd974 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -283,6 +283,14 @@ def _engine(self): def is_unique(self): return not self.duplicated().any() + @Appender(base._shared_docs['unique'] % ibase._index_doc_kwargs) + def unique(self): + result = base.IndexOpsMixin.unique(self) + # CategoricalIndex._shallow_copy uses keeps original categories + # and ordered if not otherwise specified + return self._shallow_copy(result, categories=result.categories, + ordered=result.ordered) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 901b57dcc7bfe..b0e50491b8e9d 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -395,6 +395,7 @@ def test_duplicates(self): expected = CategoricalIndex([0], name='foo') self.assert_index_equal(idx.drop_duplicates(), expected) + self.assert_index_equal(idx.unique(), expected) def test_get_indexer(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index eedbd108510f7..c72cab32d198b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1927,6 +1927,38 @@ def test_get_unique_index(self): self.assertTrue(result.unique) self.assert_index_equal(result, expected) + def test_unique(self): + mi = pd.MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]]) + + res = mi.unique() + exp = pd.MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]]) + tm.assert_index_equal(res, exp) + + mi = pd.MultiIndex.from_arrays([list('aaaa'), list('abab')]) + res = mi.unique() + exp = pd.MultiIndex.from_arrays([list('aa'), list('ab')]) + tm.assert_index_equal(res, exp) + + mi = pd.MultiIndex.from_arrays([list('aaaa'), list('aaaa')]) + res = mi.unique() + exp = pd.MultiIndex.from_arrays([['a'], ['a']]) + tm.assert_index_equal(res, exp) + + def test_unique_datetimelike(self): + idx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01', + '2015-01-01', 'NaT', 'NaT']) + idx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02', + '2015-01-02', 'NaT', '2015-01-01'], + tz='Asia/Tokyo') + result = pd.MultiIndex.from_arrays([idx1, idx2]).unique() + + eidx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT']) + eidx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-02', + 'NaT', '2015-01-01'], + tz='Asia/Tokyo') + exp = pd.MultiIndex.from_arrays([eidx1, eidx2]) + tm.assert_index_equal(result, exp) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 66216758ca091..83b1cd141a61b 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -17,6 +17,7 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import (FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin) +from pandas.types.common import is_datetime64_dtype from pandas.tseries.base import DatetimeIndexOpsMixin @@ -452,43 +453,29 @@ def test_value_counts_unique_nunique(self): o = orig.copy() klass = type(o) - values = o.values - - # create repeated values, 'n'th element is repeated by n+1 times - if isinstance(o, PeriodIndex): - # freq must be specified because repeat makes freq ambiguous - - # resets name from Index - expected_index = pd.Index(o[::-1]) - expected_index.name = None - - # attach name to klass - o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' + values = o._values - elif isinstance(o, DatetimeIndex): + if isinstance(values, Index): + # reset name not to affect latter process + values.name = None - # resets name from Index - expected_index = pd.Index(o[::-1]) - expected_index.name = None - - # attach name to klass - o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' - - # don't test boolean - elif isinstance(o, Index) and o.is_boolean(): + # create repeated values, 'n'th element is repeated by n+1 times + # skip boolean, because it only has 2 values at most + if isinstance(o, Index) and o.is_boolean(): continue elif isinstance(o, Index): - expected_index = pd.Index(values[::-1]) + expected_index = pd.Index(o[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' else: expected_index = pd.Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) - o = klass(np.repeat(values, range(1, len(o) + 1)), - index=idx, name='a') + rep = np.repeat(values, range(1, len(o) + 1)) + o = klass(rep, index=idx, name='a') + + # check values has the same dtype as the original + self.assertEqual(o.dtype, orig.dtype) expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a') @@ -499,15 +486,23 @@ def test_value_counts_unique_nunique(self): self.assertEqual(result.name, 'a') result = o.unique() - if isinstance(o, (DatetimeIndex, PeriodIndex)): + if isinstance(o, Index): self.assertTrue(isinstance(result, o.__class__)) - self.assertEqual(result.freq, o.freq) self.assert_index_equal(result, orig) + elif is_datetimetz(o): + # datetimetz Series returns array of Timestamp + self.assertEqual(result[0], orig[0]) + for r in result: + self.assertIsInstance(r, pd.Timestamp) + tm.assert_numpy_array_equal(result, + orig._values.asobject.values) else: - self.assert_numpy_array_equal(result, values) + tm.assert_numpy_array_equal(result, orig.values) self.assertEqual(o.nunique(), len(np.unique(o.values))) + def test_value_counts_unique_nunique_null(self): + for null_obj in [np.nan, None]: for o in self.objs: klass = type(o) @@ -525,12 +520,14 @@ def test_value_counts_unique_nunique(self): else: o = o.copy() o[0:2] = pd.tslib.iNaT - values = o.values - elif o.values.dtype == 'datetime64[ns]' or isinstance( - o, PeriodIndex): + values = o._values + + elif is_datetime64_dtype(o) or isinstance(o, PeriodIndex): values[0:2] = pd.tslib.iNaT else: values[0:2] = null_obj + # check values has the same dtype as the original + self.assertEqual(values.dtype, o.dtype) # create repeated values, 'n'th element is repeated by n+1 # times @@ -570,13 +567,17 @@ def test_value_counts_unique_nunique(self): self.assertTrue(result_s.index.name is None) self.assertEqual(result_s.name, 'a') - # numpy_array_equal cannot compare arrays includes nan result = o.unique() - self.assert_numpy_array_equal(result[1:], values[2:]) - - if isinstance(o, (DatetimeIndex, PeriodIndex)): - self.assertTrue(result.asi8[0] == pd.tslib.iNaT) + if isinstance(o, Index): + tm.assert_index_equal(result, + Index(values[1:], name='a')) + elif is_datetimetz(o): + # unable to compare NaT / nan + tm.assert_numpy_array_equal(result[1:], + values[2:].asobject.values) + self.assertIs(result[0], pd.NaT) else: + tm.assert_numpy_array_equal(result[1:], values[2:]) self.assertTrue(pd.isnull(result[0])) self.assertEqual(o.nunique(), 8) @@ -590,8 +591,13 @@ def test_value_counts_inferred(self): expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) - exp = np.unique(np.array(s_values, dtype=np.object_)) - self.assert_numpy_array_equal(s.unique(), exp) + if isinstance(s, Index): + exp = Index(np.unique(np.array(s_values, dtype=np.object_))) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.unique(np.array(s_values, dtype=np.object_)) + tm.assert_numpy_array_equal(s.unique(), exp) + self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is # platform-dep @@ -627,8 +633,12 @@ def test_value_counts_bins(self): exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) - self.assert_numpy_array_equal(s1.unique(), - np.array([1, 2, 3], dtype=np.int64)) + if isinstance(s1, Index): + tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) + else: + exp = np.array([1, 2, 3], dtype=np.int64) + tm.assert_numpy_array_equal(s1.unique(), exp) + self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) @@ -652,8 +662,12 @@ def test_value_counts_bins(self): expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) - exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_) - self.assert_numpy_array_equal(s.unique(), exp) + if isinstance(s, Index): + exp = Index(['a', 'b', np.nan, 'd']) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) + tm.assert_numpy_array_equal(s.unique(), exp) self.assertEqual(s.nunique(), 3) s = klass({}) @@ -661,8 +675,13 @@ def test_value_counts_bins(self): tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original - self.assert_numpy_array_equal(s.unique(), np.array([]), - check_dtype=False) + if isinstance(s, Index): + self.assert_index_equal(s.unique(), Index([]), + exact=False) + else: + self.assert_numpy_array_equal(s.unique(), np.array([]), + check_dtype=False) + self.assertEqual(s.nunique(), 0) def test_value_counts_datetime64(self): @@ -691,10 +710,10 @@ def test_value_counts_datetime64(self): '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') - if isinstance(s, DatetimeIndex): - self.assert_index_equal(s.unique(), DatetimeIndex(expected)) + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: - self.assert_numpy_array_equal(s.unique(), expected) + tm.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) @@ -714,12 +733,12 @@ def test_value_counts_datetime64(self): self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT - if isinstance(s, DatetimeIndex): - self.assert_index_equal(unique[:3], DatetimeIndex(expected)) + if isinstance(s, Index): + exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) + tm.assert_index_equal(unique, exp_idx) else: - self.assert_numpy_array_equal(unique[:3], expected) - self.assertTrue(unique[3] is pd.NaT or - unique[3].astype('int64') == pd.tslib.iNaT) + tm.assert_numpy_array_equal(unique[:3], expected) + self.assertTrue(pd.isnull(unique[3])) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) @@ -733,10 +752,10 @@ def test_value_counts_datetime64(self): tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days'], name='dt') - if isinstance(td, TimedeltaIndex): - self.assert_index_equal(td.unique(), expected) + if isinstance(td, Index): + tm.assert_index_equal(td.unique(), expected) else: - self.assert_numpy_array_equal(td.unique(), expected.values) + tm.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d703ee7c1d1c2..781c9b786328d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1302,6 +1302,30 @@ def test_unique_ordered(self): ordered=True) tm.assert_categorical_equal(res, exp_cat) + def test_unique_index_series(self): + c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) + # Categorical.unique sorts categories by appearance order + # if ordered=False + exp = Categorical([3, 1, 2], categories=[3, 1, 2]) + tm.assert_categorical_equal(c.unique(), exp) + + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(pd.Series(c).unique(), exp) + + c = Categorical([1, 1, 2, 2], categories=[3, 2, 1]) + exp = Categorical([1, 2], categories=[1, 2]) + tm.assert_categorical_equal(c.unique(), exp) + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(pd.Series(c).unique(), exp) + + c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True) + # Categorical.unique keeps categories order if ordered=True + exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True) + tm.assert_categorical_equal(c.unique(), exp) + + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(pd.Series(c).unique(), exp) + def test_mode(self): s = Categorical([1, 1, 2, 4, 5, 5, 5], categories=[5, 4, 3, 2, 1], ordered=True) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index e64a0d2ebaf5e..c08bb53238e5c 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -743,19 +743,6 @@ def shift(self, n, freq=None): attribs['end'] = end return type(self)(**attribs) - def unique(self): - """ - Index.unique with handling for DatetimeIndex/PeriodIndex metadata - - Returns - ------- - result : DatetimeIndex or PeriodIndex - """ - from pandas.core.index import Int64Index - result = Int64Index.unique(self) - return self._simple_new(result, name=self.name, freq=self.freq, - tz=getattr(self, 'tz', None)) - def repeat(self, repeats, *args, **kwargs): """ Analogous to ndarray.repeat diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 94de8cb034024..d39569ea0b826 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1534,7 +1534,7 @@ def makeStringIndex(k=10, name=None): def makeUnicodeIndex(k=10, name=None): - return Index(randu_array(nchars=10, size=k)) + return Index(randu_array(nchars=10, size=k), name=name) def makeCategoricalIndex(k=10, n=3, name=None): From a9c15d356641e77553586154cccbdada50d7629d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 29 Aug 2016 08:42:43 -0400 Subject: [PATCH 311/359] DEPR: Deprecate Timestamp.to_datetime (#14101) * DEPR: Deprecate Timestamp.to_datetime * API: Issue real warning in to_pydatetime * DEPR: Deprecate NaT.to_datetime Closes gh-8254. --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/io/tests/json/test_pandas.py | 6 ++-- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/test_groupby.py | 4 +-- pandas/tseries/tests/test_base.py | 4 +-- pandas/tseries/tests/test_offsets.py | 30 ++++++++++++++--- pandas/tseries/tests/test_timeseries.py | 8 ++++- pandas/tseries/tests/test_timezones.py | 2 +- pandas/tseries/tests/test_tslib.py | 37 ++++++++++++++++---- pandas/tslib.pyx | 45 ++++++++++++++----------- 10 files changed, 100 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 29d6d972a7a55..45cdd23140487 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -436,6 +436,7 @@ API changes ~~~~~~~~~~~ +- ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds (:issue:`14101`) - ``Panel.to_sparse`` will raise a ``NotImplementedError`` exception when called (:issue:`13778`) - ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`) - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) @@ -1060,6 +1061,7 @@ Deprecations - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``Timestamp.to_datetime`` has been deprecated in favour of ``Timestamp.to_pydatetime`` (:issue:`8254`) - ``Index.to_datetime`` and ``DatetimeIndex.to_datetime`` have been deprecated in favour of ``pd.to_datetime`` (:issue:`8254`) - ``SparseList`` has been deprecated and will be removed in a future version (:issue:`13784`) - ``DataFrame.to_html()`` and ``DataFrame.to_latex()`` have dropped the ``colSpace`` parameter in favor of ``col_space`` (:issue:`13857`) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 96756a0b2d74b..47bdd25572fc7 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -908,17 +908,17 @@ def test_tz_is_utc(self): ts = Timestamp('2013-01-10 05:00:00Z') self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) - dt = ts.to_datetime() + dt = ts.to_pydatetime() self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) ts = Timestamp('2013-01-10 00:00:00', tz='US/Eastern') self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) - dt = ts.to_datetime() + dt = ts.to_pydatetime() self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) ts = Timestamp('2013-01-10 00:00:00-0500') self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) - dt = ts.to_datetime() + dt = ts.to_pydatetime() self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) def test_tz_range_is_utc(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index edf7fc444c3e1..66a5a155dd7a5 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -502,7 +502,7 @@ def test_asof(self): d = self.dateIndex[-1] self.assertEqual(self.dateIndex.asof(d + timedelta(1)), d) - d = self.dateIndex[0].to_datetime() + d = self.dateIndex[0].to_pydatetime() tm.assertIsInstance(self.dateIndex.asof(d), Timestamp) def test_asof_datetime_partial(self): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 9a82332621933..6b33fa747d8ba 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -779,7 +779,7 @@ def test_get_group(self): g = df.groupby('DATE') key = list(g.groups)[0] result1 = g.get_group(key) - result2 = g.get_group(Timestamp(key).to_datetime()) + result2 = g.get_group(Timestamp(key).to_pydatetime()) result3 = g.get_group(str(Timestamp(key))) assert_frame_equal(result1, result2) assert_frame_equal(result1, result3) @@ -788,7 +788,7 @@ def test_get_group(self): key = list(g.groups)[0] result1 = g.get_group(key) - result2 = g.get_group((Timestamp(key[0]).to_datetime(), key[1])) + result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1])) result3 = g.get_group((str(Timestamp(key[0])), key[1])) assert_frame_equal(result1, result2) assert_frame_equal(result1, result3) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 4d3c60ce39291..26e77d3ad79f3 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1127,11 +1127,11 @@ def test_subtraction_ops_with_tz(self): # check that dt/dti subtraction ops with tz are validated dti = date_range('20130101', periods=3) ts = Timestamp('20130101') - dt = ts.to_datetime() + dt = ts.to_pydatetime() dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') ts_tz = Timestamp('20130101').tz_localize('US/Eastern') ts_tz2 = Timestamp('20130101').tz_localize('CET') - dt_tz = ts_tz.to_datetime() + dt_tz = ts_tz.to_pydatetime() td = Timedelta('1 days') def _check(result, expected): diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 3ec07c27ef854..6ea6382a9904a 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -261,8 +261,19 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, self.assertTrue(isinstance(result, Timestamp)) self.assertEqual(result, expected) - # test nano second is preserved - result = func(Timestamp(dt) + Nano(5)) + # see gh-14101 + exp_warning = None + ts = Timestamp(dt) + Nano(5) + + if (offset_s.__class__.__name__ == 'DateOffset' and + (funcname == 'apply' or normalize) and + ts.nanosecond > 0): + exp_warning = UserWarning + + # test nanosecond is preserved + with tm.assert_produces_warning(exp_warning, + check_stacklevel=False): + result = func(ts) self.assertTrue(isinstance(result, Timestamp)) if normalize is False: self.assertEqual(result, expected + Nano(5)) @@ -289,8 +300,19 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, self.assertTrue(isinstance(result, Timestamp)) self.assertEqual(result, expected_localize) - # test nano second is preserved - result = func(Timestamp(dt, tz=tz) + Nano(5)) + # see gh-14101 + exp_warning = None + ts = Timestamp(dt, tz=tz) + Nano(5) + + if (offset_s.__class__.__name__ == 'DateOffset' and + (funcname == 'apply' or normalize) and + ts.nanosecond > 0): + exp_warning = UserWarning + + # test nanosecond is preserved + with tm.assert_produces_warning(exp_warning, + check_stacklevel=False): + result = func(ts) self.assertTrue(isinstance(result, Timestamp)) if normalize is False: self.assertEqual(result, expected_localize + Nano(5)) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index c19de2ff7ca35..5eb46684d1860 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1020,7 +1020,13 @@ def test_NaT_methods(self): for method in nat_methods: if hasattr(NaT, method): - self.assertIs(getattr(NaT, method)(), NaT) + # see gh-8254 + exp_warning = None + if method == 'to_datetime': + exp_warning = FutureWarning + with tm.assert_produces_warning( + exp_warning, check_stacklevel=False): + self.assertIs(getattr(NaT, method)(), NaT) # GH 12300 self.assertEqual(NaT.isoformat(), 'NaT') diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 470aafafec547..7ec0d09c20841 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -169,7 +169,7 @@ def test_timestamp_to_datetime_tzoffset(self): from dateutil.tz import tzoffset tzinfo = tzoffset(None, 7200) expected = Timestamp('3/11/2012 04:00', tz=tzinfo) - result = Timestamp(expected.to_datetime()) + result = Timestamp(expected.to_pydatetime()) self.assertEqual(expected, result) def test_timedelta_push_over_dst_boundary(self): diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 22bb3bddbc742..6cee45df2a63c 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -47,12 +47,17 @@ def test_max_valid(self): def test_to_datetime_bijective(self): # Ensure that converting to datetime and back only loses precision # by going from nanoseconds to microseconds. - self.assertEqual( - Timestamp(Timestamp.max.to_pydatetime()).value / 1000, - Timestamp.max.value / 1000) - self.assertEqual( - Timestamp(Timestamp.min.to_pydatetime()).value / 1000, - Timestamp.min.value / 1000) + exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + self.assertEqual( + Timestamp(Timestamp.max.to_pydatetime()).value / 1000, + Timestamp.max.value / 1000) + + exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + self.assertEqual( + Timestamp(Timestamp.min.to_pydatetime()).value / 1000, + Timestamp.min.value / 1000) class TestTimestamp(tm.TestCase): @@ -616,6 +621,26 @@ def test_pprint(self): 'foo': 1}""" self.assertEqual(result, expected) + def to_datetime_depr(self): + # see gh-8254 + ts = Timestamp('2011-01-01') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + expected = datetime.datetime(2011, 1, 1) + result = ts.to_datetime() + self.assertEqual(result, expected) + + def to_pydatetime_nonzero_nano(self): + ts = Timestamp('2011-01-01 9:00:00.123456789') + + # Warn the user of data loss (nanoseconds). + with tm.assert_produces_warning(UserWarning, + check_stacklevel=False): + expected = datetime.datetime(2011, 1, 1, 9, 0, 0, 123456) + result = ts.to_pydatetime() + self.assertEqual(result, expected) + class TestDatetimeParsingWrappers(tm.TestCase): def test_does_not_convert_mixed_integer(self): diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index c1b990c417553..c9e85c5741410 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1,5 +1,7 @@ # cython: profile=False +import warnings + cimport numpy as np from numpy cimport (int8_t, int32_t, int64_t, import_array, ndarray, NPY_INT64, NPY_DATETIME, NPY_TIMEDELTA) @@ -637,22 +639,6 @@ class Timestamp(_Timestamp): return Timestamp(datetime.replace(self, **kwds), freq=self.freq) - def to_pydatetime(self, warn=True): - """ - If warn=True, issue warning if nanoseconds is nonzero - """ - cdef: - pandas_datetimestruct dts - _TSObject ts - - if self.nanosecond != 0 and warn: - print 'Warning: discarding nonzero nanoseconds' - ts = convert_to_tsobject(self, self.tzinfo, None, 0, 0) - - return datetime(ts.dts.year, ts.dts.month, ts.dts.day, - ts.dts.hour, ts.dts.min, ts.dts.sec, - ts.dts.us, ts.tzinfo) - def isoformat(self, sep='T'): base = super(_Timestamp, self).isoformat(sep=sep) if self.nanosecond == 0: @@ -805,11 +791,11 @@ def _make_nan_func(func_name): f.__name__ = func_name return f -_nat_methods = ['date', 'now', 'replace', 'to_datetime', 'today'] +_nat_methods = ['date', 'now', 'replace', 'to_pydatetime', 'today'] _nan_methods = ['weekday', 'isoweekday', 'total_seconds'] -_implemented_methods = ['to_datetime64', 'isoformat'] +_implemented_methods = ['to_datetime', 'to_datetime64', 'isoformat'] _implemented_methods.extend(_nat_methods) _implemented_methods.extend(_nan_methods) @@ -986,7 +972,7 @@ cdef class _Timestamp(datetime): ots = other elif isinstance(other, datetime): if self.nanosecond == 0: - val = self.to_datetime() + val = self.to_pydatetime() return PyObject_RichCompareBool(val, other, op) try: @@ -1048,7 +1034,7 @@ cdef class _Timestamp(datetime): cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: - cdef datetime dtval = self.to_datetime() + cdef datetime dtval = self.to_pydatetime() self._assert_tzawareness_compat(other) @@ -1078,9 +1064,28 @@ cdef class _Timestamp(datetime): raise TypeError('Cannot compare tz-naive and tz-aware timestamps') cpdef datetime to_datetime(_Timestamp self): + """ + DEPRECATED: use :meth:`to_pydatetime` instead. + + Convert a Timestamp object to a native Python datetime object. + """ + warnings.warn("to_datetime is deprecated. Use self.to_pydatetime()", + FutureWarning, stacklevel=2) + return self.to_pydatetime(warn=False) + + cpdef datetime to_pydatetime(_Timestamp self, warn=True): + """ + Convert a Timestamp object to a native Python datetime object. + + If warn=True, issue a warning if nanoseconds is nonzero. + """ cdef: pandas_datetimestruct dts _TSObject ts + + if self.nanosecond != 0 and warn: + warnings.warn("Discarding nonzero nanoseconds in conversion", + UserWarning, stacklevel=2) ts = convert_to_tsobject(self, self.tzinfo, None, 0, 0) dts = ts.dts return datetime(dts.year, dts.month, dts.day, From 10bf7213240eaa0960f50f86c7790d64d8973060 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Aug 2016 14:44:19 +0200 Subject: [PATCH 312/359] DOC: small update to install.rst page (#14115) Move some content from the removed content on dependencies from the README (see GH13882) --- README.md | 2 +- doc/source/install.rst | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 1334d9696764b..6ebc287fa2cf6 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ pip install pandas - [pytz](http://pytz.sourceforge.net) - Needed for time zone support with ``pandas.date_range`` -See the [full installation instructions](http://pandas.pydata.org/pandas-docs/stable/install.html) +See the [full installation instructions](http://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for recommended and optional dependencies. ## Installation from sources diff --git a/doc/source/install.rst b/doc/source/install.rst index 82d2dcd1cc709..f8ee0542ea17e 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -254,11 +254,14 @@ Optional Dependencies - `SQLite `__: for SQLite, this is included in Python's standard library by default. * `matplotlib `__: for plotting -* `openpyxl `__, `xlrd/xlwt `__: Needed for Excel I/O -* `XlsxWriter `__: Alternative Excel writer +* For Excel I/O: + * `xlrd/xlwt `__: Excel reading (xlrd) and writing (xlwt) + * `openpyxl `__: openpyxl version 1.6.1 + or higher (but lower than 2.0.0), or version 2.2 or higher, for writing .xlsx files (xlrd >= 0.9.0) + * `XlsxWriter `__: Alternative Excel writer + * `Jinja2 `__: Template engine for conditional HTML formatting. -* `boto `__: necessary for Amazon S3 - access. +* `boto `__: necessary for Amazon S3 access. * `blosc `__: for msgpack compression using ``blosc`` * One of `PyQt4 `__, `PySide @@ -266,7 +269,7 @@ Optional Dependencies `__, `xsel `__, or `xclip `__: necessary to use - :func:`~pandas.io.clipboard.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. + :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. * Google's `python-gflags <`__ , `oauth2client `__ , `httplib2 `__ @@ -274,7 +277,7 @@ Optional Dependencies : Needed for :mod:`~pandas.io.gbq` * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. * One of the following combinations of libraries is needed to use the - top-level :func:`~pandas.io.html.read_html` function: + top-level :func:`~pandas.read_html` function: * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is okay.) @@ -287,14 +290,14 @@ Optional Dependencies * if you install `BeautifulSoup4`_ you must install either `lxml`_ or `html5lib`_ or both. - :func:`~pandas.io.html.read_html` will **not** work with *only* + :func:`~pandas.read_html` will **not** work with *only* `BeautifulSoup4`_ installed. * You are highly encouraged to read :ref:`HTML reading gotchas `. It explains issues surrounding the installation and usage of the above three libraries * You may need to install an older version of `BeautifulSoup4`_: - - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and - 32-bit Ubuntu/Debian + - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and + 32-bit Ubuntu/Debian * Additionally, if you're using `Anaconda`_ you should definitely read :ref:`the gotchas about HTML parsing libraries ` From 0c1e052af40ad8bf0c8a11fa3014aa6e0986def0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Aug 2016 09:52:49 +0200 Subject: [PATCH 313/359] BUG: series resample with timedelta values looses dtype (GH13119) (#14118) _possibly_downcast_to_dtype did not work for timedelta data --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tests/types/test_cast.py | 76 ++++++++++++++++----------- pandas/tseries/tests/test_resample.py | 17 ++++++ pandas/types/cast.py | 4 +- 4 files changed, 64 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 45cdd23140487..1e14a2308e7e9 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1260,7 +1260,7 @@ Bug Fixes - Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) - Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) - Bug in ``Series`` creation with ``np.datetime64`` which has other unit than ``ns`` as ``object`` dtype results in incorrect values (:issue:`13876`) - +- Bug in ``resample`` with timedelta data where data was casted to float (:issue:`13119`). - Bug in ``pd.isnull()`` ``pd.notnull()`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) - Bug in ``pd.merge()`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index 2b4998fd64f4a..56a14a51105ca 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -24,44 +24,56 @@ _multiprocess_can_split_ = True -def test_downcast_conv(): - # test downcasting +class TestPossiblyDowncast(tm.TestCase): - arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) - result = _possibly_downcast_to_dtype(arr, 'infer') - assert (np.array_equal(result, arr)) + def test_downcast_conv(self): + # test downcasting - arr = np.array([8., 8., 8., 8., 8.9999999999995]) - result = _possibly_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) - - arr = np.array([8., 8., 8., 8., 9.0000000000005]) - result = _possibly_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) - - # conversions + arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) + result = _possibly_downcast_to_dtype(arr, 'infer') + assert (np.array_equal(result, arr)) - expected = np.array([1, 2]) - for dtype in [np.float64, object, np.int64]: - arr = np.array([1.0, 2.0], dtype=dtype) + arr = np.array([8., 8., 8., 8., 8.9999999999995]) result = _possibly_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected, check_dtype=False) + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) - for dtype in [np.float64, object]: - expected = np.array([1.0, 2.0, np.nan], dtype=dtype) - arr = np.array([1.0, 2.0, np.nan], dtype=dtype) + arr = np.array([8., 8., 8., 8., 9.0000000000005]) result = _possibly_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected) - - # empties - for dtype in [np.int32, np.float64, np.float32, np.bool_, - np.int64, object]: - arr = np.array([], dtype=dtype) - result = _possibly_downcast_to_dtype(arr, 'int64') - tm.assert_almost_equal(result, np.array([], dtype=np.int64)) - assert result.dtype == np.int64 + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) + + # conversions + + expected = np.array([1, 2]) + for dtype in [np.float64, object, np.int64]: + arr = np.array([1.0, 2.0], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'infer') + tm.assert_almost_equal(result, expected, check_dtype=False) + + for dtype in [np.float64, object]: + expected = np.array([1.0, 2.0, np.nan], dtype=dtype) + arr = np.array([1.0, 2.0, np.nan], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'infer') + tm.assert_almost_equal(result, expected) + + # empties + for dtype in [np.int32, np.float64, np.float32, np.bool_, + np.int64, object]: + arr = np.array([], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'int64') + tm.assert_almost_equal(result, np.array([], dtype=np.int64)) + assert result.dtype == np.int64 + + def test_datetimelikes_nan(self): + arr = np.array([1, 2, np.nan]) + exp = np.array([1, 2, np.datetime64('NaT')], dtype='datetime64[ns]') + res = _possibly_downcast_to_dtype(arr, 'datetime64[ns]') + tm.assert_numpy_array_equal(res, exp) + + exp = np.array([1, 2, np.timedelta64('NaT')], dtype='timedelta64[ns]') + res = _possibly_downcast_to_dtype(arr, 'timedelta64[ns]') + tm.assert_numpy_array_equal(res, exp) class TestInferDtype(tm.TestCase): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 85d8cd52e1866..49802ba640d70 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1935,6 +1935,23 @@ def test_resample_with_nat(self): assert_frame_equal(frame.resample('60s').mean(), frame_3s) + def test_resample_timedelta_values(self): + # GH 13119 + # check that timedelta dtype is preserved when NaT values are + # introduced by the resampling + + times = timedelta_range('1 day', '4 day', freq='4D') + df = DataFrame({'time': times}, index=times) + + times2 = timedelta_range('1 day', '4 day', freq='2D') + exp = Series(times2, index=times2, name='time') + exp.iloc[1] = pd.NaT + + res = df.resample('2D').first()['time'] + tm.assert_series_equal(res, exp) + res = df['time'].resample('2D').first() + tm.assert_series_equal(res, exp) + class TestPeriodIndex(Base, tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 59c939126d2a4..a79862eb195b6 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -12,7 +12,7 @@ is_timedelta64_dtype, is_dtype_equal, is_float_dtype, is_complex_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype, - is_scalar, + is_bool_dtype, is_scalar, _string_dtypes, _coerce_to_dtype, _ensure_int8, _ensure_int16, @@ -89,7 +89,7 @@ def trans(x): # noqa if issubclass(dtype.type, np.floating): return result.astype(dtype) - elif dtype == np.bool_ or issubclass(dtype.type, np.integer): + elif is_bool_dtype(dtype) or is_integer_dtype(dtype): # if we don't have any elements, just astype it if not np.prod(result.shape): From b6d3a81864a03d8b7310315067e6df464d54b9d7 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Wed, 31 Aug 2016 16:57:33 +0900 Subject: [PATCH 314/359] ENH: Sparse dtypes (#13849) Add better support for int64 and bool data types in sparse objects --- doc/source/sparse.rst | 55 +++++++++++ doc/source/whatsnew/v0.19.0.txt | 55 ++++++++++- pandas/core/generic.py | 13 ++- pandas/core/internals.py | 3 - pandas/formats/format.py | 3 + pandas/io/tests/test_pickle.py | 7 ++ pandas/sparse/array.py | 115 +++++++++++----------- pandas/sparse/frame.py | 21 +++- pandas/sparse/series.py | 47 +++++---- pandas/sparse/tests/test_arithmetics.py | 59 +++++++++++ pandas/sparse/tests/test_array.py | 69 ++++++++----- pandas/sparse/tests/test_format.py | 58 ++++++++++- pandas/sparse/tests/test_frame.py | 125 ++++++++++++++++++++---- pandas/sparse/tests/test_indexing.py | 15 +++ pandas/sparse/tests/test_libsparse.py | 55 +++++++++++ pandas/sparse/tests/test_series.py | 102 ++++++++++++++----- pandas/tests/series/test_subclass.py | 47 ++++++--- pandas/util/testing.py | 23 +++-- 18 files changed, 696 insertions(+), 176 deletions(-) diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index db9734edde482..b6c5c15bc9081 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -132,6 +132,61 @@ keeps an arrays of all of the locations where the data are not equal to the fill value. The ``block`` format tracks only the locations and sizes of blocks of data. +.. _sparse.dtype: + +Sparse Dtypes +------------- + +Sparse data should have the same dtype as its dense representation. Currently, +``float64``, ``int64`` and ``bool`` dtypes are supported. Depending on the original +dtype, ``fill_value`` default changes: + +- ``float64``: ``np.nan`` +- ``int64``: ``0`` +- ``bool``: ``False`` + +.. ipython:: python + + s = pd.Series([1, np.nan, np.nan]) + s + s.to_sparse() + + s = pd.Series([1, 0, 0]) + s + s.to_sparse() + + s = pd.Series([True, False, True]) + s + s.to_sparse() + +You can change the dtype using ``.astype()``, the result is also sparse. Note that +``.astype()`` also affects to the ``fill_value`` to keep its dense represantation. + + +.. ipython:: python + + s = pd.Series([1, 0, 0, 0, 0]) + s + ss = s.to_sparse() + ss + ss.astype(np.float64) + +It raises if any value cannot be coerced to specified dtype. + +.. code-block:: ipython + + In [1]: ss = pd.Series([1, np.nan, np.nan]).to_sparse() + 0 1.0 + 1 NaN + 2 NaN + dtype: float64 + BlockIndex + Block locations: array([0], dtype=int32) + Block lengths: array([1], dtype=int32) + + In [2]: ss.astype(np.int64) + ValueError: unable to coerce current fill_value nan to int64 dtype + .. _sparse.calculation: Sparse Calculation diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 1e14a2308e7e9..918a6a2361f6a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -17,6 +17,7 @@ Highlights include: - ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` - ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See ref:`here ` +- Sparse data structures now gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` .. contents:: What's new in v0.19.0 :local: @@ -975,6 +976,51 @@ Sparse Changes These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. + +``int64`` and ``bool`` support enhancements +""""""""""""""""""""""""""""""""""""""""""" + +Sparse data structures now gained enhanced support of ``int64`` and ``bool`` ``dtype`` (:issue:`667`, :issue:`13849`) + +Previously, sparse data were ``float64`` dtype by default, even if all inputs were ``int`` or ``bool`` dtype. You had to specify ``dtype`` explicitly to create sparse data with ``int64`` dtype. Also, ``fill_value`` had to be specified explicitly becuase it's default was ``np.nan`` which doesn't appear in ``int64`` or ``bool`` data. + +.. code-block:: ipython + + In [1]: pd.SparseArray([1, 2, 0, 0]) + Out[1]: + [1.0, 2.0, 0.0, 0.0] + Fill: nan + IntIndex + Indices: array([0, 1, 2, 3], dtype=int32) + + # specifying int64 dtype, but all values are stored in sp_values because + # fill_value default is np.nan + In [2]: pd.SparseArray([1, 2, 0, 0], dtype=np.int64) + Out[2]: + [1, 2, 0, 0] + Fill: nan + IntIndex + Indices: array([0, 1, 2, 3], dtype=int32) + + In [3]: pd.SparseArray([1, 2, 0, 0], dtype=np.int64, fill_value=0) + Out[3]: + [1, 2, 0, 0] + Fill: 0 + IntIndex + Indices: array([0, 1], dtype=int32) + +As of v0.19.0, sparse data keeps the input dtype, and assign more appropriate ``fill_value`` default (``0`` for ``int64`` dtype, ``False`` for ``bool`` dtype). + +.. ipython :: python + + pd.SparseArray([1, 2, 0, 0], dtype=np.int64) + pd.SparseArray([True, False, False, False]) + +See the :ref:`docs ` for more details. + +Operators now preserve dtypes +""""""""""""""""""""""""""""" + - Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) .. ipython:: python @@ -1001,6 +1047,9 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` Out[7]: ValueError: unable to coerce current fill_value nan to int64 dtype +Other sparse fixes +"""""""""""""""""" + - Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) - ``SparseArray`` with ``bool`` dtype now supports logical (bool) operators (:issue:`14000`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) @@ -1011,6 +1060,11 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` - Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) - Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`) +- Bug in ``SparseSeries`` slicing changes integer dtype to float (:issue:`8292`) +- Bug in ``SparseDataFarme`` comparison ops may raise ``TypeError`` (:issue:`13001`) +- Bug in ``SparseDataFarme.isnull`` raises ``ValueError`` (:issue:`8276`) +- Bug in ``SparseSeries`` representation with ``bool`` dtype may raise ``IndexError`` (:issue:`13110`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` of ``bool`` or ``int64`` dtype may display its values like ``float64`` dtype (:issue:`13110`) - Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result (:issue:`13985`) - Bug in ``SparseArray`` created from ``SparseSeries`` may lose ``dtype`` (:issue:`13999`) - Bug in ``SparseSeries`` comparison with dense returns normal ``Series`` rather than ``SparseSeries`` (:issue:`13999`) @@ -1053,7 +1107,6 @@ New behaviour: In [2]: i.get_indexer(['b', 'b', 'c']).dtype Out[2]: dtype('int64') - .. _whatsnew_0190.deprecations: Deprecations diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8e295174771c4..2a6f00c65c7fb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3779,24 +3779,29 @@ def asof(self, where, subset=None): # ---------------------------------------------------------------------- # Action Methods - def isnull(self): - """ + _shared_docs['isnull'] = """ Return a boolean same-sized object indicating if the values are null. See Also -------- notnull : boolean inverse of isnull """ + + @Appender(_shared_docs['isnull']) + def isnull(self): return isnull(self).__finalize__(self) - def notnull(self): - """Return a boolean same-sized object indicating if the values are + _shared_docs['isnotnull'] = """ + Return a boolean same-sized object indicating if the values are not null. See Also -------- isnull : boolean inverse of notnull """ + + @Appender(_shared_docs['isnotnull']) + def notnull(self): return notnull(self).__finalize__(self) def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e11fd4086347f..bb2d1a9d1b5d3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2478,9 +2478,6 @@ def fill_value(self): @fill_value.setter def fill_value(self, v): - # we may need to upcast our fill to match our dtype - if issubclass(self.dtype.type, np.floating): - v = float(v) self.values.fill_value = v def to_dense(self): diff --git a/pandas/formats/format.py b/pandas/formats/format.py index b83e3c4e73fdb..cb8fb3a5d2e49 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -21,6 +21,7 @@ is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype) +from pandas.types.generic import ABCSparseArray from pandas.core.base import PandasObject from pandas.core.index import Index, MultiIndex, _ensure_index @@ -1966,6 +1967,8 @@ def _format(x): vals = self.values if isinstance(vals, Index): vals = vals._values + elif isinstance(vals, ABCSparseArray): + vals = vals.values is_float_type = lib.map_infer(vals, is_float) & notnull(vals) leading_space = is_float_type.any() diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 94885d90d3c4a..a49f50b1bcb9f 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -163,6 +163,13 @@ def compare_index_period(self, result, expected, typ, version): tm.assert_equal(result.freqstr, 'M') tm.assert_index_equal(result.shift(2), expected.shift(2)) + def compare_sp_frame_float(self, result, expected, typ, version): + if LooseVersion(version) <= '0.18.1': + tm.assert_sp_frame_equal(result, expected, exact_indices=False, + check_dtype=False) + else: + tm.assert_sp_frame_equal(result, expected) + def read_pickles(self, version): if not is_platform_little_endian(): raise nose.SkipTest("known failure on non-little endian") diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index ca9d5efe2fbe5..8420371d05e02 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -4,25 +4,25 @@ from __future__ import division # pylint: disable=E1101,E1103,W0231 -from numpy import nan, ndarray import numpy as np import pandas as pd from pandas.core.base import PandasObject -from pandas import compat, lib +from pandas import compat from pandas.compat import range from pandas.compat.numpy import function as nv from pandas.types.generic import ABCSparseArray, ABCSparseSeries -from pandas.types.common import (is_float, is_integer, - is_integer_dtype, _ensure_platform_int, +from pandas.types.common import (_ensure_platform_int, + is_float, is_integer, + is_integer_dtype, is_bool_dtype, is_list_like, is_scalar, is_dtype_equal) from pandas.types.cast import (_possibly_convert_platform, _maybe_promote, - _astype_nansafe) -from pandas.types.missing import isnull, notnull + _astype_nansafe, _find_common_type) +from pandas.types.missing import isnull, notnull, na_value_for_dtype from pandas._sparse import SparseIndex, BlockIndex, IntIndex import pandas._sparse as splib @@ -69,16 +69,6 @@ def wrapper(self, other): return wrapper -def _maybe_match_dtype(left, right): - if not hasattr(right, 'dtype'): - return left.dtype - elif left.dtype == right.dtype: - return getattr(left.dtype, '__name__', left.dtype) - else: - # ToDo: to be supported after GH 667 - raise NotImplementedError('dtypes must be identical') - - def _get_fill(arr): # coerce fill_value to arr dtype if possible # int64 SparseArray can have NaN as fill_value if there is no missing @@ -99,7 +89,15 @@ def _sparse_array_op(left, right, op, name, series=False): left = left.astype(np.float64) right = right.astype(np.float64) - dtype = _maybe_match_dtype(left, right) + # dtype used to find corresponding sparse method + if not is_dtype_equal(left.dtype, right.dtype): + dtype = _find_common_type([left.dtype, right.dtype]) + left = left.astype(dtype) + right = right.astype(dtype) + else: + dtype = left.dtype + + # dtype the result must have result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: @@ -147,11 +145,11 @@ def _sparse_array_op(left, right, op, name, series=False): def _wrap_result(name, data, sparse_index, fill_value, dtype=None): """ wrap op result to have correct dtype """ if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): - # ToDo: We can remove this condition when removing - # SparseArray's dtype default when closing GH 667 dtype = np.bool - elif name == 'truediv': - dtype = np.float64 + + if is_bool_dtype(dtype): + # fill_value may be np.bool_ + fill_value = bool(fill_value) return SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype) @@ -164,7 +162,8 @@ class SparseArray(PandasObject, np.ndarray): data : {array-like (1-D), Series, SparseSeries, dict} kind : {'block', 'integer'} fill_value : float - Defaults to NaN (code for missing) + Code for missing value. Defaults depends on dtype. + 0 for int dtype, False for bool dtype, and NaN for other dtypes sparse_index : {BlockIndex, IntIndex}, optional Only if you have one. Mainly used internally @@ -182,7 +181,7 @@ class SparseArray(PandasObject, np.ndarray): fill_value = None def __new__(cls, data, sparse_index=None, index=None, kind='integer', - fill_value=None, dtype=np.float64, copy=False): + fill_value=None, dtype=None, copy=False): if index is not None: if data is None: @@ -199,25 +198,18 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', if dtype is not None: dtype = np.dtype(dtype) - if is_sparse_array: - # temp, always inherit passed SparseArray dtype - # can be removed after GH 13849 - dtype = data.dtype - - if fill_value is None: - if is_sparse_array: - fill_value = data.fill_value - else: - fill_value = nan if is_sparse_array: sparse_index = data.sp_index - values = np.asarray(data) + values = data.sp_values + fill_value = data.fill_value else: # array-like if sparse_index is None: - values, sparse_index = make_sparse(data, kind=kind, - fill_value=fill_value) + if dtype is not None: + data = np.asarray(data, dtype=dtype) + res = make_sparse(data, kind=kind, fill_value=fill_value) + values, sparse_index, fill_value = res else: values = _sanitize_values(data) if len(values) != sparse_index.npoints: @@ -226,31 +218,25 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', " index".format(type(values))) # Create array, do *not* copy data by default if copy: - try: - # ToDo: Can remove this error handling when we actually - # support other dtypes - subarr = np.array(values, dtype=dtype, copy=True) - except ValueError: - subarr = np.array(values, copy=True) + subarr = np.array(values, dtype=dtype, copy=True) else: - try: - subarr = np.asarray(values, dtype=dtype) - except ValueError: - subarr = np.asarray(values) - - # if we have a bool type, make sure that we have a bool fill_value - if ((dtype is not None and issubclass(dtype.type, np.bool_)) or - (data is not None and lib.is_bool_array(subarr))): - if np.isnan(fill_value) or not fill_value: - fill_value = False - else: - fill_value = bool(fill_value) - + subarr = np.asarray(values, dtype=dtype) # Change the class of the array to be the subclass type. return cls._simple_new(subarr, sparse_index, fill_value) @classmethod def _simple_new(cls, data, sp_index, fill_value): + if not isinstance(sp_index, SparseIndex): + # caller must pass SparseIndex + raise ValueError('sp_index must be a SparseIndex') + + if fill_value is None: + if sp_index.ngaps > 0: + # has missing hole + fill_value = np.nan + else: + fill_value = na_value_for_dtype(data.dtype) + if (is_integer_dtype(data) and is_float(fill_value) and sp_index.ngaps > 0): # if float fill_value is being included in dense repr, @@ -318,7 +304,7 @@ def __array_finalize__(self, obj): def __reduce__(self): """Necessary for making this object picklable""" - object_state = list(ndarray.__reduce__(self)) + object_state = list(np.ndarray.__reduce__(self)) subclass_state = self.fill_value, self.sp_index object_state[2] = (object_state[2], subclass_state) return tuple(object_state) @@ -326,7 +312,7 @@ def __reduce__(self): def __setstate__(self, state): """Necessary for making this object picklable""" nd_state, own_state = state - ndarray.__setstate__(self, nd_state) + np.ndarray.__setstate__(self, nd_state) fill_value, sp_index = own_state[:2] self.sp_index = sp_index @@ -404,9 +390,11 @@ def __iter__(self): yield self._get_val_at(i) def __getitem__(self, key): + """ """ + if is_integer(key): return self._get_val_at(key) elif isinstance(key, tuple): @@ -531,7 +519,11 @@ def astype(self, dtype=None, copy=True): dtype = np.dtype(dtype) sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy) try: - fill_value = dtype.type(self.fill_value) + if is_bool_dtype(dtype): + # to avoid np.bool_ dtype + fill_value = bool(self.fill_value) + else: + fill_value = dtype.type(self.fill_value) except ValueError: msg = 'unable to coerce current fill_value {0} to {1} dtype' raise ValueError(msg.format(self.fill_value, dtype)) @@ -726,7 +718,7 @@ def _sanitize_values(arr): return arr -def make_sparse(arr, kind='block', fill_value=nan): +def make_sparse(arr, kind='block', fill_value=None): """ Convert ndarray to sparse format @@ -746,6 +738,9 @@ def make_sparse(arr, kind='block', fill_value=nan): if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") + if fill_value is None: + fill_value = na_value_for_dtype(arr.dtype) + if isnull(fill_value): mask = notnull(arr) else: @@ -760,7 +755,7 @@ def make_sparse(arr, kind='block', fill_value=nan): index = _make_index(length, indices, kind) sparsified_values = arr[mask] - return sparsified_values, index + return sparsified_values, index, fill_value def _make_index(length, indices, kind): diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index f382a4b869a3e..8eeff045d1fac 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -11,6 +11,7 @@ import numpy as np from pandas.types.missing import isnull, notnull +from pandas.types.cast import _maybe_upcast from pandas.types.common import _ensure_platform_int from pandas.core.common import _try_sort @@ -22,12 +23,15 @@ import pandas.core.algorithms as algos from pandas.core.internals import (BlockManager, create_block_manager_from_arrays) -from pandas.core.generic import NDFrame +import pandas.core.generic as generic from pandas.sparse.series import SparseSeries, SparseArray from pandas.util.decorators import Appender import pandas.core.ops as ops +_shared_doc_kwargs = dict(klass='SparseDataFrame') + + class SparseDataFrame(DataFrame): """ DataFrame containing sparse floating point data in the form of SparseSeries @@ -118,7 +122,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, if dtype is not None: mgr = mgr.astype(dtype) - NDFrame.__init__(self, mgr) + generic.NDFrame.__init__(self, mgr) @property def _constructor(self): @@ -509,7 +513,7 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None): new_data, index=self.index, columns=union, default_fill_value=self.default_fill_value).__finalize__(self) - def _combine_const(self, other, func): + def _combine_const(self, other, func, raise_on_error=True): return self._apply_columns(lambda x: func(x, other)) def _reindex_index(self, index, method, copy, level, fill_value=np.nan, @@ -542,6 +546,9 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, new = values.take(indexer) if need_mask: new = new.values + # convert integer to float if necessary. need to do a lot + # more than that, handle boolean etc also + new, fill_value = _maybe_upcast(new, fill_value=fill_value) np.putmask(new, mask, fill_value) new_series[col] = new @@ -686,6 +693,14 @@ def cumsum(self, axis=0, *args, **kwargs): return self.apply(lambda x: x.cumsum(), axis=axis) + @Appender(generic._shared_docs['isnull']) + def isnull(self): + return self._apply_columns(lambda x: x.isnull()) + + @Appender(generic._shared_docs['isnotnull']) + def isnotnull(self): + return self._apply_columns(lambda x: x.isnotnull()) + def apply(self, func, axis=0, broadcast=False, reduce=False): """ Analogous to DataFrame.apply, for SparseDataFrame diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 888dbde8ffb0f..ad9168890b8f2 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -8,7 +8,7 @@ import numpy as np import warnings -from pandas.types.missing import isnull +from pandas.types.missing import isnull, notnull from pandas.types.common import is_scalar from pandas.core.common import _values_from_object, _maybe_match_name @@ -91,7 +91,8 @@ class SparseSeries(Series): data : {array-like, Series, SparseSeries, dict} kind : {'block', 'integer'} fill_value : float - Defaults to NaN (code for missing) + Code for missing value. Defaults depends on dtype. + 0 for int dtype, False for bool dtype, and NaN for other dtypes sparse_index : {BlockIndex, IntIndex}, optional Only if you have one. Mainly used internally @@ -125,26 +126,20 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if isinstance(data, Series) and name is None: name = data.name - is_sparse_array = isinstance(data, SparseArray) - if fill_value is None: - if is_sparse_array: - fill_value = data.fill_value - else: - fill_value = np.nan - - if is_sparse_array: - if isinstance(data, SparseSeries) and index is None: - index = data.index.view() - elif index is not None: + if isinstance(data, SparseArray): + if index is not None: assert (len(index) == len(data)) - sparse_index = data.sp_index + if fill_value is None: + fill_value = data.fill_value + data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() - + if fill_value is None: + fill_value = data.fill_value # extract the SingleBlockManager data = data._data @@ -153,14 +148,14 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', index = data.index.view() data = Series(data) - data, sparse_index = make_sparse(data, kind=kind, - fill_value=fill_value) + res = make_sparse(data, kind=kind, fill_value=fill_value) + data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: - data, sparse_index = make_sparse(data, kind=kind, - fill_value=fill_value) + res = make_sparse(data, kind=kind, fill_value=fill_value) + data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) @@ -636,6 +631,20 @@ def cumsum(self, axis=0, *args, **kwargs): # TODO: gh-12855 - return a SparseSeries here return Series(new_array, index=self.index).__finalize__(self) + @Appender(generic._shared_docs['isnull']) + def isnull(self): + arr = SparseArray(isnull(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=isnull(self.fill_value)) + return self._constructor(arr, index=self.index).__finalize__(self) + + @Appender(generic._shared_docs['isnotnull']) + def isnotnull(self): + arr = SparseArray(notnull(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=notnull(self.fill_value)) + return self._constructor(arr, index=self.index).__finalize__(self) + def dropna(self, axis=0, inplace=False, **kwargs): """ Analogous to Series.dropna. If fill_value=NaN, returns a dense Series diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/sparse/tests/test_arithmetics.py index def3d15a43f0f..f24244b38c42b 100644 --- a/pandas/sparse/tests/test_arithmetics.py +++ b/pandas/sparse/tests/test_arithmetics.py @@ -357,6 +357,65 @@ def test_bool_array_logical(self): fill_value=fill_value) self._check_logical_ops(a, b, values, rvalues) + def test_mixed_array_float_int(self): + + for rdtype in ['int64']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + + self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self.assertEqual(b.dtype, rdtype) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self.assertEqual(b.dtype, rdtype) + self._check_numeric_ops(a, b, values, rvalues) + + def test_mixed_array_comparison(self): + + # int32 NI ATM + for rdtype in ['int64']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self.assertEqual(b.dtype, rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self.assertEqual(b.dtype, rdtype) + self._check_comparison_ops(a, b, values, rvalues) + class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 63e29656b66ea..dd86e9e791e5e 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -30,9 +30,13 @@ def test_constructor_dtype(self): self.assertEqual(arr.dtype, np.float64) self.assertEqual(arr.fill_value, 0) + arr = SparseArray([0, 1, 2, 4], dtype=np.float64) + self.assertEqual(arr.dtype, np.float64) + self.assertTrue(np.isnan(arr.fill_value)) + arr = SparseArray([0, 1, 2, 4], dtype=np.int64) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) self.assertEqual(arr.dtype, np.int64) @@ -40,7 +44,7 @@ def test_constructor_dtype(self): arr = SparseArray([0, 1, 2, 4], dtype=None) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) self.assertEqual(arr.dtype, np.int64) @@ -63,13 +67,13 @@ def test_constructor_spindex_dtype(self): self.assertEqual(arr.dtype, np.float64) self.assertTrue(np.isnan(arr.fill_value)) - arr = SparseArray(data=[0, 1, 2, 3], - sparse_index=IntIndex(4, [0, 1, 2, 3]), - dtype=np.int64) - exp = SparseArray([0, 1, 2, 3], dtype=np.int64) + arr = SparseArray(data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=np.int64, fill_value=0) + exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64) @@ -78,22 +82,20 @@ def test_constructor_spindex_dtype(self): self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) - arr = SparseArray(data=[0, 1, 2, 3], - sparse_index=IntIndex(4, [0, 1, 2, 3]), - dtype=None) + arr = SparseArray(data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=None, fill_value=0) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) # scalar input - arr = SparseArray(data=1, - sparse_index=IntIndex(1, [0]), - dtype=None) + arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None) @@ -576,44 +578,63 @@ def test_generator_warnings(self): def test_fillna(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) res = s.fillna(-1) - exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(-1) - exp = SparseArray([1, -1, -1, 3, -1], fill_value=0) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0]) res = s.fillna(-1) - exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0) res = s.fillna(-1) - exp = SparseArray([1, -1, 0, 3, 0], fill_value=0) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan]) res = s.fillna(-1) - exp = SparseArray([-1, -1, -1, -1], fill_value=-1) + exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0) res = s.fillna(-1) - exp = SparseArray([-1, -1, -1, -1], fill_value=0) + exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) - s = SparseArray([0, 0, 0, 0]) + # float dtype's fill_value is np.nan, replaced by -1 + s = SparseArray([0., 0., 0., 0.]) res = s.fillna(-1) - exp = SparseArray([0, 0, 0, 0], fill_value=-1) + exp = SparseArray([0., 0., 0., 0.], fill_value=-1) tm.assert_sp_array_equal(res, exp) + # int dtype shouldn't have missing. No changes. + s = SparseArray([0, 0, 0, 0]) + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) + res = s.fillna(-1) + tm.assert_sp_array_equal(res, s) + s = SparseArray([0, 0, 0, 0], fill_value=0) + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) res = s.fillna(-1) exp = SparseArray([0, 0, 0, 0], fill_value=0) tm.assert_sp_array_equal(res, exp) + # fill_value can be nan if there is no missing hole. + # only fill_value will be changed + s = SparseArray([0, 0, 0, 0], fill_value=np.nan) + self.assertEqual(s.dtype, np.int64) + self.assertTrue(np.isnan(s.fill_value)) + res = s.fillna(-1) + exp = SparseArray([0, 0, 0, 0], fill_value=-1) + tm.assert_sp_array_equal(res, exp) + def test_fillna_overlap(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) # filling with existing value doesn't replace existing value with @@ -624,7 +645,7 @@ def test_fillna_overlap(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(3) - exp = SparseArray([1, 3, 3, 3, 3], fill_value=0) + exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py index 9bdc1fdd101ea..377eaa20565a2 100644 --- a/pandas/sparse/tests/test_format.py +++ b/pandas/sparse/tests/test_format.py @@ -13,7 +13,7 @@ use_32bit_repr = is_platform_windows() or is_platform_32bit() -class TestSeriesFormatting(tm.TestCase): +class TestSparseSeriesFormatting(tm.TestCase): _multiprocess_can_split_ = True @@ -62,3 +62,59 @@ def test_sparse_mi_max_row(self): "Block locations: array([0, 3]{0})\n" "Block lengths: array([1, 1]{0})".format(dfm)) self.assertEqual(result, exp) + + def test_sparse_bool(self): + # GH 13110 + s = pd.SparseSeries([True, False, False, True, False, False], + fill_value=False) + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("0 True\n1 False\n2 False\n" + "3 True\n4 False\n5 False\n" + "dtype: bool\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + result = repr(s) + exp = ("0 True\n ... \n5 False\n" + "dtype: bool\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + def test_sparse_int(self): + # GH 13110 + s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False) + + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("0 0\n1 1\n2 0\n3 0\n4 1\n" + "5 0\ndtype: int64\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + result = repr(s) + exp = ("0 0\n ..\n5 0\n" + "dtype: int64\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + +class TestSparseDataFrameFormatting(tm.TestCase): + + def test_sparse_frame(self): + # GH 13110 + df = pd.DataFrame({'A': [True, False, True, False, True], + 'B': [True, False, True, False, True], + 'C': [0, 0, 3, 0, 5], + 'D': [np.nan, np.nan, np.nan, 1, 2]}) + sparse = df.to_sparse() + self.assertEqual(repr(sparse), repr(df)) + + with option_context("display.max_rows", 3): + self.assertEqual(repr(sparse), repr(df)) diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 67b108c5dc648..192f6532a148d 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -25,10 +25,9 @@ class TestSparseDataFrame(tm.TestCase, SharedWithSparse): _multiprocess_can_split_ = True def setUp(self): - self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10), + 'C': np.arange(10, dtype=np.float64), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = bdate_range('1/1/2011', periods=10) @@ -125,10 +124,12 @@ def test_constructor(self): default_fill_value=self.frame.default_fill_value, default_kind=self.frame.default_kind, copy=True) reindexed = self.frame.reindex(idx) + tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False) # assert level parameter breaks reindex - self.assertRaises(TypeError, self.frame.reindex, idx, level=0) + with tm.assertRaises(TypeError): + self.frame.reindex(idx, level=0) repr(self.frame) @@ -569,18 +570,23 @@ def test_apply(self): self.frame.to_dense().apply(nanops.nansum)) def test_apply_nonuq(self): - df_orig = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) - df = df_orig.to_sparse() - rs = df.apply(lambda s: s[0], axis=1) - xp = Series([1., 4., 7.], ['a', 'a', 'c']) - tm.assert_series_equal(rs, xp) + orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'a', 'c']) + sparse = orig.to_sparse() + res = sparse.apply(lambda s: s[0], axis=1) + exp = orig.apply(lambda s: s[0], axis=1) + # dtype must be kept + self.assertEqual(res.dtype, np.int64) + # ToDo: apply must return subclassed dtype + self.assertIsInstance(res, pd.Series) + tm.assert_series_equal(res.to_dense(), exp) # df.T breaks - df = df_orig.T.to_sparse() - rs = df.apply(lambda s: s[0], axis=0) # noqa + sparse = orig.T.to_sparse() + res = sparse.apply(lambda s: s[0], axis=0) # noqa + exp = orig.T.apply(lambda s: s[0], axis=0) # TODO: no non-unique columns supported in sparse yet - # assert_series_equal(rs, xp) + # tm.assert_series_equal(res.to_dense(), exp) def test_applymap(self): # just test that it works @@ -596,8 +602,10 @@ def test_astype(self): self.assertEqual(sparse['B'].dtype, np.int64) res = sparse.astype(np.float64) - exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.]), - 'B': SparseArray([4., 5., 6., 7.])}, + exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.], + fill_value=0.), + 'B': SparseArray([4., 5., 6., 7.], + fill_value=0.)}, default_fill_value=np.nan) tm.assert_sp_frame_equal(res, exp) self.assertEqual(res['A'].dtype, np.float64) @@ -612,8 +620,10 @@ def test_astype(self): self.assertEqual(sparse['B'].dtype, np.int64) res = sparse.astype(np.float64) - exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.]), - 'B': SparseArray([0., 5., 0., 7.])}, + exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.], + fill_value=0.), + 'B': SparseArray([0., 5., 0., 7.], + fill_value=0.)}, default_fill_value=0.) tm.assert_sp_frame_equal(res, exp) self.assertEqual(res['A'].dtype, np.float64) @@ -813,6 +823,10 @@ def _check(frame, orig): untransposed = transposed.T tm.assert_sp_frame_equal(frame, untransposed) + tm.assert_frame_equal(frame.T.to_dense(), orig.T) + tm.assert_frame_equal(frame.T.T.to_dense(), orig.T.T) + tm.assert_sp_frame_equal(frame, frame.T.T, exact_indices=False) + self._check_all(_check) def test_shift(self): @@ -821,8 +835,8 @@ def _check(frame, orig): shifted = frame.shift(0) exp = orig.shift(0) - # int is coerced to float dtype - tm.assert_frame_equal(shifted.to_dense(), exp, check_dtype=False) + tm.assert_frame_equal(shifted.to_dense(), exp) + shifted = frame.shift(1) exp = orig.shift(1) tm.assert_frame_equal(shifted, exp) @@ -932,12 +946,85 @@ def test_nan_columnname(self): nan_colname_sparse = nan_colname.to_sparse() self.assertTrue(np.isnan(nan_colname_sparse.columns[0])) + def test_isnull(self): + # GH 8276 + df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], + 'B': [0, np.nan, np.nan, 2, np.nan]}) + + res = df.isnull() + exp = pd.SparseDataFrame({'A': [True, True, False, False, True], + 'B': [False, True, True, False, True]}, + default_fill_value=True) + exp._default_fill_value = np.nan + tm.assert_sp_frame_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], + 'B': [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.) + res = df.isnull() + tm.assertIsInstance(res, pd.SparseDataFrame) + exp = pd.DataFrame({'A': [False, False, False, False, True], + 'B': [False, True, False, False, True]}) + tm.assert_frame_equal(res.to_dense(), exp) + + def test_isnotnull(self): + # GH 8276 + df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], + 'B': [0, np.nan, np.nan, 2, np.nan]}) + + res = df.isnotnull() + exp = pd.SparseDataFrame({'A': [False, False, True, True, False], + 'B': [True, False, False, True, False]}, + default_fill_value=False) + exp._default_fill_value = np.nan + tm.assert_sp_frame_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], + 'B': [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.) + res = df.isnotnull() + tm.assertIsInstance(res, pd.SparseDataFrame) + exp = pd.DataFrame({'A': [True, True, True, True, False], + 'B': [True, False, True, True, False]}) + tm.assert_frame_equal(res.to_dense(), exp) + + +class TestSparseDataFrameArithmetic(tm.TestCase): + + def test_numeric_op_scalar(self): + df = pd.DataFrame({'A': [nan, nan, 0, 1, ], + 'B': [0, 1, 2, nan], + 'C': [1., 2., 3., 4.], + 'D': [nan, nan, nan, nan]}) + sparse = df.to_sparse() + + tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse()) + + def test_comparison_op_scalar(self): + # GH 13001 + df = pd.DataFrame({'A': [nan, nan, 0, 1, ], + 'B': [0, 1, 2, nan], + 'C': [1., 2., 3., 4.], + 'D': [nan, nan, nan, nan]}) + sparse = df.to_sparse() + + # comparison changes internal repr, compare with dense + res = sparse > 1 + tm.assertIsInstance(res, pd.SparseDataFrame) + tm.assert_frame_equal(res.to_dense(), df > 1) + + res = sparse != 0 + tm.assertIsInstance(res, pd.SparseDataFrame) + tm.assert_frame_equal(res.to_dense(), df != 0) + class TestSparseDataFrameAnalytics(tm.TestCase): def setUp(self): self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10), + 'C': np.arange(10, dtype=float), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = bdate_range('1/1/2011', periods=10) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index d176d95bb7dbf..c0d4b70c41dc4 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -49,6 +49,21 @@ def test_getitem_slice(self): tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse()) tm.assert_sp_series_equal(sparse[-5:], orig[-5:].to_sparse()) + def test_getitem_int_dtype(self): + # GH 8292 + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], name='xxx') + res = s[::2] + exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], name='xxx') + tm.assert_sp_series_equal(res, exp) + self.assertEqual(res.dtype, np.int64) + + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], fill_value=0, name='xxx') + res = s[::2] + exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], + fill_value=0, name='xxx') + tm.assert_sp_series_equal(res, exp) + self.assertEqual(res.dtype, np.int64) + def test_getitem_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 4417411403baa..c289b4a1b204f 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -243,6 +243,61 @@ class TestSparseIndexCommon(tm.TestCase): _multiprocess_can_split_ = True + def test_int_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') + self.assertIsInstance(idx, IntIndex) + self.assertEqual(idx.npoints, 2) + tm.assert_numpy_array_equal(idx.indices, + np.array([2, 3], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind='integer') + self.assertIsInstance(idx, IntIndex) + self.assertEqual(idx.npoints, 0) + tm.assert_numpy_array_equal(idx.indices, + np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), + kind='integer') + self.assertIsInstance(idx, IntIndex) + self.assertEqual(idx.npoints, 4) + tm.assert_numpy_array_equal(idx.indices, + np.array([0, 1, 2, 3], dtype=np.int32)) + + def test_block_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 2) + tm.assert_numpy_array_equal(idx.blocs, + np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([2], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 0) + tm.assert_numpy_array_equal(idx.blocs, + np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 4) + tm.assert_numpy_array_equal(idx.blocs, + np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([4], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 3) + tm.assert_numpy_array_equal(idx.blocs, + np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([1, 2], dtype=np.int32)) + def test_lookup(self): for kind in ['integer', 'block']: idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 95361a8899c46..9d5a1327da53f 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -98,10 +98,14 @@ def test_constructor_dtype(self): self.assertEqual(arr.dtype, np.float64) self.assertEqual(arr.fill_value, 0) - arr = SparseSeries([0, 1, 2, 4], dtype=np.int64) + arr = SparseSeries([0, 1, 2, 4], dtype=np.int64, fill_value=np.nan) self.assertEqual(arr.dtype, np.int64) self.assertTrue(np.isnan(arr.fill_value)) + arr = SparseSeries([0, 1, 2, 4], dtype=np.int64) + self.assertEqual(arr.dtype, np.int64) + self.assertEqual(arr.fill_value, 0) + arr = SparseSeries([0, 1, 2, 4], fill_value=0, dtype=np.int64) self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) @@ -354,7 +358,19 @@ def test_shape(self): self.assertEqual(self.ziseries2.shape, (15, )) def test_astype(self): - self.assertRaises(Exception, self.bseries.astype, np.int64) + with tm.assertRaises(ValueError): + self.bseries.astype(np.int64) + + def test_astype_all(self): + orig = pd.Series(np.array([1, 2, 3])) + s = SparseSeries(orig) + + types = [np.float64, np.float32, np.int64, + np.int32, np.int16, np.int8] + for typ in types: + res = s.astype(typ) + self.assertEqual(res.dtype, typ) + tm.assert_series_equal(res.to_dense(), orig.astype(typ)) def test_kind(self): self.assertEqual(self.bseries.kind, 'block') @@ -766,7 +782,8 @@ def _check_matches(indices, expected): data = {} for i, idx in enumerate(indices): data[i] = SparseSeries(idx.to_int_index().indices, - sparse_index=idx) + sparse_index=idx, fill_value=np.nan) + # homogenized is only valid with NaN fill values homogenized = spf.homogenize(data) for k, v in compat.iteritems(homogenized): @@ -866,9 +883,14 @@ def test_shift_nan(self): def test_shift_dtype(self): # GH 12908 orig = pd.Series([1, 2, 3, 4], dtype=np.int64) - sparse = orig.to_sparse() + sparse = orig.to_sparse() tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse()) + + sparse = orig.to_sparse(fill_value=np.nan) + tm.assert_sp_series_equal(sparse.shift(0), + orig.shift(0).to_sparse(fill_value=np.nan)) + # shift(1) or more span changes dtype to float64 tm.assert_sp_series_equal(sparse.shift(1), orig.shift(1).to_sparse()) tm.assert_sp_series_equal(sparse.shift(2), orig.shift(2).to_sparse()) tm.assert_sp_series_equal(sparse.shift(3), orig.shift(3).to_sparse()) @@ -881,25 +903,27 @@ def test_shift_dtype(self): def test_shift_dtype_fill_value(self): # GH 12908 orig = pd.Series([1, 0, 0, 4], dtype=np.int64) - sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.shift(0), - orig.shift(0).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(1), - orig.shift(1).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(2), - orig.shift(2).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(3), - orig.shift(3).to_sparse(fill_value=0)) - - tm.assert_sp_series_equal(sparse.shift(-1), - orig.shift(-1).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(-2), - orig.shift(-2).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(-3), - orig.shift(-3).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(-4), - orig.shift(-4).to_sparse(fill_value=0)) + for v in [0, 1, np.nan]: + sparse = orig.to_sparse(fill_value=v) + + tm.assert_sp_series_equal(sparse.shift(0), + orig.shift(0).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(1), + orig.shift(1).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(2), + orig.shift(2).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(3), + orig.shift(3).to_sparse(fill_value=v)) + + tm.assert_sp_series_equal(sparse.shift(-1), + orig.shift(-1).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(-2), + orig.shift(-2).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(-3), + orig.shift(-3).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(-4), + orig.shift(-4).to_sparse(fill_value=v)) def test_combine_first(self): s = self.bseries @@ -1247,6 +1271,40 @@ def test_value_counts_int(self): tm.assert_series_equal(sparse.value_counts(dropna=False), dense.value_counts(dropna=False)) + def test_isnull(self): + # GH 8276 + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + + res = s.isnull() + exp = pd.SparseSeries([True, True, False, False, True], name='xxx', + fill_value=True) + tm.assert_sp_series_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', + fill_value=0.) + res = s.isnull() + tm.assertIsInstance(res, pd.SparseSeries) + exp = pd.Series([True, False, False, False, False], name='xxx') + tm.assert_series_equal(res.to_dense(), exp) + + def test_isnotnull(self): + # GH 8276 + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + + res = s.isnotnull() + exp = pd.SparseSeries([False, False, True, True, False], name='xxx', + fill_value=False) + tm.assert_sp_series_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', + fill_value=0.) + res = s.isnotnull() + tm.assertIsInstance(res, pd.SparseSeries) + exp = pd.Series([False, True, True, True, True], name='xxx') + tm.assert_series_equal(res.to_dense(), exp) + def _dense_series_compare(s, f): result = f(s) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 440e433ffd95c..cc07c7d9dd59b 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -40,16 +40,33 @@ class TestSparseSeriesSubclassing(tm.TestCase): _multiprocess_can_split_ = True def test_subclass_sparse_slice(self): + # int64 s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5]) - tm.assert_sp_series_equal(s.loc[1:3], - tm.SubclassedSparseSeries([2.0, 3.0, 4.0], - index=[1, 2, 3])) - tm.assert_sp_series_equal(s.iloc[1:3], - tm.SubclassedSparseSeries([2.0, 3.0], - index=[1, 2])) - tm.assert_sp_series_equal(s[1:3], - tm.SubclassedSparseSeries([2.0, 3.0], - index=[1, 2])) + exp = tm.SubclassedSparseSeries([2, 3, 4], index=[1, 2, 3]) + tm.assert_sp_series_equal(s.loc[1:3], exp) + self.assertEqual(s.loc[1:3].dtype, np.int64) + + exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2]) + tm.assert_sp_series_equal(s.iloc[1:3], exp) + self.assertEqual(s.iloc[1:3].dtype, np.int64) + + exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2]) + tm.assert_sp_series_equal(s[1:3], exp) + self.assertEqual(s[1:3].dtype, np.int64) + + # float64 + s = tm.SubclassedSparseSeries([1., 2., 3., 4., 5.]) + exp = tm.SubclassedSparseSeries([2., 3., 4.], index=[1, 2, 3]) + tm.assert_sp_series_equal(s.loc[1:3], exp) + self.assertEqual(s.loc[1:3].dtype, np.float64) + + exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + tm.assert_sp_series_equal(s.iloc[1:3], exp) + self.assertEqual(s.iloc[1:3].dtype, np.float64) + + exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + tm.assert_sp_series_equal(s[1:3], exp) + self.assertEqual(s[1:3].dtype, np.float64) def test_subclass_sparse_addition(self): s1 = tm.SubclassedSparseSeries([1, 3, 5]) @@ -66,9 +83,17 @@ def test_subclass_sparse_to_frame(self): s = tm.SubclassedSparseSeries([1, 2], index=list('abcd'), name='xxx') res = s.to_frame() - exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block') + exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block', + fill_value=0) exp = tm.SubclassedSparseDataFrame({'xxx': exp_arr}, - index=list('abcd')) + index=list('abcd'), + default_fill_value=0) + tm.assert_sp_frame_equal(res, exp) + + # create from int dict + res = tm.SubclassedSparseDataFrame({'xxx': [1, 2]}, + index=list('abcd'), + default_fill_value=0) tm.assert_sp_frame_equal(res, exp) s = tm.SubclassedSparseSeries([1.1, 2.1], index=list('abcd'), diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d39569ea0b826..d50a6c460ceb5 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1404,9 +1404,8 @@ def assert_sp_array_equal(left, right): assert_numpy_array_equal(left.values, right.values) -def assert_sp_series_equal(left, right, exact_indices=True, - check_series_type=True, - check_names=True, +def assert_sp_series_equal(left, right, check_dtype=True, exact_indices=True, + check_series_type=True, check_names=True, obj='SparseSeries'): """Check that the left and right SparseSeries are equal. @@ -1414,6 +1413,8 @@ def assert_sp_series_equal(left, right, exact_indices=True, ---------- left : SparseSeries right : SparseSeries + check_dtype : bool, default True + Whether to check the Series dtype is identical. exact_indices : bool, default True check_series_type : bool, default True Whether to check the SparseSeries class is identical. @@ -1436,20 +1437,22 @@ def assert_sp_series_equal(left, right, exact_indices=True, if check_names: assert_attr_equal('name', left, right) - assert_attr_equal('dtype', left, right) + if check_dtype: + assert_attr_equal('dtype', left, right) assert_numpy_array_equal(left.values, right.values) -def assert_sp_frame_equal(left, right, exact_indices=True, - check_frame_type=True, - obj='SparseDataFrame'): +def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True, + check_frame_type=True, obj='SparseDataFrame'): """Check that the left and right SparseDataFrame are equal. Parameters ---------- left : SparseDataFrame right : SparseDataFrame + check_dtype : bool, default True + Whether to check the Series dtype is identical. exact_indices : bool, default True SparseSeries SparseIndex objects must be exactly the same, otherwise just compare dense representations. @@ -1475,9 +1478,11 @@ def assert_sp_frame_equal(left, right, exact_indices=True, # trade-off? if exact_indices: - assert_sp_series_equal(series, right[col]) + assert_sp_series_equal(series, right[col], + check_dtype=check_dtype) else: - assert_series_equal(series.to_dense(), right[col].to_dense()) + assert_series_equal(series.to_dense(), right[col].to_dense(), + check_dtype=check_dtype) assert_attr_equal('default_fill_value', left, right, obj=obj) From 47a8e713430707afcfe76e7ca995902628d4bccf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 31 Aug 2016 03:07:30 -0500 Subject: [PATCH 315/359] ENH: add parameter for HTML border (#14061) --- doc/source/options.rst | 3 +++ doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/config_init.py | 11 +++++++++++ pandas/core/frame.py | 10 ++++++++-- pandas/formats/format.py | 22 +++++++++++++++++----- pandas/tests/formats/test_format.py | 17 +++++++++++++++++ 6 files changed, 57 insertions(+), 7 deletions(-) diff --git a/doc/source/options.rst b/doc/source/options.rst index 25f03df4040a3..77cac6d495d13 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -392,6 +392,9 @@ display.width 80 Width of the display in characters. IPython qtconsole, or IDLE do not run in a terminal and hence it is not possible to correctly detect the width. +html.border 1 A ``border=value`` attribute is + inserted in the ```` tag + for the DataFrame HTML repr. io.excel.xls.writer xlwt The default Excel writer engine for 'xls' files. io.excel.xlsm.writer openpyxl The default Excel writer engine for diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 918a6a2361f6a..29971f4419ae1 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -427,6 +427,7 @@ Other enhancements df.sort_values(by='row2', axis=1) - Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) +- :meth:`~DataFrame.to_html` now has a ``border`` argument to control the value in the opening ``
    `` tag. The default is the value of the ``html.border`` option, which defaults to 1. This also affects the notebook HTML repr, but since Jupyter's CSS includes a border-width attribute, the visual effect is the same. (:issue:`11563`). - Raise ``ImportError`` in the sql functions when ``sqlalchemy`` is not installed and a connection string is used (:issue:`11920`). - Compatibility with matplotlib 2.0. Older versions of pandas should also work with matplotlib 2.0 (:issue:`13333`) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 5cbc968f06fa7..fe47391c9ff81 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -346,6 +346,17 @@ def mpl_style_cb(key): cf.deprecate_option('display.height', msg=pc_height_deprecation_warning, rkey='display.max_rows') +pc_html_border_doc = """ +: int + A ``border=value`` attribute is inserted in the ``
    `` tag + for the DataFrame HTML repr. +""" + +with cf.config_prefix('html'): + cf.register_option('border', 1, pc_html_border_doc, + validator=is_int) + + tc_sim_interactive_doc = """ : boolean Whether to simulate interactive mode for purposes of testing diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 205af5c805877..676997ede28b4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1560,7 +1560,8 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, bold_rows=True, classes=None, escape=True, max_rows=None, max_cols=None, - show_dimensions=False, notebook=False, decimal='.'): + show_dimensions=False, notebook=False, decimal='.', + border=None): """ Render a DataFrame as an HTML table. @@ -1582,6 +1583,11 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, Character recognized as decimal separator, e.g. ',' in Europe .. versionadded:: 0.18.0 + border : int + A ``border=border`` attribute is included in the opening + `
    ` tag. Default ``pd.options.html.border``. + + .. versionadded:: 0.19.0 """ formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, @@ -1597,7 +1603,7 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, show_dimensions=show_dimensions, decimal=decimal) # TODO: a generic formatter wld b in DataFrameFormatter - formatter.to_html(classes=classes, notebook=notebook) + formatter.to_html(classes=classes, notebook=notebook, border=border) if buf is None: return formatter.buf.getvalue() diff --git a/pandas/formats/format.py b/pandas/formats/format.py index cb8fb3a5d2e49..dd9a852bd8713 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -671,20 +671,28 @@ def _format_col(self, i): float_format=self.float_format, na_rep=self.na_rep, space=self.col_space, decimal=self.decimal) - def to_html(self, classes=None, notebook=False): + def to_html(self, classes=None, notebook=False, border=None): """ Render a DataFrame to a html table. Parameters ---------- + classes : str or list-like + classes to include in the `class` attribute of the opening + ``
    `` tag, in addition to the default "dataframe". notebook : {True, False}, optional, default False Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + ``
    `` tag. Default ``pd.options.html.border``. - """ + .. versionadded:: 0.19.0 + """ html_renderer = HTMLFormatter(self, classes=classes, max_rows=self.max_rows, max_cols=self.max_cols, - notebook=notebook) + notebook=notebook, + border=border) if hasattr(self.buf, 'write'): html_renderer.write_result(self.buf) elif isinstance(self.buf, compat.string_types): @@ -910,7 +918,7 @@ class HTMLFormatter(TableFormatter): indent_delta = 2 def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, - notebook=False): + notebook=False, border=None): self.fmt = formatter self.classes = classes @@ -926,6 +934,9 @@ def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, self.is_truncated = (self.max_rows < len(self.fmt.frame) or self.max_cols < len(self.fmt.columns)) self.notebook = notebook + if border is None: + border = get_option('html.border') + self.border = border def write(self, s, indent=0): rs = pprint_thing(s) @@ -1001,7 +1012,8 @@ def write_result(self, buf): self.write(''.format(div_style)) - self.write('
    ' % ' '.join(_classes), + self.write('
    ' % (self.border, + ' '.join(_classes)), indent) indent += self.indent_delta diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index e6147737e9a1d..0a2e63a018799 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -1650,6 +1650,23 @@ def test_to_html_truncate_multi_index_sparse_off(self): expected = expected.decode('utf-8') self.assertEqual(result, expected) + def test_to_html_border(self): + df = DataFrame({'A': [1, 2]}) + result = df.to_html() + assert 'border="1"' in result + + def test_to_html_border_option(self): + df = DataFrame({'A': [1, 2]}) + with pd.option_context('html.border', 0): + result = df.to_html() + self.assertTrue('border="0"' in result) + self.assertTrue('border="0"' in df._repr_html_()) + + def test_to_html_border_zero(self): + df = DataFrame({'A': [1, 2]}) + result = df.to_html(border=0) + self.assertTrue('border="0"' in result) + def test_nonunicode_nonascii_alignment(self): df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) rep_str = df.to_string() From f92cd7e41cfb8ad56875e1313791f010e5efb202 Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Mon, 29 Aug 2016 23:05:13 -0400 Subject: [PATCH 316/359] TST: Enable Google BigQuery (pandas.io.gbq) integration testing #11089 closes #11089 closes #14111 --- .travis.yml | 7 + ci/requirements-2.7.pip | 1 + ci/travis_gbq.json.enc | Bin 0 -> 2352 bytes pandas/io/tests/test_gbq.py | 289 +++++++++++++++++++++++------------- 4 files changed, 197 insertions(+), 100 deletions(-) create mode 100644 ci/travis_gbq.json.enc diff --git a/.travis.yml b/.travis.yml index 2716fa7628d61..4d3908bc35de4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -229,6 +229,13 @@ matrix: - USE_CACHE=true before_install: + # gbq secure key + - if [ -n "$encrypted_1d9d7b1f171b_iv" ]; then + openssl aes-256-cbc -K $encrypted_1d9d7b1f171b_key + -iv $encrypted_1d9d7b1f171b_iv -in ci/travis_gbq.json.enc + -out ci/travis_gbq.json -d; + export VALID_GBQ_CREDENTIALS=True; + fi - echo "before_install" - echo $VIRTUAL_ENV - export PATH="$HOME/miniconda/bin:$PATH" diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index cc3462dbf9ed0..d16b932c8be4f 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -6,3 +6,4 @@ oauth2client==1.5.0 pathlib backports.lzma py +PyCrypto diff --git a/ci/travis_gbq.json.enc b/ci/travis_gbq.json.enc new file mode 100644 index 0000000000000000000000000000000000000000..c2a33bbd6f26383bd7e8a7a504e626284efb5fd0 GIT binary patch literal 2352 zcmV-03D5QoiY_vZjh&7QCFrhKcFBG@`zj6HxkUamBtL*$SOfIYLQAnP$$?HCW-UzE zqY3S}bS_tytBr;XZgqTWlqlC0A?TtDDzJS4<-4yF+82AKZYaOSzyy z)LIN&*Phn|s>u2rH)V_1hyj-xu@)mBOg%_tj5_Sz6kyK>B5Gj0bp;~khYB=Ul|&X? zUFSM`<{}P#4_#PMfT#y?P!&Q=azAz#tG@DOU=aLF%RTb9pTg+mwrTZ+`_vBO5^xdb zCk{k&n*k1|x?M-4M;q$_?J$Z=GMNDL*;ETHrT|OpFalF9aJ;1NN8;rz^YfzF2c#MtNZvI;NuIJQ-M<=GHh=X9{ian$nm(H@?nOf1bgG`&RpLSr<5g9xf z2teKs?kATag6a+LsF}ejFjmcfSCRZKh(1~}uiJ(Qc@Q;)ValsMLtF!2X$O%Cb z2KMdb?&ns7GPy+RSdg<1=+QLqzgq74x1J+)2!4_{d|gtTVv9I=qfT>YNLb!NjSeg= zF|Qh88XA3rHR)>wth;QO_M(&hfA8)$QEpGgANx7DK|J`dW)T_`Xz_E!NK^R8RZg$y zc5}UIuDBt}n1#0!5GPf8Jbgag71LqHsVxL^@1qNIX|Dy=0vXV0(4^j2t$?ktEZdd5 zu_ckdLNK1WUPlJaR4^MLsqCIlhr=wrO2O}*qt8Z*MskXFh93(O!7RnBrwEDnT<`it5D0Mb#*2bx#aqC@LEJC=x_>Rx<|ygktaBRpWD z4#{MIj?XI%F|f1Z!qi;RP!vt6Ble@nmfAd}TzlXws1BJ)f5{5gri+aezIomN6ImrH zx}$i#tM@W$hzh(j)Gt+D=6S|?h}()_-~|h%S3)QyM`7f{Yf{v>p$dbYb8XdaAwacm zYIgF03~bBRJ?Q|Rm{AoSq^LSBkDa|`3tNoi02mXu+-Du+k_EUwoHMFk922)^pS;_D6#vtq~4S z0+*&E9tblkhvce%@L*}odrsPg ze1D(imA!lhnI7E+EDFG9720>Y4#l_d;0oNsr)BvjIN8`WGnc1$a?%?ycY8#Jhm$-C3s{t9ZH!5Tdr>`t41 zT)!t07R`S+w73>s@5X;v4d{Zrz<~%E?>$ry4A?zF{TOsf3y|_$p=_p^7 zyHtMEaO`#lEy8g>>v{%h!1*z-W`(rGI}x7M3P7v}4?u6$pF9q$Z>h4+;M|XMMXn-` zt;L)h+N2X->u!;3$*+|@qIVFK-FHTOWzOKyOMLi?7uHQUumZzC>x@c?*cS{IeR9pz z%j|yMgIP(6EQpB4%%ANMRmAGv^MZ8l-{UC8Un6k3C~MltE7?VC^N!9xT725P)|Gtf z&Y(8ua0ZUJO(-Sc>1rq^R0ra;Wa5&>w$UCFV36KRm<$T^2(h&JMd-wYacGQvViWbN z;Sj}nB6rj56!|*PGf00&z+`c`4W3nX4V>s9=aCW8AGAn)EiROzk#ku76;QET`eHgm z(nw)$QzY5E$?_QwzB-{3OpF_c;7(A1@_v7pYaO5JgoY(y&*&O#VUKi8dkA)N#1BEo z^s5wOm{@=f>c|t#|7>EeQqHh!uRXjICpE`%G!Z+Zt<^J-#-9iG(VG#%Nv?sI+ zbc`m4USJyzcgu?tl;%C}Ez6G@|f#&^hF+`g-yrj{hmY4yhlk+b#gV44cV?S5r%;?ge?g z#lzI?kuY1oXLg&XxdkBG8g*9plC**(x1xRs!fCuZZfAb#o*pyTq1{n<-CM+4c6lHo zqhwh;eK)Jl1X}YUP)?=oto!8X%qgNi1g>n7$x+*H3lrxcs&2-MENP(#=M;+oe_zRD zmCP_qF1Fe;UFgs(|6U79ig}b`dz4{4Eh38)&RvnO=3V=+bB@oe8weiJM6CJ5c%GQ-iz&#q=Du>_LJKa?c5%>1J4;MeQNYk^_$~ z;|WA1#Nz81yr8Jafys`4PisrSy?Jw~yQrKw#cLkq4Jq8We*d_mk#2#X^w3p=gJB>* z#!GJ%sBPy+SR&x<$od^Zj0! zidEfbN|w72WG4PR*<}{0X+HTW38KvQlnKe|LO@K*{nS!xOGu^})|VMf4R={d{^$ZY Wc%~RC+CiWM`BrrE1b(~# literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 4b71192c907f8..7757950592da5 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -4,6 +4,8 @@ import pytz import platform from time import sleep +import os +import logging import numpy as np @@ -21,7 +23,11 @@ PRIVATE_KEY_JSON_PATH = None PRIVATE_KEY_JSON_CONTENTS = None -DATASET_ID = 'pydata_pandas_bq_testing' +if compat.PY3: + DATASET_ID = 'pydata_pandas_bq_testing_py3' +else: + DATASET_ID = 'pydata_pandas_bq_testing_py2' + TABLE_ID = 'new_test' DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID) @@ -35,25 +41,50 @@ def _skip_if_no_project_id(): - if not PROJECT_ID: + if not _get_project_id(): raise nose.SkipTest( "Cannot run integration tests without a project id") def _skip_if_no_private_key_path(): - if not PRIVATE_KEY_JSON_PATH: + if not _get_private_key_path(): raise nose.SkipTest("Cannot run integration tests without a " "private key json file path") def _skip_if_no_private_key_contents(): - if not PRIVATE_KEY_JSON_CONTENTS: + if not _get_private_key_contents(): raise nose.SkipTest("Cannot run integration tests without a " "private key json contents") - _skip_if_no_project_id() - _skip_if_no_private_key_path() - _skip_if_no_private_key_contents() + +def _in_travis_environment(): + return 'TRAVIS_BUILD_DIR' in os.environ and \ + 'VALID_GBQ_CREDENTIALS' in os.environ + + +def _get_project_id(): + if _in_travis_environment(): + return 'pandas-travis' + else: + return PROJECT_ID + + +def _get_private_key_path(): + if _in_travis_environment(): + return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci', + 'travis_gbq.json']) + else: + return PRIVATE_KEY_JSON_PATH + + +def _get_private_key_contents(): + if _in_travis_environment(): + with open(os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci', + 'travis_gbq.json'])) as f: + return f.read() + else: + return PRIVATE_KEY_JSON_CONTENTS def _test_imports(): @@ -144,18 +175,22 @@ def _test_imports(): "service account support") -def test_requirements(): +def _setup_common(): try: _test_imports() except (ImportError, NotImplementedError) as import_exception: raise nose.SkipTest(import_exception) + if _in_travis_environment(): + logging.getLogger('oauth2client').setLevel(logging.ERROR) + logging.getLogger('apiclient').setLevel(logging.ERROR) + def _check_if_can_get_correct_default_credentials(): # Checks if "Application Default Credentials" can be fetched # from the environment the tests are running in. # See Issue #13577 - test_requirements() + import httplib2 try: from googleapiclient.discovery import build @@ -169,19 +204,20 @@ def _check_if_can_get_correct_default_credentials(): bigquery_service = build('bigquery', 'v2', http=http) jobs = bigquery_service.jobs() job_data = {'configuration': {'query': {'query': 'SELECT 1'}}} - jobs.insert(projectId=PROJECT_ID, body=job_data).execute() + jobs.insert(projectId=_get_project_id(), body=job_data).execute() return True except: return False def clean_gbq_environment(private_key=None): - dataset = gbq._Dataset(PROJECT_ID, private_key=private_key) + dataset = gbq._Dataset(_get_project_id(), private_key=private_key) for i in range(1, 10): if DATASET_ID + str(i) in dataset.datasets(): dataset_id = DATASET_ID + str(i) - table = gbq._Table(PROJECT_ID, dataset_id, private_key=private_key) + table = gbq._Table(_get_project_id(), dataset_id, + private_key=private_key) for j in range(1, 20): if TABLE_ID + str(j) in dataset.tables(dataset_id): table.delete(TABLE_ID + str(j)) @@ -215,11 +251,11 @@ def test_generate_bq_schema_deprecated(): class TestGBQConnectorIntegration(tm.TestCase): def setUp(self): - test_requirements() - + _setup_common() _skip_if_no_project_id() - self.sut = gbq.GbqConnector(PROJECT_ID) + self.sut = gbq.GbqConnector(_get_project_id(), + private_key=_get_private_key_path()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -259,13 +295,13 @@ def test_get_application_default_credentials_returns_credentials(self): class TestGBQConnectorServiceAccountKeyPathIntegration(tm.TestCase): def setUp(self): - test_requirements() + _setup_common() _skip_if_no_project_id() _skip_if_no_private_key_path() - self.sut = gbq.GbqConnector(PROJECT_ID, - private_key=PRIVATE_KEY_JSON_PATH) + self.sut = gbq.GbqConnector(_get_project_id(), + private_key=_get_private_key_path()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -290,13 +326,13 @@ def test_should_be_able_to_get_results_from_query(self): class TestGBQConnectorServiceAccountKeyContentsIntegration(tm.TestCase): def setUp(self): - test_requirements() + _setup_common() _skip_if_no_project_id() - _skip_if_no_private_key_contents() + _skip_if_no_private_key_path() - self.sut = gbq.GbqConnector(PROJECT_ID, - private_key=PRIVATE_KEY_JSON_CONTENTS) + self.sut = gbq.GbqConnector(_get_project_id(), + private_key=_get_private_key_path()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -321,7 +357,7 @@ def test_should_be_able_to_get_results_from_query(self): class GBQUnitTests(tm.TestCase): def setUp(self): - test_requirements() + _setup_common() def test_import_google_api_python_client(self): if compat.PY2: @@ -396,12 +432,12 @@ def test_read_gbq_with_empty_private_key_file_should_fail(self): private_key=empty_file_path) def test_read_gbq_with_corrupted_private_key_json_should_fail(self): - _skip_if_no_private_key_contents() + _skip_if_no_private_key_path() with tm.assertRaises(gbq.InvalidPrivateKeyFormat): gbq.read_gbq( 'SELECT 1', project_id='x', - private_key=re.sub('[a-z]', '9', PRIVATE_KEY_JSON_CONTENTS)) + private_key=re.sub('[a-z]', '9', _get_private_key_path())) class TestReadGBQIntegration(tm.TestCase): @@ -414,7 +450,7 @@ def setUpClass(cls): _skip_if_no_project_id() - test_requirements() + _setup_common() def setUp(self): # - PER-TEST FIXTURES - @@ -435,87 +471,108 @@ def tearDown(self): # executed. pass + def test_should_read_as_user_account(self): + if _in_travis_environment(): + raise nose.SkipTest("Cannot run local auth in travis environment") + + query = 'SELECT "PI" as VALID_STRING' + df = gbq.read_gbq(query, project_id=_get_project_id()) + tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + def test_should_read_as_service_account_with_key_path(self): _skip_if_no_private_key_path() query = 'SELECT "PI" as VALID_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID, - private_key=PRIVATE_KEY_JSON_PATH) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) def test_should_read_as_service_account_with_key_contents(self): _skip_if_no_private_key_contents() query = 'SELECT "PI" as VALID_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID, - private_key=PRIVATE_KEY_JSON_CONTENTS) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_contents()) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) def test_should_properly_handle_valid_strings(self): query = 'SELECT "PI" as VALID_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) def test_should_properly_handle_empty_strings(self): query = 'SELECT "" as EMPTY_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'EMPTY_STRING': [""]})) def test_should_properly_handle_null_strings(self): query = 'SELECT STRING(NULL) as NULL_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_STRING': [None]})) def test_should_properly_handle_valid_integers(self): query = 'SELECT INTEGER(3) as VALID_INTEGER' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'VALID_INTEGER': [3]})) def test_should_properly_handle_null_integers(self): query = 'SELECT INTEGER(NULL) as NULL_INTEGER' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_INTEGER': [np.nan]})) def test_should_properly_handle_valid_floats(self): query = 'SELECT PI() as VALID_FLOAT' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame( {'VALID_FLOAT': [3.141592653589793]})) def test_should_properly_handle_null_floats(self): query = 'SELECT FLOAT(NULL) as NULL_FLOAT' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_FLOAT': [np.nan]})) def test_should_properly_handle_timestamp_unix_epoch(self): query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame( {'UNIX_EPOCH': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) def test_should_properly_handle_arbitrary_timestamp(self): query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({ 'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')] })) def test_should_properly_handle_null_timestamp(self): query = 'SELECT TIMESTAMP(NULL) as NULL_TIMESTAMP' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_TIMESTAMP': [NaT]})) def test_should_properly_handle_true_boolean(self): query = 'SELECT BOOLEAN(TRUE) as TRUE_BOOLEAN' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'TRUE_BOOLEAN': [True]})) def test_should_properly_handle_false_boolean(self): query = 'SELECT BOOLEAN(FALSE) as FALSE_BOOLEAN' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'FALSE_BOOLEAN': [False]})) def test_should_properly_handle_null_boolean(self): query = 'SELECT BOOLEAN(NULL) as NULL_BOOLEAN' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_BOOLEAN': [None]})) def test_unicode_string_conversion_and_normalization(self): @@ -530,13 +587,15 @@ def test_unicode_string_conversion_and_normalization(self): query = 'SELECT "{0}" as UNICODE_STRING'.format(unicode_string) - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, correct_test_datatype) def test_index_column(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2" - result_frame = gbq.read_gbq( - query, project_id=PROJECT_ID, index_col="STRING_1") + result_frame = gbq.read_gbq(query, project_id=_get_project_id(), + index_col="STRING_1", + private_key=_get_private_key_path()) correct_frame = DataFrame( {'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1") tm.assert_equal(result_frame.index.name, correct_frame.index.name) @@ -544,8 +603,9 @@ def test_index_column(self): def test_column_order(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_1', 'STRING_2'] - result_frame = gbq.read_gbq( - query, project_id=PROJECT_ID, col_order=col_order) + result_frame = gbq.read_gbq(query, project_id=_get_project_id(), + col_order=col_order, + private_key=_get_private_key_path()) correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [ 'b'], 'STRING_3': ['c']})[col_order] tm.assert_frame_equal(result_frame, correct_frame) @@ -553,8 +613,9 @@ def test_column_order(self): def test_column_order_plus_index(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_2'] - result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, - index_col='STRING_1', col_order=col_order) + result_frame = gbq.read_gbq(query, project_id=_get_project_id(), + index_col='STRING_1', col_order=col_order, + private_key=_get_private_key_path()) correct_frame = DataFrame( {'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']}) correct_frame.set_index('STRING_1', inplace=True) @@ -564,16 +625,19 @@ def test_column_order_plus_index(self): def test_malformed_query(self): with tm.assertRaises(gbq.GenericGBQException): gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]", - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) def test_bad_project_id(self): with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq("SELECT 1", project_id='001') + gbq.read_gbq("SELECT 1", project_id='001', + private_key=_get_private_key_path()) def test_bad_table_name(self): with tm.assertRaises(gbq.GenericGBQException): gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]", - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) def test_download_dataset_larger_than_200k_rows(self): test_size = 200005 @@ -582,7 +646,8 @@ def test_download_dataset_larger_than_200k_rows(self): df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] " "GROUP EACH BY id ORDER BY id ASC LIMIT {0}" .format(test_size), - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(len(df.drop_duplicates()), test_size) def test_zero_rows(self): @@ -590,7 +655,8 @@ def test_zero_rows(self): df = gbq.read_gbq("SELECT title, id " "FROM [publicdata:samples.wikipedia] " "WHERE timestamp=-9999999", - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) page_array = np.zeros( (0,), dtype=[('title', object), ('id', np.dtype(float))]) expected_result = DataFrame(page_array, columns=['title', 'id']) @@ -602,13 +668,15 @@ def test_legacy_sql(self): # Test that a legacy sql statement fails when # setting dialect='standard' with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq(legacy_sql, project_id=PROJECT_ID, - dialect='standard') + gbq.read_gbq(legacy_sql, project_id=_get_project_id(), + dialect='standard', + private_key=_get_private_key_path()) # Test that a legacy sql statement succeeds when # setting dialect='legacy' - df = gbq.read_gbq(legacy_sql, project_id=PROJECT_ID, - dialect='legacy') + df = gbq.read_gbq(legacy_sql, project_id=_get_project_id(), + dialect='legacy', + private_key=_get_private_key_path()) self.assertEqual(len(df.drop_duplicates()), 10) def test_standard_sql(self): @@ -618,12 +686,14 @@ def test_standard_sql(self): # Test that a standard sql statement fails when using # the legacy SQL dialect (default value) with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq(standard_sql, project_id=PROJECT_ID) + gbq.read_gbq(standard_sql, project_id=_get_project_id(), + private_key=_get_private_key_path()) # Test that a standard sql statement succeeds when # setting dialect='standard' - df = gbq.read_gbq(standard_sql, project_id=PROJECT_ID, - dialect='standard') + df = gbq.read_gbq(standard_sql, project_id=_get_project_id(), + dialect='standard', + private_key=_get_private_key_path()) self.assertEqual(len(df.drop_duplicates()), 10) def test_invalid_option_for_sql_dialect(self): @@ -632,13 +702,14 @@ def test_invalid_option_for_sql_dialect(self): # Test that an invalid option for `dialect` raises ValueError with tm.assertRaises(ValueError): - gbq.read_gbq(sql_statement, project_id=PROJECT_ID, - dialect='invalid') + gbq.read_gbq(sql_statement, project_id=_get_project_id(), + dialect='invalid', + private_key=_get_private_key_path()) # Test that a correct option for dialect succeeds # to make sure ValueError was due to invalid dialect - gbq.read_gbq(sql_statement, project_id=PROJECT_ID, - dialect='standard') + gbq.read_gbq(sql_statement, project_id=_get_project_id(), + dialect='standard', private_key=_get_private_key_path()) class TestToGBQIntegration(tm.TestCase): @@ -656,18 +727,22 @@ def setUpClass(cls): _skip_if_no_project_id() - test_requirements() - clean_gbq_environment() + _setup_common() + clean_gbq_environment(_get_private_key_path()) - gbq._Dataset(PROJECT_ID).create(DATASET_ID + "1") + gbq._Dataset(_get_project_id(), + private_key=_get_private_key_path() + ).create(DATASET_ID + "1") def setUp(self): # - PER-TEST FIXTURES - # put here any instruction you want to be run *BEFORE* *EVERY* test is # executed. - self.dataset = gbq._Dataset(PROJECT_ID) - self.table = gbq._Table(PROJECT_ID, DATASET_ID + "1") + self.dataset = gbq._Dataset(_get_project_id(), + private_key=_get_private_key_path()) + self.table = gbq._Table(_get_project_id(), DATASET_ID + "1", + private_key=_get_private_key_path()) @classmethod def tearDownClass(cls): @@ -675,7 +750,7 @@ def tearDownClass(cls): # put here any instruction you want to execute only *ONCE* *AFTER* # executing all tests. - clean_gbq_environment() + clean_gbq_environment(_get_private_key_path()) def tearDown(self): # - PER-TEST FIXTURES - @@ -689,13 +764,15 @@ def test_upload_data(self): test_size = 20001 df = make_mixed_dataframe_v2(test_size) - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size) def test_upload_data_if_table_exists_fail(self): @@ -707,11 +784,13 @@ def test_upload_data_if_table_exists_fail(self): # Test the default value of if_exists is 'fail' with tm.assertRaises(gbq.TableCreationError): - gbq.to_gbq(df, destination_table, PROJECT_ID) + gbq.to_gbq(df, destination_table, _get_project_id(), + private_key=_get_private_key_path()) # Test the if_exists parameter with value 'fail' with tm.assertRaises(gbq.TableCreationError): - gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='fail') + gbq.to_gbq(df, destination_table, _get_project_id(), + if_exists='fail', private_key=_get_private_key_path()) def test_upload_data_if_table_exists_append(self): destination_table = DESTINATION_TABLE + "3" @@ -721,22 +800,26 @@ def test_upload_data_if_table_exists_append(self): df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) # Test the if_exists parameter with value 'append' - gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='append') + gbq.to_gbq(df, destination_table, _get_project_id(), + if_exists='append', private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df_different_schema, destination_table, - PROJECT_ID, if_exists='append') + _get_project_id(), if_exists='append', + private_key=_get_private_key_path()) def test_upload_data_if_table_exists_replace(self): destination_table = DESTINATION_TABLE + "4" @@ -746,17 +829,20 @@ def test_upload_data_if_table_exists_replace(self): df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, destination_table, - PROJECT_ID, if_exists='replace') + _get_project_id(), if_exists='replace', + private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], 5) def test_google_upload_errors_should_raise_exception(self): @@ -769,7 +855,8 @@ def test_google_upload_errors_should_raise_exception(self): index=range(2)) with tm.assertRaises(gbq.StreamingInsertError): - gbq.to_gbq(bad_df, destination_table, PROJECT_ID, verbose=True) + gbq.to_gbq(bad_df, destination_table, _get_project_id(), + verbose=True, private_key=_get_private_key_path()) def test_generate_schema(self): df = tm.makeMixedDataFrame() @@ -828,7 +915,9 @@ def test_list_dataset(self): def test_list_table_zero_results(self): dataset_id = DATASET_ID + "2" self.dataset.create(dataset_id) - table_list = gbq._Dataset(PROJECT_ID).tables(dataset_id) + table_list = gbq._Dataset(_get_project_id(), + private_key=_get_private_key_path() + ).tables(dataset_id) self.assertEqual(len(table_list), 0, 'Expected gbq.list_table() to return 0') @@ -854,7 +943,7 @@ def test_dataset_exists(self): def create_table_data_dataset_does_not_exist(self): dataset_id = DATASET_ID + "6" table_id = TABLE_ID + "1" - table_with_new_dataset = gbq._Table(PROJECT_ID, dataset_id) + table_with_new_dataset = gbq._Table(_get_project_id(), dataset_id) df = make_mixed_dataframe_v2(10) table_with_new_dataset.create(table_id, gbq._generate_bq_schema(df)) self.assertTrue(self.dataset.exists(dataset_id), @@ -884,8 +973,8 @@ def setUpClass(cls): _skip_if_no_project_id() _skip_if_no_private_key_path() - test_requirements() - clean_gbq_environment(PRIVATE_KEY_JSON_PATH) + _setup_common() + clean_gbq_environment(_get_private_key_path()) def setUp(self): # - PER-TEST FIXTURES - @@ -899,7 +988,7 @@ def tearDownClass(cls): # put here any instruction you want to execute only *ONCE* *AFTER* # executing all tests. - clean_gbq_environment(PRIVATE_KEY_JSON_PATH) + clean_gbq_environment(_get_private_key_path()) def tearDown(self): # - PER-TEST FIXTURES - @@ -913,15 +1002,15 @@ def test_upload_data_as_service_account_with_key_path(self): test_size = 10 df = make_mixed_dataframe_v2(test_size) - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000, - private_key=PRIVATE_KEY_JSON_PATH) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), - project_id=PROJECT_ID, - private_key=PRIVATE_KEY_JSON_PATH) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size) @@ -940,11 +1029,11 @@ def setUpClass(cls): # put here any instruction you want to execute only *ONCE* *BEFORE* # executing *ALL* tests described below. + _setup_common() _skip_if_no_project_id() _skip_if_no_private_key_contents() - test_requirements() - clean_gbq_environment(PRIVATE_KEY_JSON_CONTENTS) + clean_gbq_environment(_get_private_key_contents()) def setUp(self): # - PER-TEST FIXTURES - @@ -958,7 +1047,7 @@ def tearDownClass(cls): # put here any instruction you want to execute only *ONCE* *AFTER* # executing all tests. - clean_gbq_environment(PRIVATE_KEY_JSON_CONTENTS) + clean_gbq_environment(_get_private_key_contents()) def tearDown(self): # - PER-TEST FIXTURES - @@ -972,15 +1061,15 @@ def test_upload_data_as_service_account_with_key_contents(self): test_size = 10 df = make_mixed_dataframe_v2(test_size) - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000, - private_key=PRIVATE_KEY_JSON_CONTENTS) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_contents()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), - project_id=PROJECT_ID, - private_key=PRIVATE_KEY_JSON_CONTENTS) + project_id=_get_project_id(), + private_key=_get_private_key_contents()) self.assertEqual(result['NUM_ROWS'][0], test_size) if __name__ == '__main__': From 8654a9ed3cc2246ef9eaf2fe8725369a2e885d35 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 31 Aug 2016 09:12:52 -0400 Subject: [PATCH 317/359] API: Expanded resample closes #13500 Author: Chris Closes #13961 from chris-b1/resample-api and squashes the following commits: b8dd114 [Chris] make _from_selection a property 10c7280 [Chris] NotImp -> ValueError e203fcf [Chris] doc updates 384026b [Chris] remove PeriodIndex workaround c7b299e [Chris] cleanup debugging 5fd97d9 [Chris] add from_selection bookkeeping 7f9add4 [Chris] more wip b55309a [Chris] wip c4db0e7 [Chris] move error handling; doc fixups def74de [Chris] API: Expanded resample --- doc/source/timeseries.rst | 24 ++++ doc/source/whatsnew/v0.19.0.txt | 14 ++ pandas/core/generic.py | 21 ++- pandas/core/groupby.py | 3 +- pandas/tseries/resample.py | 33 ++++- pandas/tseries/tests/test_resample.py | 180 +++++++++++++++++++------- 6 files changed, 221 insertions(+), 54 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 6f44ee0c87945..36e492df29983 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1473,6 +1473,30 @@ Furthermore, you can also specify multiple aggregation functions for each column r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) +If a ``DataFrame`` does not have a datetimelike index, but instead you want +to resample based on datetimelike column in the frame, it can passed to the +``on`` keyword. + +.. ipython:: python + + df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), + 'a': np.arange(5)}, + index=pd.MultiIndex.from_arrays([ + [1,2,3,4,5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v','d'])) + df + df.resample('M', on='date').sum() + +Similarly, if you instead want to resample by a datetimelike +level of ``MultiIndex``, its name or location can be passed to the +``level`` keyword. + +.. ipython:: python + + df.resample(level='d').sum() + + .. _timeseries.periods: Time Span Representation diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 29971f4419ae1..9c4010f8f024a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -397,6 +397,20 @@ Other enhancements pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) +- the ``.resample()`` function now accepts a ``on=`` or ``level=`` parameter for resampling on a datetimelike column or ``MultiIndex`` level (:issue:`13500`) + + .. ipython:: python + + df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), + 'a': np.arange(5)}, + index=pd.MultiIndex.from_arrays([ + [1,2,3,4,5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v','d'])) + df + df.resample('M', on='date').sum() + df.resample('M', level='d').sum() + - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2a6f00c65c7fb..5a17401ea67b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4047,10 +4047,12 @@ def between_time(self, start_time, end_time, include_start=True, def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, - limit=None, base=0): + limit=None, base=0, on=None, level=None): """ - Convenience method for frequency conversion and resampling of regular - time-series data. + Convenience method for frequency conversion and resampling of time + series. Object must have a datetime-like index (DatetimeIndex, + PeriodIndex, or TimedeltaIndex), or pass datetime-like values + to the on or level keyword. Parameters ---------- @@ -4068,7 +4070,17 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0 + on : string, optional + For a DataFrame, column to use instead of index for resampling. + Column must be datetime-like. + .. versionadded:: 0.19.0 + + level : string or int, optional + For a MultiIndex, level (name or number) to use for + resampling. Level must be datetime-like. + + .. versionadded:: 0.19.0 To learn more about the offset strings, please see `this link `__. @@ -4173,12 +4185,11 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, """ from pandas.tseries.resample import (resample, _maybe_process_deprecations) - axis = self._get_axis_number(axis) r = resample(self, freq=rule, label=label, closed=closed, axis=axis, kind=kind, loffset=loffset, convention=convention, - base=base) + base=base, key=on, level=level) return _maybe_process_deprecations(r, how=how, fill_method=fill_method, diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 9436257b88941..66e30229cd52b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -255,7 +255,8 @@ def _set_grouper(self, obj, sort=False): Parameters ---------- obj : the subject object - + sort : bool, default False + whether the resulting grouper should be sorted """ if self.key is not None and self.level is not None: diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 5c4bfe5360fac..f1a209053445a 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -112,6 +112,15 @@ def _typ(self): return 'series' return 'dataframe' + @property + def _from_selection(self): + """ is the resampling from a DataFrame column or MultiIndex level """ + # upsampling and PeriodIndex resampling do not work + # with selection, this state used to catch and raise an error + return (self.groupby is not None and + (self.groupby.key is not None or + self.groupby.level is not None)) + def _deprecated(self, op): warnings.warn(("\n.resample() is now a deferred operation\n" "You called {op}(...) on this deferred object " @@ -207,6 +216,10 @@ def _convert_obj(self, obj): Parameters ---------- obj : the object to be resampled + + Returns + ------- + obj : converted object """ obj = obj.consolidate() return obj @@ -706,6 +719,11 @@ def _upsample(self, method, limit=None): self._set_binner() if self.axis: raise AssertionError('axis must be 0') + if self._from_selection: + raise ValueError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") ax = self.ax obj = self._selected_obj @@ -763,7 +781,15 @@ def _convert_obj(self, obj): # convert to timestamp if not (self.kind is None or self.kind == 'period'): - obj = obj.to_timestamp(how=self.convention) + if self._from_selection: + # see GH 14008, GH 12871 + msg = ("Resampling from level= or on= selection" + " with a PeriodIndex is not currently supported," + " use .set_index(...) to explicitly set index") + raise NotImplementedError(msg) + else: + obj = obj.to_timestamp(how=self.convention) + return obj def aggregate(self, arg, *args, **kwargs): @@ -841,6 +867,11 @@ def _upsample(self, method, limit=None): .fillna """ + if self._from_selection: + raise ValueError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': return super(PeriodIndexResampler, self)._upsample(method, diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 49802ba640d70..2ebcdc999a797 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -371,18 +371,44 @@ def test_apply_without_aggregation(self): result = t.apply(lambda x: x) assert_series_equal(result, self.series) + def test_agg_consistency(self): + + # make sure that we are consistent across + # similar aggregations with and w/o selection list + df = DataFrame(np.random.randn(1000, 3), + index=pd.date_range('1/1/2012', freq='S', periods=1000), + columns=['A', 'B', 'C']) + + r = df.resample('3T') + + expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) + result = r.agg({'r1': 'mean', 'r2': 'sum'}) + assert_frame_equal(result, expected) + + # TODO: once GH 14008 is fixed, move these tests into + # `Base` test class def test_agg(self): - # test with both a Resampler and a TimeGrouper + # test with all three Resampler apis and TimeGrouper np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) a_mean = r['A'].mean() a_std = r['A'].std() a_sum = r['A'].sum() @@ -393,12 +419,12 @@ def test_agg(self): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', 'std']]) - for t in [r, g]: + for t in cases: result = t.aggregate([np.mean, np.std]) assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': np.mean, 'B': np.std}) assert_frame_equal(result, expected, check_like=True) @@ -406,20 +432,20 @@ def test_agg(self): expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'std')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': ['mean', 'std']}) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ['mean', 'sum'] - for t in [r, g]: + for t in cases: result = t['A'].aggregate(['mean', 'sum']) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'sum')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) assert_frame_equal(result, expected, check_like=True) @@ -428,7 +454,7 @@ def test_agg(self): ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, 'B': {'mean2': 'mean', 'sum2': 'sum'}}) assert_frame_equal(result, expected, check_like=True) @@ -438,7 +464,7 @@ def test_agg(self): ('A', 'std'), ('B', 'mean'), ('B', 'std')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) @@ -450,20 +476,30 @@ def test_agg(self): ('r2', 'B', 'sum')]) def test_agg_misc(self): - # test with both a Resampler and a TimeGrouper + # test with all three Resampler apis and TimeGrouper np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) - - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] # passed lambda - for t in [r, g]: + for t in cases: result = t.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) rcustom = t['B'].apply(lambda x: np.std(x, ddof=1)) @@ -480,7 +516,7 @@ def test_agg_misc(self): ('result1', 'B'), ('result2', 'A'), ('result2', 'B')]) - for t in [r, g]: + for t in cases: result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum), ('result2', np.mean)])) assert_frame_equal(result, expected, check_like=True) @@ -495,19 +531,19 @@ def test_agg_misc(self): ('A', 'std'), ('B', 'mean'), ('B', 'std')]) - for t in [r, g]: + for t in cases: result = t.agg(OrderedDict([('A', ['sum', 'std']), ('B', ['mean', 'std'])])) assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not - for t in [r, g]: - result = g[['A', 'B']].agg({'A': ['sum', 'std'], + for t in cases: + result = t[['A', 'B']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) # series like aggs - for t in [r, g]: + for t in cases: result = t['A'].agg({'A': ['sum', 'std']}) expected = pd.concat([t['A'].sum(), t['A'].std()], @@ -528,9 +564,9 @@ def test_agg_misc(self): # errors # invalid names in the agg specification - for t in [r, g]: + for t in cases: def f(): - r[['A']].agg({'A': ['sum', 'std'], + t[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) self.assertRaises(SpecificationError, f) @@ -538,22 +574,31 @@ def f(): def test_agg_nested_dicts(self): np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) - - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) - - for t in [r, g]: + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] + + for t in cases: def f(): t.aggregate({'r1': {'A': ['mean', 'sum']}, 'r2': {'B': ['mean', 'sum']}}) self.assertRaises(ValueError, f) - for t in [r, g]: + for t in cases: expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(), t['B'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( @@ -567,19 +612,44 @@ def f(): 'B': {'rb': ['mean', 'std']}}) assert_frame_equal(result, expected, check_like=True) - def test_agg_consistency(self): + def test_selection_api_validation(self): + # GH 13500 + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + df = pd.DataFrame({'date': index, + 'a': np.arange(len(index), dtype=np.int64)}, + index=pd.MultiIndex.from_arrays([ + np.arange(len(index), dtype=np.int64), + index], names=['v', 'd'])) + df_exp = pd.DataFrame({'a': np.arange(len(index), dtype=np.int64)}, + index=index) - # make sure that we are consistent across - # similar aggregations with and w/o selection list - df = DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2012', freq='S', periods=1000), - columns=['A', 'B', 'C']) + # non DatetimeIndex + with tm.assertRaises(TypeError): + df.resample('2D', level='v') - r = df.resample('3T') + with tm.assertRaises(ValueError): + df.resample('2D', on='date', level='d') - expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) - result = r.agg({'r1': 'mean', 'r2': 'sum'}) - assert_frame_equal(result, expected) + with tm.assertRaises(TypeError): + df.resample('2D', on=['a', 'date']) + + with tm.assertRaises(KeyError): + df.resample('2D', level=['a', 'date']) + + # upsampling not allowed + with tm.assertRaises(ValueError): + df.resample('2D', level='d').asfreq() + + with tm.assertRaises(ValueError): + df.resample('2D', on='date').asfreq() + + exp = df_exp.resample('2D').sum() + exp.index.name = 'date' + assert_frame_equal(exp, df.resample('2D', on='date').sum()) + + exp.index.name = 'd' + assert_frame_equal(exp, df.resample('2D', level='d').sum()) class Base(object): @@ -2009,6 +2079,22 @@ def test_asfreq_upsample(self): result = frame.resample('1H').asfreq() assert_frame_equal(result, expected) + def test_selection(self): + index = self.create_series().index + # This is a bug, these should be implemented + # GH 14008 + df = pd.DataFrame({'date': index, + 'a': np.arange(len(index), dtype=np.int64)}, + index=pd.MultiIndex.from_arrays([ + np.arange(len(index), dtype=np.int64), + index], names=['v', 'd'])) + + with tm.assertRaises(NotImplementedError): + df.resample('2D', on='date') + + with tm.assertRaises(NotImplementedError): + df.resample('2D', level='d') + def test_annual_upsample_D_s_f(self): self._check_annual_upsample_cases('D', 'start', 'ffill') From b2a73b8ee7456ec684daad1ecfb9d0fded13b7ec Mon Sep 17 00:00:00 2001 From: Tom Bird Date: Wed, 31 Aug 2016 12:03:43 -0400 Subject: [PATCH 318/359] BUG: #14095. Amend eval() resolvers kwarg to accept lists closes #14095 Author: Tom Bird Closes #14121 from theultimatecrouton/resolvers and squashes the following commits: 227d734 [Tom Bird] BUG: #14095. Amend eval() resolvers kwarg to accept lists f27963c [Tom Bird] BUG: #14095. Amend eval() resolvers kwarg to accept lists 77500da [Tom Bird] BUG: #14095. Amend eval() resolvers kwarg to accept lists ef03d59 [Tom Bird] BUG: #14095. Amend eval() resolvers kwarg to accept lists --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/frame.py | 2 +- pandas/tests/frame/test_query_eval.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9c4010f8f024a..2f34b19d11cdf 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1358,3 +1358,4 @@ Bug Fixes - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. - Bug in ``read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`) +- Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue`14095`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 676997ede28b4..46a1d22a4114b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2272,7 +2272,7 @@ def eval(self, expr, inplace=None, **kwargs): resolvers = dict(self.iteritems()), index_resolvers if 'target' not in kwargs: kwargs['target'] = self - kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers + kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers) return _eval(expr, inplace=inplace, **kwargs) def select_dtypes(self, include=None, exclude=None): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 49b0ce66999d8..85159de64d83e 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -147,6 +147,16 @@ def test_query_non_str(self): with tm.assertRaisesRegexp(ValueError, msg): df.query(111) + def test_eval_resolvers_as_list(self): + # GH 14095 + df = DataFrame(randn(10, 2), columns=list('ab')) + dict1 = {'a': 1} + dict2 = {'b': 2} + self.assertTrue(df.eval('a + b', resolvers=[dict1, dict2]) == + dict1['a'] + dict2['b']) + self.assertTrue(pd.eval('a + b', resolvers=[dict1, dict2]) == + dict1['a'] + dict2['b']) + class TestDataFrameQueryWithMultiIndex(tm.TestCase): From 5db52f0d3000cb78322ffb148b3f94b3c883bb26 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 31 Aug 2016 12:06:27 -0400 Subject: [PATCH 319/359] API: Warn or raise for > 1 char encoded sep The system file encoding can cause a separator to be encoded as more than one character even though it maybe provided as one character. Multi-char separators are not supported by the C engine, so we need to catch this case. Closes #14065. Author: gfyoung Closes #14120 from gfyoung/multi-char-encoded and squashes the following commits: 152b685 [gfyoung] API: Warn or raise for > 1 char encoded sep --- doc/source/whatsnew/v0.19.0.txt | 17 +++++++++-------- pandas/io/parsers.py | 10 ++++++++++ pandas/io/tests/parser/test_unsupported.py | 2 ++ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2f34b19d11cdf..86897f51fd9d4 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -414,12 +414,12 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) +- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`) - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) -- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846`) @@ -473,6 +473,7 @@ API changes - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) +- ``pd.read_csv()`` in the C engine will now issue a ``ParserWarning`` or raise a ``ValueError`` when ``sep`` encoded is more than one character long (:issue:`14065`) - ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) - ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) @@ -1211,10 +1212,6 @@ Bug Fixes - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``groupby().cumsum()`` calculating ``cumprod`` when ``axis=1``. (:issue:`13994`) -- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) -- Bug in ``pd.read_csv()``, which caused errors to be raised when a dictionary containing scalars is passed in for ``na_values`` (:issue:`12224`) -- Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` which raised errors when a numpy array was passed in for ``usecols`` (:issue:`12546`) - Bug in ``pd.to_timedelta()`` in which the ``errors`` parameter was not being respected (:issue:`13613`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) @@ -1225,7 +1222,6 @@ Bug Fixes - Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`) - Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) -- Bug in ``pd.read_csv()`` where the index columns were being incorrectly parsed when parsed as dates with a ``thousands`` parameter (:issue:`14066`) - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) - Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) @@ -1267,6 +1263,11 @@ Bug Fixes - Bug in ``MultiIndex.from_arrays`` which didn't check for input array lengths matching (:issue:`13599`) +- Bug in ``pd.read_csv()`` which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) +- Bug in ``pd.read_csv()`` which caused errors to be raised when a dictionary containing scalars is passed in for ``na_values`` (:issue:`12224`) +- Bug in ``pd.read_csv()`` which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` which raised errors when a numpy array was passed in for ``usecols`` (:issue:`12546`) +- Bug in ``pd.read_csv()`` where the index columns were being incorrectly parsed when parsed as dates with a ``thousands`` parameter (:issue:`14066`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) - Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) @@ -1277,6 +1278,8 @@ Bug Fixes - Bug in ``pd.read_csv()`` in the C engine where the NULL character was not being parsed as NULL (:issue:`14012`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which NULL ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) +- Bug in ``pd.read_csv()`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) +- Bug in ``pd.read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`) - Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) - Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`) @@ -1351,11 +1354,9 @@ Bug Fixes - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) - Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`) -- Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. -- Bug in ``read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`) - Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue`14095`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 62f2ad1419d92..3bd8579d456d3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,6 +5,7 @@ from collections import defaultdict import re import csv +import sys import warnings import datetime @@ -782,6 +783,7 @@ def _clean_options(self, options, engine): " skipfooter" engine = 'python' + encoding = sys.getfilesystemencoding() or 'utf-8' if sep is None and not delim_whitespace: if engine == 'c': fallback_reason = "the 'c' engine does not support"\ @@ -798,6 +800,14 @@ def _clean_options(self, options, engine): " different from '\s+' are"\ " interpreted as regex)" engine = 'python' + + elif len(sep.encode(encoding)) > 1: + if engine not in ('python', 'python-fwf'): + fallback_reason = "the separator encoded in {encoding}"\ + " is > 1 char long, and the 'c' engine"\ + " does not support such separators".format( + encoding=encoding) + engine = 'python' elif delim_whitespace: if 'python' in engine: result['delimiter'] = '\s+' diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index ef8f7967193ff..e575843a7fc22 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -60,6 +60,8 @@ def test_c_engine(self): sep=None, delim_whitespace=False) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', sep='\s') + with tm.assertRaisesRegexp(ValueError, msg): + read_table(StringIO(data), engine='c', sep='§') with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', skipfooter=1) From 8fdfa518105cde9ae16449578b3d75c13c8c19a5 Mon Sep 17 00:00:00 2001 From: Giacomo Ferroni Date: Wed, 31 Aug 2016 17:17:14 +0100 Subject: [PATCH 320/359] Test for segfault in factorize (gh12666) (#14112) --- pandas/tests/test_algos.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 9543d9bba2a3a..092e02ee261a0 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -299,6 +299,16 @@ def _test_vector_resize(htable, uniques, dtype, nvals): _test_vector_resize(tbl(), vect(), dtype, 0) _test_vector_resize(tbl(), vect(), dtype, 10) + def test_complex_sorting(self): + # gh 12666 - check no segfault + # Test not valid numpy versions older than 1.11 + if pd._np_version_under1p11: + self.skipTest("Test valid only for numpy 1.11+") + + x17 = np.array([complex(i) for i in range(17)], dtype=object) + + self.assertRaises(TypeError, algos.factorize, x17[::-1], sort=True) + class TestUnique(tm.TestCase): _multiprocess_can_split_ = True From 70bb179bf843ebf2676e322b8abccc0ae0044c92 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Thu, 1 Sep 2016 16:27:38 +0900 Subject: [PATCH 321/359] CLN: Datetimelike._can_hold_na (#13983) --- pandas/tests/indexes/common.py | 13 +++- pandas/tests/test_base.py | 100 ++++++++++++++++-------------- pandas/tseries/base.py | 7 +-- pandas/tseries/period.py | 9 +++ pandas/tseries/tests/test_base.py | 59 ++++++++++++++++-- 5 files changed, 128 insertions(+), 60 deletions(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 687782172693a..2c8031898c78e 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -8,6 +8,7 @@ from pandas import (Series, Index, Float64Index, Int64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, notnull) +from pandas.types.common import needs_i8_conversion from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm @@ -319,13 +320,21 @@ def test_get_unique_index(self): if not ind._can_hold_na: continue - vals = ind.values[[0] * 5] - vals[0] = np.nan + if needs_i8_conversion(ind): + vals = ind.asi8[[0] * 5] + vals[0] = pd.tslib.iNaT + else: + vals = ind.values[[0] * 5] + vals[0] = np.nan + vals_unique = vals[:2] idx_nan = ind._shallow_copy(vals) idx_unique_nan = ind._shallow_copy(vals_unique) self.assertTrue(idx_unique_nan.is_unique) + self.assertEqual(idx_nan.dtype, ind.dtype) + self.assertEqual(idx_unique_nan.dtype, ind.dtype) + for dropna, expected in zip([False, True], [idx_unique_nan, idx_unique]): for i in [idx_nan, idx_unique_nan]: diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 83b1cd141a61b..eaa316bfd8157 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -9,7 +9,8 @@ import pandas as pd import pandas.compat as compat -from pandas.types.common import is_object_dtype, is_datetimetz +from pandas.types.common import (is_object_dtype, is_datetimetz, + needs_i8_conversion) import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta) @@ -17,7 +18,6 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import (FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin) -from pandas.types.common import is_datetime64_dtype from pandas.tseries.base import DatetimeIndexOpsMixin @@ -450,7 +450,6 @@ def test_nanops(self): def test_value_counts_unique_nunique(self): for orig in self.objs: - o = orig.copy() klass = type(o) values = o._values @@ -504,9 +503,10 @@ def test_value_counts_unique_nunique(self): def test_value_counts_unique_nunique_null(self): for null_obj in [np.nan, None]: - for o in self.objs: + for orig in self.objs: + o = orig.copy() klass = type(o) - values = o.values + values = o._values if not self._allow_na_ops(o): continue @@ -522,34 +522,43 @@ def test_value_counts_unique_nunique_null(self): o[0:2] = pd.tslib.iNaT values = o._values - elif is_datetime64_dtype(o) or isinstance(o, PeriodIndex): + elif needs_i8_conversion(o): values[0:2] = pd.tslib.iNaT + values = o._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original + self.assertEqual(values.dtype, o.dtype) # create repeated values, 'n'th element is repeated by n+1 # times - if isinstance(o, PeriodIndex): - # freq must be specified because repeat makes freq - # ambiguous + if isinstance(o, (DatetimeIndex, PeriodIndex)): + expected_index = o.copy() + expected_index.name = None - # resets name from Index - expected_index = pd.Index(o, name=None) # attach name to klass - o = klass(np.repeat(values, range(1, len(o) + 1)), - freq=o.freq, name='a') - elif isinstance(o, Index): - expected_index = pd.Index(values, name=None) - o = klass( - np.repeat(values, range(1, len(o) + 1)), name='a') + o = klass(values.repeat(range(1, len(o) + 1))) + o.name = 'a' else: - expected_index = pd.Index(values, name=None) - idx = np.repeat(o.index.values, range(1, len(o) + 1)) - o = klass( - np.repeat(values, range( - 1, len(o) + 1)), index=idx, name='a') + if is_datetimetz(o): + expected_index = orig._values._shallow_copy(values) + else: + expected_index = pd.Index(values) + expected_index.name = None + o = o.repeat(range(1, len(o) + 1)) + o.name = 'a' + + # check values has the same dtype as the original + self.assertEqual(o.dtype, orig.dtype) + # check values correctly have NaN + nanloc = np.zeros(len(o), dtype=np.bool) + nanloc[:3] = True + if isinstance(o, Index): + self.assert_numpy_array_equal(pd.isnull(o), nanloc) + else: + exp = pd.Series(nanloc, o.index, name='a') + self.assert_series_equal(pd.isnull(o), exp) expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], @@ -578,7 +587,9 @@ def test_value_counts_unique_nunique_null(self): self.assertIs(result[0], pd.NaT) else: tm.assert_numpy_array_equal(result[1:], values[2:]) + self.assertTrue(pd.isnull(result[0])) + self.assertEqual(result.dtype, orig.dtype) self.assertEqual(o.nunique(), 8) self.assertEqual(o.nunique(dropna=False), 9) @@ -942,18 +953,14 @@ def test_fillna(self): # # GH 11343 # though Index.fillna and Series.fillna has separate impl, # test here to confirm these works as the same - def get_fill_value(obj): - if isinstance(obj, pd.tseries.base.DatetimeIndexOpsMixin): - return obj.asobject.values[0] - else: - return obj.values[0] - for o in self.objs: - klass = type(o) + for orig in self.objs: + + o = orig.copy() values = o.values # values will not be changed - result = o.fillna(get_fill_value(o)) + result = o.fillna(o.astype(object).values[0]) if isinstance(o, Index): self.assert_index_equal(o, result) else: @@ -962,33 +969,30 @@ def get_fill_value(obj): self.assertFalse(o is result) for null_obj in [np.nan, None]: - for o in self.objs: + for orig in self.objs: + o = orig.copy() klass = type(o) - values = o.values.copy() if not self._allow_na_ops(o): continue - # value for filling - fill_value = get_fill_value(o) + if needs_i8_conversion(o): - # special assign to the numpy array - if o.values.dtype == 'datetime64[ns]' or isinstance( - o, PeriodIndex): - values[0:2] = pd.tslib.iNaT + values = o.astype(object).values + fill_value = values[0] + values[0:2] = pd.NaT else: + values = o.values.copy() + fill_value = o.values[0] values[0:2] = null_obj - if isinstance(o, PeriodIndex): - # freq must be specified because repeat makes freq - # ambiguous - expected = [fill_value.ordinal] * 2 + list(values[2:]) - expected = klass(ordinal=expected, freq=o.freq) - o = klass(ordinal=values, freq=o.freq) - else: - expected = [fill_value] * 2 + list(values[2:]) - expected = klass(expected) - o = klass(values) + expected = [fill_value] * 2 + list(values[2:]) + + expected = klass(expected) + o = klass(values) + + # check values has the same dtype as the original + self.assertEqual(o.dtype, orig.dtype) result = o.fillna(fill_value) if isinstance(o, Index): diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index c08bb53238e5c..f0c6e334925c4 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -362,6 +362,8 @@ def get_duplicates(self): values = Index.get_duplicates(self) return self._simple_new(values) + _can_hold_na = True + _na_value = tslib.NaT """The expected NA value to use with this index.""" @@ -370,11 +372,6 @@ def _isnan(self): """ return if each value is nan""" return (self.asi8 == tslib.iNaT) - @cache_readonly - def hasnans(self): - """ return if I have any nans; enables various perf speedups """ - return self._isnan.any() - @property def asobject(self): """ diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 8bce01b0759fc..4b50a8c0c088b 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -777,6 +777,15 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return Index.get_indexer(self._int64index, target, method, limit, tolerance) + def _get_unique_index(self, dropna=False): + """ + wrap Index._get_unique_index to handle NaT + """ + res = super(PeriodIndex, self)._get_unique_index(dropna=dropna) + if dropna: + res = res.dropna() + return res + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 26e77d3ad79f3..aa13591a4ff30 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -555,8 +555,8 @@ def test_nonunique_contains(self): def test_order(self): # with freq - idx1 = DatetimeIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D', name='idx') + idx1 = DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], freq='D', name='idx') idx2 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], freq='H', tz='Asia/Tokyo', name='tzidx') @@ -798,10 +798,27 @@ def test_shift(self): '2011-01-01 09:00'], name='xxx', tz=tz) tm.assert_index_equal(idx.shift(-3, freq='H'), exp) - def test_na_value(self): + def test_nat(self): self.assertIs(pd.DatetimeIndex._na_value, pd.NaT) self.assertIs(pd.DatetimeIndex([])._na_value, pd.NaT) + for tz in [None, 'US/Eastern', 'UTC']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.int64)) + + idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.int64)) + class TestTimedeltaIndexOps(Ops): def setUp(self): @@ -1645,10 +1662,26 @@ def test_repeat(self): tm.assert_index_equal(res, exp) self.assertIsNone(res.freq) - def test_na_value(self): + def test_nat(self): self.assertIs(pd.TimedeltaIndex._na_value, pd.NaT) self.assertIs(pd.TimedeltaIndex([])._na_value, pd.NaT) + idx = pd.TimedeltaIndex(['1 days', '2 days']) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.int64)) + + idx = pd.TimedeltaIndex(['1 days', 'NaT']) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.int64)) + class TestPeriodIndexOps(Ops): def setUp(self): @@ -2593,10 +2626,26 @@ def test_repeat(self): for res in [index.repeat(3), np.repeat(index, 3)]: tm.assert_index_equal(res, exp) - def test_na_value(self): + def test_nat(self): self.assertIs(pd.PeriodIndex._na_value, pd.NaT) self.assertIs(pd.PeriodIndex([], freq='M')._na_value, pd.NaT) + idx = pd.PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.int64)) + + idx = pd.PeriodIndex(['2011-01-01', 'NaT'], freq='D') + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.int64)) + if __name__ == '__main__': import nose From 5f5acc3f361f93b9c337286a32e304ac04a51288 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Sep 2016 09:28:06 +0200 Subject: [PATCH 322/359] TST: confirming tests for some fixed issues (#14117) * TST: assert indexing with list of Periods works (GH7710) * TST: assert median for timedelta with NaT works (GH8617) * TST: assert no conversion to float when resampling datetime64 values (13119) --- pandas/tseries/tests/test_period.py | 7 +++++++ pandas/tseries/tests/test_resample.py | 17 +++++++++++++++++ pandas/tseries/tests/test_timeseries.py | 8 ++++++++ 3 files changed, 32 insertions(+) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 1ddcc11c15a59..468c3d5bfc37c 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -2180,6 +2180,13 @@ def test_getitem_nat(self): pd.Period('2011-01', freq='M')) self.assertIs(s[pd.NaT], tslib.NaT) + def test_getitem_list_periods(self): + # GH 7710 + rng = period_range(start='2012-01-01', periods=10, freq='D') + ts = Series(lrange(len(rng)), index=rng) + exp = ts.iloc[[1]] + tm.assert_series_equal(ts[[Period('2012-01-02', freq='D')]], exp) + def test_slice_with_negative_step(self): ts = Series(np.arange(20), period_range('2014-01', periods=20, freq='M')) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 2ebcdc999a797..204808dd510a0 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -2022,6 +2022,23 @@ def test_resample_timedelta_values(self): res = df['time'].resample('2D').first() tm.assert_series_equal(res, exp) + def test_resample_datetime_values(self): + # GH 13119 + # check that datetime dtype is preserved when NaT values are + # introduced by the resampling + + dates = [datetime(2016, 1, 15), datetime(2016, 1, 19)] + df = DataFrame({'timestamp': dates}, index=dates) + + exp = Series([datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)], + index=date_range('2016-01-15', periods=3, freq='2D'), + name='timestamp') + + res = df.resample('2D').first()['timestamp'] + tm.assert_series_equal(res, exp) + res = df['timestamp'].resample('2D').first() + tm.assert_series_equal(res, exp) + class TestPeriodIndex(Base, tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 5eb46684d1860..2355d663ed7d5 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4400,6 +4400,14 @@ def test_intercept_astype_object(self): result = df.values.squeeze() self.assertTrue((result[:, 0] == expected.values).all()) + def test_nat_operations(self): + # GH 8617 + s = Series([0, pd.NaT], dtype='m8[ns]') + exp = s[0] + self.assertEqual(s.median(), exp) + self.assertEqual(s.min(), exp) + self.assertEqual(s.max(), exp) + class TestTimestamp(tm.TestCase): def test_class_ops_pytz(self): From 64831800b976c4a12636eb47e736a93e358990b9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Sep 2016 09:28:53 +0200 Subject: [PATCH 323/359] Make show_versions not to reload modules (GH13684) (#14126) --- pandas/util/print_versions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index 70df1df336704..3747e2ff6ca8f 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -101,7 +101,10 @@ def show_versions(as_json=False): deps_blob = list() for (modname, ver_f) in deps: try: - mod = importlib.import_module(modname) + if modname in sys.modules: + mod = sys.modules[modname] + else: + mod = importlib.import_module(modname) ver = ver_f(mod) deps_blob.append((modname, ver)) except: From 306e6472e2f786683e4b8fb30cce60456e7b5f03 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 1 Sep 2016 04:03:39 -0400 Subject: [PATCH 324/359] DEPR: Deprecated PeriodIndex.to_datetime (#14113) Deprecation is in favour of PeriodIndex.to_timestamp. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tests/frame/test_to_csv.py | 14 ++++++++++---- pandas/tseries/period.py | 8 ++++++++ pandas/tseries/tests/test_period.py | 12 ++++++++++-- 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 86897f51fd9d4..8ef2498eeef33 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1130,6 +1130,7 @@ Deprecations - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``PeriodIndex.to_datetime`` has been deprecated in favour of ``PeriodIndex.to_timestamp`` (:issue:`8254`) - ``Timestamp.to_datetime`` has been deprecated in favour of ``Timestamp.to_pydatetime`` (:issue:`8254`) - ``Index.to_datetime`` and ``DatetimeIndex.to_datetime`` have been deprecated in favour of ``pd.to_datetime`` (:issue:`8254`) - ``SparseList`` has been deprecated and will be removed in a future version (:issue:`13784`) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 43c8d6f25ab01..54bcb670caaef 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -291,10 +291,10 @@ def _to_uni(x): elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array( - list(map(Timestamp, recons.index.to_datetime())), + list(map(Timestamp, to_datetime(recons.index))), dtype=r_dtype) df.index = np.array( - list(map(Timestamp, df.index.to_datetime())), + list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) @@ -316,10 +316,10 @@ def _to_uni(x): elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array( - lmap(Timestamp, recons.columns.to_datetime()), + lmap(Timestamp, to_datetime(recons.columns)), dtype=c_dtype) df.columns = np.array( - lmap(Timestamp, df.columns.to_datetime()), + lmap(Timestamp, df.columns.to_timestamp()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) @@ -1157,3 +1157,9 @@ def test_to_csv_quoting(self): df = df.set_index(['a', 'b']) expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 4b50a8c0c088b..7fb0f19b04486 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1,6 +1,7 @@ # pylint: disable=E1101,E1103,W0232 from datetime import datetime, timedelta import numpy as np +import warnings from pandas.core import common as com @@ -550,6 +551,13 @@ def asfreq(self, freq=None, how='E'): return self._simple_new(new_data, self.name, freq=freq) def to_datetime(self, dayfirst=False): + """ + DEPRECATED: use :meth:`to_timestamp` instead. + + Cast to DatetimeIndex. + """ + warnings.warn("to_datetime is deprecated. Use self.to_timestamp(...)", + FutureWarning, stacklevel=2) return self.to_timestamp() year = _field_accessor('year', 0, "The year of the period") diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 468c3d5bfc37c..a492abce01086 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -3567,12 +3567,20 @@ def test_with_multi_index(self): tm.assertIsInstance(s.index.values[0][0], Period) - def test_to_datetime_1703(self): + def test_to_timestamp_1703(self): index = period_range('1/1/2012', periods=4, freq='D') - result = index.to_datetime() + result = index.to_timestamp() self.assertEqual(result[0], Timestamp('1/1/2012')) + def test_to_datetime_depr(self): + index = period_range('1/1/2012', periods=4, freq='D') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = index.to_datetime() + self.assertEqual(result[0], Timestamp('1/1/2012')) + def test_get_loc_msg(self): idx = period_range('2000-1-1', freq='A', periods=10) bad_period = Period('2012', 'A') From 58199c5da263491a26fa6cb26d2ff9d38c4b5dac Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 1 Sep 2016 04:19:14 -0400 Subject: [PATCH 325/359] API: Raise FileNotFoundError for nonexistent files (#14116) For a nonexistent file, raise the more specific FileNotFoundError for Python >= 3.3 in read_csv, read_table, and read_hdf. This error is backported to Python 2.x as IOError. Closes gh-14086. --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/compat/__init__.py | 6 ++++++ pandas/io/pytables.py | 3 ++- pandas/io/tests/parser/common.py | 5 +++-- pandas/io/tests/test_pytables.py | 3 ++- pandas/parser.pyx | 4 ++-- 6 files changed, 16 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 8ef2498eeef33..4365c66237752 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -455,6 +455,7 @@ API changes - ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds (:issue:`14101`) - ``Panel.to_sparse`` will raise a ``NotImplementedError`` exception when called (:issue:`13778`) - ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`) +- ``pd.read_csv()``, ``pd.read_table()``, and ``pd.read_hdf()`` raise the builtin ``FileNotFoundError`` exception for Python 3.x when called on a nonexistent file, and this is back-ported as IOError in Python 2.x (:issue:`14086`) - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 9f07e8ce5e0c6..990018f2f7f3b 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -107,6 +107,10 @@ def signature(f): long = int unichr = chr + # This was introduced in Python 3.3, but we don't support + # Python 3.x < 3.4, so checking PY3 is safe. + FileNotFoundError = FileNotFoundError + # list-producing versions of the major Python iterating functions def lrange(*args, **kwargs): return list(range(*args, **kwargs)) @@ -125,6 +129,8 @@ def lfilter(*args, **kwargs): import re _name_re = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*$") + FileNotFoundError = IOError + def isidentifier(s, dotted=False): return bool(_name_re.match(s)) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f77076e54f34d..ccc3fe081acde 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -322,7 +322,8 @@ def read_hdf(path_or_buf, key=None, **kwargs): exists = False if not exists: - raise IOError('File %s does not exist' % path_or_buf) + raise compat.FileNotFoundError( + 'File %s does not exist' % path_or_buf) # can't auto open/close if we are using an iterator # so delegate to the iterator diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index b90fc304e125e..0b59b695e1dca 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -654,9 +654,10 @@ def test_file(self): tm.assert_frame_equal(url_table, local_table) def test_nonexistent_path(self): - # don't segfault pls #2428 + # gh-2428: pls no segfault + # gh-14086: raise more helpful FileNotFoundError path = '%s.csv' % tm.rands(10) - self.assertRaises(IOError, self.read_csv, path) + self.assertRaises(compat.FileNotFoundError, self.read_csv, path) def test_missing_trailing_delimiters(self): data = """A,B,C,D diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index f821714b54a76..44ff9f8a5a1dd 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -339,7 +339,8 @@ def test_api(self): # File path doesn't exist path = "" - self.assertRaises(IOError, read_hdf, path, 'df') + self.assertRaises(compat.FileNotFoundError, + read_hdf, path, 'df') def test_api_default_format(self): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 5af82be5b741b..5d8ab7213a7b6 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -34,6 +34,7 @@ import numpy as np cimport util import pandas.lib as lib +import pandas.compat as compat from pandas.types.common import (is_categorical_dtype, CategoricalDtype, is_integer_dtype, is_float_dtype, is_bool_dtype, is_object_dtype, @@ -631,7 +632,6 @@ cdef class TextReader: raise ValueError('Multiple files found in compressed ' 'zip file %s', str(zip_names)) elif self.compression == 'xz': - from pandas import compat lzma = compat.import_lzma() if isinstance(source, basestring): @@ -663,7 +663,7 @@ cdef class TextReader: if ptr == NULL: if not os.path.exists(source): - raise IOError('File %s does not exist' % source) + raise compat.FileNotFoundError('File %s does not exist' % source) raise IOError('Initializing from file failed') self.parser.source = ptr From 362a56153f4c9df4447c8407899237c9e6da6b70 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 2 Sep 2016 07:16:51 -0400 Subject: [PATCH 326/359] BUG: Don't print stray newline with MultiIndex Title is self-explanatory. Closes #6618. Author: gfyoung Closes #14132 from gfyoung/to-csv-newline and squashes the following commits: d1a600f [gfyoung] BUG: Don't print stray newline with MultiIndex --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/formats/format.py | 18 +++++++++--------- pandas/tests/formats/test_format.py | 27 +++++++++++++++++++++++++++ pandas/tests/frame/test_to_csv.py | 18 +++--------------- 4 files changed, 40 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 4365c66237752..a02e6ac200e42 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1355,6 +1355,7 @@ Bug Fixes - Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`) - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) - Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`) +- Bug in ``DataFrame.to_csv()`` with ``MultiIndex`` columns in which a stray empty line was added (:issue:`6618`) - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index dd9a852bd8713..4740dd25c419d 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1524,9 +1524,9 @@ def _save_header(self): if not has_mi_columns: encoded_labels += list(write_cols) - - # write out the mi - if has_mi_columns: + writer.writerow(encoded_labels) + else: + # write out the mi columns = obj.columns # write out the names for each level, then ALL of the values for @@ -1547,12 +1547,12 @@ def _save_header(self): writer.writerow(col_line) - # add blanks for the columns, so that we - # have consistent seps - encoded_labels.extend([''] * len(columns)) - - # write out the index label line - writer.writerow(encoded_labels) + # Write out the index line if it's not empty. + # Otherwise, we will print out an extraneous + # blank line between the mi and the data rows. + if encoded_labels and set(encoded_labels) != set(['']): + encoded_labels.extend([''] * len(columns)) + writer.writerow(encoded_labels) def _save(self): diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 0a2e63a018799..7e55c04fec7cc 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3327,6 +3327,33 @@ def test_to_csv_date_format(self): self.assertEqual(df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d'), expected_ymd_sec) + def test_to_csv_multi_index(self): + # see gh-6618 + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1],[2]])) + + exp = ",1\n,2\n0,1\n" + self.assertEqual(df.to_csv(), exp) + + exp = "1\n2\n1\n" + self.assertEqual(df.to_csv(index=False), exp) + + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1],[2]]), + index=pd.MultiIndex.from_arrays([[1],[2]])) + + exp = ",,1\n,,2\n1,2,1\n" + self.assertEqual(df.to_csv(), exp) + + exp = "1\n2\n1\n" + self.assertEqual(df.to_csv(index=False), exp) + + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([['foo'],['bar']])) + + exp = ",foo\n,bar\n0,1\n" + self.assertEqual(df.to_csv(), exp) + + exp = "foo\nbar\n1\n" + self.assertEqual(df.to_csv(index=False), exp) + def test_period(self): # GH 12615 df = pd.DataFrame({'A': pd.period_range('2013-01', diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 54bcb670caaef..6d09378ca864e 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -587,21 +587,9 @@ def _make_frame(names=None): df = _make_frame(True) df.to_csv(path, tupleize_cols=False) - # catch invalid headers - with assertRaisesRegexp(CParserError, - 'Passed header=\[0,1,2\] are too many ' - 'rows for this multi_index of columns'): - read_csv(path, tupleize_cols=False, - header=lrange(3), index_col=0) - - with assertRaisesRegexp(CParserError, - 'Passed header=\[0,1,2,3,4,5,6\], len of ' - '7, but only 6 lines in file'): - read_csv(path, tupleize_cols=False, - header=lrange(7), index_col=0) - - for i in [4, 5, 6]: - with tm.assertRaises(CParserError): + for i in [5, 6, 7]: + msg = 'len of {i}, but only 5 lines in file'.format(i=i) + with assertRaisesRegexp(CParserError, msg): read_csv(path, tupleize_cols=False, header=lrange(i), index_col=0) From ccec504e31ce74f8016952ac75add1cc4bec7080 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Sep 2016 07:28:29 -0400 Subject: [PATCH 327/359] BUG: int dtype for get_dummies closes #8725 Changes `get_dummies` return columns with `uint8` dtypes instead of coercing to floats if they were alongside other float columns. Author: Tom Augspurger Closes #13796 from TomAugspurger/get_dummies_dtype and squashes the following commits: cace0f7 [Tom Augspurger] BUG: int dtype for get_dummies --- doc/source/whatsnew/v0.19.0.txt | 29 ++++- pandas/core/reshape.py | 11 +- pandas/stats/tests/test_ols.py | 2 + pandas/tests/test_panel.py | 8 +- pandas/tests/test_reshape.py | 223 ++++++++++++++++++++------------ 5 files changed, 179 insertions(+), 94 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a02e6ac200e42..38a90ac371b16 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -371,6 +371,32 @@ Previous versions of pandas would permanently silence numpy's ufunc error handli After upgrading pandas, you may see *new* ``RuntimeWarnings`` being issued from your code. These are likely legitimate, and the underlying cause likely existed in the code when using previous versions of pandas that simply silenced the warning. Use `numpy.errstate `__ around the source of the ``RuntimeWarning`` to control how these conditions are handled. +.. _whatsnew_0190.get_dummies_dtypes: + +get_dummies dtypes +^^^^^^^^^^^^^^^^^^ + +The ``pd.get_dummies`` function now returns dummy-encoded columns as small integers, rather than floats (:issue:`8725`) + +Previous behaviour: + +.. code-block:: ipython + + In [1]: pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + + Out[1]: + a float64 + b float64 + c float64 + dtype: object + +New Behavior: + +.. ipython:: python + + pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + + .. _whatsnew_0190.enhancements.other: Other enhancements @@ -479,7 +505,6 @@ API changes - ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) - .. _whatsnew_0190.api.tolist: ``Series.tolist()`` will now return Python types @@ -1355,7 +1380,7 @@ Bug Fixes - Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`) - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) - Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`) -- Bug in ``DataFrame.to_csv()`` with ``MultiIndex`` columns in which a stray empty line was added (:issue:`6618`) +- Bug in ``DataFrame.to_csv()`` with ``MultiIndex`` columns in which a stray empty line was added (:issue:`6618`) - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 4f601a2d377a6..b451f49fce78c 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1161,14 +1161,17 @@ def get_empty_Frame(data, sparse): sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): - sarr = SparseArray(np.ones(len(ixs)), - sparse_index=IntIndex(N, ixs), fill_value=0) + sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), + sparse_index=IntIndex(N, ixs), fill_value=0, + dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) - return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) + out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, + dtype=np.uint8) + return out else: - dummy_mat = np.eye(number_of_cols).take(codes, axis=0) + dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index bac824f0b4840..770f7b35a02ca 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -645,6 +645,7 @@ def testWithXEffects(self): exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]], columns=['x1_30', 'x1_9', 'x2', 'intercept'], index=res.index, dtype=float) + exp_x[['x1_30', 'x1_9']] = exp_x[['x1_30', 'x1_9']].astype(np.uint8) assert_frame_equal(res, exp_x.reindex(columns=res.columns)) def testWithXEffectsAndDroppedDummies(self): @@ -659,6 +660,7 @@ def testWithXEffectsAndDroppedDummies(self): exp_x = DataFrame([[1., 0., 14., 1.], [0, 1, 17, 1], [0, 0, 48, 1]], columns=['x1_6', 'x1_9', 'x2', 'intercept'], index=res.index, dtype=float) + exp_x[['x1_6', 'x1_9']] = exp_x[['x1_6', 'x1_9']].astype(np.uint8) assert_frame_equal(res, exp_x.reindex(columns=res.columns)) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 10a6693525590..0b266d799cf8c 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2429,18 +2429,18 @@ def test_truncate(self): def test_axis_dummies(self): from pandas.core.reshape import make_axis_dummies - minor_dummies = make_axis_dummies(self.panel, 'minor') + minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8) self.assertEqual(len(minor_dummies.columns), len(self.panel.index.levels[1])) - major_dummies = make_axis_dummies(self.panel, 'major') + major_dummies = make_axis_dummies(self.panel, 'major').astype(np.uint8) self.assertEqual(len(major_dummies.columns), len(self.panel.index.levels[0])) mapping = {'A': 'one', 'B': 'one', 'C': 'two', 'D': 'two'} transformed = make_axis_dummies(self.panel, 'minor', - transform=mapping.get) + transform=mapping.get).astype(np.uint8) self.assertEqual(len(transformed.columns), 2) self.assert_index_equal(transformed.columns, Index(['one', 'two'])) @@ -2450,7 +2450,7 @@ def test_get_dummies(self): from pandas.core.reshape import get_dummies, make_axis_dummies self.panel['Label'] = self.panel.index.labels[1] - minor_dummies = make_axis_dummies(self.panel, 'minor') + minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8) dummies = get_dummies(self.panel['Label']) self.assert_numpy_array_equal(dummies.values, minor_dummies.values) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 7136d7effc1fc..8bfd6350adc06 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -174,15 +174,15 @@ def test_basic(self): s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'a': {0: 1.0, - 1: 0.0, - 2: 0.0}, - 'b': {0: 0.0, - 1: 1.0, - 2: 0.0}, - 'c': {0: 0.0, - 1: 0.0, - 2: 1.0}}) + expected = DataFrame({'a': {0: 1, + 1: 0, + 2: 0}, + 'b': {0: 0, + 1: 1, + 2: 0}, + 'c': {0: 0, + 1: 0, + 2: 1}}, dtype=np.uint8) assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) @@ -200,7 +200,7 @@ def test_basic_types(self): if not self.sparse: exp_df_type = DataFrame - exp_blk_type = pd.core.internals.FloatBlock + exp_blk_type = pd.core.internals.IntBlock else: exp_df_type = SparseDataFrame exp_blk_type = pd.core.internals.SparseBlock @@ -239,22 +239,24 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) - exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, - 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, + 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) - exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, - 'a': {0: 1.0, 1: 0.0, 2: 0.0}, - 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, + 'a': {0: 1, 1: 0, 2: 0}, + 'b': {0: 0, 1: 1, 2: 0}}, + dtype=np.uint8) exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) - exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan]) + exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], + dtype=np.uint8) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) def test_unicode(self @@ -264,31 +266,34 @@ def test_unicode(self eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] res = get_dummies(s, prefix='letter', sparse=self.sparse) - exp = DataFrame({'letter_e': {0: 1.0, - 1: 0.0, - 2: 0.0}, - u('letter_%s') % eacute: {0: 0.0, - 1: 1.0, - 2: 1.0}}) + exp = DataFrame({'letter_e': {0: 1, + 1: 0, + 2: 0}, + u('letter_%s') % eacute: {0: 0, + 1: 1, + 2: 1}}, + dtype=np.uint8) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse) - expected = DataFrame({'A_a': [1., 0, 1], - 'A_b': [0., 1, 0], - 'B_b': [1., 1, 0], - 'B_c': [0., 0, 1]}) + expected = DataFrame({'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self): df = self.df result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'A_a': [1., 0, 1], - 'A_b': [0., 1, 0], - 'B_b': [1., 1, 0], - 'B_c': [0., 0, 1]}) + 'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1]}) + cols = ['A_a', 'A_b', 'B_b', 'B_c'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) @@ -299,10 +304,12 @@ def test_dataframe_dummies_prefix_list(self): 'C': [1, 2, 3]}) result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'from_A_a': [1., 0, 1], - 'from_A_b': [0., 1, 0], - 'from_B_b': [1., 1, 0], - 'from_B_c': [0., 0, 1]}) + 'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0], + 'from_B_b': [1, 1, 0], + 'from_B_c': [0, 0, 1]}) + cols = expected.columns[1:] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) @@ -311,31 +318,37 @@ def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df result = get_dummies(df, prefix='bad', sparse=self.sparse) - expected = DataFrame([[1, 1., 0., 1., 0.], - [2, 0., 1., 1., 0.], - [3, 1., 0., 0., 1.]], - columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c']) + expected = DataFrame([[1, 1, 0, 1, 0], + [2, 0, 1, 1, 0], + [3, 1, 0, 0, 1]], + columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], + dtype=np.uint8) + expected = expected.astype({"C": np.int}) assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], columns=['A'], sparse=self.sparse) - expected = DataFrame({'from_A_a': [1., 0, 1], - 'from_A_b': [0., 1, 0], + expected = DataFrame({'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) + cols = ['from_A_a', 'from_A_b'] + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self): df = self.df result = get_dummies(df, prefix_sep='..', sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'A..a': [1., 0, 1], - 'A..b': [0., 1, 0], - 'B..b': [1., 1, 0], - 'B..c': [0., 0, 1]}) + 'A..a': [1, 0, 1], + 'A..b': [0, 1, 0], + 'B..b': [1, 1, 0], + 'B..c': [0, 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] + cols = expected.columns[1:] + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) @@ -360,11 +373,13 @@ def test_dataframe_dummies_prefix_dict(self): 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) result = get_dummies(df, prefix=prefixes, sparse=self.sparse) - expected = DataFrame({'from_A_a': [1., 0, 1], - 'from_A_b': [0., 1, 0], - 'from_B_b': [1., 1, 0], - 'from_B_c': [0., 0, 1], + expected = DataFrame({'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0], + 'from_B_b': [1, 1, 0], + 'from_B_c': [0, 0, 1], 'C': [1, 2, 3]}) + cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self): @@ -372,12 +387,14 @@ def test_dataframe_dummies_with_na(self): df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_a': [1., 0, 1, 0], - 'A_b': [0., 1, 0, 0], - 'A_nan': [0., 0, 0, 1], - 'B_b': [1., 1, 0, 0], - 'B_c': [0., 0, 1, 0], - 'B_nan': [0., 0, 0, 1]}) + 'A_a': [1, 0, 1, 0], + 'A_b': [0, 1, 0, 0], + 'A_nan': [0, 0, 0, 1], + 'B_b': [1, 1, 0, 0], + 'B_c': [0, 0, 1, 0], + 'B_nan': [0, 0, 0, 1]}) + cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -391,12 +408,14 @@ def test_dataframe_dummies_with_categorical(self): df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'A_a': [1., 0, 1], - 'A_b': [0., 1, 0], - 'B_b': [1., 1, 0], - 'B_c': [0., 0, 1], - 'cat_x': [1., 0, 0], - 'cat_y': [0., 1, 1]}) + 'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1], + 'cat_x': [1, 0, 0], + 'cat_y': [0, 1, 1]}) + cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']] assert_frame_equal(result, expected) @@ -408,12 +427,12 @@ def test_basic_drop_first(self): s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'b': {0: 0.0, - 1: 1.0, - 2: 0.0}, - 'c': {0: 0.0, - 1: 0.0, - 2: 1.0}}) + expected = DataFrame({'b': {0: 0, + 1: 1, + 2: 0}, + 'c': {0: 0, + 1: 0, + 2: 1}}, dtype=np.uint8) result = get_dummies(s_list, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) @@ -449,19 +468,19 @@ def test_basic_drop_first_NA(self): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) - exp = DataFrame({'b': {0: 0.0, - 1: 1.0, - 2: 0.0}}) + exp = DataFrame({'b': {0: 0, + 1: 1, + 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, drop_first=True) - exp_na = DataFrame({'b': {0: 0.0, - 1: 1.0, - 2: 0.0}, - nan: {0: 0.0, - 1: 0.0, - 2: 1.0}}).reindex_axis( + exp_na = DataFrame({'b': {0: 0, + 1: 1, + 2: 0}, + nan: {0: 0, + 1: 0, + 2: 1}}, dtype=np.uint8).reindex_axis( ['b', nan], 1) assert_frame_equal(res_na, exp_na) @@ -473,8 +492,8 @@ def test_basic_drop_first_NA(self): def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse, drop_first=True) - expected = DataFrame({'A_b': [0., 1, 0], - 'B_c': [0., 0, 1]}) + expected = DataFrame({'A_b': [0, 1, 0], + 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self): @@ -482,9 +501,11 @@ def test_dataframe_dummies_drop_first_with_categorical(self): df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3], - 'A_b': [0., 1, 0], - 'B_c': [0., 0, 1], - 'cat_y': [0., 1, 1]}) + 'A_b': [0, 1, 0], + 'B_c': [0, 0, 1], + 'cat_y': [0, 1, 1]}) + cols = ['A_b', 'B_c', 'cat_y'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) @@ -494,10 +515,13 @@ def test_dataframe_dummies_drop_first_with_na(self): result = get_dummies(df, dummy_na=True, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_b': [0., 1, 0, 0], - 'A_nan': [0., 0, 0, 1], - 'B_c': [0., 0, 1, 0], - 'B_nan': [0., 0, 0, 1]}) + 'A_b': [0, 1, 0, 0], + 'A_nan': [0, 0, 0, 1], + 'B_c': [0, 0, 1, 0], + 'B_nan': [0, 0, 0, 1]}) + cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] + expected[cols] = expected[cols].astype(np.uint8) + expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -506,6 +530,37 @@ def test_dataframe_dummies_drop_first_with_na(self): expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) + def test_int_int(self): + data = Series([1, 2, 1]) + result = pd.get_dummies(data) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], + dtype=np.uint8) + tm.assert_frame_equal(result, expected) + + data = Series(pd.Categorical(['a', 'b', 'a'])) + result = pd.get_dummies(data) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=['a', 'b'], + dtype=np.uint8) + tm.assert_frame_equal(result, expected) + + def test_int_df(self): + data = DataFrame( + {'A': [1, 2, 1], + 'B': pd.Categorical(['a', 'b', 'a']), + 'C': [1, 2, 1], + 'D': [1., 2., 1.] + } + ) + columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b'] + expected = DataFrame([ + [1, 1., 1, 0, 1, 0], + [2, 2., 0, 1, 0, 1], + [1, 1., 1, 0, 1, 0] + ], columns=columns) + expected[columns[2:]] = expected[columns[2:]].astype(np.uint8) + result = pd.get_dummies(data, columns=['A', 'B']) + tm.assert_frame_equal(result, expected) + class TestGetDummiesSparse(TestGetDummies): sparse = True From d26363b96481ba2df3978925e18ca567c79901dd Mon Sep 17 00:00:00 2001 From: Piotr Jucha Date: Sat, 30 Jul 2016 18:31:39 -0400 Subject: [PATCH 328/359] BUG/DEPR: Categorical: keep dtype in MultiIndex (#13743), deprecate .from_array Now, categorical dtype is preserved also in `groupby`, `set_index`, `stack`, `get_dummies`, and `make_axis_dummies`. closes #13743 closes #13854 --- doc/source/whatsnew/v0.19.0.txt | 76 ++++++++++++++++++++++++++- pandas/core/categorical.py | 51 +++++++++++++++++- pandas/core/panel.py | 9 +--- pandas/core/reshape.py | 24 ++++----- pandas/indexes/multi.py | 19 +++---- pandas/io/pytables.py | 13 ++--- pandas/tests/frame/test_alter_axes.py | 15 ++++++ pandas/tests/frame/test_reshape.py | 16 ++++++ pandas/tests/indexes/test_multi.py | 49 +++++++++++++++++ pandas/tests/test_categorical.py | 69 +++++++++++++----------- pandas/tests/test_groupby.py | 67 +++++++++++++++++++---- pandas/tests/test_reshape.py | 39 +++++++++++++- pandas/tests/types/test_dtypes.py | 3 +- pandas/tools/merge.py | 11 ++-- 14 files changed, 370 insertions(+), 91 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 38a90ac371b16..777bc01e71833 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -977,6 +977,79 @@ New Behavior: pd.Index([1, 2, 3]).unique() pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() +.. _whatsnew_0190.api.multiindex: + +``MultiIndex`` constructors preserve categorical dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``MultiIndex.from_arrays`` and ``MultiIndex.from_product`` will now preserve categorical dtype +in ``MultiIndex`` levels. (:issue:`13743`, :issue:`13854`) + +.. ipython:: python + + cat = pd.Categorical(['a', 'b'], categories=list("bac")) + lvl1 = ['foo', 'bar'] + midx = pd.MultiIndex.from_arrays([cat, lvl1]) + midx + +Previous Behavior: + +.. code-block:: ipython + + In [4]: midx.levels[0] + Out[4]: Index(['b', 'a', 'c'], dtype='object') + + In [5]: midx.get_level_values[0] + Out[5]: Index(['a', 'b'], dtype='object') + +New Behavior: + +.. ipython:: python + + midx.levels[0] + midx.get_level_values(0) + +An analogous change has been made to ``MultiIndex.from_product``. +As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes in indexes + +.. ipython:: python + + df = pd.DataFrame({'A': [0, 1], 'B': [10, 11], 'C': cat}) + df_grouped = df.groupby(by=['A', 'C']).first() + df_set_idx = df.set_index(['A', 'C']) + +Previous Behavior: + +.. code-block:: ipython + + In [11]: df_grouped.index.levels[1] + Out[11]: Index(['b', 'a', 'c'], dtype='object', name='C') + In [12]: df_grouped.reset_index().dtypes + Out[12]: + A int64 + C object + B float64 + dtype: object + + In [13]: df_set_idx.index.levels[1] + Out[13]: Index(['b', 'a', 'c'], dtype='object', name='C') + In [14]: df_set_idx.reset_index().dtypes + Out[14]: + A int64 + C object + B int64 + dtype: object + +New Behavior: + +.. ipython:: python + + df_grouped.index.levels[1] + df_grouped.reset_index().dtypes + + df_set_idx.index.levels[1] + df_set_idx.reset_index().dtypes + .. _whatsnew_0190.api.autogenerated_chunksize_index: ``read_csv`` will progressively enumerate chunks @@ -1173,7 +1246,7 @@ Deprecations - ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. (:issue:`13564`) - ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead. (:issue:`13874`) - ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq``. (:issue:`13874`) - +- ``Categorical.from_array`` has been deprecated and will be removed in a future version (:issue:`13854`) .. _whatsnew_0190.prior_deprecations: @@ -1388,3 +1461,4 @@ Bug Fixes - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. - Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue`14095`) +- Bugs in ``stack``, ``get_dummies``, ``make_axis_dummies`` which don't preserve categorical dtypes in (multi)indexes (:issue:`13854`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 2c89e4c05c633..48054c5bd34fa 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -5,7 +5,7 @@ import types from pandas import compat, lib -from pandas.compat import u +from pandas.compat import u, lzip from pandas.types.generic import ABCSeries, ABCIndexClass, ABCCategoricalIndex from pandas.types.missing import isnull, notnull @@ -17,6 +17,7 @@ _ensure_platform_int, is_dtype_equal, is_datetimelike, + is_categorical, is_categorical_dtype, is_integer_dtype, is_bool, is_list_like, is_sequence, @@ -411,6 +412,8 @@ def base(self): @classmethod def from_array(cls, data, **kwargs): """ + DEPRECATED: Use ``Categorical`` instead. + Make a Categorical type from a single array-like object. For internal compatibility with numpy arrays. @@ -421,6 +424,8 @@ def from_array(cls, data, **kwargs): Can be an Index or array-like. The categories are assumed to be the unique values of `data`. """ + warn("Categorical.from_array is deprecated, use Categorical instead", + FutureWarning, stacklevel=2) return cls(data, **kwargs) @classmethod @@ -1959,3 +1964,47 @@ def _convert_to_list_like(list_like): else: # is this reached? return [list_like] + + +def _factorize_from_iterable(values): + """ + Factorize an input `values` into `categories` and `codes`. Preserves + categorical dtype in `categories`. + + *This is an internal function* + + Parameters + ---------- + values : list-like + + Returns + ------- + codes : np.array + categories : Index + If `values` has a categorical dtype, then `categories` is + a CategoricalIndex keeping the categories and order of `values`. + """ + from pandas.indexes.category import CategoricalIndex + + if is_categorical(values): + if isinstance(values, (ABCCategoricalIndex, ABCSeries)): + values = values._values + categories = CategoricalIndex(values.categories, + categories=values.categories, + ordered=values.ordered) + codes = values.codes + else: + cat = Categorical(values, ordered=True) + categories = cat.categories + codes = cat.codes + return codes, categories + + +def _factorize_from_iterables(iterables): + """ + A higher-level wrapper over `_factorize_from_iterable`. + See `_factorize_from_iterable` for more info. + + *This is an internal function* + """ + return lzip(*[_factorize_from_iterable(it) for it in iterables]) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index b2f318d825db6..f708774dd84ff 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -21,7 +21,6 @@ from pandas import compat from pandas.compat import (map, zip, range, u, OrderedDict, OrderedDefaultdict) from pandas.compat.numpy import function as nv -from pandas.core.categorical import Categorical from pandas.core.common import PandasError, _try_sort, _default_index from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs @@ -103,13 +102,7 @@ def panel_index(time, panels, names=None): if names is None: names = ['time', 'panel'] time, panels = _ensure_like_indices(time, panels) - time_factor = Categorical.from_array(time, ordered=True) - panel_factor = Categorical.from_array(panels, ordered=True) - - labels = [time_factor.codes, panel_factor.codes] - levels = [time_factor.categories, panel_factor.categories] - return MultiIndex(levels, labels, sortorder=None, names=names, - verify_integrity=False) + return MultiIndex.from_arrays([time, panels], sortorder=None, names=names) class Panel(NDFrame): diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index b451f49fce78c..4dec8b4106126 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -18,7 +18,7 @@ from pandas.sparse.array import SparseArray from pandas._sparse import IntIndex -from pandas.core.categorical import Categorical +from pandas.core.categorical import Categorical, _factorize_from_iterable from pandas.core.groupby import get_group_index, _compress_group_index import pandas.core.algorithms as algos @@ -166,9 +166,8 @@ def get_result(self): if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered - values = [Categorical.from_array(values[:, i], - categories=categories, - ordered=ordered) + values = [Categorical(values[:, i], categories=categories, + ordered=ordered) for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns) @@ -471,8 +470,8 @@ def stack(frame, level=-1, dropna=True): def factorize(index): if index.is_unique: return index, np.arange(len(index)) - cat = Categorical(index, ordered=True) - return cat.categories, cat.codes + codes, categories = _factorize_from_iterable(index) + return categories, codes N, K = frame.shape if isinstance(frame.columns, MultiIndex): @@ -1107,8 +1106,7 @@ def check_len(item, name): def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling - cat = Categorical.from_array(Series(data), ordered=True) - levels = cat.categories + codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): @@ -1124,10 +1122,10 @@ def get_empty_Frame(data, sparse): if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) - codes = cat.codes.copy() + codes = codes.copy() if dummy_na: - codes[codes == -1] = len(cat.categories) - levels = np.append(cat.categories, np.nan) + codes[codes == -1] = len(levels) + levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: @@ -1212,9 +1210,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): labels = frame.index.labels[num] if transform is not None: mapped_items = items.map(transform) - cat = Categorical.from_array(mapped_items.take(labels), ordered=True) - labels = cat.codes - items = cat.categories + labels, items = _factorize_from_iterable(mapped_items.take(labels)) values = np.eye(len(items), dtype=float) values = values.take(labels, axis=0) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index cc279076f7a5e..618bc319c3f74 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -852,8 +852,6 @@ def from_arrays(cls, arrays, sortorder=None, names=None): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ - from pandas.core.categorical import Categorical - if len(arrays) == 1: name = None if names is None else names[0] return Index(arrays[0], name=name) @@ -864,9 +862,9 @@ def from_arrays(cls, arrays, sortorder=None, names=None): if len(arrays[i]) != len(arrays[i - 1]): raise ValueError('all arrays must be same length') - cats = [Categorical.from_array(arr, ordered=True) for arr in arrays] - levels = [c.categories for c in cats] - labels = [c.codes for c in cats] + from pandas.core.categorical import _factorize_from_iterables + + labels, levels = _factorize_from_iterables(arrays) if names is None: names = [getattr(arr, "name", None) for arr in arrays] @@ -952,15 +950,14 @@ def from_product(cls, iterables, sortorder=None, names=None): MultiIndex.from_arrays : Convert list of arrays to MultiIndex MultiIndex.from_tuples : Convert list of tuples to MultiIndex """ - from pandas.core.categorical import Categorical + from pandas.core.categorical import _factorize_from_iterables from pandas.tools.util import cartesian_product - categoricals = [Categorical.from_array(it, ordered=True) - for it in iterables] - labels = cartesian_product([c.codes for c in categoricals]) + labels, levels = _factorize_from_iterables(iterables) + labels = cartesian_product(labels) - return MultiIndex(levels=[c.categories for c in categoricals], - labels=labels, sortorder=sortorder, names=names) + return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, + names=names) @property def nlevels(self): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ccc3fe081acde..b8c2b146b6259 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -37,7 +37,7 @@ from pandas.formats.printing import adjoin, pprint_thing from pandas.core.common import _asarray_tuplesafe, PerformanceWarning from pandas.core.algorithms import match, unique -from pandas.core.categorical import Categorical +from pandas.core.categorical import Categorical, _factorize_from_iterables from pandas.core.internals import (BlockManager, make_block, _block2d_to_blocknd, _factor_indexer, _block_shape) @@ -3736,11 +3736,12 @@ def read(self, where=None, columns=None, **kwargs): if not self.read_axes(where=where, **kwargs): return None - factors = [Categorical.from_array( - a.values, ordered=True) for a in self.index_axes] - levels = [f.categories for f in factors] - N = [len(f.categories) for f in factors] - labels = [f.codes for f in factors] + lst_vals = [a.values for a in self.index_axes] + labels, levels = _factorize_from_iterables(lst_vals) + # labels and levels are tuples but lists are expected + labels = list(labels) + levels = list(levels) + N = [len(lvl) for lvl in levels] # compute the key key = _factor_indexer(N[1:], labels) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 66b14995e6d3c..46f0fff7bb4b8 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -664,3 +664,18 @@ def test_assign_columns(self): frame.columns = ['foo', 'bar', 'baz', 'quux', 'foo2'] assert_series_equal(self.frame['C'], frame['baz'], check_names=False) assert_series_equal(self.frame['hi'], frame['foo2'], check_names=False) + + def test_set_index_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': pd.Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': pd.Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + for cols in ['C1', 'C2', ['A', 'C1'], ['A', 'C2'], ['C1', 'C2']]: + result = df.set_index(cols).reset_index() + result = result.reindex(columns=df.columns) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 066485e966a42..8b1b1130dc2fc 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -707,3 +707,19 @@ def _test_stack_with_multiindex(multiindex): columns=Index(['B', 'C'], name='Upper'), dtype=df.dtypes[0]) assert_frame_equal(result, expected) + + def test_stack_preserve_categorical_dtype(self): + # GH13854 + for ordered in [False, True]: + for labels in [list("yxz"), list("yxy")]: + cidx = pd.CategoricalIndex(labels, categories=list("xyz"), + ordered=ordered) + df = DataFrame([[10, 11, 12]], columns=cidx) + result = df.stack() + + # `MutliIndex.from_product` preserves categorical dtype - + # it's tested elsewhere. + midx = pd.MultiIndex.from_product([df.index, cidx]) + expected = Series([10, 11, 12], index=midx) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index c72cab32d198b..675193e1538b2 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -314,6 +314,22 @@ def test_set_levels_labels_names_bad_input(self): with tm.assertRaisesRegexp(TypeError, 'string'): self.index.set_names(names, level=0) + def test_set_levels_categorical(self): + # GH13854 + index = MultiIndex.from_arrays([list("xyzx"), [0, 1, 2, 3]]) + for ordered in [False, True]: + cidx = CategoricalIndex(list("bac"), ordered=ordered) + result = index.set_levels(cidx, 0) + expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], + labels=index.labels) + tm.assert_index_equal(result, expected) + + result_lvl = result.get_level_values(0) + expected_lvl = CategoricalIndex(list("bacb"), + categories=cidx.categories, + ordered=cidx.ordered) + tm.assert_index_equal(result_lvl, expected_lvl) + def test_metadata_immutable(self): levels, labels = self.index.levels, self.index.labels # shouldn't be able to set at either the top level or base level @@ -656,6 +672,25 @@ def test_from_arrays_index_datetimelike_mixed(self): tm.assert_index_equal(result, result2) + def test_from_arrays_index_series_categorical(self): + # GH13743 + idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), + ordered=False) + idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), + ordered=True) + + result = pd.MultiIndex.from_arrays([idx1, idx2]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + + result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + + result3 = pd.MultiIndex.from_arrays([idx1.values, idx2.values]) + tm.assert_index_equal(result3.get_level_values(0), idx1) + tm.assert_index_equal(result3.get_level_values(1), idx2) + def test_from_arrays_different_lengths(self): # GH13599 idx1 = [1, 2, 3] @@ -696,6 +731,20 @@ def test_from_product_datetimeindex(self): '2000-01-01')), (2, pd.Timestamp('2000-01-02'))]) tm.assert_numpy_array_equal(mi.values, etalon) + def test_from_product_index_series_categorical(self): + # GH13743 + first = ['foo', 'bar'] + for ordered in [False, True]: + idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), + ordered=ordered) + expected = pd.CategoricalIndex(list("abcaab") + list("abcaab"), + categories=list("bac"), + ordered=ordered) + + for arr in [idx, pd.Series(idx), idx.values]: + result = pd.MultiIndex.from_product([first, arr]) + tm.assert_index_equal(result.get_level_values(1), expected) + def test_values_boxed(self): tuples = [(1, pd.Timestamp('2000-01-01')), (2, pd.NaT), (3, pd.Timestamp('2000-01-03')), diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 781c9b786328d..eabd118de671d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -29,9 +29,8 @@ class TestCategorical(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.factor = Categorical.from_array(['a', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], - ordered=True) + self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], + ordered=True) def test_getitem(self): self.assertEqual(self.factor[0], 'a') @@ -70,8 +69,8 @@ def test_setitem(self): indexer[0] = True indexer[-1] = True c[indexer] = 'c' - expected = Categorical.from_array(['c', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], ordered=True) + expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], + ordered=True) self.assert_categorical_equal(c, expected) @@ -94,12 +93,12 @@ def test_constructor_unsortable(self): # it works! arr = np.array([1, 2, 3, datetime.now()], dtype='O') - factor = Categorical.from_array(arr, ordered=False) + factor = Categorical(arr, ordered=False) self.assertFalse(factor.ordered) # this however will raise as cannot be sorted self.assertRaises( - TypeError, lambda: Categorical.from_array(arr, ordered=True)) + TypeError, lambda: Categorical(arr, ordered=True)) def test_is_equal_dtype(self): @@ -341,26 +340,26 @@ def test_constructor_with_datetimelike(self): def test_constructor_from_index_series_datetimetz(self): idx = pd.date_range('2015-01-01 10:00', freq='D', periods=3, tz='US/Eastern') - result = pd.Categorical.from_array(idx) + result = pd.Categorical(idx) tm.assert_index_equal(result.categories, idx) - result = pd.Categorical.from_array(pd.Series(idx)) + result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_timedelta(self): idx = pd.timedelta_range('1 days', freq='D', periods=3) - result = pd.Categorical.from_array(idx) + result = pd.Categorical(idx) tm.assert_index_equal(result.categories, idx) - result = pd.Categorical.from_array(pd.Series(idx)) + result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_period(self): idx = pd.period_range('2015-01-01', freq='D', periods=3) - result = pd.Categorical.from_array(idx) + result = pd.Categorical(idx) tm.assert_index_equal(result.categories, idx) - result = pd.Categorical.from_array(pd.Series(idx)) + result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) def test_from_codes(self): @@ -409,9 +408,6 @@ def test_validate_ordered(self): with tm.assertRaisesRegexp(exp_err, exp_msg): Categorical([1, 2, 3], ordered=ordered) - with tm.assertRaisesRegexp(exp_err, exp_msg): - Categorical.from_array([1, 2, 3], ordered=ordered) - with tm.assertRaisesRegexp(exp_err, exp_msg): Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'], ordered=ordered) @@ -724,7 +720,7 @@ def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') - cat1 = Categorical.from_array(idx1) + cat1 = Categorical(idx1) str(cat1) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') @@ -733,7 +729,7 @@ def test_periodindex(self): idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') - cat2 = Categorical.from_array(idx2, ordered=True) + cat2 = Categorical(idx2, ordered=True) str(cat2) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') @@ -742,7 +738,7 @@ def test_periodindex(self): idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') - cat3 = Categorical.from_array(idx3, ordered=True) + cat3 = Categorical(idx3, ordered=True) exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8) exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') @@ -1590,6 +1586,11 @@ def test_deprecated_labels(self): res = cat.labels self.assert_numpy_array_equal(res, exp) + def test_deprecated_from_array(self): + # GH13854, `.from_array` is deprecated + with tm.assert_produces_warning(FutureWarning): + Categorical.from_array([0, 1]) + def test_removed_names_produces_warning(self): # 10482 @@ -1654,8 +1655,7 @@ class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', - 'c', 'c']) + self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) df = DataFrame({'value': np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] @@ -3001,9 +3001,10 @@ def test_groupby(self): # multiple groupers gb = df.groupby(['A', 'B']) - exp_index = pd.MultiIndex.from_product([['a', 'b', 'z'], - ['c', 'd', 'y']], - names=['A', 'B']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True)], + names=['A', 'B']) expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]}, index=exp_index) @@ -3014,10 +3015,13 @@ def test_groupby(self): df = df.copy() df['C'] = ['foo', 'bar'] * 2 gb = df.groupby(['A', 'B', 'C']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True), + ['foo', 'bar']], + names=['A', 'B', 'C']) expected = DataFrame({'values': Series( - np.nan, index=pd.MultiIndex.from_product( - [['a', 'b', 'z'], ['c', 'd', 'y'], ['foo', 'bar'] - ], names=['A', 'B', 'C']))}).sortlevel() + np.nan, index=exp_index)}).sortlevel() expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] result = gb.sum() tm.assert_frame_equal(result, expected) @@ -3096,11 +3100,12 @@ def test_pivot_table(self): df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) result = pd.pivot_table(df, values='values', index=['A', 'B']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True)], + names=['A', 'B']) expected = Series([1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan], - index=pd.MultiIndex.from_product( - [['a', 'b', 'z'], ['c', 'd', 'y']], - names=['A', 'B']), - name='values') + index=exp_index, name='values') tm.assert_series_equal(result, expected) def test_count(self): @@ -4184,7 +4189,7 @@ def test_astype_to_other(self): cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) tm.assert_series_equal(cat.astype('str'), exp) - s2 = Series(Categorical.from_array(['1', '2', '3', '4'])) + s2 = Series(Categorical(['1', '2', '3', '4'])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype('int'), exp2) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6b33fa747d8ba..9d8873d843642 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3108,8 +3108,10 @@ def test_apply_categorical_data(self): grouped = df.groupby(['missing', 'dense']) # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product([['a', 'b'], ['a', 'b', 'c']], - names=['missing', 'dense']) + idx = MultiIndex.from_product( + [Categorical(['a', 'b'], ordered=ordered), + Categorical(['a', 'b', 'c'], ordered=ordered)], + names=['missing', 'dense']) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=['values']) @@ -6389,7 +6391,8 @@ def test_groupby_categorical_two_columns(self): groups_double_key = test.groupby(["cat", "ints"]) res = groups_double_key.agg('mean') exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": ["a", "a", "b", "b", "c", "c"], + "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], + ordered=True), "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" ]) tm.assert_frame_equal(res, exp) @@ -6409,9 +6412,10 @@ def test_groupby_categorical_two_columns(self): res = groups_double_key.agg('mean') nan = np.nan - idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"], - [1, 2, 3, 4]], - names=["cat", "C2"]) + idx = MultiIndex.from_product( + [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), + [1, 2, 3, 4]], + names=["cat", "C2"]) exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, nan, nan, nan, nan, 4, 5], "C3": [nan, nan, nan, nan, 10, 100, @@ -6424,7 +6428,7 @@ def test_groupby_multi_categorical_as_index(self): 'A': [10, 11, 11], 'B': [101, 102, 103]}) result = df.groupby(['cat', 'A'], as_index=False).sum() - expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10, 11, 10, 11, 10, 11], 'B': [101.0, nan, nan, 205.0, nan, nan]}, columns=['cat', 'A', 'B']) @@ -6433,7 +6437,7 @@ def test_groupby_multi_categorical_as_index(self): # function grouper f = lambda r: df.loc[r, 'A'] result = df.groupby(['cat', f], as_index=False).sum() - expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10.0, nan, nan, 22.0, nan, nan], 'B': [101.0, nan, nan, 205.0, nan, nan]}, columns=['cat', 'A', 'B']) @@ -6442,14 +6446,14 @@ def test_groupby_multi_categorical_as_index(self): # another not in-axis grouper (conflicting names in index) s = Series(['a', 'b', 'b'], name='cat') result = df.groupby(['cat', s], as_index=False).sum() - expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10.0, nan, nan, 22.0, nan, nan], 'B': [101.0, nan, nan, 205.0, nan, nan]}, columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # is original index dropped? - expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10, 11, 10, 11, 10, 11], 'B': [101.0, nan, nan, 205.0, nan, nan]}, columns=['cat', 'A', 'B']) @@ -6459,6 +6463,49 @@ def test_groupby_multi_categorical_as_index(self): result = df.groupby(['cat', 'A'], as_index=False).sum() tm.assert_frame_equal(result, expected, check_index_type=True) + def test_groupby_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + # single grouper + exp_full = DataFrame({'A': [2.0, 1.0, np.nan], + 'B': [25.0, 20.0, np.nan], + 'C1': Categorical(list("bac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bac"), + categories=list("bac"), + ordered=True)}) + for col in ['C1', 'C2']: + result1 = df.groupby(by=col, as_index=False).mean() + result2 = df.groupby(by=col, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + # multiple grouper + exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], + 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, + np.nan], + 'C1': Categorical(list("bacbac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bacbac"), + categories=list("bac"), + ordered=True)}) + for cols in [['A', 'C1'], ['A', 'C2']]: + result1 = df.groupby(by=cols, as_index=False).mean() + result2 = df.groupby(by=cols, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 8bfd6350adc06..413724d1a6177 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -539,7 +539,8 @@ def test_int_int(self): data = Series(pd.Categorical(['a', 'b', 'a'])) result = pd.get_dummies(data) - expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=['a', 'b'], + expected = DataFrame([[1, 0], [0, 1], [1, 0]], + columns=pd.Categorical(['a', 'b']), dtype=np.uint8) tm.assert_frame_equal(result, expected) @@ -561,11 +562,47 @@ def test_int_df(self): result = pd.get_dummies(data, columns=['A', 'B']) tm.assert_frame_equal(result, expected) + def test_dataframe_dummies_preserve_categorical_dtype(self): + # GH13854 + for ordered in [False, True]: + cat = pd.Categorical(list("xy"), categories=list("xyz"), + ordered=ordered) + result = get_dummies(cat) + + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8) + cols = pd.CategoricalIndex(cat.categories, + categories=cat.categories, + ordered=ordered) + expected = DataFrame(data, columns=cols) + + tm.assert_frame_equal(result, expected) + class TestGetDummiesSparse(TestGetDummies): sparse = True +class TestMakeAxisDummies(tm.TestCase): + + def test_preserve_categorical_dtype(self): + # GH13854 + for ordered in [False, True]: + cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered) + midx = pd.MultiIndex(levels=[['a'], cidx], + labels=[[0, 0], [0, 1]]) + df = DataFrame([[10, 11]], index=midx) + + expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], + index=midx, columns=cidx) + + from pandas.core.reshape import make_axis_dummies + result = make_axis_dummies(df) + tm.assert_frame_equal(result, expected) + + result = make_axis_dummies(df, transform=lambda x: x) + tm.assert_frame_equal(result, expected) + + class TestLreshape(tm.TestCase): def test_pairs(self): diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index dd1a8dbd5c53a..a2b0a9ebfa6cc 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -80,8 +80,7 @@ def test_basic(self): self.assertTrue(is_categorical_dtype(self.dtype)) - factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c' - ]) + factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) s = Series(factor, name='A') diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index ca7288b048427..7a29918c55658 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -11,6 +11,8 @@ from pandas import (Categorical, DataFrame, Series, Index, MultiIndex, Timedelta) +from pandas.core.categorical import (_factorize_from_iterable, + _factorize_from_iterables) from pandas.core.frame import _merge_doc from pandas.types.generic import ABCSeries from pandas.types.common import (is_datetime64tz_dtype, @@ -1632,8 +1634,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = [None] * len(zipped) if levels is None: - levels = [Categorical.from_array( - zp, ordered=True).categories for zp in zipped] + _, levels = _factorize_from_iterables(zipped) else: levels = [_ensure_index(x) for x in levels] else: @@ -1671,9 +1672,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): levels.extend(concat_index.levels) label_list.extend(concat_index.labels) else: - factor = Categorical.from_array(concat_index, ordered=True) - levels.append(factor.categories) - label_list.append(factor.codes) + codes, categories = _factorize_from_iterable(concat_index) + levels.append(categories) + label_list.append(codes) if len(names) == len(levels): names = list(names) From 752ba9ae0eb8732c15caaea18ea86db04004d3b0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 2 Sep 2016 19:52:39 -0400 Subject: [PATCH 329/359] TST: fix blosc version (#14142) --- ci/requirements-2.7.pip | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index d16b932c8be4f..44e1695bf1a7f 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -1,4 +1,4 @@ -blosc +blosc==1.4.1 httplib2 google-api-python-client==1.2 python-gflags==2.0 From 59524af1e90d3dda2d885e711f7258704a020b6d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 2 Sep 2016 20:00:37 -0400 Subject: [PATCH 330/359] TST: sparse / dummy array comparisons on windows, xref #14140 (#14141) --- pandas/sparse/tests/test_list.py | 5 ++++- pandas/tests/test_reshape.py | 2 +- pandas/util/testing.py | 21 +++++++++++++++++---- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/pandas/sparse/tests/test_list.py b/pandas/sparse/tests/test_list.py index 0b933b4f9c6f2..b117685b6e968 100644 --- a/pandas/sparse/tests/test_list.py +++ b/pandas/sparse/tests/test_list.py @@ -60,8 +60,11 @@ def test_append_zero(self): splist.append(arr[5]) splist.append(arr[6:]) + # list always produces int64, but SA constructor + # is platform dtype aware sparr = splist.to_array() - tm.assert_sp_array_equal(sparr, SparseArray(arr, fill_value=0)) + exp = SparseArray(arr, fill_value=0) + tm.assert_sp_array_equal(sparr, exp, check_dtype=False) def test_consolidate(self): with tm.assert_produces_warning(FutureWarning, diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 413724d1a6177..80d1f5f76e5a9 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -323,7 +323,7 @@ def test_dataframe_dummies_prefix_str(self): [3, 1, 0, 0, 1]], columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], dtype=np.uint8) - expected = expected.astype({"C": np.int}) + expected = expected.astype({"C": np.int64}) assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d50a6c460ceb5..f5a93d1f17d00 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1385,11 +1385,22 @@ def assert_panelnd_equal(left, right, # Sparse -def assert_sp_array_equal(left, right): +def assert_sp_array_equal(left, right, check_dtype=True): + """Check that the left and right SparseArray are equal. + + Parameters + ---------- + left : SparseArray + right : SparseArray + check_dtype : bool, default True + Whether to check the data dtype is identical. + """ + assertIsInstance(left, pd.SparseArray, '[SparseArray]') assertIsInstance(right, pd.SparseArray, '[SparseArray]') - assert_numpy_array_equal(left.sp_values, right.sp_values) + assert_numpy_array_equal(left.sp_values, right.sp_values, + check_dtype=check_dtype) # SparseIndex comparison assertIsInstance(left.sp_index, pd._sparse.SparseIndex, '[SparseIndex]') @@ -1400,8 +1411,10 @@ def assert_sp_array_equal(left, right): left.sp_index, right.sp_index) assert_attr_equal('fill_value', left, right) - assert_attr_equal('dtype', left, right) - assert_numpy_array_equal(left.values, right.values) + if check_dtype: + assert_attr_equal('dtype', left, right) + assert_numpy_array_equal(left.values, right.values, + check_dtype=check_dtype) def assert_sp_series_equal(left, right, check_dtype=True, exact_indices=True, From 0323336d3c3c919e4773df20826a01d71c31cae7 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Sat, 3 Sep 2016 18:20:28 +0900 Subject: [PATCH 331/359] BUG: concat/append misc fixes (#13660) closes #13626 closes #7795 --- doc/source/whatsnew/v0.19.0.txt | 7 + pandas/core/frame.py | 12 +- pandas/core/series.py | 6 +- pandas/indexes/base.py | 75 ++--- pandas/indexes/category.py | 21 +- pandas/tests/indexes/test_category.py | 4 +- pandas/tests/indexes/test_multi.py | 38 ++- pandas/tests/types/test_concat.py | 86 +++++ pandas/tools/pivot.py | 6 + pandas/tools/tests/test_concat.py | 426 ++++++++++++++++++++++++- pandas/tools/tests/test_pivot.py | 5 +- pandas/tseries/base.py | 18 ++ pandas/tseries/index.py | 83 ----- pandas/tseries/period.py | 39 --- pandas/tseries/tdi.py | 28 -- pandas/tseries/tests/test_timezones.py | 46 ++- pandas/types/concat.py | 86 +++-- 17 files changed, 734 insertions(+), 252 deletions(-) create mode 100644 pandas/tests/types/test_concat.py diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 777bc01e71833..ca5f3dfc2a8f2 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1356,6 +1356,13 @@ Bug Fixes - Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) - Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) +- Bug in ``pd.concat`` and ``.append`` may coerces ``datetime64`` and ``timedelta`` to ``object`` dtype containing python built-in ``datetime`` or ``timedelta`` rather than ``Timestamp`` or ``Timedelta`` (:issue:`13626`) +- Bug in ``PeriodIndex.append`` may raises ``AttributeError`` when the result is ``object`` dtype (:issue:`13221`) +- Bug in ``CategoricalIndex.append`` may accept normal ``list`` (:issue:`13626`) +- Bug in ``pd.concat`` and ``.append`` with the same timezone get reset to UTC (:issue:`7795`) +- Bug in ``Series`` and ``DataFrame`` ``.append`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13626`) + + - Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) - Bug in ``DataFrame.describe()`` raising ``ValueError`` with only boolean columns (:issue:`13898`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 46a1d22a4114b..ac3e5d2aabef7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4384,14 +4384,20 @@ def append(self, other, ignore_index=False, verify_integrity=False): raise TypeError('Can only append a Series if ignore_index=True' ' or if the Series has a name') - index = None if other.name is None else [other.name] + if other.name is None: + index = None + else: + # other must have the same index name as self, otherwise + # index name will be reset + index = Index([other.name], name=self.index.name) + combined_columns = self.columns.tolist() + self.columns.union( other.index).difference(self.columns).tolist() other = other.reindex(combined_columns, copy=False) other = DataFrame(other.values.reshape((1, len(other))), - index=index, columns=combined_columns) + index=index, + columns=combined_columns) other = other._convert(datetime=True, timedelta=True) - if not self.columns.equals(combined_columns): self = self.reindex(columns=combined_columns) elif isinstance(other, list) and not isinstance(other[0], DataFrame): diff --git a/pandas/core/series.py b/pandas/core/series.py index 01d6f6f078d17..8379c8bcdcae8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -289,7 +289,6 @@ def _set_axis(self, axis, labels, fastpath=False): is_all_dates = labels.is_all_dates if is_all_dates: - if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): try: @@ -297,8 +296,11 @@ def _set_axis(self, axis, labels, fastpath=False): # need to set here becuase we changed the index if fastpath: self._data.set_axis(axis, labels) - except tslib.OutOfBoundsDatetime: + except (tslib.OutOfBoundsDatetime, ValueError): + # labels may exceeds datetime bounds, + # or not be a DatetimeIndex pass + self._set_subtyp(is_all_dates) object.__setattr__(self, '_index', labels) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 15cd2064624d9..d6b6d01b1e444 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1392,15 +1392,19 @@ def __getitem__(self, key): else: return result - def _ensure_compat_append(self, other): + def append(self, other): """ - prepare the append + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices Returns ------- - list of to_concat, name of result Index + appended : Index """ - name = self.name + to_concat = [self] if isinstance(other, (list, tuple)): @@ -1409,46 +1413,29 @@ def _ensure_compat_append(self, other): to_concat.append(other) for obj in to_concat: - if (isinstance(obj, Index) and obj.name != name and - obj.name is not None): - name = None - break + if not isinstance(obj, Index): + raise TypeError('all inputs must be Index') - to_concat = self._ensure_compat_concat(to_concat) - to_concat = [x._values if isinstance(x, Index) else x - for x in to_concat] - return to_concat, name + names = set([obj.name for obj in to_concat]) + name = None if len(names) > 1 else self.name - def append(self, other): - """ - Append a collection of Index options together + typs = _concat.get_dtype_kinds(to_concat) - Parameters - ---------- - other : Index or list/tuple of indices + if 'category' in typs: + # if any of the to_concat is category + from pandas.indexes.category import CategoricalIndex + return CategoricalIndex._append_same_dtype(self, to_concat, name) - Returns - ------- - appended : Index - """ - to_concat, name = self._ensure_compat_append(other) - attribs = self._get_attributes_dict() - attribs['name'] = name - return self._shallow_copy_with_infer( - np.concatenate(to_concat), **attribs) - - @staticmethod - def _ensure_compat_concat(indexes): - from pandas.tseries.api import (DatetimeIndex, PeriodIndex, - TimedeltaIndex) - klasses = DatetimeIndex, PeriodIndex, TimedeltaIndex - - is_ts = [isinstance(idx, klasses) for idx in indexes] + if len(typs) == 1: + return self._append_same_dtype(to_concat, name=name) + return _concat._concat_index_asobject(to_concat, name=name) - if any(is_ts) and not all(is_ts): - return [_maybe_box(idx) for idx in indexes] - - return indexes + def _append_same_dtype(self, to_concat, name): + """ + Concatenate to_concat which has the same class + """ + # must be overrided in specific classes + return _concat._concat_index_asobject(to_concat, name) _index_shared_docs['take'] = """ return a new %(klass)s of the values selected by the indices @@ -3634,16 +3621,6 @@ def _ensure_has_len(seq): return seq -def _maybe_box(idx): - from pandas.tseries.api import DatetimeIndex, PeriodIndex, TimedeltaIndex - klasses = DatetimeIndex, PeriodIndex, TimedeltaIndex - - if isinstance(idx, klasses): - return idx.asobject - - return idx - - def _trim_front(strings): """ Trims zeros and decimal points diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 251886ebdd974..1666d8f7bc078 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -569,26 +569,17 @@ def insert(self, loc, item): codes = np.concatenate((codes[:loc], code, codes[loc:])) return self._create_from_codes(codes) - def append(self, other): + def _append_same_dtype(self, to_concat, name): """ - Append a collection of CategoricalIndex options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - - Raises - ------ + Concatenate to_concat which has the same class ValueError if other is not in the categories """ - to_concat, name = self._ensure_compat_append(other) to_concat = [self._is_dtype_compat(c) for c in to_concat] codes = np.concatenate([c.codes for c in to_concat]) - return self._create_from_codes(codes, name=name) + result = self._create_from_codes(codes, name=name) + # if name is None, _create_from_codes sets self.name + result.name = name + return result @classmethod def _add_comparison_methods(cls): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index b0e50491b8e9d..cb8452479f616 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -271,12 +271,12 @@ def test_append(self): lambda: ci.append(ci.values.reorder_categories(list('abc')))) # with objects - result = ci.append(['c', 'a']) + result = ci.append(Index(['c', 'a'])) expected = CategoricalIndex(list('aabbcaca'), categories=categories) tm.assert_index_equal(result, expected, exact=True) # invalid objects - self.assertRaises(TypeError, lambda: ci.append(['a', 'd'])) + self.assertRaises(TypeError, lambda: ci.append(Index(['a', 'd']))) def test_insert(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 675193e1538b2..d49ac40631d37 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -6,8 +6,8 @@ import re import warnings -from pandas import (DataFrame, date_range, MultiIndex, Index, CategoricalIndex, - compat) +from pandas import (DataFrame, date_range, period_range, MultiIndex, Index, + CategoricalIndex, compat) from pandas.core.common import PerformanceWarning from pandas.indexes.base import InvalidIndexError from pandas.compat import range, lrange, u, PY3, long, lzip @@ -769,6 +769,40 @@ def test_append(self): result = self.index.append([]) self.assertTrue(result.equals(self.index)) + def test_append_mixed_dtypes(self): + # GH 13660 + dti = date_range('2011-01-01', freq='M', periods=3,) + dti_tz = date_range('2011-01-01', freq='M', periods=3, tz='US/Eastern') + pi = period_range('2011-01', freq='M', periods=3) + + mi = MultiIndex.from_arrays([[1, 2, 3], + [1.1, np.nan, 3.3], + ['a', 'b', 'c'], + dti, dti_tz, pi]) + self.assertEqual(mi.nlevels, 6) + + res = mi.append(mi) + exp = MultiIndex.from_arrays([[1, 2, 3, 1, 2, 3], + [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], + ['a', 'b', 'c', 'a', 'b', 'c'], + dti.append(dti), + dti_tz.append(dti_tz), + pi.append(pi)]) + tm.assert_index_equal(res, exp) + + other = MultiIndex.from_arrays([['x', 'y', 'z'], ['x', 'y', 'z'], + ['x', 'y', 'z'], ['x', 'y', 'z'], + ['x', 'y', 'z'], ['x', 'y', 'z']]) + + res = mi.append(other) + exp = MultiIndex.from_arrays([[1, 2, 3, 'x', 'y', 'z'], + [1.1, np.nan, 3.3, 'x', 'y', 'z'], + ['a', 'b', 'c', 'x', 'y', 'z'], + dti.append(pd.Index(['x', 'y', 'z'])), + dti_tz.append(pd.Index(['x', 'y', 'z'])), + pi.append(pd.Index(['x', 'y', 'z']))]) + tm.assert_index_equal(res, exp) + def test_get_level_values(self): result = self.index.get_level_values(0) expected = Index(['foo', 'foo', 'bar', 'baz', 'qux', 'qux'], diff --git a/pandas/tests/types/test_concat.py b/pandas/tests/types/test_concat.py new file mode 100644 index 0000000000000..6403dcb5a5350 --- /dev/null +++ b/pandas/tests/types/test_concat.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +import nose +import pandas as pd +import pandas.types.concat as _concat +import pandas.util.testing as tm + + +class TestConcatCompat(tm.TestCase): + + _multiprocess_can_split_ = True + + def check_concat(self, to_concat, exp): + for klass in [pd.Index, pd.Series]: + to_concat_klass = [klass(c) for c in to_concat] + res = _concat.get_dtype_kinds(to_concat_klass) + self.assertEqual(res, set(exp)) + + def test_get_dtype_kinds(self): + to_concat = [['a'], [1, 2]] + self.check_concat(to_concat, ['i', 'object']) + + to_concat = [[3, 4], [1, 2]] + self.check_concat(to_concat, ['i']) + + to_concat = [[3, 4], [1, 2.1]] + self.check_concat(to_concat, ['i', 'f']) + + def test_get_dtype_kinds_datetimelike(self): + to_concat = [pd.DatetimeIndex(['2011-01-01']), + pd.DatetimeIndex(['2011-01-02'])] + self.check_concat(to_concat, ['datetime']) + + to_concat = [pd.TimedeltaIndex(['1 days']), + pd.TimedeltaIndex(['2 days'])] + self.check_concat(to_concat, ['timedelta']) + + def test_get_dtype_kinds_datetimelike_object(self): + to_concat = [pd.DatetimeIndex(['2011-01-01']), + pd.DatetimeIndex(['2011-01-02'], tz='US/Eastern')] + self.check_concat(to_concat, + ['datetime', 'datetime64[ns, US/Eastern]']) + + to_concat = [pd.DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'), + pd.DatetimeIndex(['2011-01-02'], tz='US/Eastern')] + self.check_concat(to_concat, + ['datetime64[ns, Asia/Tokyo]', + 'datetime64[ns, US/Eastern]']) + + # timedelta has single type + to_concat = [pd.TimedeltaIndex(['1 days']), + pd.TimedeltaIndex(['2 hours'])] + self.check_concat(to_concat, ['timedelta']) + + to_concat = [pd.DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'), + pd.TimedeltaIndex(['1 days'])] + self.check_concat(to_concat, + ['datetime64[ns, Asia/Tokyo]', 'timedelta']) + + def test_get_dtype_kinds_period(self): + # because we don't have Period dtype (yet), + # Series results in object dtype + to_concat = [pd.PeriodIndex(['2011-01'], freq='M'), + pd.PeriodIndex(['2011-01'], freq='M')] + res = _concat.get_dtype_kinds(to_concat) + self.assertEqual(res, set(['period[M]'])) + + to_concat = [pd.Series([pd.Period('2011-01', freq='M')]), + pd.Series([pd.Period('2011-02', freq='M')])] + res = _concat.get_dtype_kinds(to_concat) + self.assertEqual(res, set(['object'])) + + to_concat = [pd.PeriodIndex(['2011-01'], freq='M'), + pd.PeriodIndex(['2011-01'], freq='D')] + res = _concat.get_dtype_kinds(to_concat) + self.assertEqual(res, set(['period[M]', 'period[D]'])) + + to_concat = [pd.Series([pd.Period('2011-01', freq='M')]), + pd.Series([pd.Period('2011-02', freq='D')])] + res = _concat.get_dtype_kinds(to_concat) + self.assertEqual(res, set(['object'])) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 3e2b7c3af460e..94b464f6fca6c 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -523,6 +523,9 @@ def _normalize(table, normalize, margins): column_margin = table.loc[:, 'All'].drop('All') index_margin = table.loc['All', :].drop('All') table = table.drop('All', axis=1).drop('All') + # to keep index and columns names + table_index_names = table.index.names + table_columns_names = table.columns.names # Normalize core table = _normalize(table, normalize=normalize, margins=False) @@ -550,6 +553,9 @@ def _normalize(table, normalize, margins): else: raise ValueError("Not a valid normalize argument") + table.index.names = table_index_names + table.columns.names = table_columns_names + else: raise ValueError("Not a valid margins argument") diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 17ccfb27d4b42..102f21bcdc535 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -4,7 +4,7 @@ from numpy.random import randn from datetime import datetime -from pandas.compat import StringIO +from pandas.compat import StringIO, iteritems import pandas as pd from pandas import (DataFrame, concat, read_csv, isnull, Series, date_range, @@ -27,6 +27,430 @@ def setUp(self): self.mixed_frame['foo'] = 'bar' +class TestConcatAppendCommon(ConcatenateBase): + + """ + Test common dtype coercion rules between concat and append. + """ + + def setUp(self): + + dt_data = [pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03')] + tz_data = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-03', tz='US/Eastern')] + + td_data = [pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Timedelta('3 days')] + + period_data = [pd.Period('2011-01', freq='M'), + pd.Period('2011-02', freq='M'), + pd.Period('2011-03', freq='M')] + + self.data = {'bool': [True, False, True], + 'int64': [1, 2, 3], + 'float64': [1.1, np.nan, 3.3], + 'category': pd.Categorical(['X', 'Y', 'Z']), + 'object': ['a', 'b', 'c'], + 'datetime64[ns]': dt_data, + 'datetime64[ns, US/Eastern]': tz_data, + 'timedelta64[ns]': td_data, + 'period[M]': period_data} + + def _check_expected_dtype(self, obj, label): + """ + Check whether obj has expected dtype depending on label + considering not-supported dtypes + """ + if isinstance(obj, pd.Index): + if label == 'bool': + self.assertEqual(obj.dtype, 'object') + else: + self.assertEqual(obj.dtype, label) + elif isinstance(obj, pd.Series): + if label.startswith('period'): + self.assertEqual(obj.dtype, 'object') + else: + self.assertEqual(obj.dtype, label) + else: + raise ValueError + + def test_dtypes(self): + # to confirm test case covers intended dtypes + for typ, vals in iteritems(self.data): + self._check_expected_dtype(pd.Index(vals), typ) + self._check_expected_dtype(pd.Series(vals), typ) + + def test_concatlike_same_dtypes(self): + # GH 13660 + for typ1, vals1 in iteritems(self.data): + + vals2 = vals1 + vals3 = vals1 + + if typ1 == 'category': + exp_data = pd.Categorical(list(vals1) + list(vals2)) + exp_data3 = pd.Categorical(list(vals1) + list(vals2) + + list(vals3)) + else: + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = pd.Index(vals1).append(pd.Index(vals2)) + exp = pd.Index(exp_data) + tm.assert_index_equal(res, exp) + + # 3 elements + res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) + exp = pd.Index(exp_data3) + tm.assert_index_equal(res, exp) + + # index.append name mismatch + i1 = pd.Index(vals1, name='x') + i2 = pd.Index(vals2, name='y') + res = i1.append(i2) + exp = pd.Index(exp_data) + tm.assert_index_equal(res, exp) + + # index.append name match + i1 = pd.Index(vals1, name='x') + i2 = pd.Index(vals2, name='x') + res = i1.append(i2) + exp = pd.Index(exp_data, name='x') + tm.assert_index_equal(res, exp) + + # cannot append non-index + with tm.assertRaisesRegexp(TypeError, 'all inputs must be Index'): + pd.Index(vals1).append(vals2) + + with tm.assertRaisesRegexp(TypeError, 'all inputs must be Index'): + pd.Index(vals1).append([pd.Index(vals2), vals3]) + + # ----- Series ----- # + + # series.append + res = pd.Series(vals1).append(pd.Series(vals2), + ignore_index=True) + exp = pd.Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], + ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = pd.Series(vals1).append([pd.Series(vals2), pd.Series(vals3)], + ignore_index=True) + exp = pd.Series(exp_data3) + tm.assert_series_equal(res, exp) + + res = pd.concat([pd.Series(vals1), pd.Series(vals2), + pd.Series(vals3)], ignore_index=True) + tm.assert_series_equal(res, exp) + + # name mismatch + s1 = pd.Series(vals1, name='x') + s2 = pd.Series(vals2, name='y') + res = s1.append(s2, ignore_index=True) + exp = pd.Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # name match + s1 = pd.Series(vals1, name='x') + s2 = pd.Series(vals2, name='x') + res = s1.append(s2, ignore_index=True) + exp = pd.Series(exp_data, name='x') + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # cannot append non-index + msg = "cannot concatenate a non-NDFrame object" + with tm.assertRaisesRegexp(TypeError, msg): + pd.Series(vals1).append(vals2) + + with tm.assertRaisesRegexp(TypeError, msg): + pd.Series(vals1).append([pd.Series(vals2), vals3]) + + with tm.assertRaisesRegexp(TypeError, msg): + pd.concat([pd.Series(vals1), vals2]) + + with tm.assertRaisesRegexp(TypeError, msg): + pd.concat([pd.Series(vals1), pd.Series(vals2), vals3]) + + def test_concatlike_dtypes_coercion(self): + # GH 13660 + for typ1, vals1 in iteritems(self.data): + for typ2, vals2 in iteritems(self.data): + + vals3 = vals2 + + # basically infer + exp_index_dtype = None + exp_series_dtype = None + + if typ1 == typ2: + # same dtype is tested in test_concatlike_same_dtypes + continue + elif typ1 == 'category' or typ2 == 'category': + # ToDo: suspicious + continue + + # specify expected dtype + if typ1 == 'bool' and typ2 in ('int64', 'float64'): + # series coerces to numeric based on numpy rule + # index doesn't because bool is object dtype + exp_series_dtype = typ2 + elif typ2 == 'bool' and typ1 in ('int64', 'float64'): + exp_series_dtype = typ1 + elif (typ1 == 'datetime64[ns, US/Eastern]' or + typ2 == 'datetime64[ns, US/Eastern]' or + typ1 == 'timedelta64[ns]' or + typ2 == 'timedelta64[ns]'): + exp_index_dtype = object + exp_series_dtype = object + + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = pd.Index(vals1).append(pd.Index(vals2)) + exp = pd.Index(exp_data, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # 3 elements + res = pd.Index(vals1).append([pd.Index(vals2), + pd.Index(vals3)]) + exp = pd.Index(exp_data3, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # ----- Series ----- # + + # series.append + res = pd.Series(vals1).append(pd.Series(vals2), + ignore_index=True) + exp = pd.Series(exp_data, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], + ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = pd.Series(vals1).append([pd.Series(vals2), + pd.Series(vals3)], + ignore_index=True) + exp = pd.Series(exp_data3, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp) + + res = pd.concat([pd.Series(vals1), pd.Series(vals2), + pd.Series(vals3)], ignore_index=True) + tm.assert_series_equal(res, exp) + + def test_concatlike_common_coerce_to_pandas_object(self): + # GH 13626 + # result must be Timestamp/Timedelta, not datetime.datetime/timedelta + dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02']) + tdi = pd.TimedeltaIndex(['1 days', '2 days']) + + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timedelta('1 days'), + pd.Timedelta('2 days')]) + + res = dti.append(tdi) + tm.assert_index_equal(res, exp) + tm.assertIsInstance(res[0], pd.Timestamp) + tm.assertIsInstance(res[-1], pd.Timedelta) + + dts = pd.Series(dti) + tds = pd.Series(tdi) + res = dts.append(tds) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + tm.assertIsInstance(res.iloc[0], pd.Timestamp) + tm.assertIsInstance(res.iloc[-1], pd.Timedelta) + + res = pd.concat([dts, tds]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + tm.assertIsInstance(res.iloc[0], pd.Timestamp) + tm.assertIsInstance(res.iloc[-1], pd.Timedelta) + + def test_concatlike_datetimetz(self): + # GH 7795 + for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo']: + dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz=tz) + + exp = pd.DatetimeIndex(['2011-01-01', '2011-01-02', + '2012-01-01', '2012-01-02'], tz=tz) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = pd.Series(dti1) + dts2 = pd.Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_datetimetz_short(self): + # GH 7795 + for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo', 'EST5EDT']: + + ix1 = pd.DatetimeIndex(start='2014-07-15', end='2014-07-17', + freq='D', tz=tz) + ix2 = pd.DatetimeIndex(['2014-07-11', '2014-07-21'], tz=tz) + df1 = pd.DataFrame(0, index=ix1, columns=['A', 'B']) + df2 = pd.DataFrame(0, index=ix2, columns=['A', 'B']) + + exp_idx = pd.DatetimeIndex(['2014-07-15', '2014-07-16', + '2014-07-17', '2014-07-11', + '2014-07-21'], tz=tz) + exp = pd.DataFrame(0, index=exp_idx, columns=['A', 'B']) + + tm.assert_frame_equal(df1.append(df2), exp) + tm.assert_frame_equal(pd.concat([df1, df2]), exp) + + def test_concatlike_datetimetz_to_object(self): + # GH 13660 + + # different tz coerces to object + for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo']: + dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02']) + + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2011-01-02', tz=tz), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02')], dtype=object) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = pd.Series(dti1) + dts2 = pd.Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + # different tz + dti3 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], + tz='US/Pacific') + + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2011-01-02', tz=tz), + pd.Timestamp('2012-01-01', tz='US/Pacific'), + pd.Timestamp('2012-01-02', tz='US/Pacific')], + dtype=object) + + res = dti1.append(dti3) + # tm.assert_index_equal(res, exp) + + dts1 = pd.Series(dti1) + dts3 = pd.Series(dti3) + res = dts1.append(dts3) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts3]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period(self): + # GH 13660 + pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + pi2 = pd.PeriodIndex(['2012-01', '2012-02'], freq='M') + + exp = pd.PeriodIndex(['2011-01', '2011-02', '2012-01', + '2012-02'], freq='M') + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + ps2 = pd.Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_diff_freq_to_object(self): + # GH 13221 + pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + pi2 = pd.PeriodIndex(['2012-01-01', '2012-02-01'], freq='D') + + exp = pd.Index([pd.Period('2011-01', freq='M'), + pd.Period('2011-02', freq='M'), + pd.Period('2012-01-01', freq='D'), + pd.Period('2012-02-01', freq='D')], dtype=object) + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + ps2 = pd.Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_mixed_dt_to_object(self): + # GH 13221 + # different datetimelike + pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + tdi = pd.TimedeltaIndex(['1 days', '2 days']) + exp = pd.Index([pd.Period('2011-01', freq='M'), + pd.Period('2011-02', freq='M'), + pd.Timedelta('1 days'), + pd.Timedelta('2 days')], dtype=object) + + res = pi1.append(tdi) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + tds = pd.Series(tdi) + res = ps1.append(tds) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, tds]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + # inverse + exp = pd.Index([pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Period('2011-01', freq='M'), + pd.Period('2011-02', freq='M')], dtype=object) + + res = tdi.append(pi1) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + tds = pd.Series(tdi) + res = tds.append(ps1) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([tds, ps1]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + class TestAppend(ConcatenateBase): def test_append(self): diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index cda2343fbb842..75c6db23b4bc7 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -895,7 +895,9 @@ def test_crosstab_margins(self): all_cols = result['All', ''] exp_cols = df.groupby(['a']).size().astype('i8') - exp_cols = exp_cols.append(Series([len(df)], index=['All'])) + # to keep index.name + exp_margin = Series([len(df)], index=Index(['All'], name='a')) + exp_cols = exp_cols.append(exp_margin) exp_cols.name = ('All', '') tm.assert_series_equal(all_cols, exp_cols) @@ -1084,7 +1086,6 @@ def test_crosstab_normalize(self): dtype='object'), columns=pd.Index([3, 4, 'All'], name='b')) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', margins=True), row_normal_margins) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns', diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index f0c6e334925c4..45e2a2d6c0720 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -26,6 +26,7 @@ from pandas.core.index import Index from pandas.indexes.base import _index_shared_docs from pandas.util.decorators import Appender, cache_readonly +import pandas.types.concat as _concat import pandas.tseries.frequencies as frequencies import pandas.algos as _algos @@ -795,6 +796,23 @@ def summary(self, name=None): result = result.replace("'", "") return result + def _append_same_dtype(self, to_concat, name): + """ + Concatenate to_concat which has the same class + """ + attribs = self._get_attributes_dict() + attribs['name'] = name + + if not isinstance(self, ABCPeriodIndex): + # reset freq + attribs['freq'] = None + + if getattr(self, 'tz', None) is not None: + return _concat._concat_datetimetz(to_concat, name) + else: + new_data = np.concatenate([c.asi8 for c in to_concat]) + return self._simple_new(new_data, **attribs) + def _ensure_datetimelike_to_i8(other): """ helper for coercing an input scalar or array to i8 """ diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f78574521ffeb..ee0e88b993f55 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1008,36 +1008,6 @@ def union_many(self, others): this.offset = to_offset(this.inferred_freq) return this - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - """ - name = self.name - to_concat = [self] - - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) - - for obj in to_concat: - if isinstance(obj, Index) and obj.name != name: - name = None - break - - to_concat = self._ensure_compat_concat(to_concat) - to_concat, factory = _process_concat_data(to_concat, name) - - return factory(to_concat) - def join(self, other, how='left', level=None, return_indexers=False): """ See Index.join @@ -2180,56 +2150,3 @@ def _use_cached_range(offset, _normalized, start, end): def _time_to_micros(time): seconds = time.hour * 60 * 60 + 60 * time.minute + time.second return 1000000 * seconds + time.microsecond - - -def _process_concat_data(to_concat, name): - klass = Index - kwargs = {} - concat = np.concatenate - - all_dti = True - need_utc_convert = False - has_naive = False - tz = None - - for x in to_concat: - if not isinstance(x, DatetimeIndex): - all_dti = False - else: - if tz is None: - tz = x.tz - - if x.tz is None: - has_naive = True - - if x.tz != tz: - need_utc_convert = True - tz = 'UTC' - - if all_dti: - need_obj_convert = False - if has_naive and tz is not None: - need_obj_convert = True - - if need_obj_convert: - to_concat = [x.asobject.values for x in to_concat] - - else: - if need_utc_convert: - to_concat = [x.tz_convert('UTC').values for x in to_concat] - else: - to_concat = [x.values for x in to_concat] - - # well, technically not a "class" anymore...oh well - klass = DatetimeIndex._simple_new - kwargs = {'tz': tz} - concat = _concat._concat_compat - else: - for i, x in enumerate(to_concat): - if isinstance(x, DatetimeIndex): - to_concat[i] = x.asobject.values - elif isinstance(x, Index): - to_concat[i] = x.values - - factory_func = lambda x: klass(concat(x), name=name, **kwargs) - return to_concat, factory_func diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 7fb0f19b04486..363f2419889d1 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -974,45 +974,6 @@ def _format_native_types(self, na_rep=u('NaT'), date_format=None, values = np.array([formatter(dt) for dt in values]) return values - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - """ - name = self.name - to_concat = [self] - - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) - - for obj in to_concat: - if isinstance(obj, Index) and obj.name != name: - name = None - break - - to_concat = self._ensure_compat_concat(to_concat) - - if isinstance(to_concat[0], PeriodIndex): - if len(set([x.freq for x in to_concat])) > 1: - # box - to_concat = [x.asobject.values for x in to_concat] - else: - cat_values = np.concatenate([x._values for x in to_concat]) - return PeriodIndex(cat_values, freq=self.freq, name=name) - - to_concat = [x._values if isinstance(x, Index) else x - for x in to_concat] - return Index(com._concat_compat(to_concat), name=name) - def __setstate__(self, state): """Necessary for making this object picklable""" diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index a17eda3ac4288..7c7cac83aef53 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -505,34 +505,6 @@ def union(self, other): result.freq = to_offset(result.inferred_freq) return result - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - """ - name = self.name - to_concat = [self] - - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) - - for obj in to_concat: - if isinstance(obj, Index) and obj.name != name: - name = None - break - - to_concat = self._ensure_compat_concat(to_concat) - return Index(_concat._concat_compat(to_concat), name=name) - def join(self, other, how='left', level=None, return_indexers=False): """ See Index.join diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 7ec0d09c20841..a7a015f273320 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -1296,27 +1296,59 @@ def test_append_aware(self): tz='US/Eastern') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Eastern') - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts1 = Series([1], index=rng1) + ts2 = Series([2], index=rng2) ts_result = ts1.append(ts2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='US/Eastern') + exp = Series([1, 2], index=exp_index) + self.assert_series_equal(ts_result, exp) self.assertEqual(ts_result.index.tz, rng1.tz) rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts1 = Series([1], index=rng1) + ts2 = Series([2], index=rng2) ts_result = ts1.append(ts2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='UTC') + exp = Series([1, 2], index=exp_index) + self.assert_series_equal(ts_result, exp) utc = rng1.tz self.assertEqual(utc, ts_result.index.tz) + # GH 7795 + # different tz coerces to object dtype, not UTC rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='US/Eastern') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Central') - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts1 = Series([1], index=rng1) + ts2 = Series([2], index=rng2) ts_result = ts1.append(ts2) - self.assertEqual(utc, ts_result.index.tz) + exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), + Timestamp('1/1/2011 02:00', tz='US/Central')]) + exp = Series([1, 2], index=exp_index) + self.assert_series_equal(ts_result, exp) + + def test_append_dst(self): + rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + ts1 = Series([1, 2, 3], index=rng1) + ts2 = Series([10, 11, 12], index=rng2) + ts_result = ts1.append(ts2) + + exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', + '2016-01-01 03:00', '2016-08-01 01:00', + '2016-08-01 02:00', '2016-08-01 03:00'], + tz='US/Eastern') + exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) + tm.assert_series_equal(ts_result, exp) + self.assertEqual(ts_result.index.tz, rng1.tz) def test_append_aware_naive(self): rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') diff --git a/pandas/types/concat.py b/pandas/types/concat.py index a7fd692cfb9cf..29a0fe7d9f8d0 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -12,11 +12,14 @@ is_datetimetz, is_datetime64_dtype, is_timedelta64_dtype, + is_period_dtype, is_object_dtype, is_bool_dtype, is_dtype_equal, _NS_DTYPE, _TD_DTYPE) +from pandas.types.generic import (ABCDatetimeIndex, ABCTimedeltaIndex, + ABCPeriodIndex) def get_dtype_kinds(l): @@ -39,7 +42,9 @@ def get_dtype_kinds(l): elif is_sparse(arr): typ = 'sparse' elif is_datetimetz(arr): - typ = 'datetimetz' + # if to_concat contains different tz, + # the result must be object dtype + typ = str(arr.dtype) elif is_datetime64_dtype(dtype): typ = 'datetime' elif is_timedelta64_dtype(dtype): @@ -48,6 +53,8 @@ def get_dtype_kinds(l): typ = 'object' elif is_bool_dtype(dtype): typ = 'bool' + elif is_period_dtype(dtype): + typ = str(arr.dtype) else: typ = dtype.kind typs.add(typ) @@ -127,7 +134,10 @@ def is_nonempty(x): typs = get_dtype_kinds(to_concat) # these are mandated to handle empties as well - if 'datetime' in typs or 'datetimetz' in typs or 'timedelta' in typs: + _contains_datetime = any(typ.startswith('datetime') for typ in typs) + _contains_period = any(typ.startswith('period') for typ in typs) + + if _contains_datetime or 'timedelta' in typs or _contains_period: return _concat_datetime(to_concat, axis=axis, typs=typs) elif 'sparse' in typs: @@ -319,12 +329,13 @@ def convert_to_pydatetime(x, axis): x = x.asobject.values else: shape = x.shape - x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), + box=True) x = x.reshape(shape) elif x.dtype == _TD_DTYPE: shape = x.shape - x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) + x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) x = x.reshape(shape) if axis == 1: @@ -336,34 +347,71 @@ def convert_to_pydatetime(x, axis): # must be single dtype if len(typs) == 1: + _contains_datetime = any(typ.startswith('datetime') for typ in typs) + _contains_period = any(typ.startswith('period') for typ in typs) - if 'datetimetz' in typs: - # datetime with no tz should be stored as "datetime" in typs, - # thus no need to care - - # we require ALL of the same tz for datetimetz - tzs = set([str(x.tz) for x in to_concat]) - if len(tzs) == 1: - from pandas.tseries.index import DatetimeIndex - new_values = np.concatenate([x.tz_localize(None).asi8 - for x in to_concat]) - return DatetimeIndex(new_values, tz=list(tzs)[0]) + if _contains_datetime: - elif 'datetime' in typs: - new_values = np.concatenate([x.view(np.int64) for x in to_concat], - axis=axis) - return new_values.view(_NS_DTYPE) + if 'datetime' in typs: + new_values = np.concatenate([x.view(np.int64) for x in + to_concat], axis=axis) + return new_values.view(_NS_DTYPE) + else: + # when to_concat has different tz, len(typs) > 1. + # thus no need to care + return _concat_datetimetz(to_concat) elif 'timedelta' in typs: new_values = np.concatenate([x.view(np.int64) for x in to_concat], axis=axis) return new_values.view(_TD_DTYPE) + elif _contains_period: + # PeriodIndex must be handled by PeriodIndex, + # Thus can't meet this condition ATM + # Must be changed when we adding PeriodDtype + raise NotImplementedError + # need to coerce to object to_concat = [convert_to_pydatetime(x, axis) for x in to_concat] return np.concatenate(to_concat, axis=axis) +def _concat_datetimetz(to_concat, name=None): + """ + concat DatetimeIndex with the same tz + all inputs must be DatetimeIndex + it is used in DatetimeIndex.append also + """ + # do not pass tz to set because tzlocal cannot be hashed + if len(set([str(x.dtype) for x in to_concat])) != 1: + raise ValueError('to_concat must have the same tz') + tz = to_concat[0].tz + # no need to localize because internal repr will not be changed + new_values = np.concatenate([x.asi8 for x in to_concat]) + return to_concat[0]._simple_new(new_values, tz=tz, name=name) + + +def _concat_index_asobject(to_concat, name=None): + """ + concat all inputs as object. DatetimeIndex, TimedeltaIndex and + PeriodIndex are converted to object dtype before concatenation + """ + + klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex + to_concat = [x.asobject if isinstance(x, klasses) else x + for x in to_concat] + + from pandas import Index + self = to_concat[0] + attribs = self._get_attributes_dict() + attribs['name'] = name + + to_concat = [x._values if isinstance(x, Index) else x + for x in to_concat] + return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) + + def _concat_sparse(to_concat, axis=0, typs=None): """ provide concatenation of an sparse/dense array of arrays each of which is a From e9c5c2d2c550a5f8ae47b0c4348fb359c93ab8f1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 3 Sep 2016 10:06:36 -0400 Subject: [PATCH 332/359] Revert "TST: fix blosc version (#14142)" This reverts commit 752ba9ae0eb8732c15caaea18ea86db04004d3b0. closes #14143 --- ci/requirements-2.7.pip | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 44e1695bf1a7f..d16b932c8be4f 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -1,4 +1,4 @@ -blosc==1.4.1 +blosc httplib2 google-api-python-client==1.2 python-gflags==2.0 From 4488f18076226894c0f07be67ee700663e32623e Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 3 Sep 2016 11:29:16 -0400 Subject: [PATCH 333/359] BUG/CLN: datetimelike Index.equals may return True with non-Index closes #13107 Author: sinhrks Closes #13986 from sinhrks/dti_equals and squashes the following commits: 580151a [sinhrks] BUG/CLN: move .equals to DatetimeOpsMixin --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/indexes/base.py | 11 +++- pandas/indexes/category.py | 3 + pandas/indexes/multi.py | 4 ++ pandas/indexes/numeric.py | 15 ++--- pandas/tests/indexes/common.py | 14 +++++ pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexes/test_category.py | 37 +++++++---- pandas/tests/indexes/test_multi.py | 2 +- pandas/tests/indexes/test_numeric.py | 2 +- pandas/tests/indexes/test_range.py | 2 +- pandas/tseries/base.py | 30 ++++++++- pandas/tseries/index.py | 20 ------ pandas/tseries/period.py | 15 ----- pandas/tseries/tdi.py | 16 ----- pandas/tseries/tests/test_base.py | 83 +++++++++++++++++++++++++ pandas/tseries/tests/test_timedeltas.py | 2 +- pandas/tseries/tests/test_timeseries.py | 2 +- 18 files changed, 177 insertions(+), 84 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index ca5f3dfc2a8f2..a422e667e32a7 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1461,6 +1461,7 @@ Bug Fixes - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) - Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`) - Bug in ``DataFrame.to_csv()`` with ``MultiIndex`` columns in which a stray empty line was added (:issue:`6618`) +- Bug in ``DatetimeIndex``, ``TimedeltaIndex`` and ``PeriodIndex.equals()`` may return ``True`` when input isn't ``Index`` but contains the same values (:issue:`13107`) - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index d6b6d01b1e444..dac0e650cb923 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1605,8 +1605,15 @@ def equals(self, other): if not isinstance(other, Index): return False - return array_equivalent(_values_from_object(self), - _values_from_object(other)) + if is_object_dtype(self) and not is_object_dtype(other): + # if other is not object, use other's logic for coercion + return other.equals(self) + + try: + return array_equivalent(_values_from_object(self), + _values_from_object(other)) + except: + return False def identical(self, other): """Similar to equals, but check that other comparable attributes are diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 1666d8f7bc078..d4fc746c652ca 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -196,6 +196,9 @@ def equals(self, other): if self.is_(other): return True + if not isinstance(other, Index): + return False + try: other = self._is_dtype_compat(other) return array_equivalent(self._data, other) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 618bc319c3f74..f42410fcdf098 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1436,6 +1436,7 @@ def reindex(self, target, method=None, level=None, limit=None, return_indexers=True, keep_order=False) else: + target = _ensure_index(target) if self.equals(target): indexer = None else: @@ -1984,6 +1985,9 @@ def equals(self, other): if self.is_(other): return True + if not isinstance(other, Index): + return False + if not isinstance(other, MultiIndex): return array_equivalent(self._values, _values_from_object(_ensure_index(other))) diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index e1ac0939812f6..b9625f3aaff92 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -7,7 +7,7 @@ from pandas.types.common import (is_dtype_equal, pandas_dtype, is_float_dtype, is_object_dtype, is_integer_dtype, is_scalar) -from pandas.types.missing import array_equivalent, isnull +from pandas.types.missing import isnull from pandas.core.common import _values_from_object from pandas import compat @@ -160,16 +160,6 @@ def _convert_scalar_indexer(self, key, kind=None): return (super(Int64Index, self) ._convert_scalar_indexer(key, kind=kind)) - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True - - return array_equivalent(_values_from_object(self), - _values_from_object(other)) - def _wrap_joined_index(self, joined, other): name = self.name if self.name == other.name else None return Int64Index(joined, name=name) @@ -306,6 +296,9 @@ def equals(self, other): if self is other: return True + if not isinstance(other, Index): + return False + # need to compare nans locations and make sure that they are the same # since nans don't compare equal this is a bit tricky try: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 2c8031898c78e..773f20532e4ff 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -650,6 +650,20 @@ def test_delete_base(self): # either depending on numpy version result = idx.delete(len(idx)) + def test_equals(self): + + for name, idx in compat.iteritems(self.indices): + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.astype(object))) + + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(np.array(idx))) + + if idx.nlevels == 1: + # do not test MultiIndex + self.assertFalse(idx.equals(pd.Series(idx))) + def test_equals_op(self): # GH9947, GH10637 index_a = self.create_index() diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 66a5a155dd7a5..0ef7e6bf3be97 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -400,7 +400,7 @@ def test_astype(self): casted = self.intIndex.astype('i8') self.assertEqual(casted.name, 'foobar') - def test_equals(self): + def test_equals_object(self): # same self.assertTrue(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index cb8452479f616..9f8405bcc2e1e 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -522,7 +522,7 @@ def test_ensure_copied_data(self): result = CategoricalIndex(index.values, copy=False) self.assertIs(_base(index.values), _base(result.values)) - def test_equals(self): + def test_equals_categorical(self): ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], @@ -556,19 +556,30 @@ def test_equals(self): # tests # make sure that we are testing for category inclusion properly - self.assertTrue(CategoricalIndex( - list('aabca'), categories=['c', 'a', 'b']).equals(list('aabca'))) + ci = CategoricalIndex(list('aabca'), categories=['c', 'a', 'b']) + self.assertFalse(ci.equals(list('aabca'))) + self.assertFalse(ci.equals(CategoricalIndex(list('aabca')))) + self.assertTrue(ci.equals(ci.copy())) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + ci = CategoricalIndex(list('aabca'), + categories=['c', 'a', 'b', np.nan]) + self.assertFalse(ci.equals(list('aabca'))) + self.assertFalse(ci.equals(CategoricalIndex(list('aabca')))) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertTrue(CategoricalIndex( - list('aabca'), categories=['c', 'a', 'b', np.nan]).equals(list( - 'aabca'))) - - self.assertFalse(CategoricalIndex( - list('aabca') + [np.nan], categories=['c', 'a', 'b']).equals(list( - 'aabca'))) - self.assertTrue(CategoricalIndex( - list('aabca') + [np.nan], categories=['c', 'a', 'b']).equals(list( - 'aabca') + [np.nan])) + self.assertTrue(ci.equals(ci.copy())) + + ci = CategoricalIndex(list('aabca') + [np.nan], + categories=['c', 'a', 'b']) + self.assertFalse(ci.equals(list('aabca'))) + self.assertFalse(ci.equals(CategoricalIndex(list('aabca')))) + self.assertTrue(ci.equals(ci.copy())) + + ci = CategoricalIndex(list('aabca') + [np.nan], + categories=['c', 'a', 'b']) + self.assertFalse(ci.equals(list('aabca') + [np.nan])) + self.assertFalse(ci.equals(CategoricalIndex(list('aabca') + [np.nan]))) + self.assertTrue(ci.equals(ci.copy())) def test_string_categorical_index_repr(self): # short diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index d49ac40631d37..25de6c5091853 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1266,7 +1266,7 @@ def test_to_hierarchical(self): def test_bounds(self): self.index._bounds - def test_equals(self): + def test_equals_multi(self): self.assertTrue(self.index.equals(self.index)) self.assertTrue(self.index.equal_levels(self.index)) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index f0af43e3513bb..d3a89b301ae46 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -265,7 +265,7 @@ def test_astype(self): i = Float64Index([0, 1.1, np.NAN]) self.assertRaises(ValueError, lambda: i.astype(dtype)) - def test_equals(self): + def test_equals_numeric(self): i = Float64Index([1.0, 2.0]) self.assertTrue(i.equals(i)) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 168ef7fc8d100..b0b8864521666 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -337,7 +337,7 @@ def test_is_monotonic(self): self.assertTrue(index.is_monotonic_increasing) self.assertTrue(index.is_monotonic_decreasing) - def test_equals(self): + def test_equals_range(self): equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), (RangeIndex(0), RangeIndex(1, -1, 3)), (RangeIndex(1, 2, 3), RangeIndex(1, 3, 4)), diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 45e2a2d6c0720..1690a9b229db2 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -11,7 +11,7 @@ import numpy as np from pandas.types.common import (is_integer, is_float, is_bool_dtype, _ensure_int64, - is_scalar, + is_scalar, is_dtype_equal, is_list_like) from pandas.types.generic import (ABCIndex, ABCSeries, ABCPeriodIndex, ABCIndexClass) @@ -109,6 +109,34 @@ def ceil(self, freq): class DatetimeIndexOpsMixin(object): """ common ops mixin to support a unified inteface datetimelike Index """ + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + if not isinstance(other, ABCIndexClass): + return False + elif not isinstance(other, type(self)): + try: + other = type(self)(other) + except: + return False + + if not is_dtype_equal(self.dtype, other.dtype): + # have different timezone + return False + + # ToDo: Remove this when PeriodDtype is added + elif isinstance(self, ABCPeriodIndex): + if not isinstance(other, ABCPeriodIndex): + return False + if self.freq != other.freq: + return False + + return np.array_equal(self.asi8, other.asi8) + def __iter__(self): return (self._box_func(v) for v in self.asi8) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index ee0e88b993f55..351edf1b38352 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1625,26 +1625,6 @@ def is_normalized(self): def _resolution(self): return period.resolution(self.asi8, self.tz) - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True - - if (not hasattr(other, 'inferred_type') or - other.inferred_type != 'datetime64'): - if self.offset is not None: - return False - try: - other = DatetimeIndex(other) - except: - return False - - if self._has_same_tz(other): - return np.array_equal(self.asi8, other.asi8) - return False - def insert(self, loc, item): """ Make new Index inserting new item at location diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 363f2419889d1..d5d89c8dc2614 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -596,21 +596,6 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return self.asobject.values - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True - - if not isinstance(other, PeriodIndex): - try: - other = PeriodIndex(other) - except: - return False - - return np.array_equal(self.asi8, other.asi8) - def to_timestamp(self, freq=None, how='start'): """ Cast to DatetimeIndex diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 7c7cac83aef53..c527bbad555f9 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -806,22 +806,6 @@ def dtype(self): def is_all_dates(self): return True - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True - - if (not hasattr(other, 'inferred_type') or - other.inferred_type != 'timedelta64'): - try: - other = TimedeltaIndex(other) - except: - return False - - return np.array_equal(self.asi8, other.asi8) - def insert(self, loc, item): """ Make new Index inserting new item at location diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index aa13591a4ff30..96ff74c819624 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -819,6 +819,37 @@ def test_nat(self): tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.int64)) + def test_equals(self): + # GH 13107 + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.asobject)) + self.assertTrue(idx.asobject.equals(idx)) + self.assertTrue(idx.asobject.equals(idx.asobject)) + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(pd.Series(idx))) + + idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'], + tz='US/Pacific') + self.assertFalse(idx.equals(idx2)) + self.assertFalse(idx.equals(idx2.copy())) + self.assertFalse(idx.equals(idx2.asobject)) + self.assertFalse(idx.asobject.equals(idx2)) + self.assertFalse(idx.equals(list(idx2))) + self.assertFalse(idx.equals(pd.Series(idx2))) + + # same internal, different tz + idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz='US/Pacific') + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + self.assertFalse(idx.equals(idx3)) + self.assertFalse(idx.equals(idx3.copy())) + self.assertFalse(idx.equals(idx3.asobject)) + self.assertFalse(idx.asobject.equals(idx3)) + self.assertFalse(idx.equals(list(idx3))) + self.assertFalse(idx.equals(pd.Series(idx3))) + class TestTimedeltaIndexOps(Ops): def setUp(self): @@ -1682,6 +1713,26 @@ def test_nat(self): tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.int64)) + def test_equals(self): + # GH 13107 + idx = pd.TimedeltaIndex(['1 days', '2 days', 'NaT']) + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.asobject)) + self.assertTrue(idx.asobject.equals(idx)) + self.assertTrue(idx.asobject.equals(idx.asobject)) + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(pd.Series(idx))) + + idx2 = pd.TimedeltaIndex(['2 days', '1 days', 'NaT']) + self.assertFalse(idx.equals(idx2)) + self.assertFalse(idx.equals(idx2.copy())) + self.assertFalse(idx.equals(idx2.asobject)) + self.assertFalse(idx.asobject.equals(idx2)) + self.assertFalse(idx.asobject.equals(idx2.asobject)) + self.assertFalse(idx.equals(list(idx2))) + self.assertFalse(idx.equals(pd.Series(idx2))) + class TestPeriodIndexOps(Ops): def setUp(self): @@ -2646,6 +2697,38 @@ def test_nat(self): tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.int64)) + def test_equals(self): + # GH 13107 + for freq in ['D', 'M']: + idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], + freq=freq) + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.asobject)) + self.assertTrue(idx.asobject.equals(idx)) + self.assertTrue(idx.asobject.equals(idx.asobject)) + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(pd.Series(idx))) + + idx2 = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], + freq='H') + self.assertFalse(idx.equals(idx2)) + self.assertFalse(idx.equals(idx2.copy())) + self.assertFalse(idx.equals(idx2.asobject)) + self.assertFalse(idx.asobject.equals(idx2)) + self.assertFalse(idx.equals(list(idx2))) + self.assertFalse(idx.equals(pd.Series(idx2))) + + # same internal, different tz + idx3 = pd.PeriodIndex._simple_new(idx.asi8, freq='H') + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + self.assertFalse(idx.equals(idx3)) + self.assertFalse(idx.equals(idx3.copy())) + self.assertFalse(idx.equals(idx3.asobject)) + self.assertFalse(idx.asobject.equals(idx3)) + self.assertFalse(idx.equals(list(idx3))) + self.assertFalse(idx.equals(pd.Series(idx3))) + if __name__ == '__main__': import nose diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 77e0216c5c79a..ab413af897215 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1523,7 +1523,7 @@ def test_misc_coverage(self): tm.assertIsInstance(list(result.values())[0][0], Timedelta) idx = TimedeltaIndex(['3d', '1d', '2d']) - self.assertTrue(idx.equals(list(idx))) + self.assertFalse(idx.equals(list(idx))) non_td = Index(list('abc')) self.assertFalse(idx.equals(list(non_td))) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 2355d663ed7d5..5ce0bdffe7ad4 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3032,7 +3032,7 @@ def test_misc_coverage(self): tm.assertIsInstance(list(result.values())[0][0], Timestamp) idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) - self.assertTrue(idx.equals(list(idx))) + self.assertFalse(idx.equals(list(idx))) non_datetime = Index(list('abc')) self.assertFalse(idx.equals(list(non_datetime))) From da7d4734e3851130af773c81b80c55151e65195e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 4 Sep 2016 05:21:01 -0500 Subject: [PATCH 334/359] DEPR: Change boxplot return_type kwarg (#12216) * DEPR: Change boxplot return_type kwarg Part of https://github.com/pydata/pandas/issues/6581 Deprecation started in https://github.com/pydata/pandas/pull/7096 Changes the default value of `return_type` in DataFrame.boxplot and DataFrame.plot.box from None to 'axes'. * API: Change faceted boxplot return_type Aligns behavior of `Groupby.boxplot` and DataFrame.boxplot(by=.) to return a Series. --- doc/source/visualization.rst | 35 +++++++-------- doc/source/whatsnew/v0.19.0.txt | 3 +- pandas/tests/plotting/common.py | 7 +-- pandas/tests/plotting/test_boxplot_method.py | 28 ++++++------ pandas/tests/plotting/test_frame.py | 5 ++- pandas/tools/plotting.py | 45 +++++++++++--------- pandas/util/testing.py | 12 +++--- 7 files changed, 73 insertions(+), 62 deletions(-) diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 16ef76638ec5b..6e05c3ff0457a 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -456,28 +456,29 @@ columns: .. _visualization.box.return: -Basically, plot functions return :class:`matplotlib Axes ` as a return value. -In ``boxplot``, the return type can be changed by argument ``return_type``, and whether the subplots is enabled (``subplots=True`` in ``plot`` or ``by`` is specified in ``boxplot``). +.. warning:: -When ``subplots=False`` / ``by`` is ``None``: + The default changed from ``'dict'`` to ``'axes'`` in version 0.19.0. -* if ``return_type`` is ``'dict'``, a dictionary containing the :class:`matplotlib Lines ` is returned. The keys are "boxes", "caps", "fliers", "medians", and "whiskers". - This is the default of ``boxplot`` in historical reason. - Note that ``plot.box()`` returns ``Axes`` by default same as other plots. -* if ``return_type`` is ``'axes'``, a :class:`matplotlib Axes ` containing the boxplot is returned. -* if ``return_type`` is ``'both'`` a namedtuple containing the :class:`matplotlib Axes ` - and :class:`matplotlib Lines ` is returned +In ``boxplot``, the return type can be controlled by the ``return_type``, keyword. The valid choices are ``{"axes", "dict", "both", None}``. +Faceting, created by ``DataFrame.boxplot`` with the ``by`` +keyword, will affect the output type as well: -When ``subplots=True`` / ``by`` is some column of the DataFrame: +================ ======= ========================== +``return_type=`` Faceted Output type +---------------- ------- -------------------------- -* A dict of ``return_type`` is returned, where the keys are the columns - of the DataFrame. The plot has a facet for each column of - the DataFrame, with a separate box for each value of ``by``. +``None`` No axes +``None`` Yes 2-D ndarray of axes +``'axes'`` No axes +``'axes'`` Yes Series of axes +``'dict'`` No dict of artists +``'dict'`` Yes Series of dicts of artists +``'both'`` No namedtuple +``'both'`` Yes Series of namedtuples +================ ======= ========================== -Finally, when calling boxplot on a :class:`Groupby` object, a dict of ``return_type`` -is returned, where the keys are the same as the Groupby object. The plot has a -facet for each key, with each facet containing a box for each column of the -DataFrame. +``Groupby.boxplot`` always returns a Series of ``return_type``. .. ipython:: python :okwarning: diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a422e667e32a7..f02367a49d44d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -494,6 +494,7 @@ API changes - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) +- Faceted boxplots from ``DataFrame.boxplot(by=col)`` now return a ``Series`` when ``return_type`` is not None. Previously these returned an ``OrderedDict``. Note that when ``return_type=None``, the default, these still return a 2-D NumPy array. (:issue:`12216`, :issue:`7096`) - More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) - ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) @@ -1282,9 +1283,9 @@ Removal of prior version deprecations/changes Now legacy time rules raises ``ValueError``. For the list of currently supported offsets, see :ref:`here ` +- The default value for the ``return_type`` parameter for ``DataFrame.plot.box`` and ``DataFrame.boxplot`` changed from ``None`` to ``"axes"``. These methods will now return a matplotlib axes by default instead of a dictionary of artists. See :ref:`here ` (:issue:`6581`). - The ``tquery`` and ``uquery`` functions in the ``pandas.io.sql`` module are removed (:issue:`5950`). - .. _whatsnew_0190.performance: Performance Improvements diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 7dcc3d6e5734f..9fe1d7cacd38f 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -5,8 +5,8 @@ import os import warnings -from pandas import DataFrame -from pandas.compat import zip, iteritems, OrderedDict +from pandas import DataFrame, Series +from pandas.compat import zip, iteritems from pandas.util.decorators import cache_readonly from pandas.types.api import is_list_like import pandas.util.testing as tm @@ -445,7 +445,8 @@ def _check_box_return_type(self, returned, return_type, expected_keys=None, self.assertIsInstance(r, Axes) return - self.assertTrue(isinstance(returned, OrderedDict)) + self.assertTrue(isinstance(returned, Series)) + self.assertEqual(sorted(returned.keys()), sorted(expected_keys)) for key, value in iteritems(returned): self.assertTrue(isinstance(value, types[return_type])) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index d499540827ab0..333792c5ffdb2 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -92,6 +92,12 @@ def test_boxplot_legacy(self): lines = list(itertools.chain.from_iterable(d.values())) self.assertEqual(len(ax.get_lines()), len(lines)) + @slow + def test_boxplot_return_type_none(self): + # GH 12216; return_type=None & by=None -> axes + result = self.hist_df.boxplot() + self.assertTrue(isinstance(result, self.plt.Axes)) + @slow def test_boxplot_return_type_legacy(self): # API change in https://github.com/pydata/pandas/pull/7096 @@ -103,10 +109,8 @@ def test_boxplot_return_type_legacy(self): with tm.assertRaises(ValueError): df.boxplot(return_type='NOTATYPE') - with tm.assert_produces_warning(FutureWarning): - result = df.boxplot() - # change to Axes in future - self._check_box_return_type(result, 'dict') + result = df.boxplot() + self._check_box_return_type(result, 'axes') with tm.assert_produces_warning(False): result = df.boxplot(return_type='dict') @@ -140,6 +144,7 @@ def _check_ax_limits(col, ax): p = df.boxplot(['height', 'weight', 'age'], by='category') height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0] dummy_ax = p[1, 1] + _check_ax_limits(df['height'], height_ax) _check_ax_limits(df['weight'], weight_ax) _check_ax_limits(df['age'], age_ax) @@ -163,8 +168,7 @@ def test_boxplot_legacy(self): grouped = self.hist_df.groupby(by='gender') with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2)) - + self._check_axes_shape(list(axes.values), axes_num=2, layout=(1, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @@ -175,7 +179,7 @@ def test_boxplot_legacy(self): grouped = df.groupby(level=1) with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3)) + self._check_axes_shape(list(axes.values), axes_num=10, layout=(4, 3)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') @@ -184,8 +188,7 @@ def test_boxplot_legacy(self): grouped = df.unstack(level=1).groupby(level=0, axis=1) with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2)) - + self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @@ -226,8 +229,7 @@ def test_grouped_box_return_type(self): expected_keys=['height', 'weight', 'category']) # now for groupby - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.groupby('gender').boxplot() + result = df.groupby('gender').boxplot(return_type='dict') self._check_box_return_type( result, 'dict', expected_keys=['Male', 'Female']) @@ -347,7 +349,7 @@ def test_grouped_box_multiple_axes(self): with tm.assert_produces_warning(UserWarning): returned = df.boxplot(column=['height', 'weight', 'category'], by='gender', return_type='axes', ax=axes[0]) - returned = np.array(list(returned.values())) + returned = np.array(list(returned.values)) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) self.assert_numpy_array_equal(returned, axes[0]) self.assertIs(returned[0].figure, fig) @@ -357,7 +359,7 @@ def test_grouped_box_multiple_axes(self): returned = df.groupby('classroom').boxplot( column=['height', 'weight', 'category'], return_type='axes', ax=axes[1]) - returned = np.array(list(returned.values())) + returned = np.array(list(returned.values)) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) self.assert_numpy_array_equal(returned, axes[1]) self.assertIs(returned[0].figure, fig) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 91be0a7a73e35..4d0c1e9213b17 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1221,6 +1221,9 @@ def test_boxplot_return_type(self): result = df.plot.box(return_type='axes') self._check_box_return_type(result, 'axes') + result = df.plot.box() # default axes + self._check_box_return_type(result, 'axes') + result = df.plot.box(return_type='both') self._check_box_return_type(result, 'both') @@ -1230,7 +1233,7 @@ def test_boxplot_subplots_return_type(self): # normal style: return_type=None result = df.plot.box(subplots=True) - self.assertIsInstance(result, np.ndarray) + self.assertIsInstance(result, Series) self._check_box_return_type(result, None, expected_keys=[ 'height', 'weight', 'category']) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 1abd11017dbfe..7fd0b1044f9d7 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2247,7 +2247,7 @@ class BoxPlot(LinePlot): # namedtuple to hold results BP = namedtuple("Boxplot", ['ax', 'lines']) - def __init__(self, data, return_type=None, **kwargs): + def __init__(self, data, return_type='axes', **kwargs): # Do not call LinePlot.__init__ which may fill nan if return_type not in self._valid_return_types: raise ValueError( @@ -2266,7 +2266,7 @@ def _args_adjust(self): self.sharey = False @classmethod - def _plot(cls, ax, y, column_num=None, return_type=None, **kwds): + def _plot(cls, ax, y, column_num=None, return_type='axes', **kwds): if y.ndim == 2: y = [remove_na(v) for v in y] # Boxplot fails with empty arrays, so need to add a NaN @@ -2339,7 +2339,7 @@ def maybe_color_bp(self, bp): def _make_plot(self): if self.subplots: - self._return_obj = compat.OrderedDict() + self._return_obj = Series() for i, (label, y) in enumerate(self._iter_data()): ax = self._get_ax(i) @@ -2691,14 +2691,17 @@ def plot_series(data, kind='line', ax=None, # Series unique grid : Setting this to True will show the grid layout : tuple (optional) (rows, columns) for the layout of the plot - return_type : {'axes', 'dict', 'both'}, default 'dict' - The kind of object to return. 'dict' returns a dictionary - whose values are the matplotlib Lines of the boxplot; + return_type : {None, 'axes', 'dict', 'both'}, default None + The kind of object to return. The default is ``axes`` 'axes' returns the matplotlib axes the boxplot is drawn on; + 'dict' returns a dictionary whose values are the matplotlib + Lines of the boxplot; 'both' returns a namedtuple with the axes and dict. - When grouping with ``by``, a dict mapping columns to ``return_type`` - is returned. + When grouping with ``by``, a Series mapping columns to ``return_type`` + is returned, unless ``return_type`` is None, in which case a NumPy + array of axes is returned with the same shape as ``layout``. + See the prose documentation for more. kwds : other plotting keyword arguments to be passed to matplotlib boxplot function @@ -2724,7 +2727,7 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, # validate return_type: if return_type not in BoxPlot._valid_return_types: - raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") + raise ValueError("return_type must be {'axes', 'dict', 'both'}") from pandas import Series, DataFrame if isinstance(data, Series): @@ -2769,23 +2772,19 @@ def plot_group(keys, values, ax): columns = [column] if by is not None: + # Prefer array return type for 2-D plots to match the subplot layout + # https://github.com/pydata/pandas/pull/12216#issuecomment-241175580 result = _grouped_plot_by_column(plot_group, data, columns=columns, by=by, grid=grid, figsize=figsize, ax=ax, layout=layout, return_type=return_type) else: + if return_type is None: + return_type = 'axes' if layout is not None: raise ValueError("The 'layout' keyword is not supported when " "'by' is None") - if return_type is None: - msg = ("\nThe default value for 'return_type' will change to " - "'axes' in a future release.\n To use the future behavior " - "now, set return_type='axes'.\n To keep the previous " - "behavior and silence this warning, set " - "return_type='dict'.") - warnings.warn(msg, FutureWarning, stacklevel=3) - return_type = 'dict' if ax is None: ax = _gca() data = data._get_numeric_data() @@ -3104,12 +3103,12 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, figsize=figsize, layout=layout) axes = _flatten(axes) - ret = compat.OrderedDict() + ret = Series() for (key, group), ax in zip(grouped, axes): d = group.boxplot(ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds) ax.set_title(pprint_thing(key)) - ret[key] = d + ret.loc[key] = d fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) else: @@ -3175,7 +3174,9 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, _axes = _flatten(axes) - result = compat.OrderedDict() + result = Series() + ax_values = [] + for i, col in enumerate(columns): ax = _axes[i] gp_col = grouped[col] @@ -3183,9 +3184,11 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, re_plotf = plotf(keys, values, ax, **kwargs) ax.set_title(col) ax.set_xlabel(pprint_thing(by)) - result[col] = re_plotf + ax_values.append(re_plotf) ax.grid(grid) + result = Series(ax_values, index=columns) + # Return axes in multiplot case, maybe revisit later # 985 if return_type is None: result = axes diff --git a/pandas/util/testing.py b/pandas/util/testing.py index f5a93d1f17d00..57bb01e5e0406 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -880,12 +880,12 @@ def assert_attr_equal(attr, left, right, obj='Attributes'): def assert_is_valid_plot_return_object(objs): import matplotlib.pyplot as plt - if isinstance(objs, np.ndarray): - for el in objs.flat: - assert isinstance(el, plt.Axes), ('one of \'objs\' is not a ' - 'matplotlib Axes instance, ' - 'type encountered {0!r}' - ''.format(el.__class__.__name__)) + if isinstance(objs, (pd.Series, np.ndarray)): + for el in objs.ravel(): + msg = ('one of \'objs\' is not a matplotlib Axes instance, ' + 'type encountered {0!r}') + assert isinstance(el, (plt.Axes, dict)), msg.format( + el.__class__.__name__) else: assert isinstance(objs, (plt.Artist, tuple, dict)), \ ('objs is neither an ndarray of Artist instances nor a ' From 900ae6bfd4a940f7a165ba2635eb590952f0ce84 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 4 Sep 2016 12:45:49 -0400 Subject: [PATCH 335/359] DOC: typo/corrections in whatsnew --- doc/source/whatsnew/v0.19.0.txt | 85 ++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f02367a49d44d..1c12a145caf72 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -16,7 +16,7 @@ Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` -- ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See ref:`here ` +- ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See :ref:`here ` - Sparse data structures now gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` .. contents:: What's new in v0.19.0 @@ -228,8 +228,8 @@ Previous behaviour: 0 2 1 2 1 5 4 5 -The first 'a' column contains the same data as the second 'a' column, when it should have -contained the array ``[0, 3]``. +The first ``a`` column contains the same data as the second ``a`` column, when it should have +contained the values ``[0, 3]``. New behaviour: @@ -319,7 +319,7 @@ Using the anchoring suffix, you can also specify the day of month to use instead New Index methods ^^^^^^^^^^^^^^^^^ -Following methods and options are added to ``Index`` to be more consistent with ``Series`` and ``DataFrame``. +The following methods and options are added to ``Index``, to be more consistent with the ``Series`` and ``DataFrame`` API. ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) @@ -329,7 +329,7 @@ Following methods and options are added to ``Index`` to be more consistent with idx.where([True, False, True]) -``Index`` now supports ``.dropna`` to exclude missing values (:issue:`6194`) +``Index`` now supports ``.dropna()`` to exclude missing values (:issue:`6194`) .. ipython:: python @@ -347,7 +347,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci midx.dropna() midx.dropna(how='all') -``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here ` (:issue:`10008`, :issue:`13156`) +``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see the :ref:`docs here ` (:issue:`10008`, :issue:`13156`) .. ipython:: python @@ -360,7 +360,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci Google BigQuery Enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). +The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). .. _whatsnew_0190.errstate: @@ -376,7 +376,7 @@ After upgrading pandas, you may see *new* ``RuntimeWarnings`` being issued from get_dummies dtypes ^^^^^^^^^^^^^^^^^^ -The ``pd.get_dummies`` function now returns dummy-encoded columns as small integers, rather than floats (:issue:`8725`) +The ``pd.get_dummies`` function now returns dummy-encoded columns as small integers, rather than floats (:issue:`8725`). This should provide an improved memory footprint. Previous behaviour: @@ -402,7 +402,7 @@ New Behavior: Other enhancements ^^^^^^^^^^^^^^^^^^ -- The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch [the application default credentials](https://developers.google.com/identity/protocols/application-default-credentials). See the :ref:`docs ` for more details (:issue:`13577`). +- The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the :ref:`docs ` for more details (:issue:`13577`). - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) @@ -448,7 +448,7 @@ Other enhancements - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846`) +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, :issue:`13846`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`). - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) @@ -464,6 +464,7 @@ Other enhancements df = pd.DataFrame({'A': [2, 7], 'B': [3, 5], 'C': [4, 8]}, index=['row1', 'row2']) + df df.sort_values(by='row2', axis=1) - Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) @@ -478,32 +479,32 @@ API changes ~~~~~~~~~~~ -- ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds (:issue:`14101`) -- ``Panel.to_sparse`` will raise a ``NotImplementedError`` exception when called (:issue:`13778`) -- ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`) -- ``pd.read_csv()``, ``pd.read_table()``, and ``pd.read_hdf()`` raise the builtin ``FileNotFoundError`` exception for Python 3.x when called on a nonexistent file, and this is back-ported as IOError in Python 2.x (:issue:`14086`) +- ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds, previously this would print a message to stdout. (:issue:`14101`) - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) +- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) +- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) +- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) +- ``Panel.to_sparse()`` will raise a ``NotImplementedError`` exception when called (:issue:`13778`) +- ``Index.reshape()`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`) +- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) +- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) - ``Styler.apply`` is now more strict about the outputs your function must return. For ``axis=0`` or ``axis=1``, the output shape must be identical. For ``axis=None``, the output must be a DataFrame with identical columns and index labels. (:issue:`13222`) - ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) - ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) -- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) -- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) -- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) +- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) - Faceted boxplots from ``DataFrame.boxplot(by=col)`` now return a ``Series`` when ``return_type`` is not None. Previously these returned an ``OrderedDict``. Note that when ``return_type=None``, the default, these still return a 2-D NumPy array. (:issue:`12216`, :issue:`7096`) -- More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) - ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) -- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) -- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) +- ``pd.read_csv()``, ``pd.read_table()``, and ``pd.read_hdf()`` raise the builtin ``FileNotFoundError`` exception for Python 3.x when called on a nonexistent file; this is back-ported as ``IOError`` in Python 2.x (:issue:`14086`) +- More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) - ``pd.read_csv()`` in the C engine will now issue a ``ParserWarning`` or raise a ``ValueError`` when ``sep`` encoded is more than one character long (:issue:`14065`) - ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) -- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) .. _whatsnew_0190.api.tolist: @@ -545,8 +546,8 @@ including ``DataFrame`` (:issue:`1134`, :issue:`4581`, :issue:`13538`) - ``Series`` logical operators align both ``index``. .. warning:: - Until 0.18.1, comparing ``Series`` with the same length has been succeeded even if - these ``index`` are different (the result ignores ``index``). As of 0.19.0, it raises ``ValueError`` to be more strict. This section also describes how to keep previous behaviour or align different indexes using flexible comparison methods like ``.eq``. + Until 0.18.1, comparing ``Series`` with the same length, would succeed even if + the ``.index`` are different (the result ignores ``.index``). As of 0.19.0, this will raises ``ValueError`` to be more strict. This section also describes how to keep previous behaviour or align different indexes, using the flexible comparison methods like ``.eq``. As a result, ``Series`` and ``DataFrame`` operators behave as below: @@ -569,11 +570,11 @@ Arithmetic operators align both ``index`` (no changes). Comparison operators """""""""""""""""""" -Comparison operators raise ``ValueError`` when ``index`` are different. +Comparison operators raise ``ValueError`` when ``.index`` are different. Previous Behavior (``Series``): -``Series`` compares values ignoring ``index`` as long as both lengthes are the same. +``Series`` compares values ignoring ``.index`` as long as both lengthes are the same. .. code-block:: ipython @@ -593,13 +594,13 @@ New Behavior (``Series``): ValueError: Can only compare identically-labeled Series objects .. note:: - To achieve the same result as previous versions (compare values based on locations ignoring ``index``), compare both ``.values``. + To achieve the same result as previous versions (compare values based on locations ignoring ``.index``), compare both ``.values``. .. ipython:: python s1.values == s2.values - If you want to compare ``Series`` aligning its ``index``, see flexible comparison methods section below. + If you want to compare ``Series`` aligning its ``.index``, see flexible comparison methods section below. Current Behavior (``DataFrame``, no change): @@ -612,11 +613,9 @@ Current Behavior (``DataFrame``, no change): Logical operators """"""""""""""""" -Logical operators align both ``index``. +Logical operators align both ``.index``. -Previous Behavior (``Series``): - -Only left hand side ``index`` is kept. +Previous Behavior (``Series``), only left hand side ``index`` is kept: .. code-block:: ipython @@ -638,10 +637,10 @@ New Behavior (``Series``): s1 & s2 .. note:: - ``Series`` logical operators fill ``NaN`` result with ``False``. + ``Series`` logical operators fill a ``NaN`` result with ``False``. .. note:: - To achieve the same result as previous versions (compare values based on locations ignoring ``index``), compare both ``.values``. + To achieve the same result as previous versions (compare values based on locations ignoring ``.index``), compare both ``.values``. .. ipython:: python @@ -668,7 +667,7 @@ which has the different ``index``. s1.eq(s2) s1.ge(s2) -Previously, it worked as the same as comparison operators (see above). +Previously, this worked the same as comparison operators (see above). .. _whatsnew_0190.api.promote: @@ -720,10 +719,12 @@ This will now convert integers/floats with the default unit of ``ns``. pd.to_datetime([1, 'foo'], errors='coerce') +Bug fixes related to ``.to_datetime()``: + - Bug in ``pd.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). - Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) - Bug in ``pd.to_datetime()`` which overflowed on ``int8``, and ``int16`` dtypes (:issue:`13451`) -- Bug in ``pd.to_datetime()`` raise ``AttributeError`` with NaN and the other string is not valid when errors='ignore' (:issue:`12424`) +- Bug in ``pd.to_datetime()`` raise ``AttributeError`` with ``NaN`` and the other string is not valid when ``errors='ignore'`` (:issue:`12424`) - Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13845`) .. _whatsnew_0190.api.merging: @@ -910,11 +911,16 @@ New Behavior: ``.values`` is changed to return array of ``Period`` object, rather than array of ``int64`` (:issue:`13988`) +Previous Behavior: + .. code-block:: ipython + In [6]: pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') In [7]: pi.values array([492, 493]) +New Behavior: + .. ipython:: python pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') @@ -966,6 +972,7 @@ Previous Behavior: In [1]: pd.Index([1, 2, 3]).unique() Out[1]: array([1, 2, 3]) + In [2]: pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() Out[2]: DatetimeIndex(['2011-01-01 00:00:00+09:00', '2011-01-02 00:00:00+09:00', '2011-01-03 00:00:00+09:00'], @@ -1175,7 +1182,7 @@ Other sparse fixes - Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) - Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) -- Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`) +- Bug in single row slicing on multi-type ``SparseDataFrame`` s, types were previously forced to float (:issue:`13917`) - Bug in ``SparseSeries`` slicing changes integer dtype to float (:issue:`8292`) - Bug in ``SparseDataFarme`` comparison ops may raise ``TypeError`` (:issue:`13001`) - Bug in ``SparseDataFarme.isnull`` raises ``ValueError`` (:issue:`8276`) @@ -1316,7 +1323,7 @@ Bug Fixes - Bug in ``pd.to_timedelta()`` in which the ``errors`` parameter was not being respected (:issue:`13613`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) -- Bug in area plot draws legend incorrectly if subplot is enabled or legend is moved after plot (matplotlib 1.5.0 is required to draw area plot legend properly) (issue:`9161`, :issue:`13544`) +- Bug in area plot draws legend incorrectly if subplot is enabled or legend is moved after plot (matplotlib 1.5.0 is required to draw area plot legend properly) (:issue:`9161`, :issue:`13544`) - Bug in ``DataFrame`` assignment with an object-dtyped ``Index`` where the resultant column is mutable to the original object. (:issue:`13522`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) @@ -1436,7 +1443,7 @@ Bug Fixes - Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) - Bug in ``Index`` may ignore specified ``datetime64`` or ``timedelta64`` passed as ``dtype`` (:issue:`13981`) - Bug in ``RangeIndex`` can be created without no arguments rather than raises ``TypeError`` (:issue:`13793`) -- Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) +- Bug in ``.value_counts()`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) - Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) - Bug in ``Series`` creation with ``np.datetime64`` which has other unit than ``ns`` as ``object`` dtype results in incorrect values (:issue:`13876`) - Bug in ``resample`` with timedelta data where data was casted to float (:issue:`13119`). @@ -1469,5 +1476,5 @@ Bug Fixes - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. -- Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue`14095`) +- Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue:`14095`) - Bugs in ``stack``, ``get_dummies``, ``make_axis_dummies`` which don't preserve categorical dtypes in (multi)indexes (:issue:`13854`) From 33f9c870a01e37c417f0359259df41ff1f3594c8 Mon Sep 17 00:00:00 2001 From: c123w Date: Mon, 5 Sep 2016 15:16:31 +0100 Subject: [PATCH 336/359] Fix typo (change 'n' to 'k' in get_dummies documentation). (#14153) --- pandas/core/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 4dec8b4106126..fa5d16bd85e98 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -984,7 +984,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, .. versionadded:: 0.16.1 drop_first : bool, default False - Whether to get k-1 dummies out of n categorical levels by removing the + Whether to get k-1 dummies out of k categorical levels by removing the first level. .. versionadded:: 0.18.0 From f7506c613fd54304b583fa0dc1498273be2989ba Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 5 Sep 2016 18:02:41 -0400 Subject: [PATCH 337/359] DOC: issue typo in v0.19.0 --- doc/source/whatsnew/v0.19.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 1c12a145caf72..7f471904acf30 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -725,7 +725,7 @@ Bug fixes related to ``.to_datetime()``: - Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) - Bug in ``pd.to_datetime()`` which overflowed on ``int8``, and ``int16`` dtypes (:issue:`13451`) - Bug in ``pd.to_datetime()`` raise ``AttributeError`` with ``NaN`` and the other string is not valid when ``errors='ignore'`` (:issue:`12424`) -- Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13845`) +- Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13834`) .. _whatsnew_0190.api.merging: From 3110a72b5388f60de2cd287359e020f7c135b5e5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 5 Sep 2016 18:19:53 -0400 Subject: [PATCH 338/359] BLD: add in build conflict resolution to appeveyor.yml --- appveyor.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index 503e154e2b8f9..84c34b34626b9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -49,6 +49,12 @@ init: - "ECHO %PYTHON_VERSION% %PYTHON%" install: + # cancel older builds for the same PR + - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod ` + https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | ` + Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { ` + throw "There are newer queued builds for this pull request, failing early." } + # this installs the appropriate Miniconda (Py2/Py3, 32/64 bit) # updates conda & installs: conda-build jinja2 anaconda-client - powershell .\ci\install.ps1 @@ -69,6 +75,7 @@ install: # https://github.com/conda/conda-build/issues/1001 # disabling 3.4 as windows complains upon compiling byte # code + - cmd: conda install conda-build=1.21.7 - cmd: conda config --set ssl_verify false From 1a8273cfa9d835fcc377a020c16bc6d55a9ad98b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 6 Sep 2016 06:04:02 -0400 Subject: [PATCH 339/359] TST: skipping xref #14120, locale separator in parser tests of unsupported engines --- pandas/io/tests/parser/test_unsupported.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index e575843a7fc22..0bfb8b17349cf 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -60,8 +60,10 @@ def test_c_engine(self): sep=None, delim_whitespace=False) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', sep='\s') - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), engine='c', sep='§') + + # GH 14120, skipping as failing when locale is set + # with tm.assertRaisesRegexp(ValueError, msg): + # read_table(StringIO(data), engine='c', sep='§') with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', skipfooter=1) From e54d4dbd96db243d041ce14f305c538fd0eb7104 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 6 Sep 2016 06:06:17 -0400 Subject: [PATCH 340/359] MAINT: flake8 *.pyx files closes #12995 flake8-ed *.pyx files and fixed errors. Removed the E226 check because that inhibits pointers (e.g. char*). Author: gfyoung Closes #14147 from gfyoung/pyx-flake8 and squashes the following commits: 386ed58 [gfyoung] MAINT: flake8 *.pyx files --- ci/lint.sh | 10 +- pandas/algos.pyx | 44 +- pandas/hashtable.pyx | 4 +- pandas/index.pyx | 37 +- pandas/io/sas/saslib.pyx | 87 ++-- pandas/lib.pyx | 111 +++-- pandas/msgpack/_packer.pyx | 61 +-- pandas/msgpack/_unpacker.pyx | 77 +-- pandas/parser.pyx | 240 +++++----- pandas/src/inference.pyx | 99 ++-- pandas/src/offsets.pyx | 6 +- pandas/src/period.pyx | 82 ++-- pandas/src/reduce.pyx | 82 ++-- pandas/src/skiplist.pyx | 1 - pandas/src/sparse.pyx | 25 +- pandas/src/testing.pyx | 30 +- pandas/tslib.pyx | 892 +++++++++++++++++++++-------------- 17 files changed, 1114 insertions(+), 774 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 61d74ae28377e..a866b04445f96 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -20,15 +20,7 @@ if [ "$LINT" ]; then echo "Linting *.py DONE" echo "Linting *.pyx" - for path in 'window.pyx' "src/join.pyx" - do - echo "linting -> pandas/$path" - flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126 - if [ $? -ne "0" ]; then - RET=1 - fi - - done + flake8 pandas --filename '*.pyx' --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126 echo "Linting *.pyx DONE" echo "Linting *.pxi.in" diff --git a/pandas/algos.pyx b/pandas/algos.pyx index d3e68ad2a5eee..de5c5fc661d4d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -59,11 +59,11 @@ cdef: int TIEBREAK_DENSE = 5 tiebreakers = { - 'average' : TIEBREAK_AVERAGE, - 'min' : TIEBREAK_MIN, - 'max' : TIEBREAK_MAX, - 'first' : TIEBREAK_FIRST, - 'dense' : TIEBREAK_DENSE, + 'average': TIEBREAK_AVERAGE, + 'min': TIEBREAK_MIN, + 'max': TIEBREAK_MAX, + 'first': TIEBREAK_FIRST, + 'dense': TIEBREAK_DENSE, } @@ -489,7 +489,6 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', bint keep_na = 0 float count = 0.0 - tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -578,6 +577,7 @@ class Infinity(object): __gt__ = lambda self, other: self is not other __ge__ = lambda self, other: True + class NegInfinity(object): """ provide a negative Infinity comparision method for ranking """ @@ -705,7 +705,6 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', # return result - cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil except -1: cdef numeric t @@ -747,11 +746,11 @@ cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k): cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): cdef: - Py_ssize_t i,j,l,m + Py_ssize_t i, j, l, m double_t x, t l = 0 - m = n-1 + m = n -1 while (l malloc(nlevels * sizeof(int64_t*)) for i from 0 <= i < nlevels: - # vecs[i] = ( list_of_arrays[i]).data - arr = list_of_arrays[i] - vecs[i] = arr.data - # assume uniqueness?? + vecs[i] = arr.data + # Assume uniqueness?? for i from 1 <= i < n: for k from 0 <= k < nlevels: cur = vecs[k][i] - pre = vecs[k][i-1] + pre = vecs[k][i -1] if cur == pre: continue elif cur > pre: @@ -988,7 +989,8 @@ def is_lexsorted(list list_of_arrays): @cython.boundscheck(False) -def groupby_indices(dict ids, ndarray[int64_t] labels, ndarray[int64_t] counts): +def groupby_indices(dict ids, ndarray[int64_t] labels, + ndarray[int64_t] counts): """ turn group_labels output into a combined indexer maping the labels to indexers @@ -1020,7 +1022,7 @@ def groupby_indices(dict ids, ndarray[int64_t] labels, ndarray[int64_t] counts): for i from 0 <= i < len(counts): arr = np.empty(counts[i], dtype=np.int64) result[ids[i]] = arr - vecs[i] = arr.data + vecs[i] = arr.data for i from 0 <= i < n: k = labels[i] @@ -1036,6 +1038,7 @@ def groupby_indices(dict ids, ndarray[int64_t] labels, ndarray[int64_t] counts): free(vecs) return result + @cython.wraparound(False) @cython.boundscheck(False) def group_labels(ndarray[object] values): @@ -1116,6 +1119,7 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): #---------------------------------------------------------------------- # first, nth, last + @cython.boundscheck(False) @cython.wraparound(False) def group_nth_object(ndarray[object, ndim=2] out, @@ -1160,6 +1164,7 @@ def group_nth_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_nth_bin_object(ndarray[object, ndim=2] out, @@ -1210,6 +1215,7 @@ def group_nth_bin_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_last_object(ndarray[object, ndim=2] out, @@ -1252,6 +1258,7 @@ def group_last_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_last_bin_object(ndarray[object, ndim=2] out, @@ -1326,7 +1333,6 @@ cdef inline float64_t _median_linear(float64_t* a, int n): a = tmp n -= na_count - if n % 2: result = kth_smallest_c( a, n / 2, n) else: diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index af694c276b5b7..3bda3f49cb054 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -192,7 +192,7 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): kh_destroy_pymap(table) - return modes[:j+1] + return modes[:j + 1] @cython.wraparound(False) @@ -227,7 +227,7 @@ def mode_int64(int64_t[:] values): kh_destroy_int64(table) - return modes[:j+1] + return modes[:j + 1] @cython.wraparound(False) diff --git a/pandas/index.pyx b/pandas/index.pyx index bc985100692fc..2935560a05b6b 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -54,7 +54,8 @@ cdef inline is_definitely_invalid_key(object val): # we have a _data, means we are a NDFrame return (PySlice_Check(val) or cnp.PyArray_Check(val) - or PyList_Check(val) or hasattr(val,'_data')) + or PyList_Check(val) or hasattr(val, '_data')) + def get_value_at(ndarray arr, object loc): if arr.descr.type_num == NPY_DATETIME: @@ -63,6 +64,7 @@ def get_value_at(ndarray arr, object loc): return Timedelta(util.get_value_at(arr, loc)) return util.get_value_at(arr, loc) + def set_value_at(ndarray arr, object loc, object val): return util.set_value_at(arr, loc, val) @@ -302,7 +304,7 @@ cdef class IndexEngine: else: n_alloc = n - result = np.empty(n_alloc, dtype=np.int64) + result = np.empty(n_alloc, dtype=np.int64) missing = np.empty(n_t, dtype=np.int64) # form the set of the results (like ismember) @@ -311,7 +313,7 @@ cdef class IndexEngine: val = util.get_value_1d(values, i) if val in stargets: if val not in d: - d[val] = [] + d[val] = [] d[val].append(i) for i in range(n_t): @@ -322,20 +324,20 @@ cdef class IndexEngine: if val in d: for j in d[val]: - # realloc if needed - if count >= n_alloc: - n_alloc += 10000 - result = np.resize(result, n_alloc) + # realloc if needed + if count >= n_alloc: + n_alloc += 10000 + result = np.resize(result, n_alloc) - result[count] = j - count += 1 + result[count] = j + count += 1 # value not found else: if count >= n_alloc: - n_alloc += 10000 - result = np.resize(result, n_alloc) + n_alloc += 10000 + result = np.resize(result, n_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -479,9 +481,9 @@ cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: return mid + 1 _pad_functions = { - 'object' : algos.pad_object, - 'int64' : algos.pad_int64, - 'float64' : algos.pad_float64 + 'object': algos.pad_object, + 'int64': algos.pad_int64, + 'float64': algos.pad_float64 } _backfill_functions = { @@ -606,7 +608,7 @@ cdef class TimedeltaEngine(DatetimeEngine): cpdef convert_scalar(ndarray arr, object value): if arr.descr.type_num == NPY_DATETIME: - if isinstance(value,np.ndarray): + if isinstance(value, np.ndarray): pass elif isinstance(value, Timestamp): return value.value @@ -615,7 +617,7 @@ cpdef convert_scalar(ndarray arr, object value): else: return Timestamp(value).value elif arr.descr.type_num == NPY_TIMEDELTA: - if isinstance(value,np.ndarray): + if isinstance(value, np.ndarray): pass elif isinstance(value, Timedelta): return value.value @@ -639,7 +641,8 @@ cdef inline _to_i8(object val): return get_datetime64_value(val) elif PyDateTime_Check(val): tzinfo = getattr(val, 'tzinfo', None) - ival = _pydatetime_to_dts(val, &dts) # Save the original date value so we can get the utcoffset from it. + # Save the original date value so we can get the utcoffset from it. + ival = _pydatetime_to_dts(val, &dts) if tzinfo is not None and not _is_utc(tzinfo): offset = tslib._get_utcoffset(tzinfo, val) ival -= tslib._delta_to_nanoseconds(offset) diff --git a/pandas/io/sas/saslib.pyx b/pandas/io/sas/saslib.pyx index ac73ae37ca70e..a66d62ea41581 100644 --- a/pandas/io/sas/saslib.pyx +++ b/pandas/io/sas/saslib.pyx @@ -10,12 +10,14 @@ import sas_constants as const # algorithm. It is partially documented here: # # https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf -cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff): +cdef np.ndarray[uint8_t, ndim=1] rle_decompress( + int result_length, np.ndarray[uint8_t, ndim=1] inbuff): cdef: uint8_t control_byte, x uint8_t [:] result = np.zeros(result_length, np.uint8) - int rpos = 0, ipos = 0, i, nbytes, end_of_first_byte, length = len(inbuff) + int rpos = 0, ipos = 0, length = len(inbuff) + int i, nbytes, end_of_first_byte while ipos < length: control_byte = inbuff[ipos] & 0xF0 @@ -41,13 +43,13 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui rpos += 1 ipos += 1 elif control_byte == 0x60: - nbytes = end_of_first_byte*256 + (inbuff[ipos]) + 17 + nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 for i in range(nbytes): result[rpos] = 0x20 rpos += 1 elif control_byte == 0x70: - nbytes = end_of_first_byte*256 + (inbuff[ipos]) + 17 + nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 for i in range(nbytes): result[rpos] = 0x00 @@ -109,8 +111,9 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui # rdc_decompress decompresses data using the Ross Data Compression algorithm: # -# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm -cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff): +# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm +cdef np.ndarray[uint8_t, ndim=1] rdc_decompress( + int result_length, np.ndarray[uint8_t, ndim=1] inbuff): cdef: uint8_t cmd @@ -124,7 +127,8 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui ii += 1 ctrl_mask = ctrl_mask >> 1 if ctrl_mask == 0: - ctrl_bits = (inbuff[ipos] << 8) + inbuff[ipos + 1] + ctrl_bits = ((inbuff[ipos] << 8) + + inbuff[ipos + 1]) ipos += 2 ctrl_mask = 0x8000 @@ -219,7 +223,8 @@ cdef class Parser(object): int subheader_pointer_length int current_page_type bint is_little_endian - np.ndarray[uint8_t, ndim=1] (*decompress)(int result_length, np.ndarray[uint8_t, ndim=1] inbuff) + np.ndarray[uint8_t, ndim=1] (*decompress)( + int result_length, np.ndarray[uint8_t, ndim=1] inbuff) object parser def __init__(self, object parser): @@ -252,7 +257,8 @@ cdef class Parser(object): elif column_types[j] == b's': self.column_types[j] = column_type_string else: - raise ValueError("unknown column type: %s" % self.parser.columns[j].ctype) + raise ValueError("unknown column type: " + "%s" % self.parser.columns[j].ctype) # compression if parser.compression == const.rle_compression: @@ -279,7 +285,8 @@ cdef class Parser(object): # update the parser self.parser._current_row_on_page_index = self.current_row_on_page_index - self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index + self.parser._current_row_in_chunk_index =\ + self.current_row_in_chunk_index self.parser._current_row_in_file_index = self.current_row_in_file_index cdef bint read_next_page(self): @@ -299,13 +306,16 @@ cdef class Parser(object): self.current_row_on_page_index = 0 self.current_page_type = self.parser._current_page_type self.current_page_block_count = self.parser._current_page_block_count - self.current_page_data_subheader_pointers_len = len(self.parser._current_page_data_subheader_pointers) - self.current_page_subheaders_count = self.parser._current_page_subheaders_count + self.current_page_data_subheader_pointers_len = len( + self.parser._current_page_data_subheader_pointers) + self.current_page_subheaders_count =\ + self.parser._current_page_subheaders_count cdef bint readline(self): cdef: - int offset, bit_offset, align_correction, subheader_pointer_length, mn + int offset, bit_offset, align_correction + int subheader_pointer_length, mn bint done, flag bit_offset = self.bit_offset @@ -321,7 +331,8 @@ cdef class Parser(object): # Loop until a data row is read while True: if self.current_page_type == page_meta_type: - flag = self.current_row_on_page_index >= self.current_page_data_subheader_pointers_len + flag = self.current_row_on_page_index >=\ + self.current_page_data_subheader_pointers_len if flag: done = self.read_next_page() if done: @@ -330,10 +341,12 @@ cdef class Parser(object): current_subheader_pointer = ( self.parser._current_page_data_subheader_pointers[ self.current_row_on_page_index]) - self.process_byte_array_with_data(current_subheader_pointer.offset, - current_subheader_pointer.length) + self.process_byte_array_with_data( + current_subheader_pointer.offset, + current_subheader_pointer.length) return False - elif self.current_page_type == page_mix_types_0 or self.current_page_type == page_mix_types_1: + elif (self.current_page_type == page_mix_types_0 or + self.current_page_type == page_mix_types_1): align_correction = (bit_offset + subheader_pointers_offset + self.current_page_subheaders_count * subheader_pointer_length) @@ -345,18 +358,18 @@ cdef class Parser(object): offset += self.current_row_on_page_index * self.row_length self.process_byte_array_with_data(offset, self.row_length) - mn = min(self.parser.row_count, self.parser._mix_page_row_count) + mn = min(self.parser.row_count, + self.parser._mix_page_row_count) if self.current_row_on_page_index == mn: done = self.read_next_page() if done: return True return False elif self.current_page_type == page_data_type: - self.process_byte_array_with_data(bit_offset + - subheader_pointers_offset + - self.current_row_on_page_index * - self.row_length, - self.row_length) + self.process_byte_array_with_data( + bit_offset + subheader_pointers_offset + + self.current_row_on_page_index * self.row_length, + self.row_length) flag = (self.current_row_on_page_index == self.current_page_block_count) if flag: @@ -371,17 +384,18 @@ cdef class Parser(object): cdef void process_byte_array_with_data(self, int offset, int length): cdef: - Py_ssize_t j - int s, k, m, jb, js, current_row - int64_t lngt, start, ct - np.ndarray[uint8_t, ndim=1] source - int64_t[:] column_types - int64_t[:] lengths - int64_t[:] offsets - uint8_t[:, :] byte_chunk - object[:, :] string_chunk - - source = np.frombuffer(self.cached_page[offset:offset+length], dtype=np.uint8) + Py_ssize_t j + int s, k, m, jb, js, current_row + int64_t lngt, start, ct + np.ndarray[uint8_t, ndim=1] source + int64_t[:] column_types + int64_t[:] lengths + int64_t[:] offsets + uint8_t[:, :] byte_chunk + object[:, :] string_chunk + + source = np.frombuffer( + self.cached_page[offset:offset + length], dtype=np.uint8) if self.decompress != NULL and (length < self.row_length): source = self.decompress(self.row_length, source) @@ -408,11 +422,12 @@ cdef class Parser(object): else: m = s for k in range(lngt): - byte_chunk[jb, m + k] = source[start + k] + byte_chunk[jb, m + k] = source[start + k] jb += 1 elif column_types[j] == column_type_string: # string - string_chunk[js, current_row] = source[start:(start+lngt)].tostring().rstrip() + string_chunk[js, current_row] = source[start:( + start + lngt)].tostring().rstrip() js += 1 self.current_row_on_page_index += 1 diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 0473ae79adce5..e7672de5c835e 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -84,6 +84,7 @@ PyDateTime_IMPORT import_array() import_ufunc() + def values_from_object(object o): """ return my values or the object if we are say an ndarray """ cdef f @@ -159,6 +160,7 @@ def ismember(ndarray arr, set values): return result.view(np.bool_) + def ismember_int64(ndarray[int64_t] arr, set values): """ Checks whether @@ -184,6 +186,7 @@ def ismember_int64(ndarray[int64_t] arr, set values): return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(ndarray[object, ndim=1] arr): @@ -217,12 +220,15 @@ cdef inline int64_t gmtime(object date): days = pydate(y, m, 1).toordinal() - _EPOCH_ORD + d - 1 return (( (((days * 24 + h) * 60 + mn))) * 60 + s) * 1000 + cpdef object to_datetime(int64_t timestamp): return pydatetime.utcfromtimestamp(timestamp / 1000.0) + cpdef object to_timestamp(object dt): return gmtime(dt) + def array_to_timestamp(ndarray[object, ndim=1] arr): cdef int i, n cdef ndarray[int64_t, ndim=1] result @@ -235,6 +241,7 @@ def array_to_timestamp(ndarray[object, ndim=1] arr): return result + def time64_to_datetime(ndarray[int64_t, ndim=1] arr): cdef int i, n cdef ndarray[object, ndim=1] result @@ -254,6 +261,7 @@ def time64_to_datetime(ndarray[int64_t, ndim=1] arr): cdef double INF = np.inf cdef double NEGINF = -INF + cpdef checknull(object val): if util.is_float_object(val) or util.is_complex_object(val): return val != val # and val != INF and val != NEGINF @@ -268,6 +276,7 @@ cpdef checknull(object val): else: return _checknull(val) + cpdef checknull_old(object val): if util.is_float_object(val) or util.is_complex_object(val): return val != val or val == INF or val == NEGINF @@ -282,18 +291,21 @@ cpdef checknull_old(object val): else: return util._checknull(val) + cpdef isposinf_scalar(object val): if util.is_float_object(val) and val == INF: return True else: return False + cpdef isneginf_scalar(object val): if util.is_float_object(val) and val == NEGINF: return True else: return False + def isscalar(object val): """ Return True if given value is scalar. @@ -356,6 +368,7 @@ def isnullobj(ndarray arr): result[i] = _check_all_nulls(val) return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) def isnullobj_old(ndarray arr): @@ -372,6 +385,7 @@ def isnullobj_old(ndarray arr): result[i] = val is NaT or util._checknull_old(val) return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) def isnullobj2d(ndarray arr): @@ -390,6 +404,7 @@ def isnullobj2d(ndarray arr): result[i, j] = 1 return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) def isnullobj2d_old(ndarray arr): @@ -413,8 +428,8 @@ def isnullobj2d_old(ndarray arr): @cython.boundscheck(False) cpdef ndarray[object] list_to_object_array(list obj): """ - Convert list to object ndarray. Seriously can\'t believe I had to write this - function + Convert list to object ndarray. Seriously can\'t believe + I had to write this function. """ cdef: Py_ssize_t i, n = len(obj) @@ -447,6 +462,7 @@ def fast_unique(ndarray[object] values): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple(list arrays): @@ -473,6 +489,7 @@ def fast_unique_multiple(list arrays): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list(list lists): @@ -499,6 +516,7 @@ def fast_unique_multiple_list(list lists): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list_gen(object gen, bint sort=True): @@ -538,6 +556,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def dicts_to_array(list dicts, list columns): @@ -563,6 +582,7 @@ def dicts_to_array(list dicts, list columns): return result + def fast_zip(list ndarrays): """ For zipping multiple ndarrays into an ndarray of tuples @@ -604,6 +624,7 @@ def fast_zip(list ndarrays): return result + def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): """ Reverse indexing operation. @@ -645,6 +666,7 @@ def has_infs_f4(ndarray[float32_t] arr): return True return False + def has_infs_f8(ndarray[float64_t] arr): cdef: Py_ssize_t i, n = len(arr) @@ -659,6 +681,7 @@ def has_infs_f8(ndarray[float64_t] arr): return True return False + def convert_timestamps(ndarray values): cdef: object val, f, result @@ -911,6 +934,7 @@ def scalar_binop(ndarray[object] values, object val, object op): return maybe_convert_bool(result) + @cython.wraparound(False) @cython.boundscheck(False) def vec_binop(ndarray[object] left, ndarray[object] right, object op): @@ -948,18 +972,19 @@ def astype_intsafe(ndarray[object] arr, new_dtype): ndarray result # on 32-bit, 1.6.2 numpy M8[ns] is a subdtype of integer, which is weird - is_datelike = new_dtype in ['M8[ns]','m8[ns]'] + is_datelike = new_dtype in ['M8[ns]', 'm8[ns]'] result = np.empty(n, dtype=new_dtype) for i in range(n): v = arr[i] if is_datelike and checknull(v): - result[i] = NPY_NAT + result[i] = NPY_NAT else: - util.set_value_at(result, i, v) + util.set_value_at(result, i, v) return result + cpdef ndarray[object] astype_unicode(ndarray arr): cdef: Py_ssize_t i, n = arr.size @@ -970,6 +995,7 @@ cpdef ndarray[object] astype_unicode(ndarray arr): return result + cpdef ndarray[object] astype_str(ndarray arr): cdef: Py_ssize_t i, n = arr.size @@ -980,6 +1006,7 @@ cpdef ndarray[object] astype_str(ndarray arr): return result + def clean_index_list(list obj): """ Utility used in pandas.core.index._ensure_index @@ -992,7 +1019,7 @@ def clean_index_list(list obj): for i in range(n): v = obj[i] - if not (PyList_Check(v) or np.PyArray_Check(v) or hasattr(v,'_data')): + if not (PyList_Check(v) or np.PyArray_Check(v) or hasattr(v, '_data')): all_arrays = 0 break @@ -1002,7 +1029,7 @@ def clean_index_list(list obj): converted = np.empty(n, dtype=object) for i in range(n): v = obj[i] - if PyList_Check(v) or np.PyArray_Check(v) or hasattr(v,'_data'): + if PyList_Check(v) or np.PyArray_Check(v) or hasattr(v, '_data'): converted[i] = tuple(v) else: converted[i] = v @@ -1038,10 +1065,16 @@ cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr): return m + @cython.boundscheck(False) @cython.wraparound(False) -def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): - """ replace the values in the array with replacement if they are nan_rep; return the same array """ +def string_array_replace_from_nan_rep( + ndarray[object, ndim=1] arr, object nan_rep, + object replace=None): + """ + Replace the values in the array with 'replacement' if + they are 'nan_rep'. Return the same array. + """ cdef int length = arr.shape[0], i = 0 if replace is None: @@ -1053,9 +1086,11 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re return arr + @cython.boundscheck(False) @cython.wraparound(False) -def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer): +def write_csv_rows(list data, ndarray data_index, + int nlevels, ndarray cols, object writer): cdef int N, j, i, ncols cdef list rows @@ -1066,7 +1101,7 @@ def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, obj # pre-allocate rows ncols = len(cols) - rows = [[None]*(nlevels+ncols) for x in range(N)] + rows = [[None] * (nlevels + ncols) for x in range(N)] j = -1 if nlevels == 1: @@ -1074,18 +1109,18 @@ def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, obj row = rows[j % N] row[0] = data_index[j] for i in range(ncols): - row[1+i] = data[i][j] + row[1 + i] = data[i][j] - if j >= N-1 and j % N == N-1: + if j >= N - 1 and j % N == N - 1: writer.writerows(rows) elif nlevels > 1: for j in range(len(data_index)): row = rows[j % N] row[:nlevels] = list(data_index[j]) for i in range(ncols): - row[nlevels+i] = data[i][j] + row[nlevels + i] = data[i][j] - if j >= N-1 and j % N == N-1: + if j >= N - 1 and j % N == N - 1: writer.writerows(rows) else: for j in range(len(data_index)): @@ -1093,15 +1128,15 @@ def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, obj for i in range(ncols): row[i] = data[i][j] - if j >= N-1 and j % N == N-1: + if j >= N - 1 and j % N == N - 1: writer.writerows(rows) - if j >= 0 and (j < N-1 or (j % N) != N-1 ): - writer.writerows(rows[:((j+1) % N)]) + if j >= 0 and (j < N - 1 or (j % N) != N - 1): + writer.writerows(rows[:((j + 1) % N)]) -#------------------------------------------------------------------------------- -# Groupby-related functions +#------------------------------------------------------------------------------ +# Groupby-related functions @cython.boundscheck(False) def arrmap(ndarray[object] index, object func): cdef int length = index.shape[0] @@ -1114,6 +1149,7 @@ def arrmap(ndarray[object] index, object func): return result + @cython.wraparound(False) @cython.boundscheck(False) def is_lexsorted(list list_of_arrays): @@ -1128,16 +1164,14 @@ def is_lexsorted(list list_of_arrays): cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) for i from 0 <= i < nlevels: - # vecs[i] = ( list_of_arrays[i]).data - arr = list_of_arrays[i] vecs[i] = arr.data - # assume uniqueness?? + # Assume uniqueness?? for i from 1 <= i < n: for k from 0 <= k < nlevels: cur = vecs[k][i] - pre = vecs[k][i-1] + pre = vecs[k][i - 1] if cur == pre: continue elif cur > pre: @@ -1148,11 +1182,9 @@ def is_lexsorted(list list_of_arrays): return True - # TODO: could do even better if we know something about the data. eg, index has # 1-min data, binner has 5-min data, then bins are just strides in index. This # is a general, O(max(len(values), len(binner))) method. - @cython.boundscheck(False) @cython.wraparound(False) def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, @@ -1182,18 +1214,18 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, if values[0] < binner[0]: raise ValueError("Values falls before first bin") - if values[lenidx-1] > binner[lenbin-1]: + if values[lenidx - 1] > binner[lenbin - 1]: raise ValueError("Values falls after last bin") bins = np.empty(lenbin - 1, dtype=np.int64) - j = 0 # index into values + j = 0 # index into values bc = 0 # bin count # linear scan if right_closed: for i in range(0, lenbin - 1): - r_bin = binner[i+1] + r_bin = binner[i + 1] # count values in current bin, advance to next bin while j < lenidx and values[j] <= r_bin: j += 1 @@ -1201,7 +1233,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, bc += 1 else: for i in range(0, lenbin - 1): - r_bin = binner[i+1] + r_bin = binner[i + 1] # count values in current bin, advance to next bin while j < lenidx and values[j] < r_bin: j += 1 @@ -1216,8 +1248,6 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, return bins - - @cython.boundscheck(False) @cython.wraparound(False) def row_bool_subset(ndarray[float64_t, ndim=2] values, @@ -1239,6 +1269,7 @@ def row_bool_subset(ndarray[float64_t, ndim=2] values, return out + @cython.boundscheck(False) @cython.wraparound(False) def row_bool_subset_object(ndarray[object, ndim=2] values, @@ -1260,6 +1291,7 @@ def row_bool_subset_object(ndarray[object, ndim=2] values, return out + @cython.boundscheck(False) @cython.wraparound(False) def get_level_sorter(ndarray[int64_t, ndim=1] label, @@ -1282,6 +1314,7 @@ def get_level_sorter(ndarray[int64_t, ndim=1] label, return out + def group_count(ndarray[int64_t] values, Py_ssize_t size): cdef: Py_ssize_t i, n = len(values) @@ -1292,6 +1325,7 @@ def group_count(ndarray[int64_t] values, Py_ssize_t size): counts[values[i]] += 1 return counts + def lookup_values(ndarray[object] values, dict mapping): cdef: Py_ssize_t i, n = len(values) @@ -1331,6 +1365,7 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts + cdef class _PandasNull: def __richcmp__(_PandasNull self, object other, int op): @@ -1346,6 +1381,7 @@ cdef class _PandasNull: pandas_null = _PandasNull() + def fast_zip_fillna(list ndarrays, fill_value=pandas_null): """ For zipping multiple ndarrays into an ndarray of tuples @@ -1445,7 +1481,7 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys, tup = PyTuple_New(k) for j in range(k): val = util.get_value_at(keys[j], - sorted_labels[j][i-1]) + sorted_labels[j][i - 1]) PyTuple_SET_ITEM(tup, j, val) Py_INCREF(val) @@ -1574,7 +1610,7 @@ cpdef slice indexer_as_slice(int64_t[:] vals): return None for i in range(2, n): - if vals[i] < 0 or vals[i] - vals[i-1] != d: + if vals[i] < 0 or vals[i] - vals[i - 1] != d: return None start = vals[0] @@ -1645,12 +1681,13 @@ cpdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): if slc is None: raise TypeError("slc should be a slice") - PySlice_GetIndicesEx(slc, objlen, + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return start, stop, step, length -cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: +cpdef Py_ssize_t slice_len( + slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: """ Get length of a bounded slice. @@ -1668,7 +1705,7 @@ cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except - if slc is None: raise TypeError("slc must be slice") - PySlice_GetIndicesEx(slc, objlen, + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return length diff --git a/pandas/msgpack/_packer.pyx b/pandas/msgpack/_packer.pyx index 5004b9e8e7262..008dbe5541d50 100644 --- a/pandas/msgpack/_packer.pyx +++ b/pandas/msgpack/_packer.pyx @@ -23,7 +23,8 @@ cdef extern from "../src/msgpack/pack.h": int msgpack_pack_false(msgpack_packer* pk) int msgpack_pack_long(msgpack_packer* pk, long d) int msgpack_pack_long_long(msgpack_packer* pk, long long d) - int msgpack_pack_unsigned_long_long(msgpack_packer* pk, unsigned long long d) + int msgpack_pack_unsigned_long_long(msgpack_packer* pk, + unsigned long long d) int msgpack_pack_float(msgpack_packer* pk, float d) int msgpack_pack_double(msgpack_packer* pk, double d) int msgpack_pack_array(msgpack_packer* pk, size_t l) @@ -58,8 +59,10 @@ cdef class Packer(object): :param bool use_single_float: Use single precision float type for float. (default: False) :param bool autoreset: - Reset buffer after each pack and return it's content as `bytes`. (default: True). - If set this to false, use `bytes()` to get content and `.reset()` to clear buffer. + Reset buffer after each pack and return it's + content as `bytes`. (default: True). + If set this to false, use `bytes()` to get + content and `.reset()` to clear buffer. :param bool use_bin_type: Use bin type introduced in msgpack spec 2.0 for bytes. It also enable str8 type for unicode. @@ -74,15 +77,16 @@ cdef class Packer(object): cdef bint autoreset def __cinit__(self): - cdef int buf_size = 1024*1024 - self.pk.buf = malloc(buf_size); + cdef int buf_size = 1024 * 1024 + self.pk.buf = malloc(buf_size) if self.pk.buf == NULL: raise MemoryError("Unable to allocate internal buffer.") self.pk.buf_size = buf_size self.pk.length = 0 - def __init__(self, default=None, encoding='utf-8', unicode_errors='strict', - use_single_float=False, bint autoreset=1, bint use_bin_type=0): + def __init__(self, default=None, encoding='utf-8', + unicode_errors='strict', use_single_float=False, + bint autoreset=1, bint use_bin_type=0): """ """ self.use_float = use_single_float @@ -110,7 +114,8 @@ cdef class Packer(object): def __dealloc__(self): free(self.pk.buf); - cdef int _pack(self, object o, int nest_limit=DEFAULT_RECURSE_LIMIT) except -1: + cdef int _pack(self, object o, + int nest_limit=DEFAULT_RECURSE_LIMIT) except -1: cdef long long llval cdef unsigned long long ullval cdef long longval @@ -147,14 +152,14 @@ cdef class Packer(object): ret = msgpack_pack_long(&self.pk, longval) elif PyFloat_Check(o): if self.use_float: - fval = o - ret = msgpack_pack_float(&self.pk, fval) + fval = o + ret = msgpack_pack_float(&self.pk, fval) else: - dval = o - ret = msgpack_pack_double(&self.pk, dval) + dval = o + ret = msgpack_pack_double(&self.pk, dval) elif PyBytes_Check(o): L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("bytes is too large") rawval = o ret = msgpack_pack_bin(&self.pk, L) @@ -162,10 +167,12 @@ cdef class Packer(object): ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyUnicode_Check(o): if not self.encoding: - raise TypeError("Can't encode unicode string: no encoding is specified") - o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) + raise TypeError("Can't encode unicode string: " + "no encoding is specified") + o = PyUnicode_AsEncodedString(o, self.encoding, + self.unicode_errors) L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("dict is too large") rawval = o ret = msgpack_pack_raw(&self.pk, len(o)) @@ -174,43 +181,43 @@ cdef class Packer(object): elif PyDict_CheckExact(o): d = o L = len(d) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("dict is too large") ret = msgpack_pack_map(&self.pk, L) if ret == 0: for k, v in d.iteritems(): - ret = self._pack(k, nest_limit-1) + ret = self._pack(k, nest_limit - 1) if ret != 0: break - ret = self._pack(v, nest_limit-1) + ret = self._pack(v, nest_limit - 1) if ret != 0: break elif PyDict_Check(o): L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("dict is too large") ret = msgpack_pack_map(&self.pk, L) if ret == 0: for k, v in o.items(): - ret = self._pack(k, nest_limit-1) + ret = self._pack(k, nest_limit - 1) if ret != 0: break - ret = self._pack(v, nest_limit-1) + ret = self._pack(v, nest_limit - 1) if ret != 0: break elif isinstance(o, ExtType): # This should be before Tuple because ExtType is namedtuple. longval = o.code rawval = o.data L = len(o.data) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("EXT data is too large") ret = msgpack_pack_ext(&self.pk, longval, L) ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyTuple_Check(o) or PyList_Check(o): L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("list is too large") ret = msgpack_pack_array(&self.pk, L) if ret == 0: for v in o: - ret = self._pack(v, nest_limit-1) + ret = self._pack(v, nest_limit - 1) if ret != 0: break elif not default_used and self._default: o = self._default(o) @@ -237,7 +244,7 @@ cdef class Packer(object): msgpack_pack_raw_body(&self.pk, data, len(data)) def pack_array_header(self, size_t size): - if size > (2**32-1): + if size > (2**32) - 1: raise ValueError cdef int ret = msgpack_pack_array(&self.pk, size) if ret == -1: @@ -250,7 +257,7 @@ cdef class Packer(object): return buf def pack_map_header(self, size_t size): - if size > (2**32-1): + if size > (2**32) - 1: raise ValueError cdef int ret = msgpack_pack_map(&self.pk, size) if ret == -1: diff --git a/pandas/msgpack/_unpacker.pyx b/pandas/msgpack/_unpacker.pyx index f68bf3369427c..6f23a24adde6c 100644 --- a/pandas/msgpack/_unpacker.pyx +++ b/pandas/msgpack/_unpacker.pyx @@ -4,18 +4,15 @@ from cpython cimport * cdef extern from "Python.h": ctypedef struct PyObject - cdef int PyObject_AsReadBuffer(object o, const void** buff, Py_ssize_t* buf_len) except -1 + cdef int PyObject_AsReadBuffer(object o, const void** buff, + Py_ssize_t* buf_len) except -1 from libc.stdlib cimport * from libc.string cimport * from libc.limits cimport * -from pandas.msgpack.exceptions import ( - BufferFull, - OutOfData, - UnpackValueError, - ExtraData, - ) +from pandas.msgpack.exceptions import (BufferFull, OutOfData, + UnpackValueError, ExtraData) from pandas.msgpack import ExtType @@ -65,7 +62,8 @@ cdef inline init_ctx(unpack_context *ctx, ctx.user.max_ext_len = max_ext_len if object_hook is not None and object_pairs_hook is not None: - raise TypeError("object_pairs_hook and object_hook are mutually exclusive.") + raise TypeError("object_pairs_hook and object_hook " + "are mutually exclusive.") if object_hook is not None: if not PyCallable_Check(object_hook): @@ -93,8 +91,11 @@ cdef inline init_ctx(unpack_context *ctx, ctx.user.encoding = encoding ctx.user.unicode_errors = unicode_errors + def default_read_extended_type(typecode, data): - raise NotImplementedError("Cannot decode extended type with typecode=%d" % typecode) + raise NotImplementedError("Cannot decode extended type " + "with typecode=%d" % typecode) + def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=1, encoding=None, unicode_errors="strict", @@ -139,7 +140,8 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, if ret == 1: obj = unpack_data(&ctx) if off < buf_len: - raise ExtraData(obj, PyBytes_FromStringAndSize(buf+off, buf_len-off)) + raise ExtraData(obj, PyBytes_FromStringAndSize( + buf + off, buf_len - off)) return obj else: raise UnpackValueError("Unpack failed: error = %d" % (ret,)) @@ -157,9 +159,9 @@ def unpack(object stream, object object_hook=None, object list_hook=None, See :class:`Unpacker` for options. """ return unpackb(stream.read(), use_list=use_list, - object_hook=object_hook, object_pairs_hook=object_pairs_hook, list_hook=list_hook, - encoding=encoding, unicode_errors=unicode_errors, - ) + object_hook=object_hook, + object_pairs_hook=object_pairs_hook, list_hook=list_hook, + encoding=encoding, unicode_errors=unicode_errors) cdef class Unpacker(object): @@ -169,10 +171,12 @@ cdef class Unpacker(object): :param file_like: File-like object having `.read(n)` method. - If specified, unpacker reads serialized data from it and :meth:`feed()` is not usable. + If specified, unpacker reads serialized data from it and + :meth:`feed()` is not usable. :param int read_size: - Used as `file_like.read(read_size)`. (default: `min(1024**2, max_buffer_size)`) + Used as `file_like.read(read_size)`. (default: + `min(1024**2, max_buffer_size)`) :param bool use_list: If true, unpack msgpack array to Python list. @@ -184,9 +188,8 @@ cdef class Unpacker(object): (See also simplejson) :param callable object_pairs_hook: - When specified, it should be callable. - Unpacker calls it with a list of key-value pairs after unpacking msgpack map. - (See also simplejson) + When specified, it should be callable. Unpacker calls it with a list + of key-value pairs after unpacking msgpack map. (See also simplejson) :param str encoding: Encoding used for decoding msgpack raw. @@ -197,9 +200,10 @@ cdef class Unpacker(object): (default: `'strict'`) :param int max_buffer_size: - Limits size of data waiting unpacked. 0 means system's INT_MAX (default). - Raises `BufferFull` exception when it is insufficient. - You shoud set this parameter when unpacking data from untrasted source. + Limits size of data waiting unpacked. 0 means system's + INT_MAX (default). Raises `BufferFull` exception when it + is insufficient. You shoud set this parameter when unpacking + data from untrasted source. :param int max_str_len: Limits max length of str. (default: 2**31-1) @@ -250,9 +254,9 @@ cdef class Unpacker(object): self.buf = NULL def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, - object object_hook=None, object object_pairs_hook=None, object list_hook=None, - encoding=None, unicode_errors='strict', int max_buffer_size=0, - object ext_hook=ExtType, + object object_hook=None, object object_pairs_hook=None, + object list_hook=None, encoding=None, unicode_errors='strict', + int max_buffer_size=0, object ext_hook=ExtType, Py_ssize_t max_str_len=2147483647, # 2**32-1 Py_ssize_t max_bin_len=2147483647, Py_ssize_t max_array_len=2147483647, @@ -274,7 +278,8 @@ cdef class Unpacker(object): if not max_buffer_size: max_buffer_size = INT_MAX if read_size > max_buffer_size: - raise ValueError("read_size should be less or equal to max_buffer_size") + raise ValueError("read_size should be less or " + "equal to max_buffer_size") if not read_size: read_size = min(max_buffer_size, 1024**2) self.max_buffer_size = max_buffer_size @@ -313,8 +318,8 @@ cdef class Unpacker(object): """Append `next_bytes` to internal buffer.""" cdef Py_buffer pybuff if self.file_like is not None: - raise AssertionError( - "unpacker.feed() is not be able to use with `file_like`.") + raise AssertionError("unpacker.feed() is not be able " + "to use with `file_like`.") PyObject_GetBuffer(next_bytes, &pybuff, PyBUF_SIMPLE) try: self.append_buffer(pybuff.buf, pybuff.len) @@ -338,10 +343,10 @@ cdef class Unpacker(object): head = 0 else: # expand buffer. - new_size = (tail-head) + _buf_len + new_size = (tail - head) + _buf_len if new_size > self.max_buffer_size: raise BufferFull - new_size = min(new_size*2, self.max_buffer_size) + new_size = min(new_size * 2, self.max_buffer_size) new_buf = malloc(new_size) if new_buf == NULL: # self.buf still holds old buffer and will be freed during @@ -363,15 +368,16 @@ cdef class Unpacker(object): cdef read_from_file(self): next_bytes = self.file_like_read( - min(self.read_size, - self.max_buffer_size - (self.buf_tail - self.buf_head) - )) + min(self.read_size, + self.max_buffer_size - (self.buf_tail - self.buf_head))) if next_bytes: - self.append_buffer(PyBytes_AsString(next_bytes), PyBytes_Size(next_bytes)) + self.append_buffer(PyBytes_AsString(next_bytes), + PyBytes_Size(next_bytes)) else: self.file_like = None - cdef object _unpack(self, execute_fn execute, object write_bytes, bint iter=0): + cdef object _unpack(self, execute_fn execute, + object write_bytes, bint iter=0): cdef int ret cdef object obj cdef size_t prev_head @@ -389,7 +395,8 @@ cdef class Unpacker(object): ret = execute(&self.ctx, self.buf, self.buf_tail, &self.buf_head) if write_bytes is not None: - write_bytes(PyBytes_FromStringAndSize(self.buf + prev_head, self.buf_head - prev_head)) + write_bytes(PyBytes_FromStringAndSize( + self.buf + prev_head, self.buf_head - prev_head)) if ret == 1: obj = unpack_data(&self.ctx) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 5d8ab7213a7b6..12525c7a9c587 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -106,7 +106,7 @@ cdef extern from "parser/tokenizer.h": enum: ERROR_OVERFLOW ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, - int *status) + int *status) ctypedef int (*io_cleanup)(void *src) ctypedef struct parser_t: @@ -410,7 +410,6 @@ cdef class TextReader: self._set_quoting(quotechar, quoting) - dtype_order = ['int64', 'float64', 'bool', 'object'] if quoting == QUOTE_NONNUMERIC: # consistent with csv module semantics, cast all to float @@ -517,7 +516,7 @@ cdef class TextReader: # need to artifically skip the final line # which is still a header line header = list(header) - header.append(header[-1]+1) + header.append(header[-1] + 1) self.parser.header_start = header[0] self.parser.header_end = header[-1] @@ -663,7 +662,8 @@ cdef class TextReader: if ptr == NULL: if not os.path.exists(source): - raise compat.FileNotFoundError('File %s does not exist' % source) + raise compat.FileNotFoundError( + 'File %s does not exist' % source) raise IOError('Initializing from file failed') self.parser.source = ptr @@ -689,7 +689,7 @@ cdef class TextReader: # header is now a list of lists, so field_count should use header[0] cdef: - size_t i, start, data_line, field_count, passed_count, hr, unnamed_count + size_t i, start, data_line, field_count, passed_count, hr, unnamed_count # noqa char *word object name int status @@ -716,10 +716,12 @@ cdef class TextReader: # e.g., if header=3 and file only has 2 lines elif self.parser.lines < hr + 1: msg = self.orig_header - if isinstance(msg,list): - msg = "[%s], len of %d," % (','.join([ str(m) for m in msg ]),len(msg)) - raise CParserError('Passed header=%s but only %d lines in file' - % (msg, self.parser.lines)) + if isinstance(msg, list): + msg = "[%s], len of %d," % ( + ','.join([ str(m) for m in msg ]), len(msg)) + raise CParserError( + 'Passed header=%s but only %d lines in file' + % (msg, self.parser.lines)) else: field_count = self.parser.line_fields[hr] @@ -740,13 +742,14 @@ cdef class TextReader: if name == '': if self.has_mi_columns: - name = 'Unnamed: %d_level_%d' % (i,level) + name = 'Unnamed: %d_level_%d' % (i, level) else: name = 'Unnamed: %d' % i unnamed_count += 1 count = counts.get(name, 0) - if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns: + if (count > 0 and self.mangle_dupe_cols + and not self.has_mi_columns): this_header.append('%s.%d' % (name, count)) else: this_header.append(name) @@ -754,12 +757,13 @@ cdef class TextReader: if self.has_mi_columns: - # if we have grabbed an extra line, but its not in our format - # so save in the buffer, and create an blank extra line for the rest of the - # parsing code + # If we have grabbed an extra line, but it's not in our + # format, save in the buffer, and create an blank extra + # line for the rest of the parsing code. if hr == self.header[-1]: lc = len(this_header) - ic = len(self.index_col) if self.index_col is not None else 0 + ic = (len(self.index_col) if self.index_col + is not None else 0) if lc != unnamed_count and lc - ic > unnamed_count: hr -= 1 self.parser_start -= 1 @@ -993,20 +997,15 @@ cdef class TextReader: # if footer > 0: # end -= footer - #print >> sys.stderr, self.table_width - #print >> sys.stderr, self.leading_cols - #print >> sys.stderr, self.parser.lines - #print >> sys.stderr, start - #print >> sys.stderr, end - #print >> sys.stderr, self.header - #print >> sys.stderr, "index" num_cols = -1 for i in range(self.parser.lines): - num_cols = (num_cols < self.parser.line_fields[i]) * self.parser.line_fields[i] +\ + num_cols = (num_cols < self.parser.line_fields[i]) * \ + self.parser.line_fields[i] + \ (num_cols >= self.parser.line_fields[i]) * num_cols if self.table_width - self.leading_cols > num_cols: - raise CParserError("Too many columns specified: expected %s and found %s" % + raise CParserError( + "Too many columns specified: expected %s and found %s" % (self.table_width - self.leading_cols, num_cols)) results = {} @@ -1045,8 +1044,8 @@ cdef class TextReader: continue # Should return as the desired dtype (inferred or specified) - col_res, na_count = self._convert_tokens(i, start, end, name, - na_filter, na_hashset, na_flist) + col_res, na_count = self._convert_tokens( + i, start, end, name, na_filter, na_hashset, na_flist) if na_filter: self._free_na_set(na_hashset) @@ -1054,8 +1053,10 @@ cdef class TextReader: if upcast_na and na_count > 0: col_res = _maybe_upcast(col_res) - if issubclass(col_res.dtype.type, np.integer) and self.compact_ints: - col_res = lib.downcast_int64(col_res, na_values, self.use_unsigned) + if issubclass(col_res.dtype.type, + np.integer) and self.compact_ints: + col_res = lib.downcast_int64(col_res, na_values, + self.use_unsigned) if col_res is None: raise CParserError('Unable to parse column %d' % i) @@ -1087,10 +1088,12 @@ cdef class TextReader: col_dtype = self.dtype if col_dtype is not None: - col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end, - na_filter, 1, na_hashset, na_flist) + col_res, na_count = self._convert_with_dtype( + col_dtype, i, start, end, na_filter, + 1, na_hashset, na_flist) - # fallback on the parse (e.g. we requested int dtype, but its actually a float) + # Fallback on the parse (e.g. we requested int dtype, + # but its actually a float). if col_res is not None: return col_res, na_count @@ -1104,7 +1107,8 @@ cdef class TextReader: dt, i, start, end, na_filter, 0, na_hashset, na_flist) except OverflowError: col_res, na_count = self._convert_with_dtype( - np.dtype('object'), i, start, end, na_filter, 0, na_hashset, na_flist) + np.dtype('object'), i, start, end, na_filter, + 0, na_hashset, na_flist) if col_res is not None: break @@ -1113,7 +1117,7 @@ cdef class TextReader: # only allow safe casts, eg. with a nan you cannot safely cast to int if col_res is not None and col_dtype is not None: try: - col_res = col_res.astype(col_dtype,casting='safe') + col_res = col_res.astype(col_dtype, casting='safe') except TypeError: # float -> int conversions can fail the above @@ -1121,12 +1125,13 @@ cdef class TextReader: col_res_orig = col_res col_res = col_res.astype(col_dtype) if (col_res != col_res_orig).any(): - raise ValueError("cannot safely convert passed user dtype of " - "{col_dtype} for {col_res} dtyped data in " - "column {column}".format(col_dtype=col_dtype, - col_res=col_res_orig.dtype.name, - column=i)) - + raise ValueError( + "cannot safely convert passed user dtype of " + "{col_dtype} for {col_res} dtyped data in " + "column {column}".format( + col_dtype=col_dtype, + col_res=col_res_orig.dtype.name, + column=i)) return col_res, na_count @@ -1137,8 +1142,8 @@ cdef class TextReader: kh_str_t *na_hashset, object na_flist): if is_integer_dtype(dtype): - result, na_count = _try_int64(self.parser, i, start, end, na_filter, - na_hashset) + result, na_count = _try_int64(self.parser, i, start, + end, na_filter, na_hashset) if user_dtype and na_count is not None: if na_count > 0: raise ValueError("Integer column has NA values in " @@ -1175,15 +1180,16 @@ cdef class TextReader: elif dtype.kind == 'U': width = dtype.itemsize if width > 0: - raise TypeError("the dtype %s is not supported for parsing" % dtype) + raise TypeError("the dtype %s is not " + "supported for parsing" % dtype) # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) elif is_categorical_dtype(dtype): - codes, cats, na_count = _categorical_convert(self.parser, i, start, - end, na_filter, na_hashset, - self.c_encoding) + codes, cats, na_count = _categorical_convert( + self.parser, i, start, end, na_filter, + na_hashset, self.c_encoding) # sort categories and recode if necessary cats = Index(cats) if not cats.is_monotonic_increasing: @@ -1198,10 +1204,12 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) elif is_datetime64_dtype(dtype): - raise TypeError("the dtype %s is not supported for parsing, " - "pass this column using parse_dates instead" % dtype) + raise TypeError("the dtype %s is not supported " + "for parsing, pass this column " + "using parse_dates instead" % dtype) else: - raise TypeError("the dtype %s is not supported for parsing" % dtype) + raise TypeError("the dtype %s is not " + "supported for parsing" % dtype) cdef _string_convert(self, Py_ssize_t i, int start, int end, bint na_filter, kh_str_t *na_hashset): @@ -1218,7 +1226,6 @@ cdef class TextReader: return _string_box_factorize(self.parser, i, start, end, na_filter, na_hashset) - def _get_converter(self, i, name): if self.converters is None: return None @@ -1330,9 +1337,9 @@ def _maybe_upcast(arr): return arr cdef enum StringPath: - CSTRING - UTF8 - ENCODED + CSTRING + UTF8 + ENCODED # factored out logic to pick string converter cdef inline StringPath _string_path(char *encoding): @@ -1445,7 +1452,7 @@ cdef _string_box_utf8(parser_t *parser, int col, pyval = PyUnicode_FromString(word) k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval + table.vals[k] = pyval result[i] = pyval @@ -1503,7 +1510,7 @@ cdef _string_box_decode(parser_t *parser, int col, pyval = PyUnicode_Decode(word, size, encoding, errors) k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval + table.vals[k] = pyval result[i] = pyval @@ -1511,6 +1518,7 @@ cdef _string_box_decode(parser_t *parser, int col, return result, na_count + @cython.boundscheck(False) cdef _categorical_convert(parser_t *parser, int col, int line_start, int line_end, @@ -1570,7 +1578,8 @@ cdef _categorical_convert(parser_t *parser, int col, for k in range(table.n_buckets): if kh_exist_str(table, k): size = strlen(table.keys[k]) - result[table.vals[k]] = PyUnicode_Decode(table.keys[k], size, encoding, errors) + result[table.vals[k]] = PyUnicode_Decode( + table.keys[k], size, encoding, errors) elif path == UTF8: for k in range(table.n_buckets): if kh_exist_str(table, k): @@ -1600,8 +1609,9 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, return result -cdef inline void _to_fw_string_nogil(parser_t *parser, int col, int line_start, - int line_end, size_t width, char *data) nogil: +cdef inline void _to_fw_string_nogil(parser_t *parser, int col, + int line_start, int line_end, + size_t width, char *data) nogil: cdef: Py_ssize_t i coliter_t it @@ -1639,17 +1649,20 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, na_fset = kset_float64_from_list(na_flist) with nogil: error = _try_double_nogil(parser, col, line_start, line_end, - na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) + na_filter, na_hashset, use_na_flist, + na_fset, NA, data, &na_count) kh_destroy_float64(na_fset) if error != 0: return None, None return result, na_count -cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset, bint use_na_flist, +cdef inline int _try_double_nogil(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, + bint use_na_flist, const kh_float64_t *na_flist, - double NA, - double *data, int *na_count) nogil: + double NA, double *data, + int *na_count) nogil: cdef: int error, size_t i @@ -1674,15 +1687,17 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int na_count[0] += 1 data[0] = NA else: - data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, - parser.thousands, 1) + data[0] = parser.converter(word, &p_end, parser.decimal, + parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: + if (strcasecmp(word, cinf) == 0 or + strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: - # Just return a non-zero value since the errno is never consumed. + # Just return a non-zero value since + # the errno is never consumed. return 1 if use_na_flist: k64 = kh_get_float64(na_flist, data[0]) @@ -1693,15 +1708,17 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int else: for i in range(lines): COLITER_NEXT(it, word) - data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, - parser.thousands, 1) + data[0] = parser.converter(word, &p_end, parser.decimal, + parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: + if (strcasecmp(word, cinf) == 0 or + strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: - # Just return a non-zero value since the errno is never consumed. + # Just return a non-zero value since + # the errno is never consumed. return 1 data += 1 @@ -1724,7 +1741,8 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, data = result.data coliter_setup(&it, parser, col, line_start) with nogil: - error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count) + error = _try_int64_nogil(parser, col, line_start, line_end, + na_filter, na_hashset, NA, data, &na_count) if error != 0: if error == ERROR_OVERFLOW: # Can't get the word variable @@ -1733,9 +1751,10 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, return result, na_count -cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data, - int *na_count) nogil: +cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, int64_t NA, + int64_t *data, int *na_count) nogil: cdef: int error size_t i @@ -1785,14 +1804,18 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, data = result.data with nogil: - error = _try_bool_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count) + error = _try_bool_nogil(parser, col, line_start, + line_end, na_filter, + na_hashset, NA, data, + &na_count) if error != 0: return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, const kh_str_t *na_hashset, uint8_t NA, uint8_t *data, - int *na_count) nogil: +cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, uint8_t NA, + uint8_t *data, int *na_count) nogil: cdef: int error size_t lines = line_end - line_start @@ -1832,7 +1855,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, bint na_filter, const kh_str_t *na_hashset, - const kh_str_t *true_hashset, const kh_str_t *false_hashset): + const kh_str_t *true_hashset, + const kh_str_t *false_hashset): cdef: int error, na_count = 0 size_t i, lines @@ -1848,16 +1872,20 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, result = np.empty(lines, dtype=np.uint8) data = result.data with nogil: - error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, na_hashset, - true_hashset, false_hashset, NA, data, &na_count) + error = _try_bool_flex_nogil(parser, col, line_start, line_end, + na_filter, na_hashset, true_hashset, + false_hashset, NA, data, &na_count) if error != 0: return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, const kh_str_t *na_hashset, - const kh_str_t *true_hashset, const kh_str_t *false_hashset, - uint8_t NA, uint8_t *data, int *na_count) nogil: +cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, + const kh_str_t *true_hashset, + const kh_str_t *false_hashset, + uint8_t NA, uint8_t *data, + int *na_count) nogil: cdef: int error = 0 size_t i @@ -2016,14 +2044,15 @@ def _concatenate_chunks(list chunks): if warning_columns: warning_names = ','.join(warning_columns) - warning_message = " ".join(["Columns (%s) have mixed types." % warning_names, + warning_message = " ".join([ + "Columns (%s) have mixed types." % warning_names, "Specify dtype option on import or set low_memory=False." ]) warnings.warn(warning_message, DtypeWarning, stacklevel=8) return result -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # NA values def _compute_na_values(): int64info = np.iinfo(np.int64) @@ -2035,17 +2064,17 @@ def _compute_na_values(): uint16info = np.iinfo(np.uint16) uint8info = np.iinfo(np.uint8) na_values = { - np.float64 : np.nan, - np.int64 : int64info.min, - np.int32 : int32info.min, - np.int16 : int16info.min, - np.int8 : int8info.min, - np.uint64 : uint64info.max, - np.uint32 : uint32info.max, - np.uint16 : uint16info.max, - np.uint8 : uint8info.max, - np.bool_ : uint8info.max, - np.object_ : np.nan # oof + np.float64: np.nan, + np.int64: int64info.min, + np.int32: int32info.min, + np.int16: int16info.min, + np.int8: int8info.min, + np.uint64: uint64info.max, + np.uint32: uint32info.max, + np.uint16: uint16info.max, + np.uint8: uint8info.max, + np.bool_: uint8info.max, + np.object_: np.nan # oof } return na_values @@ -2128,22 +2157,13 @@ def _to_structured_array(dict columns, object names, object usecols): stride = dt.itemsize - # start = time.time() - - # we own the data + # We own the data. buf = malloc(length * stride) recs = util.sarr_from_data(dt, length, buf) assert(recs.flags.owndata) - # buf = recs.data - # end = time.time() - # print 'took %.4f' % (end - start) - for i in range(nfields): - # start = time.clock() - # name = names[i] - # XXX field_type = fields[fnames[i]] @@ -2156,9 +2176,6 @@ def _to_structured_array(dict columns, object names, object usecols): elsize, stride, length, field_type[0] == np.object_) - # print 'Transfer of %s took %.4f' % (str(field_type), - # time.clock() - start) - return recs cdef _fill_structured_column(char *dst, char* src, int elsize, @@ -2175,7 +2192,6 @@ cdef _fill_structured_column(char *dst, char* src, int elsize, src += elsize - def _maybe_encode(values): if values is None: return [] diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 62555dc7f178c..4fa730eac0fd1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -21,15 +21,20 @@ cdef extern from "headers/stdint.h": enum: INT64_MIN # core.common import for fast inference checks + + def is_float(object obj): return util.is_float_object(obj) + def is_integer(object obj): return util.is_integer_object(obj) + def is_bool(object obj): return util.is_bool_object(obj) + def is_complex(object obj): return util.is_complex_object(obj) @@ -38,33 +43,33 @@ cpdef bint is_period(object val): return util.is_period_object(val) _TYPE_MAP = { - 'categorical' : 'categorical', - 'category' : 'categorical', + 'categorical': 'categorical', + 'category': 'categorical', 'int8': 'integer', 'int16': 'integer', 'int32': 'integer', 'int64': 'integer', - 'i' : 'integer', + 'i': 'integer', 'uint8': 'integer', 'uint16': 'integer', 'uint32': 'integer', 'uint64': 'integer', - 'u' : 'integer', + 'u': 'integer', 'float32': 'floating', 'float64': 'floating', - 'f' : 'floating', + 'f': 'floating', 'complex128': 'complex', - 'c' : 'complex', + 'c': 'complex', 'string': 'string' if PY2 else 'bytes', - 'S' : 'string' if PY2 else 'bytes', + 'S': 'string' if PY2 else 'bytes', 'unicode': 'unicode' if PY2 else 'string', - 'U' : 'unicode' if PY2 else 'string', + 'U': 'unicode' if PY2 else 'string', 'bool': 'boolean', - 'b' : 'boolean', - 'datetime64[ns]' : 'datetime64', - 'M' : 'datetime64', - 'timedelta64[ns]' : 'timedelta64', - 'm' : 'timedelta64', + 'b': 'boolean', + 'datetime64[ns]': 'datetime64', + 'M': 'datetime64', + 'timedelta64[ns]': 'timedelta64', + 'm': 'timedelta64', } # types only exist on certain platform @@ -88,12 +93,13 @@ cdef _try_infer_map(v): """ if its in our map, just return the dtype """ cdef: object attr, val - for attr in ['name','kind','base']: - val = getattr(v.dtype,attr) + for attr in ['name', 'kind', 'base']: + val = getattr(v.dtype, attr) if val in _TYPE_MAP: return _TYPE_MAP[val] return None + def infer_dtype(object _values): """ we are coercing to an ndarray here @@ -107,12 +113,13 @@ def infer_dtype(object _values): if isinstance(_values, np.ndarray): values = _values - elif hasattr(_values,'dtype'): + elif hasattr(_values, 'dtype'): # this will handle ndarray-like # e.g. categoricals try: - values = getattr(_values, '_values', getattr(_values, 'values', _values)) + values = getattr(_values, '_values', getattr( + _values, 'values', _values)) except: val = _try_infer_map(_values) if val is not None: @@ -242,20 +249,21 @@ def is_possible_datetimelike_array(object arr): for i in range(n): v = arr[i] if util.is_string_object(v): - continue + continue elif util._checknull(v): - continue + continue elif is_datetime(v): - seen_datetime=1 + seen_datetime=1 elif is_timedelta(v): - seen_timedelta=1 + seen_timedelta=1 else: - return False + return False return seen_datetime or seen_timedelta cdef inline bint is_null_datetimelike(v): - # determine if we have a null for a timedelta/datetime (or integer versions)x + # determine if we have a null for a timedelta/datetime (or integer + # versions)x if util._checknull(v): return True elif v is NaT: @@ -315,6 +323,7 @@ cdef inline bint is_time(object o): cdef inline bint is_timedelta(object o): return PyDelta_Check(o) or util.is_timedelta64_object(o) + def is_bool_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -335,9 +344,11 @@ def is_bool_array(ndarray values): else: return False + def is_integer(object o): return util.is_integer_object(o) + def is_integer_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -358,6 +369,7 @@ def is_integer_array(ndarray values): else: return False + def is_integer_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -380,6 +392,7 @@ def is_integer_float_array(ndarray values): else: return False + def is_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -400,6 +413,7 @@ def is_float_array(ndarray values): else: return False + def is_string_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -421,6 +435,7 @@ def is_string_array(ndarray values): else: return False + def is_unicode_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -475,11 +490,12 @@ def is_datetime_array(ndarray[object] values): if is_null_datetime64(v): # we are a regular null if util._checknull(v): - null_count += 1 + null_count += 1 elif not is_datetime(v): return False return null_count != n + def is_datetime64_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v @@ -619,6 +635,7 @@ cdef extern from "parse_helper.h": cdef int64_t iINT64_MAX = INT64_MAX cdef int64_t iINT64_MIN = INT64_MIN + def maybe_convert_numeric(object[:] values, set na_values, bint convert_empty=True, bint coerce_numeric=False): """ @@ -772,7 +789,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_float = 1 elif util.is_datetime64_object(val): if convert_datetime: - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value seen_datetime = 1 else: seen_object = 1 @@ -807,7 +825,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, break else: seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value else: seen_object = 1 break @@ -857,7 +876,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return floats elif seen_int: return ints - elif not seen_datetime and not seen_numeric and not seen_timedelta: + elif (not seen_datetime and not seen_numeric + and not seen_timedelta): return bools.view(np.bool_) else: @@ -887,7 +907,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return floats elif seen_int: return ints - elif not seen_datetime and not seen_numeric and not seen_timedelta: + elif (not seen_datetime and not seen_numeric + and not seen_timedelta): return bools.view(np.bool_) return objects @@ -896,8 +917,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, def convert_sql_column(x): return maybe_convert_objects(x, try_float=1) + def try_parse_dates(ndarray[object] values, parser=None, - dayfirst=False,default=None): + dayfirst=False, default=None): cdef: Py_ssize_t i, n ndarray[object] result @@ -907,12 +929,12 @@ def try_parse_dates(ndarray[object] values, parser=None, if parser is None: if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year,date.month,1) + date=datetime.now() + default=datetime(date.year, date.month, 1) try: from dateutil.parser import parse - parse_date = lambda x: parse(x, dayfirst=dayfirst,default=default) + parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) except ImportError: # pragma: no cover def parse_date(s): try: @@ -944,9 +966,10 @@ def try_parse_dates(ndarray[object] values, parser=None, return result + def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, date_parser=None, time_parser=None, - dayfirst=False,default=None): + dayfirst=False, default=None): cdef: Py_ssize_t i, n ndarray[object] result @@ -960,8 +983,8 @@ def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, if date_parser is None: if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year,date.month,1) + date=datetime.now() + default=datetime(date.year, date.month, 1) try: from dateutil.parser import parse @@ -1016,6 +1039,7 @@ def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, return result + def try_parse_datetime_components(ndarray[object] years, ndarray[object] months, ndarray[object] days, @@ -1052,6 +1076,7 @@ def try_parse_datetime_components(ndarray[object] years, return result + def sanitize_objects(ndarray[object] values, set na_values, convert_empty=True): cdef: @@ -1075,6 +1100,7 @@ def sanitize_objects(ndarray[object] values, set na_values, return na_count + def maybe_convert_bool(ndarray[object] arr, true_values=None, false_values=None): cdef: @@ -1166,6 +1192,7 @@ def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, return result + def map_infer(ndarray arr, object f, bint convert=1): """ Substitute for np.vectorize with pandas-friendly dtype inference @@ -1246,6 +1273,7 @@ def to_object_array(list rows, int min_width=0): return result + def tuples_to_object_array(ndarray[object] tuples): cdef: Py_ssize_t i, j, n, k, tmp @@ -1262,6 +1290,7 @@ def tuples_to_object_array(ndarray[object] tuples): return result + def to_object_array_tuples(list rows): cdef: Py_ssize_t i, j, n, k, tmp diff --git a/pandas/src/offsets.pyx b/pandas/src/offsets.pyx index 096198c8a05fa..c963e256d0aa5 100644 --- a/pandas/src/offsets.pyx +++ b/pandas/src/offsets.pyx @@ -162,7 +162,7 @@ cdef class YearOffset(_Offset): cpdef prev(self): cdef int64_t days - days = 365 + is_leapyear(self.y - (1-self.ly)) + days = 365 + is_leapyear(self.y - (1 - self.ly)) self.t -= days * us_in_day self.y -= 1 @@ -204,8 +204,8 @@ cdef class MonthOffset(_Offset): self.t = ts.value + (self.dayoffset * us_in_day) # for day counting - self.m = ts.dts.month - 1 - self.y = ts.dts.year + self.m = ts.dts.month - 1 + self.y = ts.dts.year self.ly = is_leapyear(self.y) if self.biz != 0: diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index bb0108fcb141c..5565f25937394 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -80,17 +80,21 @@ cdef extern from "period_helper.h": ctypedef int64_t (*freq_conv_func)(int64_t, char, asfreq_info*) void initialize_daytime_conversion_factor_matrix() - int64_t asfreq(int64_t dtordinal, int freq1, int freq2, char relation) except INT32_MIN + int64_t asfreq(int64_t dtordinal, int freq1, int freq2, + char relation) except INT32_MIN freq_conv_func get_asfreq_func(int fromFreq, int toFreq) void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) int64_t get_period_ordinal(int year, int month, int day, - int hour, int minute, int second, int microseconds, int picoseconds, - int freq) nogil except INT32_MIN + int hour, int minute, int second, + int microseconds, int picoseconds, + int freq) nogil except INT32_MIN - int64_t get_python_ordinal(int64_t period_ordinal, int freq) except INT32_MIN + int64_t get_python_ordinal(int64_t period_ordinal, + int freq) except INT32_MIN - int get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil except INT32_MIN + int get_date_info(int64_t ordinal, int freq, + date_info *dinfo) nogil except INT32_MIN double getAbsTime(int, int64_t, int64_t) int pyear(int64_t ordinal, int freq) except INT32_MIN @@ -134,6 +138,7 @@ cdef inline int64_t remove_mult(int64_t period_ord_w_mult, int64_t mult): return period_ord_w_mult * mult + 1; + @cython.wraparound(False) @cython.boundscheck(False) def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, tz=None): @@ -158,11 +163,13 @@ def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, tz=None): continue pandas_datetime_to_datetimestruct(dtarr[i], PANDAS_FR_ns, &dts) out[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) else: out = localize_dt64arr_to_period(dtarr, freq, tz) return out + @cython.wraparound(False) @cython.boundscheck(False) def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq): @@ -212,6 +219,7 @@ cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, return retval + def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): """ Convert int64-array of period ordinals from one frequency to another, and @@ -254,7 +262,9 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): return result -def period_ordinal(int y, int m, int d, int h, int min, int s, int us, int ps, int freq): + +def period_ordinal(int y, int m, int d, int h, int min, + int s, int us, int ps, int freq): cdef: int64_t ordinal @@ -284,6 +294,7 @@ cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + def period_format(int64_t value, int freq, object fmt=None): cdef: int freq_group @@ -332,7 +343,8 @@ cdef list extra_fmts = [(b"%q", b"^`AB`^"), (b"%u", b"^`IJ`^"), (b"%n", b"^`KL`^")] -cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^", "^`GH`^", "^`IJ`^", "^`KL`^"] +cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^", + "^`GH`^", "^`IJ`^", "^`KL`^"] cdef object _period_strftime(int64_t value, int freq, object fmt): import sys @@ -390,6 +402,7 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN + def get_period_field(int code, int64_t value, int freq): cdef accessor f = _get_accessor_func(code) if f is NULL: @@ -398,6 +411,7 @@ def get_period_field(int code, int64_t value, int freq): return np.nan return f(value, freq) + def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): cdef: Py_ssize_t i, sz @@ -420,7 +434,6 @@ def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): return out - cdef accessor _get_accessor_func(int code): if code == 0: return &pyear @@ -571,7 +584,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): pos = _pos # statictzinfo - if typ not in ['pytz','dateutil']: + if typ not in ['pytz', 'dateutil']: for i in range(n): if stamps[i] == NPY_NAT: continue @@ -613,7 +626,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, continue pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) elif _is_tzlocal(tz): for i in range(n): @@ -628,7 +642,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pandas_datetime_to_datetimestruct(stamps[i] + delta, PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) else: # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = _get_dst_info(tz) @@ -639,7 +654,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pos = _pos # statictzinfo - if typ not in ['pytz','dateutil']: + if typ not in ['pytz', 'dateutil']: for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT @@ -647,7 +662,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pandas_datetime_to_datetimestruct(stamps[i] + deltas[0], PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) else: for i in range(n): if stamps[i] == NPY_NAT: @@ -656,13 +672,15 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pandas_datetime_to_datetimestruct(stamps[i] + deltas[pos[i]], PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) return result _DIFFERENT_FREQ = "Input has different freq={1} from Period(freq={0})" -_DIFFERENT_FREQ_INDEX = "Input has different freq={1} from PeriodIndex(freq={0})" +_DIFFERENT_FREQ_INDEX = ("Input has different freq={1} " + "from PeriodIndex(freq={0})") class IncompatibleFrequency(ValueError): @@ -675,7 +693,7 @@ cdef class _Period(object): int64_t ordinal object freq - _comparables = ['name','freqstr'] + _comparables = ['name', 'freqstr'] _typ = 'period' @classmethod @@ -695,7 +713,9 @@ cdef class _Period(object): @classmethod def _from_ordinal(cls, ordinal, freq): - """ fast creation from an ordinal and freq that are already validated! """ + """ + Fast creation from an ordinal and freq that are already validated! + """ if ordinal == tslib.iNaT: return tslib.NaT else: @@ -727,7 +747,8 @@ cdef class _Period(object): return hash((self.ordinal, self.freqstr)) def _add_delta(self, other): - if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): + if isinstance(other, (timedelta, np.timedelta64, + offsets.Tick, Timedelta)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) @@ -752,7 +773,8 @@ cdef class _Period(object): def __add__(self, other): if isinstance(self, Period): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset, Timedelta)): + offsets.Tick, offsets.DateOffset, + Timedelta)): return self._add_delta(other) elif other is tslib.NaT: return tslib.NaT @@ -769,7 +791,8 @@ cdef class _Period(object): def __sub__(self, other): if isinstance(self, Period): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset, Timedelta)): + offsets.Tick, offsets.DateOffset, + Timedelta)): neg_other = -other return self + neg_other elif lib.is_integer(other): @@ -1138,8 +1161,9 @@ class Period(_Period): raise ValueError('Must supply freq for ordinal value') elif value is None: - if (year is None and month is None and quarter is None and - day is None and hour is None and minute is None and second is None): + if (year is None and month is None and + quarter is None and day is None and + hour is None and minute is None and second is None): ordinal = tslib.iNaT else: if freq is None: @@ -1157,7 +1181,8 @@ class Period(_Period): elif isinstance(value, Period): other = value - if freq is None or frequencies.get_freq_code(freq) == frequencies.get_freq_code(other.freq): + if freq is None or frequencies.get_freq_code( + freq) == frequencies.get_freq_code(other.freq): ordinal = other.ordinal freq = other.freq else: @@ -1177,7 +1202,8 @@ class Period(_Period): try: freq = frequencies.Resolution.get_freq(reso) except KeyError: - raise ValueError("Invalid frequency or could not infer: %s" % reso) + raise ValueError( + "Invalid frequency or could not infer: %s" % reso) elif isinstance(value, datetime): dt = value @@ -1210,7 +1236,8 @@ def _ordinal_from_fields(year, month, quarter, day, if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) - return get_period_ordinal(year, month, day, hour, minute, second, 0, 0, base) + return get_period_ordinal(year, month, day, hour, + minute, second, 0, 0, base) def _quarter_to_myear(year, quarter, freq): @@ -1218,7 +1245,8 @@ def _quarter_to_myear(year, quarter, freq): if quarter <= 0 or quarter > 4: raise ValueError('Quarter must be 1 <= q <= 4') - mnum = frequencies._month_numbers[frequencies._get_rule_month(freq)] + 1 + mnum = frequencies._month_numbers[ + frequencies._get_rule_month(freq)] + 1 month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: year -= 1 diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index c3f8bdfbfd0a6..1cd3e53494a72 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -46,11 +46,11 @@ cdef class Reducer: self.chunksize = k self.increment = k * arr.dtype.itemsize - self.f = f self.arr = arr self.labels = labels - self.dummy, self.typ, self.index, self.ityp = self._check_dummy(dummy=dummy) + self.dummy, self.typ, self.index, self.ityp = self._check_dummy( + dummy=dummy) def _check_dummy(self, dummy=None): cdef object index=None, typ=None, ityp=None @@ -65,16 +65,17 @@ cdef class Reducer: else: # we passed a series-like - if hasattr(dummy,'values'): + if hasattr(dummy, 'values'): typ = type(dummy) - index = getattr(dummy,'index',None) + index = getattr(dummy, 'index', None) dummy = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if len(dummy) != self.chunksize: - raise ValueError('Dummy array must be length %d' % self.chunksize) + raise ValueError('Dummy array must be length %d' % + self.chunksize) return dummy, typ, index, ityp @@ -111,15 +112,16 @@ cdef class Reducer: if self.typ is not None: - # recreate with the index if supplied - if has_index: + # recreate with the index if supplied + if has_index: - cached_typ = self.typ(chunk, index=self.index, name=name) + cached_typ = self.typ( + chunk, index=self.index, name=name) - else: + else: - # use the passsed typ, sans index - cached_typ = self.typ(chunk, name=name) + # use the passsed typ, sans index + cached_typ = self.typ(chunk, name=name) # use the cached_typ if possible if cached_typ is not None: @@ -127,13 +129,15 @@ cdef class Reducer: if has_index: object.__setattr__(cached_typ, 'index', self.index) - object.__setattr__(cached_typ._data._block, 'values', chunk) + object.__setattr__( + cached_typ._data._block, 'values', chunk) object.__setattr__(cached_typ, 'name', name) res = self.f(cached_typ) else: res = self.f(chunk) - if hasattr(res,'values') and isinstance(res.values, np.ndarray): + if hasattr(res, 'values') and isinstance( + res.values, np.ndarray): res = res.values if i == 0: result = _get_result_array(res, @@ -167,7 +171,8 @@ cdef class SeriesBinGrouper: bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index, values, f, bins, typ, ityp, name + object arr, index, dummy_arr, dummy_index + object values, f, bins, typ, ityp, name def __init__(self, object series, object f, object bins, object dummy): n = len(series) @@ -182,7 +187,7 @@ cdef class SeriesBinGrouper: self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series,'name',None) + self.name = getattr(series, 'name', None) self.dummy_arr, self.dummy_index = self._check_dummy(dummy) self.passed_dummy = dummy is not None @@ -205,7 +210,7 @@ cdef class SeriesBinGrouper: raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() - index = dummy.index.values + index = dummy.index.values if not index.flags.contiguous: index = index.copy() @@ -227,9 +232,9 @@ cdef class SeriesBinGrouper: counts[0] = self.bins[0] for i in range(1, self.ngroups): if i == self.ngroups - 1: - counts[i] = len(self.arr) - self.bins[i-1] + counts[i] = len(self.arr) - self.bins[i - 1] else: - counts[i] = self.bins[i] - self.bins[i-1] + counts[i] = self.bins[i] - self.bins[i - 1] group_size = 0 n = len(self.arr) @@ -252,7 +257,8 @@ cdef class SeriesBinGrouper: else: object.__setattr__(cached_ityp, '_data', islider.buf) cached_ityp._engine.clear_mapping() - object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__( + cached_typ._data._block, 'values', vslider.buf) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', name) @@ -293,7 +299,8 @@ cdef class SeriesGrouper: bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index, f, labels, values, typ, ityp, name + object arr, index, dummy_arr, dummy_index + object f, labels, values, typ, ityp, name def __init__(self, object series, object f, object labels, Py_ssize_t ngroups, object dummy): @@ -309,7 +316,7 @@ cdef class SeriesGrouper: self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series,'name',None) + self.name = getattr(series, 'name', None) self.dummy_arr, self.dummy_index = self._check_dummy(dummy) self.passed_dummy = dummy is not None @@ -320,14 +327,14 @@ cdef class SeriesGrouper: if dummy is None: values = np.empty(0, dtype=self.arr.dtype) - index = None + index = None else: values = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() - index = dummy.index.values + index = dummy.index.values if not index.flags.contiguous: index = index.copy() @@ -375,7 +382,8 @@ cdef class SeriesGrouper: else: object.__setattr__(cached_ityp, '_data', islider.buf) cached_ityp._engine.clear_mapping() - object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__( + cached_typ._data._block, 'values', vslider.buf) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', name) @@ -411,14 +419,14 @@ cdef class SeriesGrouper: cdef inline _extract_result(object res): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ - if hasattr(res,'values'): - res = res.values + if hasattr(res, 'values'): + res = res.values if not np.isscalar(res): - if isinstance(res, np.ndarray): - if res.ndim == 0: - res = res.item() - elif res.ndim == 1 and len(res) == 1: - res = res[0] + if isinstance(res, np.ndarray): + if res.ndim == 0: + res = res.item() + elif res.ndim == 1 and len(res) == 1: + res = res[0] return res cdef class Slider: @@ -467,9 +475,11 @@ cdef class Slider: self.buf.data = self.orig_data self.buf.strides[0] = self.orig_stride + class InvalidApply(Exception): pass + def apply_frame_axis0(object frame, object f, object names, ndarray[int64_t] starts, ndarray[int64_t] ends): cdef: @@ -482,7 +492,6 @@ def apply_frame_axis0(object frame, object f, object names, if frame.index._has_complex_internals: raise InvalidApply('Cannot modify frame index internals') - results = [] # Need to infer if our low-level mucking is going to cause a segfault @@ -496,7 +505,6 @@ def apply_frame_axis0(object frame, object f, object names, except: raise InvalidApply('Let this error raise above us') - slider = BlockSlider(frame) mutated = False @@ -550,7 +558,8 @@ cdef class BlockSlider: util.set_array_not_contiguous(x) self.nblocks = len(self.blocks) - self.idx_slider = Slider(self.frame.index.values, self.dummy.index.values) + self.idx_slider = Slider( + self.frame.index.values, self.dummy.index.values) self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) for i, block in enumerate(self.blocks): @@ -574,7 +583,7 @@ cdef class BlockSlider: # move and set the index self.idx_slider.move(start, end) - object.__setattr__(self.index,'_data',self.idx_slider.buf) + object.__setattr__(self.index, '_data', self.idx_slider.buf) self.index._engine.clear_mapping() cdef reset(self): @@ -589,6 +598,7 @@ cdef class BlockSlider: arr.data = self.base_ptrs[i] arr.shape[1] = 0 + def reduce(arr, f, axis=0, dummy=None, labels=None): """ @@ -606,7 +616,7 @@ def reduce(arr, f, axis=0, dummy=None, labels=None): raise Exception('Cannot use shortcut') # pass as an ndarray - if hasattr(labels,'values'): + if hasattr(labels, 'values'): labels = labels.values reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels) diff --git a/pandas/src/skiplist.pyx b/pandas/src/skiplist.pyx index e7db7bd5a4a02..3017931e25115 100644 --- a/pandas/src/skiplist.pyx +++ b/pandas/src/skiplist.pyx @@ -75,7 +75,6 @@ cdef class IndexableSkiplist: i -= node.width[level] node = node.next[level] - return node.value cpdef insert(self, double value): diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx index 88eb4cf13815b..7ab29414499fc 100644 --- a/pandas/src/sparse.pyx +++ b/pandas/src/sparse.pyx @@ -20,7 +20,7 @@ _np_version_under1p11 = LooseVersion(_np_version) < '1.11' np.import_array() np.import_ufunc() -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Preamble stuff cdef float64_t NaN = np.NaN @@ -29,7 +29,7 @@ cdef float64_t INF = np.inf cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- cdef class SparseIndex: @@ -112,7 +112,8 @@ cdef class IntIndex(SparseIndex): xindices = self.indices yindices = y.indices - new_indices = np.empty(min(len(xindices), len(yindices)), dtype=np.int32) + new_indices = np.empty(min( + len(xindices), len(yindices)), dtype=np.int32) for xi from 0 <= xi < self.npoints: xind = xindices[xi] @@ -171,7 +172,8 @@ cdef class IntIndex(SparseIndex): return -1 @cython.wraparound(False) - cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer): + cpdef ndarray[int32_t] lookup_array(self, ndarray[ + int32_t, ndim=1] indexer): """ Vectorized lookup, returns ndarray[int32_t] """ @@ -279,7 +281,7 @@ cpdef get_blocks(ndarray[int32_t, ndim=1] indices): lens = lens[:result_indexer] return locs, lens -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # BlockIndex cdef class BlockIndex(SparseIndex): @@ -350,7 +352,7 @@ cdef class BlockIndex(SparseIndex): for i from 0 <= i < self.nblocks: if i > 0: - if blocs[i] <= blocs[i-1]: + if blocs[i] <= blocs[i - 1]: raise ValueError('Locations not in ascending order') if i < self.nblocks - 1: @@ -524,7 +526,8 @@ cdef class BlockIndex(SparseIndex): return -1 @cython.wraparound(False) - cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer): + cpdef ndarray[int32_t] lookup_array(self, ndarray[ + int32_t, ndim=1] indexer): """ Vectorized lookup, returns ndarray[int32_t] """ @@ -642,7 +645,8 @@ cdef class BlockUnion(BlockMerge): cdef _make_merged_blocks(self): cdef: - ndarray[int32_t, ndim=1] xstart, xend, ystart, yend, out_bloc, out_blen + ndarray[int32_t, ndim=1] xstart, xend, ystart + ndarray[int32_t, ndim=1] yend, out_bloc, out_blen int32_t nstart, nend, diff Py_ssize_t max_len, result_indexer = 0 @@ -752,14 +756,13 @@ cdef class BlockUnion(BlockMerge): return self._find_next_block_end(1 - mode) -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Sparse arithmetic include "sparse_op_helper.pxi" - -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Indexing operations def get_reindexer(ndarray[object, ndim=1] values, dict index_map): diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index e9563d9168206..cda21ba9c4ce1 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -68,13 +68,14 @@ cpdef assert_almost_equal(a, b, b : object check_less_precise : bool or int, default False Specify comparison precision. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If an integer, then this will be the number of decimal points to compare + 5 digits (False) or 3 digits (True) after decimal points are + compared. If an integer, then this will be the number of decimal + points to compare check_dtype: bool, default True check dtype if both a and b are np.ndarray obj : str, default None - Specify object name being compared, internally used to show appropriate - assertion message + Specify object name being compared, internally used to show + appropriate assertion message lobj : str, default None Specify left object name being compared, internally used to show appropriate assertion message @@ -129,8 +130,9 @@ cpdef assert_almost_equal(a, b, na, nb = a.size, b.size if a.shape != b.shape: from pandas.util.testing import raise_assert_detail - raise_assert_detail(obj, '{0} shapes are different'.format(obj), - a.shape, b.shape) + raise_assert_detail( + obj, '{0} shapes are different'.format(obj), + a.shape, b.shape) if check_dtype and not is_dtype_equal(a, b): from pandas.util.testing import assert_attr_equal @@ -148,7 +150,7 @@ cpdef assert_almost_equal(a, b, from pandas.util.testing import raise_assert_detail # if we have a small diff set, print it - if abs(na-nb) < 10: + if abs(na - nb) < 10: r = list(set(a) ^ set(b)) else: r = None @@ -158,14 +160,16 @@ cpdef assert_almost_equal(a, b, for i in xrange(len(a)): try: - assert_almost_equal(a[i], b[i], check_less_precise=check_less_precise) + assert_almost_equal(a[i], b[i], + check_less_precise=check_less_precise) except AssertionError: is_unequal = True diff += 1 if is_unequal: from pandas.util.testing import raise_assert_detail - msg = '{0} values are different ({1} %)'.format(obj, np.round(diff * 100.0 / na, 5)) + msg = '{0} values are different ({1} %)'.format( + obj, np.round(diff * 100.0 / na, 5)) raise_assert_detail(obj, msg, lobj, robj) return True @@ -198,12 +202,12 @@ cpdef assert_almost_equal(a, b, # case for zero if abs(fa) < 1e-5: if not decimal_almost_equal(fa, fb, decimal): - assert False, ( - '(very low values) expected %.5f but got %.5f, with decimal %d' % (fb, fa, decimal) - ) + assert False, ('(very low values) expected %.5f but ' + 'got %.5f, with decimal %d' % (fb, fa, decimal)) else: if not decimal_almost_equal(1, fb / fa, decimal): - assert False, 'expected %.5f but got %.5f, with decimal %d' % (fb, fa, decimal) + assert False, ('expected %.5f but got %.5f, ' + 'with decimal %d' % (fb, fa, decimal)) return True raise AssertionError("{0} != {1}".format(a, b)) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index c9e85c5741410..9073ad0abd535 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -36,7 +36,8 @@ cdef extern from "datetime_helper.h": from datetime cimport cmp_pandas_datetimestruct from libc.stdlib cimport free -from util cimport is_integer_object, is_float_object, is_datetime64_object, is_timedelta64_object +from util cimport (is_integer_object, is_float_object, is_datetime64_object, + is_timedelta64_object) cimport util from datetime cimport * @@ -49,8 +50,10 @@ from datetime import time as datetime_time import re # dateutil compat -from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, tzfile as _dateutil_tzfile, - tzutc as _dateutil_tzutc, tzstr as _dateutil_tzstr) +from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, + tzfile as _dateutil_tzfile, + tzutc as _dateutil_tzutc, + tzstr as _dateutil_tzstr) from pandas.compat import is_platform_windows if is_platform_windows(): @@ -61,7 +64,8 @@ from dateutil.relativedelta import relativedelta from dateutil.parser import DEFAULTPARSER from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo -from pandas.compat import parse_date, string_types, iteritems, StringIO, callable +from pandas.compat import (parse_date, string_types, iteritems, + StringIO, callable) import operator import collections @@ -89,8 +93,10 @@ try: except NameError: # py3 basestring = str -cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct dts, - object tz, object freq): + +cdef inline object create_timestamp_from_ts( + int64_t value, pandas_datetimestruct dts, + object tz, object freq): cdef _Timestamp ts_base ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month, dts.day, dts.hour, dts.min, @@ -101,13 +107,17 @@ cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct return ts_base -cdef inline object create_datetime_from_ts(int64_t value, pandas_datetimestruct dts, - object tz, object freq): + +cdef inline object create_datetime_from_ts( + int64_t value, pandas_datetimestruct dts, + object tz, object freq): return datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) + def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): - # convert an i8 repr to an ndarray of datetimes or Timestamp (if box == True) + # convert an i8 repr to an ndarray of datetimes or Timestamp (if box == + # True) cdef: Py_ssize_t i, n = len(arr) @@ -133,7 +143,8 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): if value == NPY_NAT: result[i] = NaT else: - pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + value, PANDAS_FR_ns, &dts) result[i] = func_create(value, dts, tz, freq) elif _is_tzlocal(tz) or _is_fixed_offset(tz): for i in range(n): @@ -141,7 +152,8 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): if value == NPY_NAT: result[i] = NaT else: - pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + value, PANDAS_FR_ns, &dts) dt = create_datetime_from_ts(value, dts, tz, freq) dt = dt + tz.utcoffset(dt) if box: @@ -163,10 +175,12 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): # find right representation of dst etc in pytz timezone new_tz = tz._tzinfos[tz._transition_info[pos]] else: - # no zone-name change for dateutil tzs - dst etc represented in single object. + # no zone-name change for dateutil tzs - dst etc + # represented in single object. new_tz = tz - pandas_datetime_to_datetimestruct(value + deltas[pos], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + value + deltas[pos], PANDAS_FR_ns, &dts) result[i] = func_create(value, dts, new_tz, freq) else: for i in range(n): @@ -180,8 +194,10 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): return result + def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): - # convert an i8 repr to an ndarray of timedelta or Timedelta (if box == True) + # convert an i8 repr to an ndarray of timedelta or Timedelta (if box == + # True) cdef: Py_ssize_t i, n = len(arr) @@ -197,7 +213,7 @@ def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): if box: result[i] = Timedelta(value) else: - result[i] = timedelta(microseconds=int(value)/1000) + result[i] = timedelta(microseconds=int(value) / 1000) return result @@ -205,6 +221,7 @@ def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): cdef inline bint _is_tzlocal(object tz): return isinstance(tz, _dateutil_tzlocal) + cdef inline bint _is_fixed_offset(object tz): if _treat_tz_as_dateutil(tz): if len(tz._trans_idx) == 0 and len(tz._trans_list) == 0: @@ -212,7 +229,8 @@ cdef inline bint _is_fixed_offset(object tz): else: return 0 elif _treat_tz_as_pytz(tz): - if len(tz._transition_info) == 0 and len(tz._utc_transition_times) == 0: + if (len(tz._transition_info) == 0 + and len(tz._utc_transition_times) == 0): return 1 else: return 0 @@ -223,6 +241,8 @@ _no_input = object() # Python front end to C extension type _Timestamp # This serves as the box for datetime64 + + class Timestamp(_Timestamp): """TimeStamp is the pandas equivalent of python's Datetime and is interchangable with it in most cases. It's the type used @@ -281,7 +301,8 @@ class Timestamp(_Timestamp): offset : str, DateOffset Deprecated, use freq """ - return cls(datetime.fromordinal(ordinal), freq=freq, tz=tz, offset=offset) + return cls(datetime.fromordinal(ordinal), + freq=freq, tz=tz, offset=offset) @classmethod def now(cls, tz=None): @@ -370,13 +391,16 @@ class Timestamp(_Timestamp): if ts_input is _no_input: # User passed keyword arguments. return Timestamp(datetime(year, month, day, hour or 0, - minute or 0, second or 0, microsecond or 0, tzinfo), - tz=tzinfo) + minute or 0, second or 0, + microsecond or 0, tzinfo), + tz=tzinfo) elif is_integer_object(freq): # User passed positional arguments: - # Timestamp(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]]) + # Timestamp(year, month, day[, hour[, minute[, second[, + # microsecond[, tzinfo]]]]]) return Timestamp(datetime(ts_input, freq, tz, unit or 0, - year or 0, month or 0, day or 0, hour), tz=hour) + year or 0, month or 0, day or 0, + hour), tz=hour) ts = convert_to_tsobject(ts_input, tz, unit, 0, 0) @@ -399,7 +423,6 @@ class Timestamp(_Timestamp): return ts_base - def _round(self, freq, rounder): cdef int64_t unit @@ -411,7 +434,7 @@ class Timestamp(_Timestamp): value = self.tz_localize(None).value else: value = self.value - result = Timestamp(unit*rounder(value/float(unit)),unit='ns') + result = Timestamp(unit * rounder(value / float(unit)), unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) return result @@ -493,7 +516,8 @@ class Timestamp(_Timestamp): @property def weekday_name(self): - out = get_date_name_field(np.array([self.value], dtype=np.int64), 'weekday_name') + out = get_date_name_field( + np.array([self.value], dtype=np.int64), 'weekday_name') return out[0] @property @@ -592,8 +616,8 @@ class Timestamp(_Timestamp): # tz naive, localize tz = maybe_get_tz(tz) if not isinstance(ambiguous, basestring): - ambiguous = [ambiguous] - value = tz_localize_to_utc(np.array([self.value],dtype='i8'), tz, + ambiguous = [ambiguous] + value = tz_localize_to_utc(np.array([self.value], dtype='i8'), tz, ambiguous=ambiguous, errors=errors)[0] return Timestamp(value, tz=tz) else: @@ -605,7 +629,6 @@ class Timestamp(_Timestamp): raise TypeError('Cannot localize tz-aware Timestamp, use ' 'tz_convert for conversions') - def tz_convert(self, tz): """ Convert tz-aware Timestamp to another time zone. @@ -677,25 +700,26 @@ class Timestamp(_Timestamp): year -= 1 month += 12 return (day + - np.fix((153*month - 457)/5) + - 365*year + + np.fix((153 * month - 457) / 5) + + 365 * year + np.floor(year / 4) - np.floor(year / 100) + np.floor(year / 400) + 1721118.5 + (self.hour + - self.minute/60.0 + - self.second/3600.0 + - self.microsecond/3600.0/1e+6 + - self.nanosecond/3600.0/1e+9 - )/24.0) + self.minute / 60.0 + + self.second / 3600.0 + + self.microsecond / 3600.0 / 1e+6 + + self.nanosecond / 3600.0 / 1e+9 + ) / 24.0) def normalize(self): """ Normalize Timestamp to midnight, preserving tz information. """ - normalized_value = date_normalize(np.array([self.value], dtype='i8'), tz=self.tz)[0] + normalized_value = date_normalize( + np.array([self.value], dtype='i8'), tz=self.tz)[0] return Timestamp(normalized_value).tz_localize(self.tz) def __radd__(self, other): @@ -704,7 +728,9 @@ class Timestamp(_Timestamp): return self + other -_nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN']) +_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + + class NaTType(_NaT): """(N)ot-(A)-(T)ime, the time equivalent of NaN""" @@ -762,7 +788,6 @@ class NaTType(_NaT): return NotImplemented - fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond', 'week', 'dayofyear', 'days_in_month', 'daysinmonth', 'dayofweek', @@ -771,20 +796,23 @@ for field in fields: prop = property(fget=lambda self: np.nan) setattr(NaTType, field, prop) -# GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or return NaT -# create functions that raise, for binding to NaTType + +# GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or +# return NaT create functions that raise, for binding to NaTType def _make_error_func(func_name): def f(*args, **kwargs): raise ValueError("NaTType does not support " + func_name) f.__name__ = func_name return f + def _make_nat_func(func_name): def f(*args, **kwargs): return NaT f.__name__ = func_name return f + def _make_nan_func(func_name): def f(*args, **kwargs): return np.nan @@ -813,7 +841,9 @@ for _maybe_method_name in dir(NaTType): if (callable(_maybe_method) and not _maybe_method_name.startswith("_") and _maybe_method_name not in _implemented_methods): - setattr(NaTType, _maybe_method_name, _make_error_func(_maybe_method_name)) + setattr(NaTType, _maybe_method_name, + _make_error_func(_maybe_method_name)) + def __nat_unpickle(*args): # return constant defined in the module @@ -1028,9 +1058,11 @@ cdef class _Timestamp(datetime): pass tz = ", tz='{0}'".format(zone) if zone is not None else "" - freq = ", freq='{0}'".format(self.freq.freqstr) if self.freq is not None else "" + freq = ", freq='{0}'".format( + self.freq.freqstr) if self.freq is not None else "" - return "Timestamp('{stamp}'{tz}{freq})".format(stamp=stamp, tz=tz, freq=freq) + return "Timestamp('{stamp}'{tz}{freq})".format( + stamp=stamp, tz=tz, freq=freq) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: @@ -1101,7 +1133,8 @@ cdef class _Timestamp(datetime): if is_timedelta64_object(other): other_int = other.astype('timedelta64[ns]').view('i8') - return Timestamp(self.value + other_int, tz=self.tzinfo, freq=self.freq) + return Timestamp(self.value + other_int, + tz=self.tzinfo, freq=self.freq) elif is_integer_object(other): if self is NaT: @@ -1114,7 +1147,8 @@ cdef class _Timestamp(datetime): elif isinstance(other, timedelta) or hasattr(other, 'delta'): nanos = _delta_to_nanoseconds(other) - result = Timestamp(self.value + nanos, tz=self.tzinfo, freq=self.freq) + result = Timestamp(self.value + nanos, + tz=self.tzinfo, freq=self.freq) if getattr(other, 'normalize', False): result = Timestamp(normalize_date(result)) return result @@ -1148,21 +1182,27 @@ cdef class _Timestamp(datetime): return NaT # coerce if necessary if we are a Timestamp-like - if isinstance(self, datetime) and (isinstance(other, datetime) or is_datetime64_object(other)): + if (isinstance(self, datetime) + and (isinstance(other, datetime) + or is_datetime64_object(other))): self = Timestamp(self) other = Timestamp(other) # validate tz's if get_timezone(self.tzinfo) != get_timezone(other.tzinfo): - raise TypeError("Timestamp subtraction must have the same timezones or no timezones") + raise TypeError( + "Timestamp subtraction must have the " + "same timezones or no timezones") - # scalar Timestamp/datetime - Timestamp/datetime -> yields a Timedelta + # scalar Timestamp/datetime - Timestamp/datetime -> yields a + # Timedelta try: - return Timedelta(self.value-other.value) + return Timedelta(self.value -other.value) except (OverflowError, OutOfBoundsDatetime): pass - # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with same timezone if specified) + # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with + # same timezone if specified) return datetime.__sub__(self, other) cpdef _get_field(self, field): @@ -1170,9 +1210,12 @@ cdef class _Timestamp(datetime): return int(out[0]) cpdef _get_start_end_field(self, field): - month_kw = self.freq.kwds.get('startingMonth', self.freq.kwds.get('month', 12)) if self.freq else 12 + month_kw = self.freq.kwds.get( + 'startingMonth', self.freq.kwds.get( + 'month', 12)) if self.freq else 12 freqstr = self.freqstr if self.freq else None - out = get_start_end_field(np.array([self.value], dtype=np.int64), field, freqstr, month_kw) + out = get_start_end_field( + np.array([self.value], dtype=np.int64), field, freqstr, month_kw) return out[0] property _repr_base: @@ -1361,19 +1404,20 @@ cdef convert_to_tsobject(object ts, object tz, object unit, obj.value = NPY_NAT else: obj.value = _get_datetime64_nanos(ts) - pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) + pandas_datetime_to_datetimestruct( + obj.value, PANDAS_FR_ns, &obj.dts) elif is_integer_object(ts): if ts == NPY_NAT: obj.value = NPY_NAT else: - ts = ts * cast_from_unit(None,unit) + ts = ts * cast_from_unit(None, unit) obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_float_object(ts): if ts != ts or ts == NPY_NAT: obj.value = NPY_NAT else: - ts = cast_from_unit(ts,unit) + ts = cast_from_unit(ts, unit) obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif PyDateTime_Check(ts): @@ -1424,7 +1468,9 @@ cdef convert_to_tsobject(object ts, object tz, object unit, ts = datetime.combine(ts, datetime_time()) return convert_to_tsobject(ts, tz, None, 0, 0) elif getattr(ts, '_typ', None) == 'period': - raise ValueError("Cannot convert Period to Timestamp unambiguously. Use to_timestamp") + raise ValueError( + "Cannot convert Period to Timestamp " + "unambiguously. Use to_timestamp") else: raise TypeError('Cannot convert input to Timestamp') @@ -1465,7 +1511,8 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, else: try: _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) - obj.value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &obj.dts) + obj.value = pandas_datetimestruct_to_datetime( + PANDAS_FR_ns, &obj.dts) _check_dts_bounds(&obj.dts) if out_local == 1: obj.tzinfo = pytz.FixedOffset(out_tzoffset) @@ -1483,12 +1530,14 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, ts = tz_convert_single(ts, tz, 'UTC') except ValueError: try: - ts = parse_datetime_string(ts, dayfirst=dayfirst, yearfirst=yearfirst) + ts = parse_datetime_string( + ts, dayfirst=dayfirst, yearfirst=yearfirst) except Exception: raise ValueError("could not convert string to Timestamp") return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst) + def _test_parse_iso8601(object ts): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used @@ -1534,7 +1583,6 @@ cdef inline void _localize_tso(_TSObject obj, object tz): pos = trans.searchsorted(obj.value, side='right') - 1 - # static/pytz/dateutil specific code if _is_fixed_offset(tz): # statictzinfo @@ -1542,7 +1590,8 @@ cdef inline void _localize_tso(_TSObject obj, object tz): pandas_datetime_to_datetimestruct(obj.value + deltas[0], PANDAS_FR_ns, &obj.dts) else: - pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) + pandas_datetime_to_datetimestruct( + obj.value, PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz elif _treat_tz_as_pytz(tz): inf = tz._transition_info[pos] @@ -1591,21 +1640,29 @@ cdef inline bint _is_utc(object tz): cdef inline object _get_zone(object tz): """ We need to do several things here: - 1/ Distinguish between pytz and dateutil timezones - 2/ Not be over-specific (e.g. US/Eastern with/without DST is same *zone* but a different tz object) - 3/ Provide something to serialize when we're storing a datetime object in pytables. - - We return a string prefaced with dateutil if it's a dateutil tz, else just the tz name. It needs to be a - string so that we can serialize it with UJSON/pytables. maybe_get_tz (below) is the inverse of this process. + 1) Distinguish between pytz and dateutil timezones + 2) Not be over-specific (e.g. US/Eastern with/without DST is same *zone* + but a different tz object) + 3) Provide something to serialize when we're storing a datetime object + in pytables. + + We return a string prefaced with dateutil if it's a dateutil tz, else just + the tz name. It needs to be a string so that we can serialize it with + UJSON/pytables. maybe_get_tz (below) is the inverse of this process. """ if _is_utc(tz): return 'UTC' else: if _treat_tz_as_dateutil(tz): if '.tar.gz' in tz._filename: - raise ValueError('Bad tz filename. Dateutil on python 3 on windows has a bug which causes tzfile._filename to be the same for all ' - 'timezone files. Please construct dateutil timezones implicitly by passing a string like "dateutil/Europe/London" ' - 'when you construct your pandas objects instead of passing a timezone object. See https://github.com/pydata/pandas/pull/7362') + raise ValueError( + 'Bad tz filename. Dateutil on python 3 on windows has a ' + 'bug which causes tzfile._filename to be the same for all ' + 'timezone files. Please construct dateutil timezones ' + 'implicitly by passing a string like "dateutil/Europe' + '/London" when you construct your pandas objects instead ' + 'of passing a timezone object. See ' + 'https://github.com/pydata/pandas/pull/7362') return 'dateutil/' + tz._filename else: # tz is a pytz timezone or unknown. @@ -1620,8 +1677,8 @@ cdef inline object _get_zone(object tz): cpdef inline object maybe_get_tz(object tz): """ - (Maybe) Construct a timezone object from a string. If tz is a string, use it to construct a timezone object. - Otherwise, just return tz. + (Maybe) Construct a timezone object from a string. If tz is a string, use + it to construct a timezone object. Otherwise, just return tz. """ if isinstance(tz, string_types): if tz == 'tzlocal()': @@ -1639,7 +1696,6 @@ cpdef inline object maybe_get_tz(object tz): return tz - class OutOfBoundsDatetime(ValueError): pass @@ -1659,7 +1715,8 @@ cdef inline _check_dts_bounds(pandas_datetimestruct *dts): dts.day, dts.hour, dts.min, dts.sec) - raise OutOfBoundsDatetime('Out of bounds nanosecond timestamp: %s' % fmt) + raise OutOfBoundsDatetime( + 'Out of bounds nanosecond timestamp: %s' % fmt) def datetime_to_datetime64(ndarray[object] values): @@ -1689,7 +1746,8 @@ def datetime_to_datetime64(ndarray[object] values): _check_dts_bounds(&_ts.dts) else: if inferred_tz is not None: - raise ValueError('Cannot mix tz-aware with tz-naive values') + raise ValueError( + 'Cannot mix tz-aware with tz-naive values') iresult[i] = _pydatetime_to_dts(val, &dts) _check_dts_bounds(&dts) else: @@ -1698,7 +1756,7 @@ def datetime_to_datetime64(ndarray[object] values): return result, inferred_tz cdef: - set _not_datelike_strings = set(['a','A','m','M','p','P','t','T']) + set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) cpdef bint _does_string_look_like_datetime(object date_string): if date_string.startswith('0'): @@ -1742,7 +1800,7 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, pandas_datetimestruct dts if na_rep is None: - na_rep = 'NaT' + na_rep = 'NaT' # if we don't have a format nor tz, then choose # a format based on precision @@ -1780,7 +1838,7 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, elif show_us: res += '.%.6d' % dts.us elif show_ms: - res += '.%.3d' % (dts.us/1000) + res += '.%.3d' % (dts.us /1000) result[i] = res @@ -1810,7 +1868,6 @@ cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') def parse_datetime_string(object date_string, object freq=None, dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime. Also cares special handling matching time patterns. @@ -1913,23 +1970,27 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, i = date_string.index('Q', 1, 6) if i == 1: quarter = int(date_string[0]) - if date_len == 4 or (date_len == 5 and date_string[i + 1] == '-'): + if date_len == 4 or (date_len == 5 + and date_string[i + 1] == '-'): # r'(\d)Q-?(\d\d)') year = 2000 + int(date_string[-2:]) - elif date_len == 6 or (date_len == 7 and date_string[i + 1] == '-'): + elif date_len == 6 or (date_len == 7 + and date_string[i + 1] == '-'): # r'(\d)Q-?(\d\d\d\d)') year = int(date_string[-4:]) else: raise ValueError elif i == 2 or i == 3: # r'(\d\d)-?Q(\d)' - if date_len == 4 or (date_len == 5 and date_string[i - 1] == '-'): + if date_len == 4 or (date_len == 5 + and date_string[i - 1] == '-'): quarter = int(date_string[-1]) year = 2000 + int(date_string[:2]) else: raise ValueError elif i == 4 or i == 5: - if date_len == 6 or (date_len == 7 and date_string[i - 1] == '-'): + if date_len == 6 or (date_len == 7 + and date_string[i - 1] == '-'): # r'(\d\d\d\d)-?Q(\d)' quarter = int(date_string[-1]) year = int(date_string[:4]) @@ -1937,7 +1998,8 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, raise ValueError if not (1 <= quarter <= 4): - msg = 'Incorrect quarterly string is given, quarter must be between 1 and 4: {0}' + msg = ('Incorrect quarterly string is given, quarter must be ' + 'between 1 and 4: {0}') raise DateParseError(msg.format(date_string)) if freq is not None: @@ -1945,7 +2007,8 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, try: mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 except (KeyError, ValueError): - msg = 'Unable to retrieve month information from given freq: {0}'.format(freq) + msg = ('Unable to retrieve month information from given ' + 'freq: {0}').format(freq) raise DateParseError(msg) month = (mnum + (quarter - 1) * 3) % 12 + 1 @@ -1962,7 +2025,8 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, except ValueError: pass - if date_len == 6 and (freq == 'M' or getattr(freq, 'rule_code', None) == 'M'): + if date_len == 6 and (freq == 'M' or getattr( + freq, 'rule_code', None) == 'M'): year = int(date_string[:4]) month = int(date_string[4:6]) try: @@ -2048,7 +2112,8 @@ def dateutil_parse(object timestr, object default, ignoretz=False, # const for parsers -_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, second=0, microsecond=0) +_DEFAULT_DATETIME = datetime(1, 1, 1).replace( + hour=0, minute=0, second=0, microsecond=0) _MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] _MONTH_NUMBERS = dict((k, i) for i, k in enumerate(_MONTHS)) @@ -2092,7 +2157,9 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): int64_t m ndarray[float64_t] fvalues ndarray mask - bint is_ignore=errors=='ignore', is_coerce=errors=='coerce', is_raise=errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' + bint is_raise = errors=='raise' bint need_to_iterate=True ndarray[int64_t] iresult ndarray[object] oresult @@ -2123,9 +2190,11 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): # check the bounds if not need_to_iterate: - if (fvalues < _NS_LOWER_BOUND).any() or (fvalues > _NS_UPPER_BOUND).any(): - raise OutOfBoundsDatetime("cannot convert input with unit '{0}'".format(unit)) - result = (iresult*m).astype('M8[ns]') + if ((fvalues < _NS_LOWER_BOUND).any() + or (fvalues > _NS_UPPER_BOUND).any()): + raise OutOfBoundsDatetime( + "cannot convert input with unit '{0}'".format(unit)) + result = (iresult *m).astype('M8[ns]') iresult = result.view('i8') iresult[mask] = iNaT return result @@ -2149,10 +2218,9 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): iresult[i] = cast_from_unit(val, unit) except OverflowError: if is_raise: - raise OutOfBoundsDatetime("cannot convert input {0}" - "with the unit '{1}'".format( - val, - unit)) + raise OutOfBoundsDatetime( + "cannot convert input {0} with the unit " + "'{1}'".format(val, unit)) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -2166,19 +2234,17 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): iresult[i] = cast_from_unit(float(val), unit) except ValueError: if is_raise: - raise ValueError("non convertible value {0}" - "with the unit '{1}'".format( - val, - unit)) + raise ValueError( + "non convertible value {0} with the unit " + "'{1}'".format(val, unit)) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT except: if is_raise: - raise OutOfBoundsDatetime("cannot convert input {0}" - "with the unit '{1}'".format( - val, - unit)) + raise OutOfBoundsDatetime( + "cannot convert input {0} with the unit " + "'{1}'".format(val, unit)) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -2240,8 +2306,13 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', ndarray[int64_t] iresult ndarray[object] oresult pandas_datetimestruct dts - bint utc_convert = bool(utc), seen_integer=0, seen_string=0, seen_datetime=0 - bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' + bint utc_convert = bool(utc) + bint seen_integer = 0 + bint seen_string = 0 + bint seen_datetime = 0 + bint is_raise = errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' _TSObject _ts int out_local=0, out_tzoffset=0 @@ -2340,7 +2411,8 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', seen_string=1 _string_to_dts(val, &dts, &out_local, &out_tzoffset) - value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + value = pandas_datetimestruct_to_datetime( + PANDAS_FR_ns, &dts) if out_local == 1: tz = pytz.FixedOffset(out_tzoffset) value = tz_convert_single(value, tz, 'UTC') @@ -2353,8 +2425,9 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue elif is_raise: - raise ValueError("time data %r doesn't match format specified" % - (val,)) + raise ValueError( + "time data %r doesn't match format " + "specified" % (val,)) else: return values @@ -2398,7 +2471,8 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', if is_integer_object(val) or is_float_object(val): result[i] = NPY_NAT elif is_raise: - raise ValueError("mixed datetimes and integers in passed array") + raise ValueError( + "mixed datetimes and integers in passed array") else: raise TypeError @@ -2440,7 +2514,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst) + yearfirst=yearfirst) _pydatetime_to_dts(oresult[i], &dts) _check_dts_bounds(&dts) except Exception: @@ -2456,11 +2530,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return oresult -# Similar to Timestamp/datetime, this is a construction requirement for timedeltas -# we need to do object instantiation in python -# This will serve as a C extension type that -# shadows the python class, where we do any heavy lifting. - +# Similar to Timestamp/datetime, this is a construction requirement for +# timedeltas that we need to do object instantiation in python. This will +# serve as a C extension type that shadows the Python class, where we do any +# heavy lifting. cdef class _Timedelta(timedelta): cdef readonly: @@ -2526,14 +2599,14 @@ cdef class _Timedelta(timedelta): return # put frac in seconds - frac = ivalue/(1000*1000*1000) + frac = ivalue /(1000 *1000 *1000) if frac < 0: self._sign = -1 # even fraction if (-frac % 86400) != 0: - self._d = -frac/86400 + 1 - frac += 86400*self._d + self._d = -frac /86400 + 1 + frac += 86400 *self._d else: frac = -frac else: @@ -2542,37 +2615,38 @@ cdef class _Timedelta(timedelta): if frac >= 86400: self._d += frac / 86400 - frac -= self._d * 86400 + frac -= self._d * 86400 if frac >= 3600: - self._h = frac / 3600 - frac -= self._h * 3600 + self._h = frac / 3600 + frac -= self._h * 3600 else: self._h = 0 if frac >= 60: self._m = frac / 60 - frac -= self._m * 60 + frac -= self._m * 60 else: self._m = 0 if frac >= 0: self._s = frac - frac -= self._s + frac -= self._s else: self._s = 0 - sfrac = (self._h*3600 + self._m*60 + self._s)*(1000*1000*1000) + sfrac = (self._h * 3600 + self._m * 60 + + self._s) * (1000 * 1000 * 1000) if self._sign < 0: - ifrac = ivalue + self._d*DAY_NS - sfrac + ifrac = ivalue + self._d *DAY_NS - sfrac else: - ifrac = ivalue - (self._d*DAY_NS + sfrac) + ifrac = ivalue - (self._d *DAY_NS + sfrac) if ifrac != 0: - self._ms = ifrac/(1000*1000) - ifrac -= self._ms*1000*1000 - self._us = ifrac/1000 - ifrac -= self._us*1000 + self._ms = ifrac /(1000 *1000) + ifrac -= self._ms *1000 *1000 + self._us = ifrac /1000 + ifrac -= self._us *1000 self._ns = ifrac else: self._ms = 0 @@ -2586,16 +2660,20 @@ cdef class _Timedelta(timedelta): return an actual datetime.timedelta object note: we lose nanosecond resolution if any """ - return timedelta(microseconds=int(self.value)/1000) + return timedelta(microseconds=int(self.value) /1000) cpdef bint _has_ns(self): return self.value % 1000 != 0 # components named tuple -Components = collections.namedtuple('Components',['days','hours','minutes','seconds','milliseconds','microseconds','nanoseconds']) +Components = collections.namedtuple('Components', [ + 'days', 'hours', 'minutes', 'seconds', + 'milliseconds', 'microseconds', 'nanoseconds']) # Python front end to C extension type _Timedelta # This serves as the box for timedelta64 + + class Timedelta(_Timedelta): """ Represents a duration, the difference between two dates or times. @@ -2608,7 +2686,8 @@ class Timedelta(_Timedelta): value : Timedelta, timedelta, np.timedelta64, string, or integer unit : string, [D,h,m,s,ms,us,ns] Denote the unit of the input, if input is an integer. Default 'ns'. - days, seconds, microseconds, milliseconds, minutes, hours, weeks : numeric, optional + days, seconds, microseconds, + milliseconds, minutes, hours, weeks : numeric, optional Values for construction in compat with datetime.timedelta. np ints and floats will be coereced to python ints and floats. @@ -2623,43 +2702,52 @@ class Timedelta(_Timedelta): if value is _no_input: if not len(kwargs): - raise ValueError("cannot construct a Timedelta without a value/unit or descriptive keywords (days,seconds....)") + raise ValueError( + "cannot construct a Timedelta without a value/unit or " + "descriptive keywords (days,seconds....)") def _to_py_int_float(v): if is_integer_object(v): return int(v) elif is_float_object(v): return float(v) - raise TypeError("Invalid type {0}. Must be int or float.".format(type(v))) + raise TypeError( + "Invalid type {0}. Must be int or float.".format(type(v))) - kwargs = dict([ (k, _to_py_int_float(v)) for k, v in iteritems(kwargs) ]) + kwargs = dict([ (k, _to_py_int_float(v)) + for k, v in iteritems(kwargs) ]) try: - nano = kwargs.pop('nanoseconds',0) - value = convert_to_timedelta64(timedelta(**kwargs),'ns') + nano + nano = kwargs.pop('nanoseconds', 0) + value = convert_to_timedelta64( + timedelta(**kwargs), 'ns') + nano except TypeError as e: - raise ValueError("cannot construct a Timedelta from the passed arguments, allowed keywords are " - "[weeks, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds]") + raise ValueError("cannot construct a Timedelta from the " + "passed arguments, allowed keywords are " + "[weeks, days, hours, minutes, seconds, " + "milliseconds, microseconds, nanoseconds]") if isinstance(value, Timedelta): value = value.value elif util.is_string_object(value): value = np.timedelta64(parse_timedelta_string(value)) elif isinstance(value, timedelta): - value = convert_to_timedelta64(value,'ns') + value = convert_to_timedelta64(value, 'ns') elif isinstance(value, np.timedelta64): if unit is not None: value = value.astype('timedelta64[{0}]'.format(unit)) value = value.astype('timedelta64[ns]') - elif hasattr(value,'delta'): - value = np.timedelta64(_delta_to_nanoseconds(value.delta),'ns') + elif hasattr(value, 'delta'): + value = np.timedelta64(_delta_to_nanoseconds(value.delta), 'ns') elif is_integer_object(value) or util.is_float_object(value): # unit=None is de-facto 'ns' - value = convert_to_timedelta64(value,unit) + value = convert_to_timedelta64(value, unit) elif _checknull_with_nat(value): return NaT else: - raise ValueError("Value must be Timedelta, string, integer, float, timedelta or convertible") + raise ValueError( + "Value must be Timedelta, string, integer, " + "float, timedelta or convertible") if isinstance(value, np.timedelta64): value = value.view('i8') @@ -2669,7 +2757,7 @@ class Timedelta(_Timedelta): return NaT # make timedelta happy - td_base = _Timedelta.__new__(cls, microseconds=int(value)/1000) + td_base = _Timedelta.__new__(cls, microseconds=int(value) /1000) td_base.value = value td_base.is_populated = 0 return td_base @@ -2690,19 +2778,19 @@ class Timedelta(_Timedelta): self._ensure_components() if self._ns: - return "N" + return "N" elif self._us: - return "U" + return "U" elif self._ms: - return "L" + return "L" elif self._s: - return "S" + return "S" elif self._m: - return "T" + return "T" elif self._h: - return "H" + return "H" else: - return "D" + return "D" def _round(self, freq, rounder): @@ -2710,8 +2798,8 @@ class Timedelta(_Timedelta): from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos - result = unit*rounder(self.value/float(unit)) - return Timedelta(result,unit='ns') + result = unit *rounder(self.value /float(unit)) + return Timedelta(result, unit='ns') def round(self, freq): """ @@ -2768,43 +2856,49 @@ class Timedelta(_Timedelta): self._ensure_components() if self._sign < 0: - sign_pretty = "-" - sign2_pretty = " +" + sign_pretty = "-" + sign2_pretty = " +" else: - sign_pretty = "" - sign2_pretty = " " + sign_pretty = "" + sign2_pretty = " " # show everything if format == 'all': - seconds_pretty = "%02d.%03d%03d%03d" % (self._s, self._ms, self._us, self._ns) - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, sign2_pretty, self._h, self._m, seconds_pretty) + seconds_pretty = "%02d.%03d%03d%03d" % ( + self._s, self._ms, self._us, self._ns) + return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) # by default not showing nano if self._ms or self._us or self._ns: - seconds_pretty = "%02d.%03d%03d" % (self._s, self._ms, self._us) + seconds_pretty = "%02d.%03d%03d" % (self._s, self._ms, self._us) else: - seconds_pretty = "%02d" % self._s + seconds_pretty = "%02d" % self._s # if we have a partial day - subs = self._h or self._m or self._s or self._ms or self._us or self._ns + subs = (self._h or self._m or self._s or + self._ms or self._us or self._ns) if format == 'even_day': - if not subs: - return "%s%d days" % (sign_pretty, self._d) + if not subs: + return "%s%d days" % (sign_pretty, self._d) elif format == 'sub_day': - if not self._d: + if not self._d: - # degenerate, don't need the extra space - if self._sign > 0: - sign2_pretty = "" - return "%s%s%02d:%02d:%s" % (sign_pretty, sign2_pretty, self._h, self._m, seconds_pretty) + # degenerate, don't need the extra space + if self._sign > 0: + sign2_pretty = "" + return "%s%s%02d:%02d:%s" % (sign_pretty, sign2_pretty, + self._h, self._m, seconds_pretty) if subs or format=='long': - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, sign2_pretty, self._h, self._m, seconds_pretty) + return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) return "%s%d days" % (sign_pretty, self._d) - def __repr__(self): return "Timedelta('{0}')".format(self._repr_base(format='long')) def __str__(self): @@ -2815,10 +2909,12 @@ class Timedelta(_Timedelta): """ Return a Components NamedTuple-like """ self._ensure_components() if self._sign < 0: - return Components(-self._d,self._h,self._m,self._s,self._ms,self._us,self._ns) + return Components(-self._d, self._h, self._m, self._s, + self._ms, self._us, self._ns) # return the named tuple - return Components(self._d,self._h,self._m,self._s,self._ms,self._us,self._ns) + return Components(self._d, self._h, self._m, self._s, + self._ms, self._us, self._ns) @property def days(self): @@ -2829,7 +2925,7 @@ class Timedelta(_Timedelta): """ self._ensure_components() if self._sign < 0: - return -1*self._d + return -1 *self._d return self._d @property @@ -2840,7 +2936,7 @@ class Timedelta(_Timedelta): .components will return the shown components """ self._ensure_components() - return self._h*3600 + self._m*60 + self._s + return self._h *3600 + self._m *60 + self._s @property def microseconds(self): @@ -2850,7 +2946,7 @@ class Timedelta(_Timedelta): .components will return the shown components """ self._ensure_components() - return self._ms*1000 + self._us + return self._ms *1000 + self._us @property def nanoseconds(self): @@ -2866,7 +2962,7 @@ class Timedelta(_Timedelta): """ Total duration of timedelta in seconds (to ns precision) """ - return 1e-9*self.value + return 1e-9 *self.value def __setstate__(self, state): (value) = state @@ -2887,13 +2983,13 @@ class Timedelta(_Timedelta): def _validate_ops_compat(self, other): # return True if we are compat with operating if _checknull_with_nat(other): - return True + return True elif isinstance(other, (Timedelta, timedelta, np.timedelta64)): - return True + return True elif util.is_string_object(other): - return True - elif hasattr(other,'delta'): - return True + return True + elif hasattr(other, 'delta'): + return True return False # higher than np.ndarray and np.matrix @@ -2952,9 +3048,9 @@ class Timedelta(_Timedelta): # only integers and floats allowed if not (is_integer_object(other) or is_float_object(other)): - return NotImplemented + return NotImplemented - return Timedelta(other*self.value, unit='ns') + return Timedelta(other *self.value, unit='ns') __rmul__ = __mul__ @@ -2965,7 +3061,7 @@ class Timedelta(_Timedelta): # integers or floats if is_integer_object(other) or is_float_object(other): - return Timedelta(self.value/other, unit='ns') + return Timedelta(self.value /other, unit='ns') if not self._validate_ops_compat(other): return NotImplemented @@ -2973,7 +3069,7 @@ class Timedelta(_Timedelta): other = Timedelta(other) if other is NaT: return np.nan - return self.value/float(other.value) + return self.value /float(other.value) def __rtruediv__(self, other): if hasattr(other, 'dtype'): @@ -2988,13 +3084,13 @@ class Timedelta(_Timedelta): return float(other.value) / self.value if not PY3: - __div__ = __truediv__ - __rdiv__ = __rtruediv__ + __div__ = __truediv__ + __rdiv__ = __rtruediv__ def _not_implemented(self, *args, **kwargs): return NotImplemented - __floordiv__ = _not_implemented + __floordiv__ = _not_implemented __rfloordiv__ = _not_implemented def _op_unary_method(func, name): @@ -3010,14 +3106,16 @@ class Timedelta(_Timedelta): __abs__ = _op_unary_method(lambda x: abs(x), '__abs__') # resolution in ns -Timedelta.min = Timedelta(np.iinfo(np.int64).min+1) +Timedelta.min = Timedelta(np.iinfo(np.int64).min +1) Timedelta.max = Timedelta(np.iinfo(np.int64).max) cdef PyTypeObject* td_type = Timedelta + cdef inline bint is_timedelta(object o): return Py_TYPE(o) == td_type # isinstance(o, Timedelta) + cpdef array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', @@ -3054,37 +3152,37 @@ cpdef array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): return iresult -cdef dict timedelta_abbrevs = { 'D' : 'd', - 'd' : 'd', - 'days' : 'd', - 'day' : 'd', - 'hours' : 'h', - 'hour' : 'h', - 'hr' : 'h', - 'h' : 'h', - 'm' : 'm', - 'minute' : 'm', - 'min' : 'm', - 'minutes' : 'm', - 's' : 's', - 'seconds' : 's', - 'sec' : 's', - 'second' : 's', - 'ms' : 'ms', - 'milliseconds' : 'ms', - 'millisecond' : 'ms', - 'milli' : 'ms', - 'millis' : 'ms', - 'us' : 'us', - 'microseconds' : 'us', - 'microsecond' : 'us', - 'micro' : 'us', - 'micros' : 'us', - 'ns' : 'ns', - 'nanoseconds' : 'ns', - 'nano' : 'ns', - 'nanos' : 'ns', - 'nanosecond' : 'ns', +cdef dict timedelta_abbrevs = { 'D': 'd', + 'd': 'd', + 'days': 'd', + 'day': 'd', + 'hours': 'h', + 'hour': 'h', + 'hr': 'h', + 'h': 'h', + 'm': 'm', + 'minute': 'm', + 'min': 'm', + 'minutes': 'm', + 's': 's', + 'seconds': 's', + 'sec': 's', + 'second': 's', + 'ms': 'ms', + 'milliseconds': 'ms', + 'millisecond': 'ms', + 'milli': 'ms', + 'millis': 'ms', + 'us': 'us', + 'microseconds': 'us', + 'microsecond': 'us', + 'micro': 'us', + 'micros': 'us', + 'ns': 'ns', + 'nanoseconds': 'ns', + 'nano': 'ns', + 'nanos': 'ns', + 'nanosecond': 'ns', } timedelta_abbrevs_map = timedelta_abbrevs @@ -3134,7 +3232,8 @@ cdef inline parse_timedelta_string(object ts): list number=[], frac=[], unit=[] # neg : tracks if we have a leading negative for the value - # have_dot : tracks if we are processing a dot (either post hhmmss or inside an expression) + # have_dot : tracks if we are processing a dot (either post hhmmss or + # inside an expression) # have_value : track if we have at least 1 leading unit # have_hhmmss : tracks if we have a regular format hh:mm:ss @@ -3250,11 +3349,11 @@ cdef inline parse_timedelta_string(object ts): raise ValueError("no units specified") if len(frac) > 0 and len(frac) <= 3: - m = 10**(3-len(frac)) * 1000L * 1000L + m = 10**(3 -len(frac)) * 1000L * 1000L elif len(frac) > 3 and len(frac) <= 6: - m = 10**(6-len(frac)) * 1000L + m = 10**(6 -len(frac)) * 1000L else: - m = 10**(9-len(frac)) + m = 10**(9 -len(frac)) r = int(''.join(frac)) * m result += timedelta_as_neg(r, neg) @@ -3320,7 +3419,7 @@ cpdef convert_to_timedelta64(object ts, object unit): else: if util.is_array(ts): ts = ts.astype('int64').item() - if unit in ['Y','M','W']: + if unit in ['Y', 'M', 'W']: ts = np.timedelta64(ts, unit) else: ts = cast_from_unit(ts, unit) @@ -3328,15 +3427,15 @@ cpdef convert_to_timedelta64(object ts, object unit): elif is_float_object(ts): if util.is_array(ts): ts = ts.astype('int64').item() - if unit in ['Y','M','W']: + if unit in ['Y', 'M', 'W']: ts = np.timedelta64(int(ts), unit) else: ts = cast_from_unit(ts, unit) ts = np.timedelta64(ts) elif util.is_string_object(ts): ts = np.timedelta64(parse_timedelta_string(ts)) - elif hasattr(ts,'delta'): - ts = np.timedelta64(_delta_to_nanoseconds(ts),'ns') + elif hasattr(ts, 'delta'): + ts = np.timedelta64(_delta_to_nanoseconds(ts), 'ns') if isinstance(ts, timedelta): ts = np.timedelta64(ts) @@ -3345,7 +3444,9 @@ cpdef convert_to_timedelta64(object ts, object unit): "scalar: %s" % type(ts)) return ts.astype('timedelta64[ns]') -def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='raise'): + +def array_strptime(ndarray[object] values, object fmt, + bint exact=True, errors='raise'): """ Parameters ---------- @@ -3364,7 +3465,9 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' int64_t us, ns object val, group_key, ampm, found dict found_key - bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' + bint is_raise = errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' assert is_raise or is_ignore or is_coerce @@ -3442,8 +3545,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match format %r (match)" % - (values[i], fmt)) + raise ValueError("time data %r does not match " + "format %r (match)" % (values[i], fmt)) if len(val) != found.end(): if is_coerce: iresult[i] = NPY_NAT @@ -3458,8 +3561,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match format %r (search)" % - (values[i], fmt)) + raise ValueError("time data %r does not match format " + "%r (search)" % (values[i], fmt)) year = 1900 month = day = 1 @@ -3563,7 +3666,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' # same and yet time.daylight is true; too ambiguous to # be able to tell what timezone has daylight savings if (time.tzname[0] == time.tzname[1] and - time.daylight and found_zone not in ("utc", "gmt")): + time.daylight and found_zone not in ( + "utc", "gmt")): break else: tz = value @@ -3579,9 +3683,10 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' # calculation. try: if julian == -1: - # Need to add 1 to result since first day of the year is 1, not 0. + # Need to add 1 to result since first day of the year is 1, not + # 0. julian = datetime_date(year, month, day).toordinal() - \ - datetime_date(year, 1, 1).toordinal() + 1 + datetime_date(year, 1, 1).toordinal() + 1 else: # Assume that if they bothered to include Julian day it will # be accurate. datetime_result = datetime_date.fromordinal( @@ -3590,10 +3695,10 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' month = datetime_result.month day = datetime_result.day except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise + if is_coerce: + iresult[i] = NPY_NAT + continue + raise if weekday == -1: weekday = datetime_date(year, month, day).weekday() @@ -3672,10 +3777,11 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int base = ts - frac = ts-base + frac = ts -base if p: - frac = round(frac,p) - return (base*m) + (frac*m) + frac = round(frac, p) + return (base *m) + (frac *m) + def cast_to_nanoseconds(ndarray arr): cdef: @@ -3721,6 +3827,7 @@ def pydt_to_i8(object pydt): return ts.value + def i8_to_pydt(int64_t i8, object tzinfo = None): """ Inverse of pydt_to_i8 @@ -3737,6 +3844,7 @@ try: except: have_pytz = False + def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): cdef: ndarray[int64_t] utc_dates, tt, result, trans, deltas @@ -3803,7 +3911,8 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz2) - delta = int(total_seconds(_get_utcoffset(tz2, dt))) * 1000000000 + delta = int(total_seconds( + _get_utcoffset(tz2, dt))) * 1000000000 result[i] = v + delta return result @@ -3836,6 +3945,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): result[i] = v + offset return result + def tz_convert_single(int64_t val, object tz1, object tz2): cdef: ndarray[int64_t] trans, deltas @@ -3889,7 +3999,8 @@ def tz_convert_single(int64_t val, object tz1, object tz2): dst_cache = {} cdef inline bint _treat_tz_as_pytz(object tz): - return hasattr(tz, '_utc_transition_times') and hasattr(tz, '_transition_info') + return hasattr(tz, '_utc_transition_times') and hasattr( + tz, '_transition_info') cdef inline bint _treat_tz_as_dateutil(object tz): return hasattr(tz, '_trans_list') and hasattr(tz, '_trans_idx') @@ -3902,24 +4013,32 @@ def _p_tz_cache_key(tz): cdef inline object _tz_cache_key(object tz): """ - Return the key in the cache for the timezone info object or None if unknown. + Return the key in the cache for the timezone info object or None + if unknown. - The key is currently the tz string for pytz timezones, the filename for dateutil timezones. + The key is currently the tz string for pytz timezones, the filename for + dateutil timezones. Notes ===== - This cannot just be the hash of a timezone object. Unfortunately, the hashes of two dateutil tz objects - which represent the same timezone are not equal (even though the tz objects will compare equal and - represent the same tz file). - Also, pytz objects are not always hashable so we use str(tz) instead. + This cannot just be the hash of a timezone object. Unfortunately, the + hashes of two dateutil tz objects which represent the same timezone are + not equal (even though the tz objects will compare equal and represent + the same tz file). Also, pytz objects are not always hashable so we use + str(tz) instead. """ if isinstance(tz, _pytz_BaseTzInfo): return tz.zone elif isinstance(tz, _dateutil_tzfile): if '.tar.gz' in tz._filename: - raise ValueError('Bad tz filename. Dateutil on python 3 on windows has a bug which causes tzfile._filename to be the same for all ' - 'timezone files. Please construct dateutil timezones implicitly by passing a string like "dateutil/Europe/London" ' - 'when you construct your pandas objects instead of passing a timezone object. See https://github.com/pydata/pandas/pull/7362') + raise ValueError('Bad tz filename. Dateutil on python 3 on ' + 'windows has a bug which causes tzfile._filename ' + 'to be the same for all timezone files. Please ' + 'construct dateutil timezones implicitly by ' + 'passing a string like "dateutil/Europe/London" ' + 'when you construct your pandas objects instead ' + 'of passing a timezone object. See ' + 'https://github.com/pydata/pandas/pull/7362') return 'dateutil' + tz._filename else: return None @@ -3956,26 +4075,29 @@ cdef object _get_dst_info(object tz): if len(tz._trans_list): # get utc trans times trans_list = _get_utc_trans_times_from_dateutil_tz(tz) - trans = np.hstack([np.array([0], dtype='M8[s]'), # place holder for first item - np.array(trans_list, dtype='M8[s]')]).astype('M8[ns]') # all trans listed + trans = np.hstack([ + np.array([0], dtype='M8[s]'), # place holder for first item + np.array(trans_list, dtype='M8[s]')]).astype( + 'M8[ns]') # all trans listed trans = trans.view('i8') trans[0] = NPY_NAT + 1 # deltas - deltas = np.array([v.offset for v in (tz._ttinfo_before,) + tz._trans_idx], dtype='i8') # + (tz._ttinfo_std,) + deltas = np.array([v.offset for v in ( + tz._ttinfo_before,) + tz._trans_idx], dtype='i8') deltas *= 1000000000 typ = 'dateutil' elif _is_fixed_offset(tz): trans = np.array([NPY_NAT + 1], dtype=np.int64) - deltas = np.array([tz._ttinfo_std.offset], dtype='i8') * 1000000000 + deltas = np.array([tz._ttinfo_std.offset], + dtype='i8') * 1000000000 typ = 'fixed' else: trans = np.array([], dtype='M8[ns]') deltas = np.array([], dtype='i8') typ = None - else: # static tzinfo trans = np.array([NPY_NAT + 1], dtype=np.int64) @@ -3989,8 +4111,9 @@ cdef object _get_dst_info(object tz): cdef object _get_utc_trans_times_from_dateutil_tz(object tz): """ - Transition times in dateutil timezones are stored in local non-dst time. This code - converts them to UTC. It's the reverse of the code in dateutil.tz.tzfile.__init__. + Transition times in dateutil timezones are stored in local non-dst + time. This code converts them to UTC. It's the reverse of the code + in dateutil.tz.tzfile.__init__. """ new_trans = list(tz._trans_list) last_std_offset = 0 @@ -4000,6 +4123,7 @@ cdef object _get_utc_trans_times_from_dateutil_tz(object tz): new_trans[i] = trans - last_std_offset return new_trans + def tot_seconds(td): return total_seconds(td) @@ -4069,7 +4193,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, elif hasattr(ambiguous, '__iter__'): is_dst = True if len(ambiguous) != len(vals): - raise ValueError("Length of ambiguous bool-array must be the same size as vals") + raise ValueError( + "Length of ambiguous bool-array must be the same size as vals") trans, deltas, typ = _get_dst_info(tz) @@ -4082,7 +4207,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result_b.fill(NPY_NAT) # left side - idx_shifted = (np.maximum(0, trans.searchsorted(vals - DAY_NS, side='right') - 1)).astype(np.int64) + idx_shifted = (np.maximum(0, trans.searchsorted( + vals - DAY_NS, side='right') - 1)).astype(np.int64) for i in range(n): v = vals[i] - deltas[idx_shifted[i]] @@ -4093,7 +4219,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result_a[i] = v # right side - idx_shifted = (np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1)).astype(np.int64) + idx_shifted = (np.maximum(0, trans.searchsorted( + vals + DAY_NS, side='right') - 1)).astype(np.int64) for i in range(n): v = vals[i] - deltas[idx_shifted[i]] @@ -4110,36 +4237,39 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, # Get the ambiguous hours (given the above, these are the hours # where result_a != result_b and neither of them are NAT) both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT) - both_eq = result_a == result_b + both_eq = result_a == result_b trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq))) if trans_idx.size == 1: stamp = Timestamp(vals[trans_idx]) - raise pytz.AmbiguousTimeError("Cannot infer dst time from %s as" - "there are no repeated times" % stamp) + raise pytz.AmbiguousTimeError( + "Cannot infer dst time from %s as there " + "are no repeated times" % stamp) # Split the array into contiguous chunks (where the difference between - # indices is 1). These are effectively dst transitions in different years - # which is useful for checking that there is not an ambiguous transition - # in an individual year. + # indices is 1). These are effectively dst transitions in different + # years which is useful for checking that there is not an ambiguous + # transition in an individual year. if trans_idx.size > 0: - one_diff = np.where(np.diff(trans_idx)!=1)[0]+1 + one_diff = np.where(np.diff(trans_idx) != 1)[0] +1 trans_grp = np.array_split(trans_idx, one_diff) - # Iterate through each day, if there are no hours where the delta is negative - # (indicates a repeat of hour) the switch cannot be inferred + # Iterate through each day, if there are no hours where the + # delta is negative (indicates a repeat of hour) the switch + # cannot be inferred for grp in trans_grp: delta = np.diff(result_a[grp]) - if grp.size == 1 or np.all(delta>0): + if grp.size == 1 or np.all(delta > 0): stamp = Timestamp(vals[grp[0]]) raise pytz.AmbiguousTimeError(stamp) - # Find the index for the switch and pull from a for dst and b for standard - switch_idx = (delta<=0).nonzero()[0] + # Find the index for the switch and pull from a for dst and b + # for standard + switch_idx = (delta <= 0).nonzero()[0] if switch_idx.size > 1: - raise pytz.AmbiguousTimeError("There are %i dst switches " - "when there should only be 1." - % switch_idx.size) - switch_idx = switch_idx[0]+1 # Pull the only index and adjust + raise pytz.AmbiguousTimeError( + "There are %i dst switches when " + "there should only be 1." % switch_idx.size) + switch_idx = switch_idx[0] + 1 # Pull the only index and adjust a_idx = grp[:switch_idx] b_idx = grp[switch_idx:] dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx])) @@ -4164,9 +4294,9 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result[i] = NPY_NAT else: stamp = Timestamp(vals[i]) - raise pytz.AmbiguousTimeError("Cannot infer dst time from %r, "\ - "try using the 'ambiguous' argument" - % stamp) + raise pytz.AmbiguousTimeError( + "Cannot infer dst time from %r, try using the " + "'ambiguous' argument" % stamp) elif left != NPY_NAT: result[i] = left elif right != NPY_NAT: @@ -4246,6 +4376,7 @@ def build_field_sarray(ndarray[int64_t] dtindex): return out + def get_time_micros(ndarray[int64_t] dtindex): """ Datetime as int64 representation to a structured array of fields @@ -4284,7 +4415,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): _month_offset = np.array( [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], [ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 ]], - dtype=np.int32 ) + dtype=np.int32 ) count = len(dtindex) out = np.empty(count, dtype='i4') @@ -4294,7 +4425,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.year return out @@ -4303,7 +4435,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.month return out @@ -4312,7 +4445,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.day return out @@ -4321,7 +4455,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.hour return out @@ -4330,7 +4465,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.min return out @@ -4339,7 +4475,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.sec return out @@ -4348,7 +4485,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.us return out @@ -4357,7 +4495,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.ps / 1000 return out elif field == 'doy': @@ -4365,9 +4504,10 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) - out[i] = _month_offset[isleap, dts.month-1] + dts.day + out[i] = _month_offset[isleap, dts.month -1] + dts.day return out elif field == 'dow': @@ -4375,7 +4515,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dayofweek(dts.year, dts.month, dts.day) return out @@ -4384,7 +4525,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) isleap_prev = is_leapyear(dts.year - 1) mo_off = _month_offset[isleap, dts.month - 1] @@ -4414,7 +4556,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.month out[i] = ((out[i] - 1) / 3) + 1 return out @@ -4424,7 +4567,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = days_in_month(dts) return out elif field == 'is_leap_year': @@ -4434,7 +4578,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): @cython.wraparound(False) -def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=None, int month_kw=12): +def get_start_end_field(ndarray[int64_t] dtindex, object field, + object freqstr=None, int month_kw=12): """ Given an int64-based datetime index return array of indicators of whether timestamps are at the start/end of the month/quarter/year @@ -4456,21 +4601,24 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N _month_offset = np.array( [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], [ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 ]], - dtype=np.int32 ) + dtype=np.int32 ) count = len(dtindex) out = np.zeros(count, dtype='int8') if freqstr: if freqstr == 'C': - raise ValueError("Custom business days is not supported by %s" % field) + raise ValueError( + "Custom business days is not supported by %s" % field) is_business = freqstr[0] == 'B' - # YearBegin(), BYearBegin() use month = starting month of year - # QuarterBegin(), BQuarterBegin() use startingMonth = starting month of year - # other offests use month, startingMonth as ending month of year. + # YearBegin(), BYearBegin() use month = starting month of year. + # QuarterBegin(), BQuarterBegin() use startingMonth = starting + # month of year. Other offests use month, startingMonth as ending + # month of year. - if (freqstr[0:2] in ['MS', 'QS', 'AS']) or (freqstr[1:3] in ['MS', 'QS', 'AS']): + if (freqstr[0:2] in ['MS', 'QS', 'AS']) or ( + freqstr[1:3] in ['MS', 'QS', 'AS']): end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw else: @@ -4485,7 +4633,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day dow = ts_dayofweek(ts) @@ -4497,7 +4646,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day if dom == 1: @@ -4509,7 +4659,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] @@ -4518,14 +4669,16 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N ldom = _month_offset[isleap, dts.month] dow = ts_dayofweek(ts) - if (ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2)): + if (ldom == doy and dow < 5) or ( + dow == 4 and (ldom - doy <= 2)): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] dom = dts.day @@ -4541,19 +4694,22 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day dow = ts_dayofweek(ts) - if ((dts.month - start_month) % 3 == 0) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): + if ((dts.month - start_month) % 3 == 0) and ( + (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day if ((dts.month - start_month) % 3 == 0) and dom == 1: @@ -4565,7 +4721,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] @@ -4574,14 +4731,17 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N ldom = _month_offset[isleap, dts.month] dow = ts_dayofweek(ts) - if ((dts.month - end_month) % 3 == 0) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))): + if ((dts.month - end_month) % 3 == 0) and ( + (ldom == doy and dow < 5) or ( + dow == 4 and (ldom - doy <= 2))): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] dom = dts.day @@ -4597,19 +4757,22 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day dow = ts_dayofweek(ts) - if (dts.month == start_month) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): + if (dts.month == start_month) and ( + (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day if (dts.month == start_month) and dom == 1: @@ -4621,7 +4784,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) dom = dts.day @@ -4630,14 +4794,17 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N dow = ts_dayofweek(ts) ldom = _month_offset[isleap, dts.month] - if (dts.month == end_month) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))): + if (dts.month == end_month) and ( + (ldom == doy and dow < 5) or ( + dow == 4 and (ldom - doy <= 2))): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] @@ -4651,6 +4818,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N raise ValueError("Field %s not supported" % field) + @cython.wraparound(False) @cython.boundscheck(False) def get_date_name_field(ndarray[int64_t] dtindex, object field): @@ -4666,8 +4834,9 @@ def get_date_name_field(ndarray[int64_t] dtindex, object field): int dow _dayname = np.array( - ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], - dtype=np.object_ ) + ['Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday', 'Sunday'], + dtype=np.object_ ) count = len(dtindex) out = np.empty(count, dtype=object) @@ -4710,11 +4879,13 @@ def date_normalize(ndarray[int64_t] stamps, tz=None): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + stamps[i], PANDAS_FR_ns, &dts) result[i] = _normalized_stamp(&dts) return result + @cython.wraparound(False) @cython.boundscheck(False) cdef _normalize_local(ndarray[int64_t] stamps, object tz): @@ -4730,15 +4901,15 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + stamps[i], PANDAS_FR_ns, &dts) result[i] = _normalized_stamp(&dts) elif _is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, - &dts) + pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 @@ -4755,7 +4926,7 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): pos = _pos # statictzinfo - if typ not in ['pytz','dateutil']: + if typ not in ['pytz', 'dateutil']: for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT @@ -4840,7 +5011,7 @@ def monthrange(int64_t year, int64_t month): if month < 1 or month > 12: raise ValueError("bad month number 0; must be 1-12") - days = days_per_month_table[is_leapyear(year)][month-1] + days = days_per_month_table[is_leapyear(year)][month -1] return (dayofweek(year, month, 1), days) @@ -4848,7 +5019,7 @@ cdef inline int64_t ts_dayofweek(_TSObject ts): return dayofweek(ts.dts.year, ts.dts.month, ts.dts.day) cdef inline int days_in_month(pandas_datetimestruct dts) nogil: - return days_per_month_table[is_leapyear(dts.year)][dts.month-1] + return days_per_month_table[is_leapyear(dts.year)][dts.month -1] cpdef normalize_date(object dt): """ @@ -4874,10 +5045,14 @@ cdef inline int _year_add_months(pandas_datetimestruct dts, cdef inline int _month_add_months(pandas_datetimestruct dts, int months) nogil: - """new month number after shifting pandas_datetimestruct number of months""" + """ + New month number after shifting pandas_datetimestruct + number of months. + """ cdef int new_month = (dts.month + months) % 12 return 12 if new_month == 0 else new_month + @cython.wraparound(False) @cython.boundscheck(False) def shift_months(int64_t[:] dtindex, int months, object day=None): @@ -4902,7 +5077,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): with nogil: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dts.year = _year_add_months(dts, months) dts.month = _month_add_months(dts, months) @@ -4916,7 +5092,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): with nogil: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) months_to_roll = months # offset semantics - if on the anchor point and going backwards @@ -4937,7 +5114,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): with nogil: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) months_to_roll = months # similar semantics - when adding shift forward by one @@ -4992,10 +5170,12 @@ except: __all__ = [] + def _getlang(): # Figure out what the current language is set to. return locale.getlocale(locale.LC_TIME) + class LocaleTime(object): """Stores and handles locale-specific information related to time. @@ -5075,8 +5255,9 @@ class LocaleTime(object): # magical; just happened to have used it everywhere else where a # static date was needed. am_pm = [] - for hour in (01,22): - time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0)) + for hour in (01, 22): + time_tuple = time.struct_time( + (1999, 3, 17, hour, 44, 55, 2, 76, 0)) am_pm.append(time.strftime("%p", time_tuple).lower()) self.am_pm = am_pm @@ -5088,22 +5269,23 @@ class LocaleTime(object): # overloaded numbers is minimized. The order in which searches for # values within the format string is very important; it eliminates # possible ambiguity for what something represents. - time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0)) + time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, 2, 76, 0)) date_time = [None, None, None] date_time[0] = time.strftime("%c", time_tuple).lower() date_time[1] = time.strftime("%x", time_tuple).lower() date_time[2] = time.strftime("%X", time_tuple).lower() replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), - (self.f_month[3], '%B'), (self.a_weekday[2], '%a'), - (self.a_month[3], '%b'), (self.am_pm[1], '%p'), - ('1999', '%Y'), ('99', '%y'), ('22', '%H'), - ('44', '%M'), ('55', '%S'), ('76', '%j'), - ('17', '%d'), ('03', '%m'), ('3', '%m'), - # '3' needed for when no leading zero. - ('2', '%w'), ('10', '%I')] + (self.f_month[3], + '%B'), (self.a_weekday[2], '%a'), + (self.a_month[3], '%b'), (self.am_pm[1], '%p'), + ('1999', '%Y'), ('99', '%y'), ('22', '%H'), + ('44', '%M'), ('55', '%S'), ('76', '%j'), + ('17', '%d'), ('03', '%m'), ('3', '%m'), + # '3' needed for when no leading zero. + ('2', '%w'), ('10', '%I')] replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone for tz in tz_values]) - for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')): + for offset, directive in ((0, '%c'), (1, '%x'), (2, '%X')): current_format = date_time[offset] for old, new in replacement_pairs: # Must deal with possible lack of locale info @@ -5115,7 +5297,7 @@ class LocaleTime(object): # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since # 2005-01-03 occurs before the first Monday of the year. Otherwise # %U is used. - time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0)) + time_tuple = time.struct_time((1999, 1, 3, 1, 1, 1, 6, 3, 0)) if '00' in time.strftime(directive, time_tuple): U_W = '%W' else: @@ -5161,7 +5343,8 @@ class TimeRE(dict): 'f': r"(?P[0-9]{1,9})", 'H': r"(?P2[0-3]|[0-1]\d|\d)", 'I': r"(?P1[0-2]|0[1-9]|[1-9])", - 'j': r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])", + 'j': (r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|" + r"[1-9]\d|0[1-9]|[1-9])"), 'm': r"(?P1[0-2]|0[1-9]|[1-9])", 'M': r"(?P[0-5]\d|\d)", 'S': r"(?P6[0-1]|[0-5]\d|\d)", @@ -5221,11 +5404,11 @@ class TimeRE(dict): whitespace_replacement = re_compile(r'\s+') format = whitespace_replacement.sub(r'\\s+', format) while '%' in format: - directive_index = format.index('%')+1 + directive_index = format.index('%') +1 processed_format = "%s%s%s" % (processed_format, - format[:directive_index-1], + format[:directive_index -1], self[format[directive_index]]) - format = format[directive_index+1:] + format = format[directive_index +1:] return "%s%s" % (processed_format, format) def compile(self, format): @@ -5239,7 +5422,8 @@ _TimeRE_cache = TimeRE() _CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache _regex_cache = {} -cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon): +cdef _calc_julian_from_U_or_W(int year, int week_of_year, + int day_of_week, int week_starts_Mon): """Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year assumes the week starts on Sunday or Monday (6 or 0).""" From 80230292e19e377f5b11409e112500b6a180e18a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 6 Sep 2016 16:30:57 +0200 Subject: [PATCH 341/359] API/DEPR: Remove +/- as setops for Index (GH8227) (#14127) --- doc/source/whatsnew/v0.19.0.txt | 37 ++++++++++++++++++++++++++++++ pandas/indexes/base.py | 25 ++++++-------------- pandas/indexes/category.py | 2 +- pandas/indexes/multi.py | 1 + pandas/tests/indexes/test_base.py | 24 +++++++++++-------- pandas/tests/indexes/test_multi.py | 17 ++++++++------ 6 files changed, 70 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 7f471904acf30..2d93652ca91db 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -926,6 +926,43 @@ New Behavior: pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') pi.values + +.. _whatsnew_0190.api.setops: + +Index ``+`` / ``-`` no longer used for set operations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Addition and subtraction of the base Index type (not the numeric subclasses) +previously performed set operations (set union and difference). This +behaviour was already deprecated since 0.15.0 (in favor using the specific +``.union()`` and ``.difference()`` methods), and is now disabled. When +possible, ``+`` and ``-`` are now used for element-wise operations, for +example for concatenating strings (:issue:`8227`, :issue:`14127`). + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pd.Index(['a', 'b']) + pd.Index(['a', 'c']) + FutureWarning: using '+' to provide set union with Indexes is deprecated, use '|' or .union() + Out[1]: Index(['a', 'b', 'c'], dtype='object') + +The same operation will now perform element-wise addition: + +.. ipython:: python + + pd.Index(['a', 'b']) + pd.Index(['a', 'c']) + +Note that numeric Index objects already performed element-wise operations. +For example, the behaviour of adding two integer Indexes: + +.. ipython:: python + + pd.Index([1, 2, 3]) + pd.Index([2, 3, 4]) + +is unchanged. The base ``Index`` is now made consistent with this behaviour. + + .. _whatsnew_0190.api.difference: ``Index.difference`` and ``.symmetric_difference`` changes diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index dac0e650cb923..d4ca18a6713b5 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1739,28 +1739,16 @@ def argsort(self, *args, **kwargs): return result.argsort(*args, **kwargs) def __add__(self, other): - if is_list_like(other): - warnings.warn("using '+' to provide set union with Indexes is " - "deprecated, use '|' or .union()", FutureWarning, - stacklevel=2) - if isinstance(other, Index): - return self.union(other) return Index(np.array(self) + other) def __radd__(self, other): - if is_list_like(other): - warnings.warn("using '+' to provide set union with Indexes is " - "deprecated, use '|' or .union()", FutureWarning, - stacklevel=2) return Index(other + np.array(self)) __iadd__ = __add__ def __sub__(self, other): - warnings.warn("using '-' to provide set differences with Indexes is " - "deprecated, use .difference()", FutureWarning, - stacklevel=2) - return self.difference(other) + raise TypeError("cannot perform __sub__ with this index type: " + "{typ}".format(typ=type(self))) def __and__(self, other): return self.intersection(other) @@ -1990,7 +1978,8 @@ def symmetric_difference(self, other, result_name=None): ----- ``symmetric_difference`` contains elements that appear in either ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by - ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. + ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates + dropped. Examples -------- @@ -3333,8 +3322,8 @@ def _evaluate_compare(self, other): cls.__ge__ = _make_compare(operator.ge) @classmethod - def _add_numericlike_set_methods_disabled(cls): - """ add in the numeric set-like methods to disable """ + def _add_numeric_methods_add_sub_disabled(cls): + """ add in the numeric add/sub methods to disable """ def _make_invalid_op(name): def invalid_op(self, other=None): @@ -3349,7 +3338,7 @@ def invalid_op(self, other=None): @classmethod def _add_numeric_methods_disabled(cls): - """ add in numeric methods to disable """ + """ add in numeric methods to disable other than add/sub """ def _make_invalid_op(name): def invalid_op(self, other=None): diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index d4fc746c652ca..c1f5d47e1e04f 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -649,7 +649,7 @@ def _add_accessors(cls): typ='method', overwrite=True) -CategoricalIndex._add_numericlike_set_methods_disabled() +CategoricalIndex._add_numeric_methods_add_sub_disabled() CategoricalIndex._add_numeric_methods_disabled() CategoricalIndex._add_logical_methods_disabled() CategoricalIndex._add_comparison_methods() diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index f42410fcdf098..09c755b2c9792 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -2219,6 +2219,7 @@ def isin(self, values, level=None): MultiIndex._add_numeric_methods_disabled() +MultiIndex._add_numeric_methods_add_sub_disabled() MultiIndex._add_logical_methods_disabled() diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0ef7e6bf3be97..7f68318d4d7d3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -730,16 +730,6 @@ def test_union(self): expected = Index(list('ab'), name='A') tm.assert_index_equal(union, expected) - def test_add(self): - - # - API change GH 8226 - with tm.assert_produces_warning(): - self.strIndex + self.strIndex - with tm.assert_produces_warning(): - self.strIndex + self.strIndex.tolist() - with tm.assert_produces_warning(): - self.strIndex.tolist() + self.strIndex - with tm.assert_produces_warning(RuntimeWarning): firstCat = self.strIndex.union(self.dateIndex) secondCat = self.strIndex.union(self.strIndex) @@ -755,6 +745,13 @@ def test_add(self): tm.assert_contains_all(self.strIndex, secondCat) tm.assert_contains_all(self.dateIndex, firstCat) + def test_add(self): + idx = self.strIndex + expected = Index(self.strIndex.values * 2) + self.assert_index_equal(idx + idx, expected) + self.assert_index_equal(idx + idx.tolist(), expected) + self.assert_index_equal(idx.tolist() + idx, expected) + # test add and radd idx = Index(list('abc')) expected = Index(['a1', 'b1', 'c1']) @@ -762,6 +759,13 @@ def test_add(self): expected = Index(['1a', '1b', '1c']) self.assert_index_equal('1' + idx, expected) + def test_sub(self): + idx = self.strIndex + self.assertRaises(TypeError, lambda: idx - 'a') + self.assertRaises(TypeError, lambda: idx - idx) + self.assertRaises(TypeError, lambda: idx - idx.tolist()) + self.assertRaises(TypeError, lambda: idx.tolist() - idx) + def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 25de6c5091853..5248f0775d22f 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1408,21 +1408,24 @@ def test_intersection(self): # result = self.index & tuples # self.assertTrue(result.equals(tuples)) - def test_difference(self): + def test_sub(self): first = self.index - result = first.difference(self.index[-3:]) - # - API change GH 8226 - with tm.assert_produces_warning(): + # - now raises (previously was set op difference) + with tm.assertRaises(TypeError): first - self.index[-3:] - with tm.assert_produces_warning(): + with tm.assertRaises(TypeError): self.index[-3:] - first - with tm.assert_produces_warning(): + with tm.assertRaises(TypeError): self.index[-3:] - first.tolist() + with tm.assertRaises(TypeError): + first.tolist() - self.index[-3:] - self.assertRaises(TypeError, lambda: first.tolist() - self.index[-3:]) + def test_difference(self): + first = self.index + result = first.difference(self.index[-3:]) expected = MultiIndex.from_tuples(sorted(self.index[:-3].values), sortorder=0, names=self.index.names) From 844d5fba7d0878971ec2f3841cdd033e1ae7d425 Mon Sep 17 00:00:00 2001 From: Iulius Curt Date: Wed, 7 Sep 2016 13:49:24 +0300 Subject: [PATCH 342/359] Fix trivial typo in comment (#14174) --- pandas/tseries/offsets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index f12ba8083f545..051cc8aa4d018 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -814,7 +814,7 @@ def apply(self, other): if bd != 0: skip_bd = BusinessDay(n=bd) - # midnight busienss hour may not on BusinessDay + # midnight business hour may not on BusinessDay if not self.next_bday.onOffset(other): remain = other - self._prev_opening_time(other) other = self._next_opening_time(other + skip_bd) + remain From e88ad28c97457c8c1c8a83bf5252257c1eb802bf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Sep 2016 15:11:03 +0200 Subject: [PATCH 343/359] API/DEPR: Remove +/- as setops for DatetimeIndex/PeriodIndex (GH9630) (#14164) API/DEPR: Remove +/- as setops for DatetimeIndex/PeriodIndex (GH9630) xref #13777, deprecations put in place in #9630 --- doc/source/whatsnew/v0.19.0.txt | 25 ++++- pandas/tseries/base.py | 20 ++-- pandas/tseries/index.py | 44 ++++++-- pandas/tseries/tests/test_base.py | 171 +++++++++++++++++------------- 4 files changed, 165 insertions(+), 95 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2d93652ca91db..9345f11aca341 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -932,14 +932,16 @@ New Behavior: Index ``+`` / ``-`` no longer used for set operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Addition and subtraction of the base Index type (not the numeric subclasses) +Addition and subtraction of the base Index type and of DatetimeIndex +(not the numeric index types) previously performed set operations (set union and difference). This behaviour was already deprecated since 0.15.0 (in favor using the specific ``.union()`` and ``.difference()`` methods), and is now disabled. When possible, ``+`` and ``-`` are now used for element-wise operations, for -example for concatenating strings (:issue:`8227`, :issue:`14127`). +example for concatenating strings or subtracting datetimes +(:issue:`8227`, :issue:`14127`). -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -962,6 +964,23 @@ For example, the behaviour of adding two integer Indexes: is unchanged. The base ``Index`` is now made consistent with this behaviour. +Further, because of this change, it is now possible to subtract two +DatetimeIndex objects resulting in a TimedeltaIndex: + +Previous behavior: + +.. code-block:: ipython + + In [1]: pd.DatetimeIndex(['2016-01-01', '2016-01-02']) - pd.DatetimeIndex(['2016-01-02', '2016-01-03']) + FutureWarning: using '-' to provide set differences with datetimelike Indexes is deprecated, use .difference() + Out[1]: DatetimeIndex(['2016-01-01'], dtype='datetime64[ns]', freq=None) + +New behavior: + +.. ipython:: python + + pd.DatetimeIndex(['2016-01-01', '2016-01-02']) - pd.DatetimeIndex(['2016-01-02', '2016-01-03']) + .. _whatsnew_0190.api.difference: diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 1690a9b229db2..3b676b894d355 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -2,7 +2,6 @@ Base and utility classes for tseries type pandas objects. """ -import warnings from datetime import datetime, timedelta from pandas import compat @@ -628,10 +627,9 @@ def __add__(self, other): raise TypeError("cannot add TimedeltaIndex and {typ}" .format(typ=type(other))) elif isinstance(other, Index): - warnings.warn("using '+' to provide set union with " - "datetimelike Indexes is deprecated, " - "use .union()", FutureWarning, stacklevel=2) - return self.union(other) + raise TypeError("cannot add {typ1} and {typ2}" + .format(typ1=type(self).__name__, + typ2=type(other).__name__)) elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): return self._add_delta(other) @@ -646,6 +644,7 @@ def __add__(self, other): def __sub__(self, other): from pandas.core.index import Index + from pandas.tseries.index import DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex from pandas.tseries.offsets import DateOffset if isinstance(other, TimedeltaIndex): @@ -653,13 +652,14 @@ def __sub__(self, other): elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if not isinstance(other, TimedeltaIndex): raise TypeError("cannot subtract TimedeltaIndex and {typ}" - .format(typ=type(other))) + .format(typ=type(other).__name__)) return self._add_delta(-other) + elif isinstance(other, DatetimeIndex): + return self._sub_datelike(other) elif isinstance(other, Index): - warnings.warn("using '-' to provide set differences with " - "datetimelike Indexes is deprecated, " - "use .difference()", FutureWarning, stacklevel=2) - return self.difference(other) + raise TypeError("cannot subtract {typ1} and {typ2}" + .format(typ1=type(self).__name__, + typ2=type(other).__name__)) elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): return self._add_delta(-other) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 351edf1b38352..e26a0548fdc78 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -731,19 +731,43 @@ def _add_datelike(self, other): def _sub_datelike(self, other): # subtract a datetime from myself, yielding a TimedeltaIndex from pandas import TimedeltaIndex - other = Timestamp(other) - if other is tslib.NaT: - result = self._nat_new(box=False) - # require tz compat - elif not self._has_same_tz(other): - raise TypeError("Timestamp subtraction must have the same " - "timezones or no timezones") + if isinstance(other, DatetimeIndex): + # require tz compat + if not self._has_same_tz(other): + raise TypeError("DatetimeIndex subtraction must have the same " + "timezones or no timezones") + result = self._sub_datelike_dti(other) + elif isinstance(other, (tslib.Timestamp, datetime)): + other = Timestamp(other) + if other is tslib.NaT: + result = self._nat_new(box=False) + # require tz compat + elif not self._has_same_tz(other): + raise TypeError("Timestamp subtraction must have the same " + "timezones or no timezones") + else: + i8 = self.asi8 + result = i8 - other.value + result = self._maybe_mask_results(result, + fill_value=tslib.iNaT) else: - i8 = self.asi8 - result = i8 - other.value - result = self._maybe_mask_results(result, fill_value=tslib.iNaT) + raise TypeError("cannot subtract DatetimeIndex and {typ}" + .format(typ=type(other).__name__)) return TimedeltaIndex(result, name=self.name, copy=False) + def _sub_datelike_dti(self, other): + """subtraction of two DatetimeIndexes""" + if not len(self) == len(other): + raise ValueError("cannot add indices of unequal length") + + self_i8 = self.asi8 + other_i8 = other.asi8 + new_values = self_i8 - other_i8 + if self.hasnans or other.hasnans: + mask = (self._isnan) | (other._isnan) + new_values[mask] = tslib.iNaT + return new_values.view('i8') + def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ freq = attrs.get('freq', None) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 96ff74c819624..8a86fcba32ecb 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -360,7 +360,7 @@ def test_resolution(self): tz=tz) self.assertEqual(idx.resolution, expected) - def test_add_iadd(self): + def test_union(self): for tz in self.tz: # union rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) @@ -378,17 +378,12 @@ def test_add_iadd(self): for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), (rng3, other3, expected3)]: - # GH9094 - with tm.assert_produces_warning(FutureWarning): - result_add = rng + other - result_union = rng.union(other) - tm.assert_index_equal(result_add, expected) + result_union = rng.union(other) tm.assert_index_equal(result_union, expected) - # GH9094 - with tm.assert_produces_warning(FutureWarning): - rng += other - tm.assert_index_equal(rng, expected) + + def test_add_iadd(self): + for tz in self.tz: # offset offsets = [pd.offsets.Hour(2), timedelta(hours=2), @@ -421,7 +416,26 @@ def test_add_iadd(self): with tm.assertRaisesRegexp(TypeError, msg): Timestamp('2011-01-01') + idx - def test_sub_isub(self): + def test_add_dti_dti(self): + # previously performed setop (deprecated in 0.16.0), now raises + # TypeError (GH14164) + + dti = date_range('20130101', periods=3) + dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') + + with tm.assertRaises(TypeError): + dti + dti + + with tm.assertRaises(TypeError): + dti_tz + dti_tz + + with tm.assertRaises(TypeError): + dti_tz + dti + + with tm.assertRaises(TypeError): + dti + dti_tz + + def test_difference(self): for tz in self.tz: # diff rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) @@ -439,9 +453,11 @@ def test_sub_isub(self): for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), (rng3, other3, expected3)]: - result_union = rng.difference(other) + result_diff = rng.difference(other) + tm.assert_index_equal(result_diff, expected) - tm.assert_index_equal(result_union, expected) + def test_sub_isub(self): + for tz in self.tz: # offset offsets = [pd.offsets.Hour(2), timedelta(hours=2), @@ -449,9 +465,10 @@ def test_sub_isub(self): for delta in offsets: rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - result = rng - delta expected = pd.date_range('1999-12-31 22:00', '2000-01-31 22:00', tz=tz) + + result = rng - delta tm.assert_index_equal(result, expected) rng -= delta tm.assert_index_equal(rng, expected) @@ -466,6 +483,47 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + def test_sub_dti_dti(self): + # previously performed setop (deprecated in 0.16.0), now changed to + # return subtraction -> TimeDeltaIndex (GH ...) + + dti = date_range('20130101', periods=3) + dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') + dti_tz2 = date_range('20130101', periods=3).tz_localize('UTC') + expected = TimedeltaIndex([0, 0, 0]) + + result = dti - dti + tm.assert_index_equal(result, expected) + + result = dti_tz - dti_tz + tm.assert_index_equal(result, expected) + + with tm.assertRaises(TypeError): + dti_tz - dti + + with tm.assertRaises(TypeError): + dti - dti_tz + + with tm.assertRaises(TypeError): + dti_tz - dti_tz2 + + # isub + dti -= dti + tm.assert_index_equal(dti, expected) + + # different length raises ValueError + dti1 = date_range('20130101', periods=3) + dti2 = date_range('20130101', periods=4) + with tm.assertRaises(ValueError): + dti1 - dti2 + + # NaN propagation + dti1 = DatetimeIndex(['2012-01-01', np.nan, '2012-01-03']) + dti2 = DatetimeIndex(['2012-01-02', '2012-01-03', np.nan]) + expected = TimedeltaIndex(['1 days', np.nan, np.nan]) + result = dti2 - dti1 + tm.assert_index_equal(result, expected) + def test_sub_period(self): # GH 13078 # not supported, check TypeError @@ -1239,50 +1297,6 @@ def _check(result, expected): ['20121231', '20130101', '20130102'], tz='US/Eastern') tm.assert_index_equal(result, expected) - def test_dti_dti_deprecated_ops(self): - - # deprecated in 0.16.0 (GH9094) - # change to return subtraction -> TimeDeltaIndex in 0.17.0 - # shoudl move to the appropriate sections above - - dti = date_range('20130101', periods=3) - dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') - - with tm.assert_produces_warning(FutureWarning): - result = dti - dti - expected = Index([]) - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti + dti - expected = dti - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti_tz - dti_tz - expected = Index([]) - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti_tz + dti_tz - expected = dti_tz - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti_tz - dti - expected = dti_tz - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti - dti_tz - expected = dti - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - self.assertRaises(TypeError, lambda: dti_tz + dti) - with tm.assert_produces_warning(FutureWarning): - self.assertRaises(TypeError, lambda: dti + dti_tz) - def test_dti_tdi_numeric_ops(self): # These are normally union/diff set-like ops @@ -2005,7 +2019,7 @@ def test_resolution(self): idx = pd.period_range(start='2013-04-01', periods=30, freq=freq) self.assertEqual(idx.resolution, expected) - def test_add_iadd(self): + def test_union(self): # union rng1 = pd.period_range('1/1/2000', freq='D', periods=5) other1 = pd.period_range('1/6/2000', freq='D', periods=5) @@ -2031,7 +2045,8 @@ def test_add_iadd(self): rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', '2000-01-01 09:05'], freq='T') other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05' - '2000-01-01 09:08'], freq='T') + '2000-01-01 09:08'], + freq='T') expected5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', '2000-01-01 09:05', '2000-01-01 09:08'], freq='T') @@ -2052,20 +2067,19 @@ def test_add_iadd(self): expected6), (rng7, other7, expected7)]: - # GH9094 - with tm.assert_produces_warning(FutureWarning): - result_add = rng + other - result_union = rng.union(other) - - tm.assert_index_equal(result_add, expected) tm.assert_index_equal(result_union, expected) - # GH 6527 - # GH9094 - with tm.assert_produces_warning(FutureWarning): - rng += other - tm.assert_index_equal(rng, expected) + def test_add_iadd(self): + rng = pd.period_range('1/1/2000', freq='D', periods=5) + other = pd.period_range('1/6/2000', freq='D', periods=5) + + # previously performed setop union, now raises TypeError (GH14164) + with tm.assertRaises(TypeError): + rng + other + + with tm.assertRaises(TypeError): + rng += other # offset # DateOffset @@ -2152,7 +2166,7 @@ def test_add_iadd(self): rng += 1 tm.assert_index_equal(rng, expected) - def test_sub_isub(self): + def test_difference(self): # diff rng1 = pd.period_range('1/1/2000', freq='D', periods=5) other1 = pd.period_range('1/6/2000', freq='D', periods=5) @@ -2194,6 +2208,19 @@ def test_sub_isub(self): result_union = rng.difference(other) tm.assert_index_equal(result_union, expected) + def test_sub_isub(self): + + # previously performed setop, now raises TypeError (GH14164) + # TODO needs to wait on #13077 for decision on result type + rng = pd.period_range('1/1/2000', freq='D', periods=5) + other = pd.period_range('1/6/2000', freq='D', periods=5) + + with tm.assertRaises(TypeError): + rng - other + + with tm.assertRaises(TypeError): + rng -= other + # offset # DateOffset rng = pd.period_range('2014', '2024', freq='A') From 3f3839b6cb00a7fbabfa8c0899be69c6ea088b3a Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 7 Sep 2016 09:16:02 -0400 Subject: [PATCH 344/359] DEPR: Deprecate pandas.core.datetools (#14105) * MAINT: Replace datetools import in tests * MAINT: Replace datetools import internally * DOC: Replace datetools import in docs * MAINT: Remove datetool imports from scripts * DEPR: Deprecate pandas.core.datetools Closes gh-14094. --- doc/source/timeseries.rst | 6 +- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/api/tests/test_api.py | 21 ++- pandas/core/api.py | 14 +- pandas/core/datetools.py | 6 + pandas/core/generic.py | 10 +- pandas/io/tests/test_sql.py | 2 +- pandas/sparse/tests/test_frame.py | 6 +- pandas/sparse/tests/test_series.py | 5 +- pandas/stats/tests/test_ols.py | 12 +- pandas/tests/frame/test_indexing.py | 11 +- pandas/tests/frame/test_timeseries.py | 18 ++- pandas/tests/series/test_indexing.py | 13 +- pandas/tests/series/test_timeseries.py | 25 +-- pandas/tests/test_panel.py | 13 +- pandas/tests/test_panel4d.py | 4 +- pandas/tests/test_window.py | 4 +- pandas/tseries/tests/test_daterange.py | 100 ++++++------ pandas/tseries/tests/test_offsets.py | 150 ++++++++++-------- pandas/tseries/tests/test_period.py | 7 +- pandas/tseries/tests/test_timeseries.py | 19 ++- .../tseries/tests/test_timeseries_legacy.py | 30 ++-- pandas/tseries/tests/test_timezones.py | 21 ++- pandas/util/depr_module.py | 79 +++++++++ scripts/bench_join.py | 4 +- scripts/groupby_speed.py | 4 +- scripts/hdfstore_panel_perf.py | 2 +- 27 files changed, 356 insertions(+), 231 deletions(-) create mode 100644 pandas/util/depr_module.py diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 36e492df29983..924f286164225 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -7,7 +7,7 @@ from datetime import datetime, timedelta, time import numpy as np import pandas as pd - from pandas import datetools + from pandas import offsets np.random.seed(123456) randn = np.random.randn randint = np.random.randint @@ -1223,7 +1223,7 @@ The shift method accepts an ``freq`` argument which can accept a .. ipython:: python - ts.shift(5, freq=datetools.bday) + ts.shift(5, freq=offsets.BDay()) ts.shift(5, freq='BM') Rather than changing the alignment of the data and the index, ``DataFrame`` and @@ -1246,7 +1246,7 @@ around ``reindex`` which generates a ``date_range`` and calls ``reindex``. .. ipython:: python - dr = pd.date_range('1/1/2010', periods=3, freq=3 * datetools.bday) + dr = pd.date_range('1/1/2010', periods=3, freq=3 * offsets.BDay()) ts = pd.Series(randn(3), index=dr) ts ts.asfreq(BDay()) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9345f11aca341..282c4ef127391 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1295,6 +1295,7 @@ Deprecations - ``PeriodIndex.to_datetime`` has been deprecated in favour of ``PeriodIndex.to_timestamp`` (:issue:`8254`) - ``Timestamp.to_datetime`` has been deprecated in favour of ``Timestamp.to_pydatetime`` (:issue:`8254`) +- ``pandas.core.datetools`` module has been deprecated and will be removed in a subsequent release (:issue:`14094`) - ``Index.to_datetime`` and ``DatetimeIndex.to_datetime`` have been deprecated in favour of ``pd.to_datetime`` (:issue:`8254`) - ``SparseList`` has been deprecated and will be removed in a future version (:issue:`13784`) - ``DataFrame.to_html()`` and ``DataFrame.to_latex()`` have dropped the ``colSpace`` parameter in favor of ``col_space`` (:issue:`13857`) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index b706d789931b0..d4d8b7e4e9747 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -42,7 +42,7 @@ class TestPDApi(Base, tm.TestCase): 'json', 'lib', 'index', 'parser'] # these are already deprecated; awaiting removal - deprecated_modules = ['ols', 'stats'] + deprecated_modules = ['ols', 'stats', 'datetools'] # misc misc = ['IndexSlice', 'NaT'] @@ -61,14 +61,14 @@ class TestPDApi(Base, tm.TestCase): 'SparseTimeSeries', 'Panel4D', 'SparseList'] - # these should be deperecated in the future + # these should be deprecated in the future deprecated_classes_in_future = ['Term', 'Panel'] # these should be removed from top-level namespace remove_classes_from_top_level_namespace = ['Expr'] # external modules exposed in pandas namespace - modules = ['np', 'datetime', 'datetools'] + modules = ['np', 'datetime'] # top-level functions funcs = ['bdate_range', 'concat', 'crosstab', 'cut', @@ -99,7 +99,7 @@ class TestPDApi(Base, tm.TestCase): funcs_to = ['to_datetime', 'to_msgpack', 'to_numeric', 'to_pickle', 'to_timedelta'] - # these should be deperecated in the future + # these should be deprecated in the future deprecated_funcs_in_future = ['pnow', 'groupby', 'info'] # these are already deprecated; awaiting removal @@ -208,6 +208,19 @@ def test_removed_from_core_common(self): 'ensure_float']: self.assertRaises(AttributeError, lambda: getattr(com, t)) + +class TestDatetools(tm.TestCase): + + def test_deprecation_access_func(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.datetools.to_datetime('2016-01-01') + + def test_deprecation_access_obj(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.datetools.monthEnd + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/core/api.py b/pandas/core/api.py index 579f21eb4ada8..c0f39e2ac4717 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -28,8 +28,18 @@ from pandas.tseries.tdi import TimedeltaIndex, Timedelta from pandas.tseries.period import Period, PeriodIndex -# legacy -import pandas.core.datetools as datetools +# see gh-14094. +from pandas.util.depr_module import _DeprecatedModule + +_alts = ['pandas.tseries.tools', 'pandas.tseries.offsets', + 'pandas.tseries.frequencies'] +_removals = ['day', 'bday', 'businessDay', 'cday', 'customBusinessDay', + 'customBusinessMonthEnd', 'customBusinessMonthBegin', + 'monthEnd', 'yearEnd', 'yearBegin', 'bmonthEnd', 'bmonthBegin', + 'cbmonthEnd', 'cbmonthBegin', 'bquarterEnd', 'quarterEnd', + 'byearEnd', 'week'] +datetools = _DeprecatedModule(deprmod='pandas.core.datetools', alts=_alts, + removals=_removals) from pandas.core.config import (get_option, set_option, reset_option, describe_option, option_context, options) diff --git a/pandas/core/datetools.py b/pandas/core/datetools.py index 79718c79f9bdd..bfc3f3d4e4743 100644 --- a/pandas/core/datetools.py +++ b/pandas/core/datetools.py @@ -2,10 +2,16 @@ # flake8: noqa +import warnings + from pandas.tseries.tools import * from pandas.tseries.offsets import * from pandas.tseries.frequencies import * +warnings.warn("The pandas.core.datetools module is deprecated and will be " + "removed in a future version. Please use the pandas.tseries " + "module instead.", FutureWarning, stacklevel=2) + day = DateOffset() bday = BDay() businessDay = bday diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5a17401ea67b1..2834603287f1e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -42,9 +42,9 @@ import pandas.core.algorithms as algos import pandas.core.common as com import pandas.core.missing as missing -import pandas.core.datetools as datetools from pandas.formats.printing import pprint_thing from pandas.formats.format import format_percentiles +from pandas.tseries.frequencies import to_offset from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lrange, string_types, @@ -4792,7 +4792,7 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, periods : int Number of periods to move, can be positive or negative freq : DateOffset, timedelta, or time rule string, optional - Increment to use from datetools module or time rule (e.g. 'EOM'). + Increment to use from the tseries module or time rule (e.g. 'EOM'). See Notes. axis : %(axes_single_arg)s @@ -4865,7 +4865,7 @@ def tshift(self, periods=1, freq=None, axis=0): periods : int Number of periods to move, can be positive or negative freq : DateOffset, timedelta, or time rule string, default None - Increment to use from datetools module or time rule (e.g. 'EOM') + Increment to use from the tseries module or time rule (e.g. 'EOM') axis : int or basestring Corresponds to the axis that contains the Index @@ -4895,11 +4895,11 @@ def tshift(self, periods=1, freq=None, axis=0): return self if isinstance(freq, string_types): - freq = datetools.to_offset(freq) + freq = to_offset(freq) block_axis = self._get_block_manager_axis(axis) if isinstance(index, PeriodIndex): - orig_freq = datetools.to_offset(index.freq) + orig_freq = to_offset(index.freq) if freq == orig_freq: new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index ffe7b9d6b460a..198a4017b5af7 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -37,7 +37,7 @@ from pandas import date_range, to_datetime, to_timedelta, Timestamp import pandas.compat as compat from pandas.compat import StringIO, range, lrange, string_types -from pandas.core.datetools import format as date_format +from pandas.tseries.tools import format as date_format import pandas.io.sql as sql from pandas.io.sql import read_sql_table, read_sql_query diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 192f6532a148d..5cc765a2c1cf3 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -8,7 +8,7 @@ from pandas import Series, DataFrame, bdate_range, Panel from pandas.tseries.index import DatetimeIndex -import pandas.core.datetools as datetools +from pandas.tseries.offsets import BDay import pandas.util.testing as tm from pandas.compat import lrange from pandas import compat @@ -850,8 +850,8 @@ def _check(frame, orig): exp = exp.to_sparse(frame.default_fill_value) tm.assert_frame_equal(shifted, exp) - shifted = frame.shift(2, freq=datetools.bday) - exp = orig.shift(2, freq=datetools.bday) + shifted = frame.shift(2, freq=BDay()) + exp = orig.shift(2, freq=BDay()) exp = exp.to_sparse(frame.default_fill_value) tm.assert_frame_equal(shifted, exp) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 9d5a1327da53f..de8c63df9c9e6 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -7,9 +7,8 @@ import pandas as pd from pandas import Series, DataFrame, bdate_range -from pandas.core.datetools import BDay -import pandas.core.datetools as datetools from pandas.core.common import isnull +from pandas.tseries.offsets import BDay import pandas.util.testing as tm from pandas.compat import range from pandas import compat @@ -843,7 +842,7 @@ def test_shift(self): f = lambda s: s.shift(2, freq='B') _dense_series_compare(series, f) - f = lambda s: s.shift(2, freq=datetools.bday) + f = lambda s: s.shift(2, freq=BDay()) _dense_series_compare(series, f) def test_shift_nan(self): diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 770f7b35a02ca..6f688649affb0 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -16,7 +16,7 @@ from pandas import date_range, bdate_range from pandas.core.panel import Panel -from pandas import DataFrame, Index, Series, notnull, datetools +from pandas import DataFrame, Index, Series, notnull, offsets from pandas.stats.api import ols from pandas.stats.ols import _filter_data from pandas.stats.plm import NonPooledPanelOLS, PanelOLS @@ -24,7 +24,7 @@ assert_frame_equal, assertRaisesRegexp, slow) import pandas.util.testing as tm import pandas.compat as compat -from .common import BaseTest +from pandas.stats.tests.common import BaseTest _have_statsmodels = True try: @@ -898,22 +898,22 @@ class TestOLSFilter(tm.TestCase): def setUp(self): date_index = date_range(datetime(2009, 12, 11), periods=3, - freq=datetools.bday) + freq=offsets.BDay()) ts = Series([3, 1, 4], index=date_index) self.TS1 = ts date_index = date_range(datetime(2009, 12, 11), periods=5, - freq=datetools.bday) + freq=offsets.BDay()) ts = Series([1, 5, 9, 2, 6], index=date_index) self.TS2 = ts date_index = date_range(datetime(2009, 12, 11), periods=3, - freq=datetools.bday) + freq=offsets.BDay()) ts = Series([5, np.nan, 3], index=date_index) self.TS3 = ts date_index = date_range(datetime(2009, 12, 11), periods=5, - freq=datetools.bday) + freq=offsets.BDay()) ts = Series([np.nan, 5, 8, 9, 7], index=date_index) self.TS4 = ts diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 578df5ba9101e..720dcdd62dd89 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -17,6 +17,7 @@ date_range) import pandas as pd +from pandas.tseries.offsets import BDay from pandas.types.common import (is_float_dtype, is_integer, is_scalar) @@ -2068,8 +2069,6 @@ def test_at_time_between_time_datetimeindex(self): assert_frame_equal(result, df) def test_xs(self): - from pandas.core.datetools import bday - idx = self.frame.index[5] xs = self.frame.xs(idx) for item, value in compat.iteritems(xs): @@ -2090,7 +2089,7 @@ def test_xs(self): self.assertEqual(xs['B'], '1') with tm.assertRaises(KeyError): - self.tsframe.xs(self.tsframe.index[0] - bday) + self.tsframe.xs(self.tsframe.index[0] - BDay()) # xs get column series = self.frame.xs('A', axis=1) @@ -2772,3 +2771,9 @@ def test_transpose(self): expected = DataFrame(self.df.values.T) expected.index = ['A', 'B'] assert_frame_equal(result, expected) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 4916d81b18c22..9758c2b9c805e 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -10,7 +10,7 @@ from pandas import DataFrame, Series, Index, Timestamp, DatetimeIndex import pandas as pd -import pandas.core.datetools as datetools +import pandas.tseries.offsets as offsets from pandas.util.testing import (assert_almost_equal, assert_series_equal, @@ -136,14 +136,14 @@ def test_shift(self): assert_frame_equal(unshifted, self.tsframe) # shift by DateOffset - shiftedFrame = self.tsframe.shift(5, freq=datetools.BDay()) + shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay()) self.assertEqual(len(shiftedFrame), len(self.tsframe)) shiftedFrame2 = self.tsframe.shift(5, freq='B') assert_frame_equal(shiftedFrame, shiftedFrame2) d = self.tsframe.index[0] - shifted_d = d + datetools.BDay(5) + shifted_d = d + offsets.BDay(5) assert_series_equal(self.tsframe.xs(d), shiftedFrame.xs(shifted_d), check_names=False) @@ -160,7 +160,7 @@ def test_shift(self): ps.ix[:-1, 0].values) shifted2 = ps.shift(1, 'B') - shifted3 = ps.shift(1, datetools.bday) + shifted3 = ps.shift(1, offsets.BDay()) assert_frame_equal(shifted2, shifted3) assert_frame_equal(ps, shifted2.shift(-1, 'B')) @@ -222,7 +222,7 @@ def test_tshift(self): shifted2 = ps.tshift(freq='B') assert_frame_equal(shifted, shifted2) - shifted3 = ps.tshift(freq=datetools.bday) + shifted3 = ps.tshift(freq=offsets.BDay()) assert_frame_equal(shifted, shifted3) assertRaisesRegexp(ValueError, 'does not match', ps.tshift, freq='M') @@ -297,7 +297,7 @@ def test_truncate_copy(self): self.assertFalse((self.tsframe.values[5:11] == 5).any()) def test_asfreq(self): - offset_monthly = self.tsframe.asfreq(datetools.bmonthEnd) + offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd()) rule_monthly = self.tsframe.asfreq('BM') assert_almost_equal(offset_monthly['A'], rule_monthly['A']) @@ -365,3 +365,9 @@ def test_operation_on_NaT(self): res = df.max() exp = pd.Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 64ebaa63cc10f..54cf626858354 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -12,15 +12,15 @@ from pandas.core.index import MultiIndex from pandas.core.indexing import IndexingError from pandas.tseries.index import Timestamp +from pandas.tseries.offsets import BDay from pandas.tseries.tdi import Timedelta -import pandas.core.datetools as datetools from pandas.compat import lrange, range from pandas import compat from pandas.util.testing import assert_series_equal, assert_almost_equal import pandas.util.testing as tm -from .common import TestData +from pandas.tests.series.common import TestData JOIN_TYPES = ['inner', 'outer', 'left', 'right'] @@ -153,7 +153,7 @@ def test_getitem_get(self): self.assertEqual(self.series[5], self.series.get(self.series.index[5])) # missing - d = self.ts.index[0] - datetools.bday + d = self.ts.index[0] - BDay() self.assertRaises(KeyError, self.ts.__getitem__, d) # None @@ -321,7 +321,7 @@ def test_getitem_boolean_object(self): def test_getitem_setitem_boolean_corner(self): ts = self.ts - mask_shifted = ts.shift(1, freq=datetools.bday) > ts.median() + mask_shifted = ts.shift(1, freq=BDay()) > ts.median() # these used to raise...?? @@ -1856,3 +1856,8 @@ def test_multilevel_preserve_name(self): result2 = s.ix['foo'] self.assertEqual(result.name, s.name) self.assertEqual(result2.name, s.name) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 07d2abc1bcbb2..6e3d52366a4ec 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -7,14 +7,13 @@ from pandas import Index, Series, date_range, NaT from pandas.tseries.index import DatetimeIndex +from pandas.tseries.offsets import BDay, BMonthEnd from pandas.tseries.tdi import TimedeltaIndex -import pandas.core.datetools as datetools - from pandas.util.testing import assert_series_equal, assert_almost_equal import pandas.util.testing as tm -from .common import TestData +from pandas.tests.series.common import TestData class TestSeriesTimeSeries(TestData, tm.TestCase): @@ -29,7 +28,7 @@ def test_shift(self): tm.assert_numpy_array_equal(unshifted.valid().values, self.ts.values[:-1]) - offset = datetools.bday + offset = BDay() shifted = self.ts.shift(1, freq=offset) unshifted = shifted.shift(-1, freq=offset) @@ -56,7 +55,7 @@ def test_shift(self): tm.assert_numpy_array_equal(unshifted.valid().values, ps.values[:-1]) shifted2 = ps.shift(1, 'B') - shifted3 = ps.shift(1, datetools.bday) + shifted3 = ps.shift(1, BDay()) assert_series_equal(shifted2, shifted3) assert_series_equal(ps, shifted2.shift(-1, 'B')) @@ -66,7 +65,7 @@ def test_shift(self): shifted4 = ps.shift(1, freq='B') assert_series_equal(shifted2, shifted4) - shifted5 = ps.shift(1, freq=datetools.bday) + shifted5 = ps.shift(1, freq=BDay()) assert_series_equal(shifted5, shifted4) # 32-bit taking @@ -131,7 +130,7 @@ def test_tshift(self): shifted2 = ps.tshift(freq='B') assert_series_equal(shifted, shifted2) - shifted3 = ps.tshift(freq=datetools.bday) + shifted3 = ps.tshift(freq=BDay()) assert_series_equal(shifted, shifted3) self.assertRaises(ValueError, ps.tshift, freq='M') @@ -156,7 +155,7 @@ def test_tshift(self): self.assertRaises(ValueError, no_freq.tshift) def test_truncate(self): - offset = datetools.bday + offset = BDay() ts = self.ts[::3] @@ -417,8 +416,8 @@ def test_asfreq(self): monthly_ts = daily_ts.asfreq('BM') self.assert_series_equal(monthly_ts, ts) - daily_ts = ts.asfreq(datetools.bday) - monthly_ts = daily_ts.asfreq(datetools.bmonthEnd) + daily_ts = ts.asfreq(BDay()) + monthly_ts = daily_ts.asfreq(BMonthEnd()) self.assert_series_equal(monthly_ts, ts) result = ts[:0].asfreq('M') @@ -561,3 +560,9 @@ def test_empty_series_ops(self): assert_series_equal(a, a - b) assert_series_equal(a, b + a) self.assertRaises(TypeError, lambda x, y: x - y, b, a) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 0b266d799cf8c..a197037789fd2 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -10,8 +10,8 @@ import pandas as pd from pandas.types.common import is_float_dtype -from pandas import Series, DataFrame, Index, isnull, notnull, pivot, MultiIndex -from pandas.core.datetools import bday +from pandas import (Series, DataFrame, Index, date_range, isnull, notnull, + pivot, MultiIndex) from pandas.core.nanops import nanall, nanany from pandas.core.panel import Panel from pandas.core.series import remove_na @@ -20,6 +20,7 @@ from pandas import compat from pandas.compat import range, lrange, StringIO, OrderedDict, signature +from pandas.tseries.offsets import BDay, MonthEnd from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, ensure_clean, assertRaisesRegexp, @@ -500,11 +501,9 @@ def test_setitem(self): p[0] = np.random.randn(4, 2) def test_setitem_ndarray(self): - from pandas import date_range, datetools - timeidx = date_range(start=datetime(2009, 1, 1), end=datetime(2009, 12, 31), - freq=datetools.MonthEnd()) + freq=MonthEnd()) lons_coarse = np.linspace(-177.5, 177.5, 72) lats_coarse = np.linspace(-87.5, 87.5, 36) P = Panel(items=timeidx, major_axis=lons_coarse, @@ -542,7 +541,7 @@ def test_major_xs(self): self.assertEqual(result.name, 'ItemA') # not contained - idx = self.panel.major_axis[0] - bday + idx = self.panel.major_axis[0] - BDay() self.assertRaises(Exception, self.panel.major_xs, idx) def test_major_xs_mixed(self): @@ -1878,7 +1877,7 @@ def test_tshift(self): shifted2 = ps.tshift(freq='B') assert_panel_equal(shifted, shifted2) - shifted3 = ps.tshift(freq=bday) + shifted3 = ps.tshift(freq=BDay()) assert_panel_equal(shifted, shifted3) assertRaisesRegexp(ValueError, 'does not match', ps.tshift, freq='M') diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 493889e579af2..1b5a7b6ee1e83 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -8,10 +8,10 @@ from pandas.types.common import is_float_dtype from pandas import Series, Index, isnull, notnull -from pandas.core.datetools import bday from pandas.core.panel import Panel from pandas.core.panel4d import Panel4D from pandas.core.series import remove_na +from pandas.tseries.offsets import BDay from pandas.util.testing import (assert_panel_equal, assert_panel4d_equal, @@ -479,7 +479,7 @@ def test_major_xs(self): ref.xs(idx), check_names=False) # not contained - idx = self.panel4d.major_axis[0] - bday + idx = self.panel4d.major_axis[0] - BDay() self.assertRaises(Exception, self.panel4d.major_xs, idx) def test_major_xs_mixed(self): diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 7a35682eee3b0..929ff43bfaaad 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -12,9 +12,9 @@ import pandas as pd from pandas import (Series, DataFrame, Panel, bdate_range, isnull, notnull, concat, Timestamp) -import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.core.window as rwindow +import pandas.tseries.offsets as offsets from pandas.core.base import SpecificationError from pandas.core.common import UnsupportedFunctionCall import pandas.util.testing as tm @@ -1321,7 +1321,7 @@ def get_result(obj, window, min_periods=None, freq=None, center=False): freq='B') last_date = series_result.index[-1] - prev_date = last_date - 24 * datetools.bday + prev_date = last_date - 24 * offsets.BDay() trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 854b60c17853b..87f9f55e0189c 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -7,11 +7,11 @@ from pandas.tseries.index import DatetimeIndex from pandas import Timestamp -from pandas.tseries.offsets import generate_range +from pandas.tseries.offsets import (BDay, BMonthEnd, CDay, MonthEnd, + generate_range, DateOffset, Minute) from pandas.tseries.index import cdate_range, bdate_range, date_range from pandas.core import common as com -import pandas.core.datetools as datetools from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm @@ -27,12 +27,12 @@ def eq_gen_range(kwargs, expected): class TestGenRangeGeneration(tm.TestCase): def test_generate(self): - rng1 = list(generate_range(START, END, offset=datetools.bday)) + rng1 = list(generate_range(START, END, offset=BDay())) rng2 = list(generate_range(START, END, time_rule='B')) self.assertEqual(rng1, rng2) def test_generate_cday(self): - rng1 = list(generate_range(START, END, offset=datetools.cday)) + rng1 = list(generate_range(START, END, offset=CDay())) rng2 = list(generate_range(START, END, time_rule='C')) self.assertEqual(rng1, rng2) @@ -78,44 +78,42 @@ def setUp(self): self.rng = bdate_range(START, END) def test_constructor(self): - bdate_range(START, END, freq=datetools.bday) - bdate_range(START, periods=20, freq=datetools.bday) - bdate_range(end=START, periods=20, freq=datetools.bday) + bdate_range(START, END, freq=BDay()) + bdate_range(START, periods=20, freq=BDay()) + bdate_range(end=START, periods=20, freq=BDay()) self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'B') self.assertRaises(ValueError, bdate_range, '2011-1-1', '2012-1-1', 'B') def test_naive_aware_conflicts(self): - naive = bdate_range(START, END, freq=datetools.bday, tz=None) - aware = bdate_range(START, END, freq=datetools.bday, + naive = bdate_range(START, END, freq=BDay(), tz=None) + aware = bdate_range(START, END, freq=BDay(), tz="Asia/Hong_Kong") assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", naive.join, aware) assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", aware.join, naive) def test_cached_range(self): - DatetimeIndex._cached_range(START, END, offset=datetools.bday) - DatetimeIndex._cached_range(START, periods=20, - offset=datetools.bday) - DatetimeIndex._cached_range(end=START, periods=20, - offset=datetools.bday) + DatetimeIndex._cached_range(START, END, offset=BDay()) + DatetimeIndex._cached_range(START, periods=20, offset=BDay()) + DatetimeIndex._cached_range(end=START, periods=20, offset=BDay()) assertRaisesRegexp(TypeError, "offset", DatetimeIndex._cached_range, START, END) assertRaisesRegexp(TypeError, "specify period", DatetimeIndex._cached_range, START, - offset=datetools.bday) + offset=BDay()) assertRaisesRegexp(TypeError, "specify period", DatetimeIndex._cached_range, end=END, - offset=datetools.bday) + offset=BDay()) assertRaisesRegexp(TypeError, "start or end", DatetimeIndex._cached_range, periods=20, - offset=datetools.bday) + offset=BDay()) def test_cached_range_bug(self): rng = date_range('2010-09-01 05:00:00', periods=50, - freq=datetools.DateOffset(hours=6)) + freq=DateOffset(hours=6)) self.assertEqual(len(rng), 50) self.assertEqual(rng[0], datetime(2010, 9, 1, 5)) @@ -155,7 +153,7 @@ def test_getitem(self): self.assertEqual(smaller.offset, self.rng.offset) sliced = self.rng[::5] - self.assertEqual(sliced.offset, datetools.bday * 5) + self.assertEqual(sliced.offset, BDay() * 5) fancy_indexed = self.rng[[4, 3, 2, 1, 0]] self.assertEqual(len(fancy_indexed), 5) @@ -183,9 +181,9 @@ def test_shift(self): self.assertEqual(shifted[0], self.rng[0]) self.assertEqual(shifted.offset, self.rng.offset) - rng = date_range(START, END, freq=datetools.bmonthEnd) - shifted = rng.shift(1, freq=datetools.bday) - self.assertEqual(shifted[0], rng[0] + datetools.bday) + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=BDay()) + self.assertEqual(shifted[0], rng[0] + BDay()) def test_pickle_unpickle(self): unpickled = self.round_trip_pickle(self.rng) @@ -217,7 +215,7 @@ def test_union(self): tm.assert_index_equal(right.union(left), the_union) # overlapping, but different offset - rng = date_range(START, END, freq=datetools.bmonthEnd) + rng = date_range(START, END, freq=BMonthEnd()) the_union = self.rng.union(rng) tm.assertIsInstance(the_union, DatetimeIndex) @@ -248,14 +246,14 @@ def test_outer_join(self): tm.assertIsInstance(the_join, DatetimeIndex) # overlapping, but different offset - rng = date_range(START, END, freq=datetools.bmonthEnd) + rng = date_range(START, END, freq=BMonthEnd()) the_join = self.rng.join(rng, how='outer') tm.assertIsInstance(the_join, DatetimeIndex) self.assertIsNone(the_join.freq) def test_union_not_cacheable(self): - rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) + rng = date_range('1/1/2000', periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] the_union = rng1.union(rng2) @@ -268,7 +266,7 @@ def test_union_not_cacheable(self): self.assert_index_equal(the_union, expected) def test_intersection(self): - rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) + rng = date_range('1/1/2000', periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] the_int = rng1.intersection(rng2) @@ -309,7 +307,7 @@ def test_summary_dateutil(self): def test_misc(self): end = datetime(2009, 5, 13) dr = bdate_range(end=end, periods=20) - firstDate = end - 19 * datetools.bday + firstDate = end - 19 * BDay() assert len(dr) == 20 assert dr[0] == firstDate @@ -351,18 +349,18 @@ def test_daterange_bug_456(self): # GH #456 rng1 = bdate_range('12/5/2011', '12/5/2011') rng2 = bdate_range('12/2/2011', '12/5/2011') - rng2.offset = datetools.BDay() + rng2.offset = BDay() result = rng1.union(rng2) tm.assertIsInstance(result, DatetimeIndex) def test_error_with_zero_monthends(self): self.assertRaises(ValueError, date_range, '1/1/2000', '1/1/2001', - freq=datetools.MonthEnd(0)) + freq=MonthEnd(0)) def test_range_bug(self): # GH #770 - offset = datetools.DateOffset(months=3) + offset = DateOffset(months=3) result = date_range("2011-1-1", "2012-1-31", freq=offset) start = datetime(2011, 1, 1) @@ -456,9 +454,9 @@ def test_month_range_union_tz_pytz(self): late_end = datetime(2011, 5, 1) early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=datetools.monthEnd) + freq=MonthEnd()) late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=datetools.monthEnd) + freq=MonthEnd()) early_dr.union(late_dr) @@ -475,9 +473,9 @@ def test_month_range_union_tz_dateutil(self): late_end = datetime(2011, 5, 1) early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=datetools.monthEnd) + freq=MonthEnd()) late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=datetools.monthEnd) + freq=MonthEnd()) early_dr.union(late_dr) @@ -595,29 +593,29 @@ def setUp(self): self.rng = cdate_range(START, END) def test_constructor(self): - cdate_range(START, END, freq=datetools.cday) - cdate_range(START, periods=20, freq=datetools.cday) - cdate_range(end=START, periods=20, freq=datetools.cday) + cdate_range(START, END, freq=CDay()) + cdate_range(START, periods=20, freq=CDay()) + cdate_range(end=START, periods=20, freq=CDay()) self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'C') self.assertRaises(ValueError, cdate_range, '2011-1-1', '2012-1-1', 'C') def test_cached_range(self): - DatetimeIndex._cached_range(START, END, offset=datetools.cday) + DatetimeIndex._cached_range(START, END, offset=CDay()) DatetimeIndex._cached_range(START, periods=20, - offset=datetools.cday) + offset=CDay()) DatetimeIndex._cached_range(end=START, periods=20, - offset=datetools.cday) + offset=CDay()) self.assertRaises(Exception, DatetimeIndex._cached_range, START, END) self.assertRaises(Exception, DatetimeIndex._cached_range, START, - freq=datetools.cday) + freq=CDay()) self.assertRaises(Exception, DatetimeIndex._cached_range, end=END, - freq=datetools.cday) + freq=CDay()) self.assertRaises(Exception, DatetimeIndex._cached_range, periods=20, - freq=datetools.cday) + freq=CDay()) def test_comparison(self): d = self.rng[10] @@ -642,7 +640,7 @@ def test_getitem(self): self.assertEqual(smaller.offset, self.rng.offset) sliced = self.rng[::5] - self.assertEqual(sliced.offset, datetools.cday * 5) + self.assertEqual(sliced.offset, CDay() * 5) fancy_indexed = self.rng[[4, 3, 2, 1, 0]] self.assertEqual(len(fancy_indexed), 5) @@ -672,9 +670,9 @@ def test_shift(self): self.assertEqual(shifted.offset, self.rng.offset) with tm.assert_produces_warning(com.PerformanceWarning): - rng = date_range(START, END, freq=datetools.bmonthEnd) - shifted = rng.shift(1, freq=datetools.cday) - self.assertEqual(shifted[0], rng[0] + datetools.cday) + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=CDay()) + self.assertEqual(shifted[0], rng[0] + CDay()) def test_pickle_unpickle(self): unpickled = self.round_trip_pickle(self.rng) @@ -706,7 +704,7 @@ def test_union(self): self.assert_index_equal(right.union(left), the_union) # overlapping, but different offset - rng = date_range(START, END, freq=datetools.bmonthEnd) + rng = date_range(START, END, freq=BMonthEnd()) the_union = self.rng.union(rng) tm.assertIsInstance(the_union, DatetimeIndex) @@ -737,7 +735,7 @@ def test_outer_join(self): tm.assertIsInstance(the_join, DatetimeIndex) # overlapping, but different offset - rng = date_range(START, END, freq=datetools.bmonthEnd) + rng = date_range(START, END, freq=BMonthEnd()) the_join = self.rng.join(rng, how='outer') tm.assertIsInstance(the_join, DatetimeIndex) @@ -767,7 +765,7 @@ def test_summary_dateutil(self): def test_misc(self): end = datetime(2009, 5, 13) dr = cdate_range(end=end, periods=20) - firstDate = end - 19 * datetools.cday + firstDate = end - 19 * CDay() assert len(dr) == 20 assert dr[0] == firstDate @@ -792,7 +790,7 @@ def test_daterange_bug_456(self): # GH #456 rng1 = cdate_range('12/5/2011', '12/5/2011') rng2 = cdate_range('12/2/2011', '12/5/2011') - rng2.offset = datetools.CDay() + rng2.offset = CDay() result = rng1.union(rng2) tm.assertIsInstance(result, DatetimeIndex) diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 6ea6382a9904a..b3da62c8d2db5 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -9,31 +9,31 @@ import numpy as np from pandas.compat.numpy import np_datetime64_compat -from pandas.core.datetools import (bday, BDay, CDay, BQuarterEnd, BMonthEnd, - BusinessHour, CustomBusinessHour, - CBMonthEnd, CBMonthBegin, BYearEnd, - MonthEnd, MonthBegin, SemiMonthBegin, - SemiMonthEnd, BYearBegin, QuarterBegin, - BQuarterBegin, BMonthBegin, DateOffset, - Week, YearBegin, YearEnd, Hour, Minute, - Second, Day, Micro, Milli, Nano, Easter, - WeekOfMonth, format, ole2datetime, - QuarterEnd, to_datetime, normalize_date, - get_offset, get_standard_freq) from pandas.core.series import Series from pandas.tseries.frequencies import (_offset_map, get_freq_code, - _get_freq_str, _INVALID_FREQ_ERROR) + _get_freq_str, _INVALID_FREQ_ERROR, + get_offset, get_standard_freq) from pandas.tseries.index import _to_m8, DatetimeIndex, _daterange_cache -from pandas.tseries.tools import parse_time_string, DateParseError +from pandas.tseries.offsets import (BDay, CDay, BQuarterEnd, BMonthEnd, + BusinessHour, WeekOfMonth, CBMonthEnd, + CustomBusinessHour, WeekDay, + CBMonthBegin, BYearEnd, MonthEnd, + MonthBegin, SemiMonthBegin, SemiMonthEnd, + BYearBegin, QuarterBegin, BQuarterBegin, + BMonthBegin, DateOffset, Week, YearBegin, + YearEnd, Hour, Minute, Second, Day, Micro, + QuarterEnd, BusinessMonthEnd, FY5253, + Milli, Nano, Easter, FY5253Quarter, + LastWeekOfMonth, CacheableOffset) +from pandas.tseries.tools import (format, ole2datetime, parse_time_string, + to_datetime, DateParseError) import pandas.tseries.offsets as offsets from pandas.io.pickle import read_pickle -from pandas.tslib import NaT, Timestamp, Timedelta +from pandas.tslib import normalize_date, NaT, Timestamp, Timedelta import pandas.tslib as tslib from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm -from pandas.tseries.offsets import BusinessMonthEnd, CacheableOffset, \ - LastWeekOfMonth, FY5253, FY5253Quarter, WeekDay from pandas.tseries.holiday import USFederalHolidayCalendar _multiprocess_can_split_ = True @@ -646,38 +646,43 @@ def test_onOffset(self): def test_apply(self): tests = [] - tests.append((bday, {datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - tests.append((2 * bday, {datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)})) - - tests.append((-bday, {datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)})) - - tests.append((-2 * bday, {datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)})) + tests.append((BDay(), {datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + tests.append((2 * BDay(), {datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)} + )) + + tests.append((-BDay(), {datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)} + )) + + tests.append((-2 * BDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)} + )) tests.append((BDay(0), {datetime(2008, 1, 1): datetime(2008, 1, 1), datetime(2008, 1, 4): datetime(2008, 1, 4), datetime(2008, 1, 5): datetime(2008, 1, 7), datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) + datetime(2008, 1, 7): datetime(2008, 1, 7)} + )) for offset, cases in tests: for base, expected in compat.iteritems(cases): @@ -1787,35 +1792,40 @@ def test_onOffset(self): assertOnOffset(offset, d, expected) def test_apply(self): - from pandas.core.datetools import cday tests = [] - tests.append((cday, {datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - tests.append((2 * cday, {datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)})) - - tests.append((-cday, {datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)})) - - tests.append((-2 * cday, {datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)})) + tests.append((CDay(), {datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + tests.append((2 * CDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)} + )) + + tests.append((-CDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)} + )) + + tests.append((-2 * CDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)} + )) tests.append((CDay(0), {datetime(2008, 1, 1): datetime(2008, 1, 1), datetime(2008, 1, 4): datetime(2008, 1, 4), diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index a492abce01086..62cfcf7f1360e 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -16,7 +16,6 @@ import pandas.tseries.period as period import pandas.tseries.offsets as offsets -import pandas.core.datetools as datetools import pandas as pd import numpy as np from numpy.random import randn @@ -2910,9 +2909,9 @@ def test_asfreq_ts(self): tm.assert_index_equal(result.index, index.asfreq('D', how='start')) def test_badinput(self): - self.assertRaises(datetools.DateParseError, Period, '1/1/-2000', 'A') - # self.assertRaises(datetools.DateParseError, Period, '-2000', 'A') - # self.assertRaises(datetools.DateParseError, Period, '0', 'A') + self.assertRaises(ValueError, Period, '-2000', 'A') + self.assertRaises(tslib.DateParseError, Period, '0', 'A') + self.assertRaises(tslib.DateParseError, Period, '1/1/-2000', 'A') def test_negative_ordinals(self): Period(ordinal=-1000, freq='A') diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 5ce0bdffe7ad4..ac48fcc2551ea 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -16,7 +16,6 @@ import pandas as pd import pandas.compat as compat import pandas.core.common as com -import pandas.core.datetools as datetools import pandas.tseries.frequencies as frequencies import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools @@ -566,7 +565,7 @@ def test_frame_fillna_limit(self): def test_frame_setitem_timestamp(self): # 2155 columns = DatetimeIndex(start='1/1/2012', end='2/1/2012', - freq=datetools.bday) + freq=offsets.BDay()) index = lrange(10) data = DataFrame(columns=columns, index=index) t = datetime(2012, 11, 1) @@ -1918,7 +1917,7 @@ def test_astype_object(self): self.assertEqual(casted.tolist(), exp_values) def test_catch_infinite_loop(self): - offset = datetools.DateOffset(minute=5) + offset = offsets.DateOffset(minute=5) # blow up, don't loop forever self.assertRaises(Exception, date_range, datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) @@ -2544,7 +2543,7 @@ def test_index_to_datetime(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = idx.to_datetime() - expected = DatetimeIndex(datetools.to_datetime(idx.values)) + expected = DatetimeIndex(pd.to_datetime(idx.values)) tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning, @@ -3779,7 +3778,7 @@ def test_ns_index(self): dtstart = np.datetime64('2012-09-20T00:00:00') dt = dtstart + np.arange(nsamples) * np.timedelta64(ns, 'ns') - freq = ns * pd.datetools.Nano() + freq = ns * offsets.Nano() index = pd.DatetimeIndex(dt, freq=freq, name='time') self.assert_index_parameters(index) @@ -4134,7 +4133,7 @@ def test_datetimeindex_constructor(self): edate = datetime(2000, 1, 1) idx = DatetimeIndex(start=sdate, freq='1B', periods=20) self.assertEqual(len(idx), 20) - self.assertEqual(idx[0], sdate + 0 * datetools.bday) + self.assertEqual(idx[0], sdate + 0 * offsets.BDay()) self.assertEqual(idx.freq, 'B') idx = DatetimeIndex(end=edate, freq=('D', 5), periods=20) @@ -4144,19 +4143,19 @@ def test_datetimeindex_constructor(self): idx1 = DatetimeIndex(start=sdate, end=edate, freq='W-SUN') idx2 = DatetimeIndex(start=sdate, end=edate, - freq=datetools.Week(weekday=6)) + freq=offsets.Week(weekday=6)) self.assertEqual(len(idx1), len(idx2)) self.assertEqual(idx1.offset, idx2.offset) idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS') idx2 = DatetimeIndex(start=sdate, end=edate, - freq=datetools.QuarterBegin(startingMonth=1)) + freq=offsets.QuarterBegin(startingMonth=1)) self.assertEqual(len(idx1), len(idx2)) self.assertEqual(idx1.offset, idx2.offset) idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ') idx2 = DatetimeIndex(start=sdate, end=edate, - freq=datetools.BQuarterEnd(startingMonth=12)) + freq=offsets.BQuarterEnd(startingMonth=12)) self.assertEqual(len(idx1), len(idx2)) self.assertEqual(idx1.offset, idx2.offset) @@ -5019,7 +5018,7 @@ def test_shift(self): # GH #1063, multiple of same base result = ts.shift(1, freq='4H') - exp_index = ts.index + datetools.Hour(4) + exp_index = ts.index + offsets.Hour(4) tm.assert_index_equal(result.index, exp_index) idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) diff --git a/pandas/tseries/tests/test_timeseries_legacy.py b/pandas/tseries/tests/test_timeseries_legacy.py index 6f58ad3a57b48..d8c01c53fb2e5 100644 --- a/pandas/tseries/tests/test_timeseries_legacy.py +++ b/pandas/tseries/tests/test_timeseries_legacy.py @@ -8,8 +8,8 @@ from pandas import (Index, Series, date_range, Timestamp, DatetimeIndex, Int64Index, to_datetime) -import pandas.core.datetools as datetools -import pandas.tseries.offsets as offsets +from pandas.tseries.frequencies import get_offset, to_offset +from pandas.tseries.offsets import BDay, Micro, Milli, MonthBegin import pandas as pd from pandas.util.testing import assert_series_equal, assert_almost_equal @@ -19,12 +19,11 @@ from pandas import read_pickle from numpy.random import rand import pandas.compat as compat -from pandas.core.datetools import BDay randn = np.random.randn -# infortunately, too much has changed to handle these legacy pickles +# Unfortunately, too much has changed to handle these legacy pickles # class TestLegacySupport(unittest.TestCase): class LegacySupport(object): @@ -65,8 +64,6 @@ def test_unpickle_legacy_frame(self): self.assertEqual(unpickled.index.offset, BDay(1, normalize=True)) def test_unpickle_legacy_series(self): - from pandas.core.datetools import BDay - unpickled = self.series dtindex = DatetimeIndex(start='1/3/2005', end='1/14/2005', @@ -86,7 +83,7 @@ def test_unpickle_legacy_len0_daterange(self): ex_index = DatetimeIndex([], freq='B') self.assert_index_equal(result.index, ex_index) - tm.assertIsInstance(result.index.freq, offsets.BDay) + tm.assertIsInstance(result.index.freq, BDay) self.assertEqual(len(result), 0) def test_arithmetic_interaction(self): @@ -140,7 +137,7 @@ def test_unpickle_daterange(self): rng = read_pickle(filepath) tm.assertIsInstance(rng[0], datetime) - tm.assertIsInstance(rng.offset, offsets.BDay) + tm.assertIsInstance(rng.offset, BDay) self.assertEqual(rng.values.dtype, object) def test_setops(self): @@ -213,20 +210,15 @@ def test_legacy_time_rules(self): new_rng = date_range(start, end, freq=new_freq) self.assert_index_equal(old_rng, new_rng) - # test get_legacy_offset_name - offset = datetools.get_offset(new_freq) - old_name = datetools.get_legacy_offset_name(offset) - self.assertEqual(old_name, old_freq) - def test_ms_vs_MS(self): - left = datetools.get_offset('ms') - right = datetools.get_offset('MS') - self.assertEqual(left, datetools.Milli()) - self.assertEqual(right, datetools.MonthBegin()) + left = get_offset('ms') + right = get_offset('MS') + self.assertEqual(left, Milli()) + self.assertEqual(right, MonthBegin()) def test_rule_aliases(self): - rule = datetools.to_offset('10us') - self.assertEqual(rule, datetools.Micro(10)) + rule = to_offset('10us') + self.assertEqual(rule, Micro(10)) if __name__ == '__main__': diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index a7a015f273320..b8247fe01b3f2 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -11,7 +11,6 @@ from pandas import DatetimeIndex, to_datetime, NaT from pandas import tslib -import pandas.core.datetools as datetools import pandas.tseries.offsets as offsets from pandas.tseries.index import bdate_range, date_range import pandas.tseries.tools as tools @@ -371,7 +370,7 @@ def test_with_tz(self): # just want it to work start = datetime(2011, 3, 12, tzinfo=pytz.utc) - dr = bdate_range(start, periods=50, freq=datetools.Hour()) + dr = bdate_range(start, periods=50, freq=offsets.Hour()) self.assertIs(dr.tz, pytz.utc) # DateRange with naive datetimes @@ -409,33 +408,33 @@ def test_with_tz_ambiguous_times(self): # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, - freq=datetools.Hour()) + freq=offsets.Hour()) self.assertRaises(pytz.NonExistentTimeError, dr.tz_localize, tz) # after dst transition, it works dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, - freq=datetools.Hour(), tz=tz) + freq=offsets.Hour(), tz=tz) # November 6, 2011, fall back, repeat 2 AM hour dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, - freq=datetools.Hour()) + freq=offsets.Hour()) self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, tz) # UTC is OK dr = date_range(datetime(2011, 3, 13), periods=48, - freq=datetools.Minute(30), tz=pytz.utc) + freq=offsets.Minute(30), tz=pytz.utc) def test_ambiguous_infer(self): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition tz = self.tz('US/Eastern') dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=datetools.Hour()) + freq=offsets.Hour()) self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, tz) # With repeated hours, we can infer the transition dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=datetools.Hour(), tz=tz) + freq=offsets.Hour(), tz=tz) times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', '11/06/2011 02:00', '11/06/2011 03:00'] di = DatetimeIndex(times) @@ -449,7 +448,7 @@ def test_ambiguous_infer(self): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=datetools.Hour()) + freq=offsets.Hour()) localized = dr.tz_localize(tz) localized_infer = dr.tz_localize(tz, ambiguous='infer') self.assert_index_equal(localized, localized_infer) @@ -463,7 +462,7 @@ def test_ambiguous_flags(self): # Pass in flags to determine right dst transition dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=datetools.Hour(), tz=tz) + freq=offsets.Hour(), tz=tz) times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', '11/06/2011 02:00', '11/06/2011 03:00'] @@ -501,7 +500,7 @@ def test_ambiguous_flags(self): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=datetools.Hour()) + freq=offsets.Hour()) is_dst = np.array([1] * 10) localized = dr.tz_localize(tz) localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) diff --git a/pandas/util/depr_module.py b/pandas/util/depr_module.py new file mode 100644 index 0000000000000..7e03a000a50ec --- /dev/null +++ b/pandas/util/depr_module.py @@ -0,0 +1,79 @@ +""" +This module houses a utility class for mocking deprecated modules. +It is for internal use only and should not be used beyond this purpose. +""" + +import warnings +import importlib + + +class _DeprecatedModule(object): + """ Class for mocking deprecated modules. + + Parameters + ---------- + deprmod : name of module to be deprecated. + alts : alternative modules to be used to access objects or methods + available in module. + removals : objects or methods in module that will no longer be + accessible once module is removed. + """ + def __init__(self, deprmod, alts=None, removals=None): + self.deprmod = deprmod + + self.alts = alts + if self.alts is not None: + self.alts = frozenset(self.alts) + + self.removals = removals + if self.removals is not None: + self.removals = frozenset(self.removals) + + # For introspection purposes. + self.self_dir = frozenset(dir(self.__class__)) + + def __dir__(self): + _dir = object.__dir__(self) + + if self.removals is not None: + _dir.extend(list(self.removals)) + + if self.alts is not None: + for modname in self.alts: + module = importlib.import_module(modname) + _dir.extend(dir(module)) + + return _dir + + def __getattr__(self, name): + if name in self.self_dir: + return object.__getattribute__(self, name) + + if self.removals is not None and name in self.removals: + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=FutureWarning) + module = importlib.import_module(self.deprmod) + + warnings.warn( + "{deprmod}.{name} is deprecated and will be removed in " + "a future version.".format(deprmod=self.deprmod, name=name), + FutureWarning, stacklevel=2) + + return object.__getattribute__(module, name) + + if self.alts is not None: + for modname in self.alts: + module = importlib.import_module(modname) + + if hasattr(module, name): + warnings.warn( + "{deprmod}.{name} is deprecated. Please use " + "{modname}.{name} instead.".format( + deprmod=self.deprmod, modname=modname, name=name), + FutureWarning, stacklevel=2) + + return getattr(module, name) + + raise AttributeError("module '{deprmod}' has no attribute " + "'{name}'".format(deprmod=self.deprmod, + name=name)) diff --git a/scripts/bench_join.py b/scripts/bench_join.py index 5223aac40d63b..1ce5c94130e85 100644 --- a/scripts/bench_join.py +++ b/scripts/bench_join.py @@ -12,9 +12,9 @@ a = np.arange(n, dtype=np.int64) b = np.arange(n * pct_overlap, n * (1 + pct_overlap), dtype=np.int64) -dr1 = DatetimeIndex('1/1/2000', periods=n, offset=datetools.Minute()) +dr1 = DatetimeIndex('1/1/2000', periods=n, offset=offsets.Minute()) dr2 = DatetimeIndex( - dr1[int(pct_overlap * n)], periods=n, offset=datetools.Minute(2)) + dr1[int(pct_overlap * n)], periods=n, offset=offsets.Minute(2)) aobj = a.astype(object) bobj = b.astype(object) diff --git a/scripts/groupby_speed.py b/scripts/groupby_speed.py index 34f293d5008c6..3be9fac12418e 100644 --- a/scripts/groupby_speed.py +++ b/scripts/groupby_speed.py @@ -1,12 +1,12 @@ from __future__ import print_function from pandas import * -rng = DatetimeIndex('1/3/2011', '11/30/2011', offset=datetools.Minute()) +rng = DatetimeIndex('1/3/2011', '11/30/2011', offset=offsets.Minute()) df = DataFrame(np.random.randn(len(rng), 5), index=rng, columns=list('OHLCV')) -rng5 = DatetimeIndex('1/3/2011', '11/30/2011', offset=datetools.Minute(5)) +rng5 = DatetimeIndex('1/3/2011', '11/30/2011', offset=offsets.Minute(5)) gp = rng5.asof grouped = df.groupby(gp) diff --git a/scripts/hdfstore_panel_perf.py b/scripts/hdfstore_panel_perf.py index 66b0b52444bc1..c66e9506fc4c5 100644 --- a/scripts/hdfstore_panel_perf.py +++ b/scripts/hdfstore_panel_perf.py @@ -7,7 +7,7 @@ panel = Panel(np.random.randn(i, j, k), items=[rands(10) for _ in range(i)], major_axis=DatetimeIndex('1/1/2000', periods=j, - offset=datetools.Minute()), + offset=offsets.Minute()), minor_axis=[rands(10) for _ in range(k)]) From ab4bd36401d97cc288a3db09e0dd57bfdd84bd25 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Wed, 7 Sep 2016 22:18:27 +0900 Subject: [PATCH 345/359] ENH: concat and append now can handle unordered categories (#13767) Concatting categoricals with non-matching categories will now return object dtype instead of raising an error. * ENH: concat and append now can handleunordered categories * reomove union_categoricals kw from concat --- doc/source/categorical.rst | 58 ++++- doc/source/merging.rst | 27 +-- doc/source/whatsnew/v0.19.0.txt | 65 ++++-- pandas/core/internals.py | 7 +- pandas/tests/series/test_combine_concat.py | 4 +- pandas/tests/test_categorical.py | 164 ++++++-------- pandas/tools/merge.py | 25 +-- pandas/tools/tests/test_concat.py | 248 +++++++++++++++++++++ pandas/types/concat.py | 59 ++--- 9 files changed, 473 insertions(+), 184 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index d59ad68c9ea83..59ddfe602c033 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -675,12 +675,60 @@ be lexsorted, use ``sort_categories=True`` argument. union_categoricals([a, b], sort_categories=True) -.. note:: +``union_categoricals`` also works with the "easy" case of combining two +categoricals of the same categories and order information +(e.g. what you could also ``append`` for). + +.. ipython:: python + + a = pd.Categorical(["a", "b"], ordered=True) + b = pd.Categorical(["a", "b", "a"], ordered=True) + union_categoricals([a, b]) + +The below raises ``TypeError`` because the categories are ordered and not identical. + +.. code-block:: ipython + + In [1]: a = pd.Categorical(["a", "b"], ordered=True) + In [2]: b = pd.Categorical(["a", "b", "c"], ordered=True) + In [3]: union_categoricals([a, b]) + Out[3]: + TypeError: to union ordered Categoricals, all categories must be the same + +.. _categorical.concat: + +Concatenation +~~~~~~~~~~~~~ + +This section describes concatenations specific to ``category`` dtype. See :ref:`Concatenating objects` for general description. + +By default, ``Series`` or ``DataFrame`` concatenation which contains the same categories +results in ``category`` dtype, otherwise results in ``object`` dtype. +Use ``.astype`` or ``union_categoricals`` to get ``category`` result. + +.. ipython:: python + + # same categories + s1 = pd.Series(['a', 'b'], dtype='category') + s2 = pd.Series(['a', 'b', 'a'], dtype='category') + pd.concat([s1, s2]) + + # different categories + s3 = pd.Series(['b', 'c'], dtype='category') + pd.concat([s1, s3]) + + pd.concat([s1, s3]).astype('category') + union_categoricals([s1.values, s3.values]) + + +Following table summarizes the results of ``Categoricals`` related concatenations. - In addition to the "easy" case of combining two categoricals of the same - categories and order information (e.g. what you could also ``append`` for), - ``union_categoricals`` only works with unordered categoricals and will - raise if any are ordered. +| arg1 | arg2 | result | +|---------|-------------------------------------------|---------| +| category | category (identical categories) | category | +| category | category (different categories, both not ordered) | object (dtype is inferred) | +| category | category (different categories, either one is ordered) | object (dtype is inferred) | +| category | not category | object (dtype is inferred) | Getting Data In/Out ------------------- diff --git a/doc/source/merging.rst b/doc/source/merging.rst index f14e5741c6e2e..c6541a26c72b4 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -78,34 +78,35 @@ some configurable handling of "what to do with the other axes": :: pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False) + keys=None, levels=None, names=None, verify_integrity=False, + copy=True) -- ``objs``: a sequence or mapping of Series, DataFrame, or Panel objects. If a +- ``objs`` : a sequence or mapping of Series, DataFrame, or Panel objects. If a dict is passed, the sorted keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. -- ``axis``: {0, 1, ...}, default 0. The axis to concatenate along. -- ``join``: {'inner', 'outer'}, default 'outer'. How to handle indexes on +- ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along. +- ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on other axis(es). Outer for union and inner for intersection. -- ``join_axes``: list of Index objects. Specific indexes to use for the other +- ``ignore_index`` : boolean, default False. If True, do not use the index + values on the concatenation axis. The resulting axis will be labeled 0, ..., + n - 1. This is useful if you are concatenating objects where the + concatenation axis does not have meaningful indexing information. Note + the index values on the other axes are still respected in the join. +- ``join_axes`` : list of Index objects. Specific indexes to use for the other n - 1 axes instead of performing inner/outer set logic. -- ``keys``: sequence, default None. Construct hierarchical index using the +- ``keys`` : sequence, default None. Construct hierarchical index using the passed keys as the outermost level. If multiple levels passed, should contain tuples. - ``levels`` : list of sequences, default None. Specific levels (unique values) to use for constructing a MultiIndex. Otherwise they will be inferred from the keys. -- ``names``: list, default None. Names for the levels in the resulting +- ``names`` : list, default None. Names for the levels in the resulting hierarchical index. -- ``verify_integrity``: boolean, default False. Check whether the new +- ``verify_integrity`` : boolean, default False. Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. -- ``ignore_index`` : boolean, default False. If True, do not use the index - values on the concatenation axis. The resulting axis will be labeled 0, ..., - n - 1. This is useful if you are concatenating objects where the - concatenation axis does not have meaningful indexing information. Note - the index values on the other axes are still respected in the join. - ``copy`` : boolean, default True. If False, do not copy data unnecessarily. Without a little bit of context and example many of these arguments don't make diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 282c4ef127391..9f468ae6785cb 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -15,6 +15,8 @@ Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` +- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here ` +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`here ` - pandas development api, see :ref:`here ` - ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See :ref:`here ` - Sparse data structures now gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` @@ -218,7 +220,7 @@ they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :is data = '0,1,2\n3,4,5' names = ['a', 'b', 'a'] -Previous behaviour: +Previous Behavior: .. code-block:: ipython @@ -231,7 +233,7 @@ Previous behaviour: The first ``a`` column contains the same data as the second ``a`` column, when it should have contained the values ``[0, 3]``. -New behaviour: +New Behavior: .. ipython :: python @@ -277,6 +279,38 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) df['col3'] +.. _whatsnew_0190.enhancements.union_categoricals: + +Categorical Concatenation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +- A function :func:`union_categoricals` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846`) + +.. ipython:: python + + from pandas.types.concat import union_categoricals + a = pd.Categorical(["b", "c"]) + b = pd.Categorical(["a", "b"]) + union_categoricals([a, b]) + +- ``concat`` and ``append`` now can concat ``category`` dtypes wifht different +``categories`` as ``object`` dtype (:issue:`13524`) + +Previous Behavior: + + .. code-block:: ipython + + In [1]: s1 = pd.Series(['a', 'b'], dtype='category') + In [2]: s2 = pd.Series(['b', 'c'], dtype='category') + In [3]: pd.concat([s1, s2]) + ValueError: incompatible categories in categorical concat + +New Behavior: + + .. ipython:: python + + pd.concat([s1, s2]) + .. _whatsnew_0190.enhancements.semi_month_offsets: Semi-Month Offsets @@ -378,11 +412,11 @@ get_dummies dtypes The ``pd.get_dummies`` function now returns dummy-encoded columns as small integers, rather than floats (:issue:`8725`). This should provide an improved memory footprint. -Previous behaviour: +Previous Behavior: .. code-block:: ipython - In [1]: pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + In [1]: pd.get_dummies(['a', 'b', 'a', 'c']).dtypes Out[1]: a float64 @@ -404,7 +438,7 @@ Other enhancements - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the :ref:`docs ` for more details (:issue:`13577`). -- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) +- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) .. ipython:: python @@ -448,7 +482,6 @@ Other enhancements - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, :issue:`13846`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`). - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) @@ -512,7 +545,7 @@ API changes ``Series.tolist()`` will now return Python types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behaviour (:issue:`10904`) +``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behavior (:issue:`10904`) .. ipython:: python @@ -547,7 +580,7 @@ including ``DataFrame`` (:issue:`1134`, :issue:`4581`, :issue:`13538`) .. warning:: Until 0.18.1, comparing ``Series`` with the same length, would succeed even if - the ``.index`` are different (the result ignores ``.index``). As of 0.19.0, this will raises ``ValueError`` to be more strict. This section also describes how to keep previous behaviour or align different indexes, using the flexible comparison methods like ``.eq``. + the ``.index`` are different (the result ignores ``.index``). As of 0.19.0, this will raises ``ValueError`` to be more strict. This section also describes how to keep previous behavior or align different indexes, using the flexible comparison methods like ``.eq``. As a result, ``Series`` and ``DataFrame`` operators behave as below: @@ -615,7 +648,7 @@ Logical operators Logical operators align both ``.index``. -Previous Behavior (``Series``), only left hand side ``index`` is kept: +Previous behavior (``Series``), only left hand side ``index`` is kept: .. code-block:: ipython @@ -935,7 +968,7 @@ Index ``+`` / ``-`` no longer used for set operations Addition and subtraction of the base Index type and of DatetimeIndex (not the numeric index types) previously performed set operations (set union and difference). This -behaviour was already deprecated since 0.15.0 (in favor using the specific +behavior was already deprecated since 0.15.0 (in favor using the specific ``.union()`` and ``.difference()`` methods), and is now disabled. When possible, ``+`` and ``-`` are now used for element-wise operations, for example for concatenating strings or subtracting datetimes @@ -956,13 +989,13 @@ The same operation will now perform element-wise addition: pd.Index(['a', 'b']) + pd.Index(['a', 'c']) Note that numeric Index objects already performed element-wise operations. -For example, the behaviour of adding two integer Indexes: +For example, the behavior of adding two integer Indexes: .. ipython:: python pd.Index([1, 2, 3]) + pd.Index([2, 3, 4]) -is unchanged. The base ``Index`` is now made consistent with this behaviour. +is unchanged. The base ``Index`` is now made consistent with this behavior. Further, because of this change, it is now possible to subtract two DatetimeIndex objects resulting in a TimedeltaIndex: @@ -1130,7 +1163,7 @@ the result of calling :func:`read_csv` without the ``chunksize=`` argument. data = 'A,B\n0,1\n2,3\n4,5\n6,7' -Previous behaviour: +Previous Behavior: .. code-block:: ipython @@ -1142,7 +1175,7 @@ Previous behaviour: 0 4 5 1 6 7 -New behaviour: +New Behavior: .. ipython :: python @@ -1268,7 +1301,7 @@ These types are the same on many platform, but for 64 bit python on Windows, ``np.int_`` is 32 bits, and ``np.intp`` is 64 bits. Changing this behavior improves performance for many operations on that platform. -Previous behaviour: +Previous Behavior: .. code-block:: ipython @@ -1277,7 +1310,7 @@ Previous behaviour: In [2]: i.get_indexer(['b', 'b', 'c']).dtype Out[2]: dtype('int32') -New behaviour: +New Behavior: .. code-block:: ipython diff --git a/pandas/core/internals.py b/pandas/core/internals.py index bb2d1a9d1b5d3..9a1c7864903d7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4787,10 +4787,9 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): [get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers], concat_axis) - blocks = [make_block(concatenate_join_units(join_units, concat_axis, - copy=copy), - placement=placement) - for placement, join_units in concat_plan] + blocks = [make_block( + concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement) for placement, join_units in concat_plan] return BlockManager(blocks, axes) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index fd6fd90cd631f..23261c2ef79e2 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -185,9 +185,9 @@ def test_concat_empty_series_dtypes(self): 'category') self.assertEqual(pd.concat([Series(dtype='category'), Series(dtype='float64')]).dtype, - np.object_) + 'float64') self.assertEqual(pd.concat([Series(dtype='category'), - Series(dtype='object')]).dtype, 'category') + Series(dtype='object')]).dtype, 'object') # sparse result = pd.concat([Series(dtype='float64').to_sparse(), Series( diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index eabd118de671d..c4ddd2c0981d9 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2088,8 +2088,8 @@ def test_series_functions_no_warnings(self): def test_assignment_to_dataframe(self): # assignment - df = DataFrame({'value': np.array( - np.random.randint(0, 10000, 100), dtype='int32')}) + df = DataFrame({'value': np.array(np.random.randint(0, 10000, 100), + dtype='int32')}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] df = df.sort_values(by=['value'], ascending=True) @@ -3355,16 +3355,15 @@ def test_slicing_and_getting_ops(self): def test_slicing_doc_examples(self): # GH 7918 - cats = Categorical( - ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"]) + cats = Categorical(["a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c"]) idx = Index(["h", "i", "j", "k", "l", "m", "n", ]) values = [1, 2, 2, 2, 3, 4, 5] df = DataFrame({"cats": cats, "values": values}, index=idx) result = df.iloc[2:4, :] expected = DataFrame( - {"cats": Categorical( - ['b', 'b'], categories=['a', 'b', 'c']), + {"cats": Categorical(['b', 'b'], categories=['a', 'b', 'c']), "values": [2, 2]}, index=['j', 'k']) tm.assert_frame_equal(result, expected) @@ -3379,10 +3378,9 @@ def test_slicing_doc_examples(self): tm.assert_series_equal(result, expected) result = df.ix["h":"j", 0:1] - expected = DataFrame({'cats': Series( - Categorical( - ['a', 'b', 'b'], categories=['a', 'b', 'c']), index=['h', 'i', - 'j'])}) + expected = DataFrame({'cats': Categorical(['a', 'b', 'b'], + categories=['a', 'b', 'c'])}, + index=['h', 'i', 'j']) tm.assert_frame_equal(result, expected) def test_assigning_ops(self): @@ -3636,8 +3634,8 @@ def f(): with tm.assertRaises(ValueError): # different values df = orig.copy() - df.ix["j":"k", 0] = pd.Categorical( - ["c", "c"], categories=["a", "b", "c"]) + df.ix["j":"k", 0] = pd.Categorical(["c", "c"], + categories=["a", "b", "c"]) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3674,8 +3672,8 @@ def f(): self.assertRaises(ValueError, f) # fancy indexing - catsf = pd.Categorical( - ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"]) + catsf = pd.Categorical(["a", "a", "c", "c", "a", "a", "a"], + categories=["a", "b", "c"]) idxf = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) valuesf = [1, 1, 3, 3, 1, 1, 1] df = pd.DataFrame({"cats": catsf, "values": valuesf}, index=idxf) @@ -3733,9 +3731,8 @@ def f(): s = orig.copy() s.index = ["x", "y"] s["y"] = "a" - exp = Series( - pd.Categorical(["b", "a"], - categories=["a", "b"]), index=["x", "y"]) + exp = Series(pd.Categorical(["b", "a"], categories=["a", "b"]), + index=["x", "y"]) tm.assert_series_equal(s, exp) # ensure that one can set something to np.nan @@ -3887,7 +3884,7 @@ def test_cat_equality(self): self.assertRaises(TypeError, lambda: a > b) self.assertRaises(TypeError, lambda: b > a) - def test_concat(self): + def test_concat_append(self): cat = pd.Categorical(["a", "b"], categories=["a", "b"]) vals = [1, 2] df = pd.DataFrame({"cats": cat, "vals": vals}) @@ -3896,20 +3893,22 @@ def test_concat(self): exp = pd.DataFrame({"cats": cat2, "vals": vals2}, index=pd.Index([0, 1, 0, 1])) - res = pd.concat([df, df]) - tm.assert_frame_equal(exp, res) + tm.assert_frame_equal(pd.concat([df, df]), exp) + tm.assert_frame_equal(df.append(df), exp) - # Concat should raise if the two categoricals do not have the same - # categories + # GH 13524 can concat different categories cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) vals3 = [1, 2] - df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) + df_different_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) - def f(): - pd.concat([df, df_wrong_categories]) + res = pd.concat([df, df_different_categories], ignore_index=True) + exp = pd.DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]}) + tm.assert_frame_equal(res, exp) - self.assertRaises(ValueError, f) + res = df.append(df_different_categories, ignore_index=True) + tm.assert_frame_equal(res, exp) + def test_concat_append_gh7864(self): # GH 7864 # make sure ordering is preserverd df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], @@ -3926,41 +3925,44 @@ def f(): df2['grade'].cat.categories) dfx = pd.concat([df1, df2]) - dfx['grade'].cat.categories self.assert_index_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) + dfa = df1.append(df2) + self.assert_index_equal(df['grade'].cat.categories, + dfa['grade'].cat.categories) + + def test_concat_preserve(self): - # GH 8641 - # series concat not preserving category dtype + # GH 8641 series concat not preserving category dtype + # GH 13524 can concat different categories s = Series(list('abc'), dtype='category') s2 = Series(list('abd'), dtype='category') - def f(): - pd.concat([s, s2]) - - self.assertRaises(ValueError, f) + exp = Series(list('abcabd')) + res = pd.concat([s, s2], ignore_index=True) + tm.assert_series_equal(res, exp) - result = pd.concat([s, s], ignore_index=True) - expected = Series(list('abcabc')).astype('category') - tm.assert_series_equal(result, expected) + exp = Series(list('abcabc'), dtype='category') + res = pd.concat([s, s], ignore_index=True) + tm.assert_series_equal(res, exp) - result = pd.concat([s, s]) - expected = Series( - list('abcabc'), index=[0, 1, 2, 0, 1, 2]).astype('category') - tm.assert_series_equal(result, expected) + exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2], + dtype='category') + res = pd.concat([s, s]) + tm.assert_series_equal(res, exp) a = Series(np.arange(6, dtype='int64')) b = Series(list('aabbca')) df2 = DataFrame({'A': a, 'B': b.astype('category', categories=list('cab'))}) - result = pd.concat([df2, df2]) - expected = DataFrame({'A': pd.concat([a, a]), - 'B': pd.concat([b, b]).astype( - 'category', categories=list('cab'))}) - tm.assert_frame_equal(result, expected) + res = pd.concat([df2, df2]) + exp = DataFrame({'A': pd.concat([a, a]), + 'B': pd.concat([b, b]).astype( + 'category', categories=list('cab'))}) + tm.assert_frame_equal(res, exp) def test_categorical_index_preserver(self): @@ -3968,44 +3970,21 @@ def test_categorical_index_preserver(self): b = Series(list('aabbca')) df2 = DataFrame({'A': a, - 'B': b.astype('category', categories=list( - 'cab'))}).set_index('B') + 'B': b.astype('category', categories=list('cab')) + }).set_index('B') result = pd.concat([df2, df2]) expected = DataFrame({'A': pd.concat([a, a]), 'B': pd.concat([b, b]).astype( - 'category', categories=list( - 'cab'))}).set_index('B') + 'category', categories=list('cab')) + }).set_index('B') tm.assert_frame_equal(result, expected) # wrong catgories df3 = DataFrame({'A': a, - 'B': b.astype('category', categories=list( - 'abc'))}).set_index('B') + 'B': pd.Categorical(b, categories=list('abc')) + }).set_index('B') self.assertRaises(TypeError, lambda: pd.concat([df2, df3])) - def test_append(self): - cat = pd.Categorical(["a", "b"], categories=["a", "b"]) - vals = [1, 2] - df = pd.DataFrame({"cats": cat, "vals": vals}) - cat2 = pd.Categorical(["a", "b", "a", "b"], categories=["a", "b"]) - vals2 = [1, 2, 1, 2] - exp = pd.DataFrame({"cats": cat2, - "vals": vals2}, index=pd.Index([0, 1, 0, 1])) - - res = df.append(df) - tm.assert_frame_equal(exp, res) - - # Concat should raise if the two categoricals do not have the same - # categories - cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) - vals3 = [1, 2] - df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) - - def f(): - df.append(df_wrong_categories) - - self.assertRaises(ValueError, f) - def test_merge(self): # GH 9426 @@ -4470,27 +4449,22 @@ def test_dt_accessor_api_for_categorical(self): def test_concat_categorical(self): # See GH 10177 - df1 = pd.DataFrame( - np.arange(18, dtype='int64').reshape(6, - 3), columns=["a", "b", "c"]) - - df2 = pd.DataFrame( - np.arange(14, dtype='int64').reshape(7, 2), columns=["a", "c"]) - df2['h'] = pd.Series(pd.Categorical(["one", "one", "two", "one", "two", - "two", "one"])) - - df_concat = pd.concat((df1, df2), axis=0).reset_index(drop=True) - - df_expected = pd.DataFrame( - {'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], - 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan], - 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13]}) - df_expected['h'] = pd.Series(pd.Categorical( - [None, None, None, None, None, None, "one", "one", "two", "one", - "two", "two", "one"])) - - tm.assert_frame_equal(df_expected, df_concat) + df1 = pd.DataFrame(np.arange(18, dtype='int64').reshape(6, 3), + columns=["a", "b", "c"]) + + df2 = pd.DataFrame(np.arange(14, dtype='int64').reshape(7, 2), + columns=["a", "c"]) + + cat_values = ["one", "one", "two", "one", "two", "two", "one"] + df2['h'] = pd.Series(pd.Categorical(cat_values)) + + res = pd.concat((df1, df2), axis=0, ignore_index=True) + exp = pd.DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], + 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan], + 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], + 'h': [None] * 6 + cat_values}) + tm.assert_frame_equal(res, exp) class TestCategoricalSubclassing(tm.TestCase): diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 7a29918c55658..6521acbd0b733 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1290,9 +1290,12 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, join_axes : list of Index objects Specific indexes to use for the other n - 1 axes instead of performing inner/outer set logic - verify_integrity : boolean, default False - Check whether the new concatenated axis contains duplicates. This can - be very expensive relative to the actual data concatenation + ignore_index : boolean, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. keys : sequence, default None If multiple levels passed, should contain tuples. Construct hierarchical index using the passed keys as the outermost level @@ -1301,12 +1304,9 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, MultiIndex. Otherwise they will be inferred from the keys names : list, default None Names for the levels in the resulting hierarchical index - ignore_index : boolean, default False - If True, do not use the index values along the concatenation axis. The - resulting axis will be labeled 0, ..., n - 1. This is useful if you are - concatenating objects where the concatenation axis does not have - meaningful indexing information. Note the index values on the other - axes are still respected in the join. + verify_integrity : boolean, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation copy : boolean, default True If False, do not copy data unnecessarily @@ -1512,10 +1512,9 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) - new_data = concatenate_block_managers(mgrs_indexers, - self.new_axes, - concat_axis=self.axis, - copy=self.copy) + new_data = concatenate_block_managers( + mgrs_indexers, self.new_axes, concat_axis=self.axis, + copy=self.copy) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 102f21bcdc535..8e20cfa83c405 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -450,6 +450,254 @@ def test_concatlike_common_period_mixed_dt_to_object(self): res = pd.concat([tds, ps1]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + def test_concat_categorical(self): + # GH 13524 + + # same categories -> category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2], dtype='category') + + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='category') + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # partially different categories => not-category + s1 = pd.Series([3, 2], dtype='category') + s2 = pd.Series([2, 1], dtype='category') + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # completelly different categories (same dtype) => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') + + exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_coercion(self): + # GH 13524 + + # category + not-category => not-category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # result shouldn't be affected by 1st elem dtype + exp = pd.Series([2, 1, 2, 1, 2, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all values are not in category => not-category + s1 = pd.Series([3, 2], dtype='category') + s2 = pd.Series([2, 1]) + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([2, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # completelly different categories => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series([1, 3, 2]) + + exp = pd.Series([10, 11, np.nan, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([1, 3, 2, 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # different dtype => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series(['a', 'b', 'c']) + + exp = pd.Series([10, 11, np.nan, 'a', 'b', 'c']) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series(['a', 'b', 'c', 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # if normal series only contains NaN-likes => not-category + s1 = pd.Series([10, 11], dtype='category') + s2 = pd.Series([np.nan, np.nan, np.nan]) + + exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([np.nan, np.nan, np.nan, 10, 11]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + def test_concat_categorical_3elem_coercion(self): + # GH 13524 + + # mixed dtypes => not-category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2], dtype='category') + s3 = pd.Series([1, 2, 1, 2, np.nan]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype='category') + s2 = pd.Series([1, 2, 3], dtype='category') + s3 = pd.Series([1, 3, 4]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype='category') + s2 = pd.Series([1, 2, 3], dtype='category') + s3 = pd.Series([10, 11, 12]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + def test_concat_categorical_multi_coercion(self): + # GH 13524 + + s1 = pd.Series([1, 3], dtype='category') + s2 = pd.Series([3, 4], dtype='category') + s3 = pd.Series([2, 3]) + s4 = pd.Series([2, 2], dtype='category') + s5 = pd.Series([1, np.nan]) + s6 = pd.Series([1, 3, 2], dtype='category') + + # mixed dtype, values are all in categories => not-category + exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) + res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) + res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + + def test_concat_categorical_ordered(self): + # GH 13524 + + s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True)) + s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True)) + + exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], + ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) + + def test_concat_categorical_coercion_nan(self): + # GH 13524 + + # some edge cases + # category + not-category => not category + s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), + dtype='category') + s2 = pd.Series([np.nan, 1]) + + exp = pd.Series([np.nan, np.nan, np.nan, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + s1 = pd.Series([1, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([1, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # mixed dtype, all nan-likes => not-category + s1 = pd.Series([np.nan, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=object) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all category nan-likes => category + s1 = pd.Series([np.nan, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan], dtype='category') + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_empty(self): + # GH 13524 + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([1, 2], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([]) + + # different dtype => not-category + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + # empty Series is ignored + exp = pd.Series([np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + class TestAppend(ConcatenateBase): diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 29a0fe7d9f8d0..8bdd71348a537 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -5,7 +5,6 @@ import numpy as np import pandas.tslib as tslib from pandas import compat -from pandas.compat import map from pandas.core.algorithms import take_1d from .common import (is_categorical_dtype, is_sparse, @@ -133,19 +132,21 @@ def is_nonempty(x): typs = get_dtype_kinds(to_concat) - # these are mandated to handle empties as well _contains_datetime = any(typ.startswith('datetime') for typ in typs) _contains_period = any(typ.startswith('period') for typ in typs) - if _contains_datetime or 'timedelta' in typs or _contains_period: + if 'category' in typs: + # this must be priort to _concat_datetime, + # to support Categorical + datetime-like + return _concat_categorical(to_concat, axis=axis) + + elif _contains_datetime or 'timedelta' in typs or _contains_period: return _concat_datetime(to_concat, axis=axis, typs=typs) + # these are mandated to handle empties as well elif 'sparse' in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) - elif 'category' in typs: - return _concat_categorical(to_concat, axis=axis) - if not nonempty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise @@ -181,18 +182,14 @@ def _concat_categorical(to_concat, axis=0): A single array, preserving the combined dtypes """ - from pandas.core.categorical import Categorical - - def convert_categorical(x): - # coerce to object dtype - if is_categorical_dtype(x.dtype): - return x.get_values() - return x.ravel() - - if get_dtype_kinds(to_concat) - set(['object', 'category']): - # convert to object type and perform a regular concat - return _concat_compat([np.array(x, copy=False, dtype=object) - for x in to_concat], axis=0) + def _concat_asobject(to_concat): + to_concat = [x.get_values() if is_categorical_dtype(x.dtype) + else x.ravel() for x in to_concat] + res = _concat_compat(to_concat) + if axis == 1: + return res.reshape(1, len(res)) + else: + return res # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything @@ -200,25 +197,15 @@ def convert_categorical(x): categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] # validate the categories - categories = categoricals[0] - rawcats = categories.categories - for x in categoricals[1:]: - if not categories.is_dtype_equal(x): - raise ValueError("incompatible categories in categorical concat") - - # we've already checked that all categoricals are the same, so if their - # length is equal to the input then we have all the same categories - if len(categoricals) == len(to_concat): - # concating numeric types is much faster than concating object types - # and fastpath takes a shorter path through the constructor - return Categorical(np.concatenate([x.codes for x in to_concat], - axis=0), - rawcats, ordered=categoricals[0].ordered, - fastpath=True) + if len(categoricals) != len(to_concat): + pass else: - concatted = np.concatenate(list(map(convert_categorical, to_concat)), - axis=0) - return Categorical(concatted, rawcats) + # when all categories are identical + first = to_concat[0] + if all(first.is_dtype_equal(other) for other in to_concat[1:]): + return union_categoricals(categoricals) + + return _concat_asobject(to_concat) def union_categoricals(to_union, sort_categories=False): From 9b7efd69b2e88f5019d480ea4d5719bfaa7fa26b Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Wed, 7 Sep 2016 09:22:53 -0400 Subject: [PATCH 346/359] Add steps to run gbq integration testing to the contributing docs (#14144) --- .travis.yml | 8 +----- ci/travis_encrypt_gbq.sh | 35 +++++++++++++++++++++++ ci/travis_gbq_config.txt | 3 ++ ci/travis_process_gbq_encryption.sh | 11 ++++++++ doc/source/contributing.rst | 43 +++++++++++++++++++++++++++++ pandas/io/tests/test_gbq.py | 4 +-- 6 files changed, 95 insertions(+), 9 deletions(-) create mode 100755 ci/travis_encrypt_gbq.sh create mode 100644 ci/travis_gbq_config.txt create mode 100755 ci/travis_process_gbq_encryption.sh diff --git a/.travis.yml b/.travis.yml index 4d3908bc35de4..c6f6d8b81ae59 100644 --- a/.travis.yml +++ b/.travis.yml @@ -229,14 +229,8 @@ matrix: - USE_CACHE=true before_install: - # gbq secure key - - if [ -n "$encrypted_1d9d7b1f171b_iv" ]; then - openssl aes-256-cbc -K $encrypted_1d9d7b1f171b_key - -iv $encrypted_1d9d7b1f171b_iv -in ci/travis_gbq.json.enc - -out ci/travis_gbq.json -d; - export VALID_GBQ_CREDENTIALS=True; - fi - echo "before_install" + - source ci/travis_process_gbq_encryption.sh - echo $VIRTUAL_ENV - export PATH="$HOME/miniconda/bin:$PATH" - df -h diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh new file mode 100755 index 0000000000000..719db67f384e0 --- /dev/null +++ b/ci/travis_encrypt_gbq.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +GBQ_JSON_FILE=$1 +GBQ_PROJECT_ID=$2 + +if [[ $# -ne 2 ]]; then + echo -e "Too few arguments.\nUsage: ./travis_encrypt_gbq.sh "\ + " " + exit 1 +fi + +if [[ $GBQ_JSON_FILE != *.json ]]; then + echo "ERROR: Expected *.json file" + exit 1 +fi + +if [[ ! -f $GBQ_JSON_FILE ]]; then + echo "ERROR: File $GBQ_JSON_FILE does not exist" + exit 1 +fi + +echo "Encrypting $GBQ_JSON_FILE..." +read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file $GBQ_JSON_FILE \ +travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); + +echo "Adding your secure key and project id to travis_gbq_config.txt ..." +echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY\n"\ +"GBQ_PROJECT_ID='$GBQ_PROJECT_ID'" > travis_gbq_config.txt + +echo "Done. Removing file $GBQ_JSON_FILE" +rm $GBQ_JSON_FILE + +echo -e "Created encrypted credentials file travis_gbq.json.enc.\n"\ + "NOTE: Do NOT commit the *.json file containing your unencrypted" \ + "private key" diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt new file mode 100644 index 0000000000000..3b68d62f177cc --- /dev/null +++ b/ci/travis_gbq_config.txt @@ -0,0 +1,3 @@ +TRAVIS_IV_ENV=encrypted_1d9d7b1f171b_iv +TRAVIS_KEY_ENV=encrypted_1d9d7b1f171b_key +GBQ_PROJECT_ID='pandas-travis' diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh new file mode 100755 index 0000000000000..7ff4c08f78e37 --- /dev/null +++ b/ci/travis_process_gbq_encryption.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +source ci/travis_gbq_config.txt + +if [[ -n ${!TRAVIS_IV_ENV} ]]; then + openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ + -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; + export GBQ_PROJECT_ID=$GBQ_PROJECT_ID; + echo 'Successfully decrypted gbq credentials' +fi + diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 54de4d86a48d9..7f336abcaa6d7 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -626,6 +626,44 @@ This will display stderr from the benchmarks, and use your local Information on how to write a benchmark and how to use asv can be found in the `asv documentation `_. +.. _contributing.gbq_integration_tests: + +Running Google BigQuery Integration Tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You will need to create a Google BigQuery private key in JSON format in +order to run Google BigQuery integration tests on your local machine and +on Travis-CI. The first step is to create a `service account +`__. + +Integration tests for ``pandas.io.gbq`` are skipped in pull requests because +the credentials that are required for running Google BigQuery integration +tests are `encrypted `__ +on Travis-CI and are only accessible from the pydata/pandas repository. The +credentials won't be available on forks of pandas. Here are the steps to run +gbq integration tests on a forked repository: + +#. First, complete all the steps in the `Encrypting Files Prerequisites + `__ section. +#. Sign into `Travis `__ using your GitHub account. +#. Enable your forked repository of pandas for testing in `Travis + `__. +#. Run the following command from terminal where the current working directory + is the ``ci`` folder:: + + ./travis_encrypt_gbq.sh + +#. Create a new branch from the branch used in your pull request. Commit the + encrypted file called ``travis_gbq.json.enc`` as well as the file + ``travis_gbq_config.txt``, in an otherwise empty commit. DO NOT commit the + ``*.json`` file which contains your unencrypted private key. +#. Your branch should be tested automatically once it is pushed. You can check + the status by visiting your Travis branches page which exists at the + following location: https://travis-ci.org/your-user-name/pandas/branches . + Click on a build job for your branch. Expand the following line in the + build log: ``ci/print_skipped.py /tmp/nosetests.xml`` . Search for the + term ``test_gbq`` and confirm that gbq integration tests are not skipped. + Running the vbench performance test suite (phasing out) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -814,6 +852,11 @@ updated. Pushing them to GitHub again is done by:: This will automatically update your pull request with the latest code and restart the Travis-CI tests. +If your pull request is related to the ``pandas.io.gbq`` module, please see +the section on :ref:`Running Google BigQuery Integration Tests +` to configure a Google BigQuery service +account for your pull request on Travis-CI. + Delete your merged branch (optional) ------------------------------------ diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 7757950592da5..921fd824d6ffd 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -60,12 +60,12 @@ def _skip_if_no_private_key_contents(): def _in_travis_environment(): return 'TRAVIS_BUILD_DIR' in os.environ and \ - 'VALID_GBQ_CREDENTIALS' in os.environ + 'GBQ_PROJECT_ID' in os.environ def _get_project_id(): if _in_travis_environment(): - return 'pandas-travis' + return os.environ.get('GBQ_PROJECT_ID') else: return PROJECT_ID From 1ace12b7473e6444f7bbd56f90343ca7638570c3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Sep 2016 15:57:21 +0200 Subject: [PATCH 347/359] DOC: cleanup build warnings (#14172) * DOC: remove examples on Panel4D (caused warnings) and refer to older docs * DOC: fix build warnings * resolve comments --- doc/source/dsintro.rst | 136 +++----------------------------- doc/source/install.rst | 5 +- doc/source/sparse.rst | 36 +-------- doc/source/timeseries.rst | 6 +- doc/source/whatsnew/v0.14.1.txt | 2 +- doc/source/whatsnew/v0.15.1.txt | 4 +- doc/source/whatsnew/v0.8.0.txt | 2 +- pandas/core/generic.py | 2 +- pandas/io/gbq.py | 32 +++++--- 9 files changed, 44 insertions(+), 181 deletions(-) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index b5ad681426b15..6063e3e8bce45 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -935,134 +935,20 @@ method: minor_axis=['a', 'b', 'c', 'd']) panel.to_frame() - -.. _dsintro.panel4d: - -Panel4D (Experimental) ----------------------- - -.. warning:: - - In 0.19.0 ``Panel4D`` is deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. - -``Panel4D`` is a 4-Dimensional named container very much like a ``Panel``, but -having 4 named dimensions. It is intended as a test bed for more N-Dimensional named -containers. - - - **labels**: axis 0, each item corresponds to a Panel contained inside - - **items**: axis 1, each item corresponds to a DataFrame contained inside - - **major_axis**: axis 2, it is the **index** (rows) of each of the - DataFrames - - **minor_axis**: axis 3, it is the **columns** of each of the DataFrames - -``Panel4D`` is a sub-class of ``Panel``, so most methods that work on Panels are -applicable to Panel4D. The following methods are disabled: - - - ``join , to_frame , to_excel , to_sparse , groupby`` - -Construction of Panel4D works in a very similar manner to a ``Panel`` - -From 4D ndarray with optional axis labels -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. ipython:: python - - p4d = pd.Panel4D(np.random.randn(2, 2, 5, 4), - labels=['Label1','Label2'], - items=['Item1', 'Item2'], - major_axis=pd.date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - p4d - - -From dict of Panel objects -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. ipython:: python - - data = { 'Label1' : pd.Panel({ 'Item1' : pd.DataFrame(np.random.randn(4, 3)) }), - 'Label2' : pd.Panel({ 'Item2' : pd.DataFrame(np.random.randn(4, 2)) }) } - pd.Panel4D(data) - -Note that the values in the dict need only be **convertible to Panels**. -Thus, they can be any of the other valid inputs to Panel as per above. - -Slicing -~~~~~~~ - -Slicing works in a similar manner to a Panel. ``[]`` slices the first dimension. -``.ix`` allows you to slice arbitrarily and get back lower dimensional objects - -.. ipython:: python - - p4d['Label1'] - -4D -> Panel - -.. ipython:: python - - p4d.ix[:,:,:,'A'] - -4D -> DataFrame - -.. ipython:: python - - p4d.ix[:,:,0,'A'] - -4D -> Series - -.. ipython:: python - - p4d.ix[:,0,0,'A'] - -Transposing -~~~~~~~~~~~ - -A Panel4D can be rearranged using its ``transpose`` method (which does not make a -copy by default unless the data are heterogeneous): - -.. ipython:: python - - p4d.transpose(3, 2, 1, 0) - .. _dsintro.panelnd: +.. _dsintro.panel4d: -PanelND (Experimental) ----------------------- +Panel4D and PanelND (Deprecated) +-------------------------------- .. warning:: - In 0.19.0 ``PanelND`` is deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. + In 0.19.0 ``Panel4D`` and ``PanelND`` are deprecated and will be removed in + a future version. The recommended way to represent these types of + n-dimensional data are with the + `xarray package `__. + Pandas provides a :meth:`~Panel4D.to_xarray` method to automate + this conversion. -PanelND is a module with a set of factory functions to enable a user to construct N-dimensional named -containers like Panel4D, with a custom set of axis labels. Thus a domain-specific container can easily be -created. - -The following creates a Panel5D. A new panel type object must be sliceable into a lower dimensional object. -Here we slice to a Panel4D. - -.. ipython:: python - :okwarning: - - from pandas.core import panelnd - Panel5D = panelnd.create_nd_panel_factory( - klass_name = 'Panel5D', - orders = [ 'cool', 'labels','items','major_axis','minor_axis'], - slices = { 'labels' : 'labels', 'items' : 'items', - 'major_axis' : 'major_axis', 'minor_axis' : 'minor_axis' }, - slicer = pd.Panel4D, - aliases = { 'major' : 'major_axis', 'minor' : 'minor_axis' }, - stat_axis = 2) - - p5d = Panel5D(dict(C1 = p4d)) - p5d - - # print a slice of our 5D - p5d.ix['C1',:,:,0:3,:] - - # transpose it - p5d.transpose(1,2,3,4,0) - - # look at the shape & dim - p5d.shape - p5d.ndim +See the `docs of a previous version `__ +for documentation on these objects. diff --git a/doc/source/install.rst b/doc/source/install.rst index f8ee0542ea17e..6295e6f6cbb68 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -255,6 +255,7 @@ Optional Dependencies * `matplotlib `__: for plotting * For Excel I/O: + * `xlrd/xlwt `__: Excel reading (xlrd) and writing (xlwt) * `openpyxl `__: openpyxl version 1.6.1 or higher (but lower than 2.0.0), or version 2.2 or higher, for writing .xlsx files (xlrd >= 0.9.0) @@ -296,8 +297,8 @@ Optional Dependencies `. It explains issues surrounding the installation and usage of the above three libraries * You may need to install an older version of `BeautifulSoup4`_: - - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and - 32-bit Ubuntu/Debian + Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and 32-bit + Ubuntu/Debian * Additionally, if you're using `Anaconda`_ you should definitely read :ref:`the gotchas about HTML parsing libraries ` diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index b6c5c15bc9081..d3f921f8762cc 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -9,7 +9,7 @@ import pandas as pd import pandas.util.testing as tm np.set_printoptions(precision=4, suppress=True) - options.display.max_rows = 15 + pd.options.display.max_rows = 15 ********************** Sparse data structures @@ -90,38 +90,10 @@ can be converted back to a regular ndarray by calling ``to_dense``: SparseList ---------- -.. note:: The ``SparseList`` class has been deprecated and will be removed in a future version. +The ``SparseList`` class has been deprecated and will be removed in a future version. +See the `docs of a previous version `__ +for documentation on ``SparseList``. -``SparseList`` is a list-like data structure for managing a dynamic collection -of SparseArrays. To create one, simply call the ``SparseList`` constructor with -a ``fill_value`` (defaulting to ``NaN``): - -.. ipython:: python - - spl = pd.SparseList() - spl - -The two important methods are ``append`` and ``to_array``. ``append`` can -accept scalar values or any 1-dimensional sequence: - -.. ipython:: python - :suppress: - -.. ipython:: python - - spl.append(np.array([1., np.nan, np.nan, 2., 3.])) - spl.append(5) - spl.append(sparr) - spl - -As you can see, all of the contents are stored internally as a list of -memory-efficient ``SparseArray`` objects. Once you've accumulated all of the -data, you can call ``to_array`` to get a single ``SparseArray`` with all the -data: - -.. ipython:: python - - spl.to_array() SparseIndex objects ------------------- diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 924f286164225..4132d25e9be48 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1219,7 +1219,7 @@ objects. ts.shift(1) The shift method accepts an ``freq`` argument which can accept a -``DateOffset`` class or other ``timedelta``-like object or also a :ref:`offset alias `: +``DateOffset`` class or other ``timedelta``-like object or also a :ref:`offset alias `: .. ipython:: python @@ -1494,7 +1494,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample(level='d').sum() + df.resample('M', level='d').sum() .. _timeseries.periods: @@ -1630,8 +1630,6 @@ Period Dtypes ``PeriodIndex`` has a custom ``period`` dtype. This is a pandas extension dtype similar to the :ref:`timezone aware dtype ` (``datetime64[ns, tz]``). -.. _timeseries.timezone_series: - The ``period`` dtype holds the ``freq`` attribute and is represented with ``period[freq]`` like ``period[D]`` or ``period[M]``, using :ref:`frequency strings `. diff --git a/doc/source/whatsnew/v0.14.1.txt b/doc/source/whatsnew/v0.14.1.txt index 84f2a77203c41..239d6c9c6e0d4 100644 --- a/doc/source/whatsnew/v0.14.1.txt +++ b/doc/source/whatsnew/v0.14.1.txt @@ -156,7 +156,7 @@ Experimental ~~~~~~~~~~~~ - ``pandas.io.data.Options`` has a new method, ``get_all_data`` method, and now consistently returns a - multi-indexed ``DataFrame``, see :ref:`the docs `. (:issue:`5602`) + multi-indexed ``DataFrame`` (:issue:`5602`) - ``io.gbq.read_gbq`` and ``io.gbq.to_gbq`` were refactored to remove the dependency on the Google ``bq.py`` command line client. This submodule now uses ``httplib2`` and the Google ``apiclient`` and ``oauth2client`` API client diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt index a25e5a80b65fc..cd9298c74539a 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.txt @@ -185,8 +185,6 @@ API changes 2014-11-22 call AAPL141122C00110000 1.02 2014-11-28 call AAPL141128C00110000 1.32 - See the Options documentation in :ref:`Remote Data ` - .. _whatsnew_0151.datetime64_plotting: - pandas now also registers the ``datetime64`` dtype in matplotlib's units registry @@ -257,7 +255,7 @@ Enhancements - Added support for 3-character ISO and non-standard country codes in :func:`io.wb.download()` (:issue:`8482`) -- :ref:`World Bank data requests ` now will warn/raise based +- World Bank data requests now will warn/raise based on an ``errors`` argument, as well as a list of hard-coded country codes and the World Bank's JSON response. In prior versions, the error messages didn't look at the World Bank's JSON response. Problem-inducing input were diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/whatsnew/v0.8.0.txt index cf6ac7c1e6ad2..4136c108fba57 100644 --- a/doc/source/whatsnew/v0.8.0.txt +++ b/doc/source/whatsnew/v0.8.0.txt @@ -59,7 +59,7 @@ Time series changes and improvements aggregation functions, and control over how the intervals and result labeling are defined. A suite of high performance Cython/C-based resampling functions (including Open-High-Low-Close) have also been implemented. -- Revamp of :ref:`frequency aliases ` and support for +- Revamp of :ref:`frequency aliases ` and support for **frequency shortcuts** like '15min', or '1h30min' - New :ref:`DatetimeIndex class ` supports both fixed frequency and irregular time diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2834603287f1e..2f78c9acf7972 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3998,7 +3998,7 @@ def asfreq(self, freq, method=None, how=None, normalize=False): converted : type of caller To learn more about the frequency strings, please see `this link - `__. +`__. """ from pandas.tseries.resample import asfreq return asfreq(self, freq, method=method, how=how, normalize=normalize) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 068cfee2b2aa2..8f23e82daf2e3 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -630,16 +630,20 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, https://developers.google.com/api-client-library/python/apis/bigquery/v2 Authentication to the Google BigQuery service is via OAuth 2.0. + - If "private_key" is not provided: - By default "application default credentials" are used. - .. versionadded:: 0.19.0 + By default "application default credentials" are used. + + .. versionadded:: 0.19.0 + + If default application credentials are not found or are restrictive, + user account credentials are used. In this case, you will be asked to + grant permissions for product name 'pandas GBQ'. - If default application credentials are not found or are restrictive, - user account credentials are used. In this case, you will be asked to - grant permissions for product name 'pandas GBQ'. - If "private_key" is provided: - Service account credentials will be used to authenticate. + + Service account credentials will be used to authenticate. Parameters ---------- @@ -747,16 +751,20 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, https://developers.google.com/api-client-library/python/apis/bigquery/v2 Authentication to the Google BigQuery service is via OAuth 2.0. + - If "private_key" is not provided: - By default "application default credentials" are used. - .. versionadded:: 0.19.0 + By default "application default credentials" are used. + + .. versionadded:: 0.19.0 + + If default application credentials are not found or are restrictive, + user account credentials are used. In this case, you will be asked to + grant permissions for product name 'pandas GBQ'. - If default application credentials are not found or are restrictive, - user account credentials are used. In this case, you will be asked to - grant permissions for product name 'pandas GBQ'. - If "private_key" is provided: - Service account credentials will be used to authenticate. + + Service account credentials will be used to authenticate. Parameters ---------- From 957eaa4a01de456207b12c839a084d2c236e3885 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Sep 2016 21:15:38 +0200 Subject: [PATCH 348/359] DOC: clean-up 0.19.0 whatsnew file (#14176) * DOC: clean-up 0.19.0 whatsnew file * further clean-up * Update highlights * consistent use of behaviour/behavior * s/favour/favor --- doc/source/whatsnew/v0.19.0.txt | 331 ++++++++++++++++---------------- 1 file changed, 165 insertions(+), 166 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9f468ae6785cb..a007500322ed4 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1,25 +1,28 @@ .. _whatsnew_0190: -v0.19.0 (August ??, 2016) -------------------------- +v0.19.0 (September ??, 2016) +---------------------------- -This is a major release from 0.18.1 and includes a small number of API changes, several new features, +This is a major release from 0.18.1 and includes number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. -.. warning:: - - pandas >= 0.19.0 will no longer silence numpy ufunc warnings upon import, see :ref:`here `. - Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` - :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here ` - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`here ` -- pandas development api, see :ref:`here ` - ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See :ref:`here ` -- Sparse data structures now gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` +- Sparse data structures gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` +- Comparison operations with ``Series`` no longer ignores the index, see :ref:`here ` for an overview of the API changes. +- Introduction of a pandas development API for utility functions, see :ref:`here `. +- Deprecation of ``Panel4D`` and ``PanelND``. We recommend to represent these types of n-dimensional data with the `xarray package `__. +- Removal of the previously deprecated modules ``pandas.io.data``, ``pandas.io.wb``, ``pandas.tools.rplot``. + +.. warning:: + + pandas >= 0.19.0 will no longer silence numpy ufunc warnings upon import, see :ref:`here `. .. contents:: What's new in v0.19.0 :local: @@ -35,7 +38,7 @@ New features pandas development API ^^^^^^^^^^^^^^^^^^^^^^ -As part of making pandas APi more uniform and accessible in the future, we have created a standard +As part of making pandas API more uniform and accessible in the future, we have created a standard sub-package of pandas, ``pandas.api`` to hold public API's. We are starting by exposing type introspection functions in ``pandas.api.types``. More sub-packages and officially sanctioned API's will be published in future versions of pandas (:issue:`13147`, :issue:`13634`) @@ -215,12 +218,12 @@ default of the index) in a DataFrame. :ref:`Duplicate column names ` are now supported in :func:`read_csv` whether they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) -.. ipython :: python +.. ipython:: python data = '0,1,2\n3,4,5' names = ['a', 'b', 'a'] -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -230,25 +233,25 @@ Previous Behavior: 0 2 1 2 1 5 4 5 -The first ``a`` column contains the same data as the second ``a`` column, when it should have +The first ``a`` column contained the same data as the second ``a`` column, when it should have contained the values ``[0, 3]``. -New Behavior: +**New behavior**: -.. ipython :: python +.. ipython:: python - In [2]: pd.read_csv(StringIO(data), names=names) + pd.read_csv(StringIO(data), names=names) .. _whatsnew_0190.enhancements.read_csv_categorical: -:func:`read_csv` supports parsing ``Categorical`` directly -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``read_csv`` supports parsing ``Categorical`` directly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`read_csv` function now supports parsing a ``Categorical`` column when specified as a dtype (:issue:`10153`). Depending on the structure of the data, this can result in a faster parse time and lower memory usage compared to -converting to ``Categorical`` after parsing. See the io :ref:`docs here ` +converting to ``Categorical`` after parsing. See the io :ref:`docs here `. .. ipython:: python @@ -296,7 +299,7 @@ Categorical Concatenation - ``concat`` and ``append`` now can concat ``category`` dtypes wifht different ``categories`` as ``object`` dtype (:issue:`13524`) -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -305,7 +308,7 @@ Previous Behavior: In [3]: pd.concat([s1, s2]) ValueError: incompatible categories in categorical concat -New Behavior: +**New behavior**: .. ipython:: python @@ -407,12 +410,12 @@ After upgrading pandas, you may see *new* ``RuntimeWarnings`` being issued from .. _whatsnew_0190.get_dummies_dtypes: -get_dummies dtypes -^^^^^^^^^^^^^^^^^^ +``get_dummies`` now returns integer dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``pd.get_dummies`` function now returns dummy-encoded columns as small integers, rather than floats (:issue:`8725`). This should provide an improved memory footprint. -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -424,22 +427,19 @@ Previous Behavior: c float64 dtype: object -New Behavior: +**New behavior**: .. ipython:: python pd.get_dummies(['a', 'b', 'a', 'c']).dtypes -.. _whatsnew_0190.enhancements.other: - -Other enhancements -^^^^^^^^^^^^^^^^^^ +.. _whatsnew_0190.enhancements.to_numeric_downcast: -- The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the :ref:`docs ` for more details (:issue:`13577`). +Downcast values to smallest possible dtype in ``to_numeric`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) -- ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) +``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) .. ipython:: python @@ -447,6 +447,16 @@ Other enhancements pd.to_numeric(s, downcast='unsigned') pd.to_numeric(s, downcast='integer') + +.. _whatsnew_0190.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the :ref:`docs ` for more details (:issue:`13577`). + +- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) + - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) - ``Timestamp`` can now accept positional and keyword parameters similar to :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) @@ -471,13 +481,10 @@ Other enhancements df.resample('M', on='date').sum() df.resample('M', level='d').sum() -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the + ``decimal`` (:issue:`12933`), ``na_filter`` (:issue:`13321`) and the ``memory_map`` option (:issue:`13381`). - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - - The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`) - - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) @@ -504,43 +511,14 @@ Other enhancements - :meth:`~DataFrame.to_html` now has a ``border`` argument to control the value in the opening ``
    `` tag. The default is the value of the ``html.border`` option, which defaults to 1. This also affects the notebook HTML repr, but since Jupyter's CSS includes a border-width attribute, the visual effect is the same. (:issue:`11563`). - Raise ``ImportError`` in the sql functions when ``sqlalchemy`` is not installed and a connection string is used (:issue:`11920`). - Compatibility with matplotlib 2.0. Older versions of pandas should also work with matplotlib 2.0 (:issue:`13333`) - -.. _whatsnew_0190.api: - - -API changes -~~~~~~~~~~~ - - -- ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds, previously this would print a message to stdout. (:issue:`14101`) -- Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) -- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) -- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) -- ``Panel.to_sparse()`` will raise a ``NotImplementedError`` exception when called (:issue:`13778`) -- ``Index.reshape()`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`) -- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) -- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) -- An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) -- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) -- Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) -- ``Styler.apply`` is now more strict about the outputs your function must return. For ``axis=0`` or ``axis=1``, the output shape must be identical. For ``axis=None``, the output must be a DataFrame with identical columns and index labels. (:issue:`13222`) -- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) -- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) -- Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) -- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) -- ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) -- Faceted boxplots from ``DataFrame.boxplot(by=col)`` now return a ``Series`` when ``return_type`` is not None. Previously these returned an ``OrderedDict``. Note that when ``return_type=None``, the default, these still return a 2-D NumPy array. (:issue:`12216`, :issue:`7096`) - ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) -- ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) -- ``pd.read_csv()``, ``pd.read_table()``, and ``pd.read_hdf()`` raise the builtin ``FileNotFoundError`` exception for Python 3.x when called on a nonexistent file; this is back-ported as ``IOError`` in Python 2.x (:issue:`14086`) -- More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) -- ``pd.read_csv()`` in the C engine will now issue a ``ParserWarning`` or raise a ``ValueError`` when ``sep`` encoded is more than one character long (:issue:`14065`) -- ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) +.. _whatsnew_0190.api: -.. _whatsnew_0190.api.tolist: +API changes +~~~~~~~~~~~ ``Series.tolist()`` will now return Python types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -551,9 +529,8 @@ API changes .. ipython:: python s = pd.Series([1,2,3]) - type(s.tolist()[0]) -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -561,7 +538,7 @@ Previous Behavior: Out[7]: -New Behavior: +**New behavior**: .. ipython:: python @@ -572,11 +549,11 @@ New Behavior: ``Series`` operators for different indexes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Following ``Series`` operators has been changed to make all operators consistent, +Following ``Series`` operators have been changed to make all operators consistent, including ``DataFrame`` (:issue:`1134`, :issue:`4581`, :issue:`13538`) - ``Series`` comparison operators now raise ``ValueError`` when ``index`` are different. -- ``Series`` logical operators align both ``index``. +- ``Series`` logical operators align both ``index`` of left and right hand side. .. warning:: Until 0.18.1, comparing ``Series`` with the same length, would succeed even if @@ -607,7 +584,7 @@ Comparison operators raise ``ValueError`` when ``.index`` are different. Previous Behavior (``Series``): -``Series`` compares values ignoring ``.index`` as long as both lengthes are the same. +``Series`` compared values ignoring the ``.index`` as long as both had the same length: .. code-block:: ipython @@ -618,7 +595,7 @@ Previous Behavior (``Series``): C False dtype: bool -New Behavior (``Series``): +**New behavior** (``Series``): .. code-block:: ipython @@ -627,13 +604,18 @@ New Behavior (``Series``): ValueError: Can only compare identically-labeled Series objects .. note:: + To achieve the same result as previous versions (compare values based on locations ignoring ``.index``), compare both ``.values``. .. ipython:: python s1.values == s2.values - If you want to compare ``Series`` aligning its ``.index``, see flexible comparison methods section below. + If you want to compare ``Series`` aligning its ``.index``, see flexible comparison methods section below: + + .. ipython:: python + + s1.eq(s2) Current Behavior (``DataFrame``, no change): @@ -646,9 +628,9 @@ Current Behavior (``DataFrame``, no change): Logical operators """"""""""""""""" -Logical operators align both ``.index``. +Logical operators align both ``.index`` of left and right hand side. -Previous behavior (``Series``), only left hand side ``index`` is kept: +Previous behavior (``Series``), only left hand side ``index`` was kept: .. code-block:: ipython @@ -661,7 +643,7 @@ Previous behavior (``Series``), only left hand side ``index`` is kept: C False dtype: bool -New Behavior (``Series``): +**New behavior** (``Series``): .. ipython:: python @@ -673,11 +655,11 @@ New Behavior (``Series``): ``Series`` logical operators fill a ``NaN`` result with ``False``. .. note:: - To achieve the same result as previous versions (compare values based on locations ignoring ``.index``), compare both ``.values``. + To achieve the same result as previous versions (compare values based on only left hand side index), you can use ``reindex_like``: .. ipython:: python - s1.values & s2.values + s1 & s2.reindex_like(s1) Current Behavior (``DataFrame``, no change): @@ -714,7 +696,7 @@ A ``Series`` will now correctly promote its dtype for assignment with incompat v s = pd.Series() -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -723,7 +705,7 @@ Previous Behavior: In [3]: s["b"] = 3.0 TypeError: invalid type promotion -New Behavior: +**New behavior**: .. ipython:: python @@ -739,7 +721,7 @@ New Behavior: Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -774,7 +756,7 @@ Merging will now preserve the dtype of the join keys (:issue:`8596`) df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) df2 -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -791,7 +773,7 @@ Previous Behavior: v1 float64 dtype: object -New Behavior: +**New behavior**: We are able to preserve the join keys @@ -820,7 +802,7 @@ Percentile identifiers in the index of a ``.describe()`` output will now be roun s = pd.Series([0, 1, 2, 3, 4]) df = pd.DataFrame([0, 1, 2, 3, 4]) -Previous Behavior: +**Previous behavior**: The percentiles were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame if the percentiles were duplicated. @@ -847,7 +829,7 @@ The percentiles were rounded to at most one decimal place, which could raise ``V ... ValueError: cannot reindex from a duplicate axis -New Behavior: +**New behavior**: .. ipython:: python @@ -868,10 +850,10 @@ Furthermore: """""""""""""""""""""""""""""""""""""""" ``PeriodIndex`` now has its own ``period`` dtype. The ``period`` dtype is a -pandas extension dtype like ``category`` or :ref:`timezone aware dtype ` (``datetime64[ns, tz]``). (:issue:`13941`). +pandas extension dtype like ``category`` or the :ref:`timezone aware dtype ` (``datetime64[ns, tz]``). (:issue:`13941`). As a consequence of this change, ``PeriodIndex`` no longer has an integer dtype: -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -886,7 +868,7 @@ Previous Behavior: In [4]: pi.dtype Out[4]: dtype('int64') -New Behavior: +**New behavior**: .. ipython:: python @@ -904,14 +886,14 @@ New Behavior: Previously, ``Period`` has its own ``Period('NaT')`` representation different from ``pd.NaT``. Now ``Period('NaT')`` has been changed to return ``pd.NaT``. (:issue:`12759`, :issue:`13582`) -Previous Behavior: +**Previous behavior**: .. code-block:: ipython In [5]: pd.Period('NaT', freq='D') Out[5]: Period('NaT', 'D') -New Behavior: +**New behavior**: These result in ``pd.NaT`` without providing ``freq`` option. @@ -921,9 +903,9 @@ These result in ``pd.NaT`` without providing ``freq`` option. pd.Period(None) -To be compat with ``Period`` addition and subtraction, ``pd.NaT`` now supports addition and subtraction with ``int``. Previously it raises ``ValueError``. +To be compatible with ``Period`` addition and subtraction, ``pd.NaT`` now supports addition and subtraction with ``int``. Previously it raised ``ValueError``. -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -931,7 +913,7 @@ Previous Behavior: ... ValueError: Cannot add integral value to Timestamp without freq. -New Behavior: +**New behavior**: .. ipython:: python @@ -941,10 +923,10 @@ New Behavior: ``PeriodIndex.values`` now returns array of ``Period`` object """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -``.values`` is changed to return array of ``Period`` object, rather than array -of ``int64`` (:issue:`13988`) +``.values`` is changed to return an array of ``Period`` objects, rather than an array +of integers (:issue:`13988`). -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -952,7 +934,7 @@ Previous Behavior: In [7]: pi.values array([492, 493]) -New Behavior: +**New behavior**: .. ipython:: python @@ -982,7 +964,7 @@ Previous behavior: FutureWarning: using '+' to provide set union with Indexes is deprecated, use '|' or .union() Out[1]: Index(['a', 'b', 'c'], dtype='object') -The same operation will now perform element-wise addition: +**New behavior**: the same operation will now perform element-wise addition: .. ipython:: python @@ -1008,7 +990,7 @@ Previous behavior: FutureWarning: using '-' to provide set differences with datetimelike Indexes is deprecated, use .difference() Out[1]: DatetimeIndex(['2016-01-01'], dtype='datetime64[ns]', freq=None) -New behavior: +**New behavior**: .. ipython:: python @@ -1027,7 +1009,7 @@ New behavior: idx1 = pd.Index([1, 2, 3, np.nan]) idx2 = pd.Index([0, 1, np.nan]) -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -1037,7 +1019,7 @@ Previous Behavior: In [4]: idx1.symmetric_difference(idx2) Out[4]: Float64Index([0.0, nan, 2.0, 3.0], dtype='float64') -New Behavior: +**New behavior**: .. ipython:: python @@ -1050,12 +1032,11 @@ New Behavior: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ``Index.unique()`` now returns unique values as an -``Index`` of the appropriate ``dtype``. (:issue:`13395`) - +``Index`` of the appropriate ``dtype``. (:issue:`13395`). Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex``, ``TimedeltaIndex`` and ``PeriodIndex`` returned ``Index`` to keep metadata like timezone. -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -1063,11 +1044,12 @@ Previous Behavior: Out[1]: array([1, 2, 3]) In [2]: pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() - Out[2]: DatetimeIndex(['2011-01-01 00:00:00+09:00', '2011-01-02 00:00:00+09:00', - '2011-01-03 00:00:00+09:00'], - dtype='datetime64[ns, Asia/Tokyo]', freq=None) + Out[2]: + DatetimeIndex(['2011-01-01 00:00:00+09:00', '2011-01-02 00:00:00+09:00', + '2011-01-03 00:00:00+09:00'], + dtype='datetime64[ns, Asia/Tokyo]', freq=None) -New Behavior: +**New behavior**: .. ipython:: python @@ -1076,8 +1058,8 @@ New Behavior: .. _whatsnew_0190.api.multiindex: -``MultiIndex`` constructors preserve categorical dtypes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``MultiIndex`` constructors, ``groupby`` and ``set_index`` preserve categorical dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ``MultiIndex.from_arrays`` and ``MultiIndex.from_product`` will now preserve categorical dtype in ``MultiIndex`` levels. (:issue:`13743`, :issue:`13854`) @@ -1089,7 +1071,7 @@ in ``MultiIndex`` levels. (:issue:`13743`, :issue:`13854`) midx = pd.MultiIndex.from_arrays([cat, lvl1]) midx -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -1099,7 +1081,7 @@ Previous Behavior: In [5]: midx.get_level_values[0] Out[5]: Index(['a', 'b'], dtype='object') -New Behavior: +**New behavior**: the single level is now a ``CategoricalIndex``: .. ipython:: python @@ -1115,7 +1097,7 @@ As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes df_grouped = df.groupby(by=['A', 'C']).first() df_set_idx = df.set_index(['A', 'C']) -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -1137,7 +1119,7 @@ Previous Behavior: B int64 dtype: object -New Behavior: +**New behavior**: .. ipython:: python @@ -1152,8 +1134,8 @@ New Behavior: ``read_csv`` will progressively enumerate chunks ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When :func:`read_csv` is called with ``chunksize='n'`` and without specifying an index, -each chunk used to have an independently generated index from `0`` to ``n-1``. +When :func:`read_csv` is called with ``chunksize=n`` and without specifying an index, +each chunk used to have an independently generated index from ``0`` to ``n-1``. They are now given instead a progressive index, starting from ``0`` for the first chunk, from ``n`` for the second, and so on, so that, when concatenated, they are identical to the result of calling :func:`read_csv` without the ``chunksize=`` argument. @@ -1163,7 +1145,7 @@ the result of calling :func:`read_csv` without the ``chunksize=`` argument. data = 'A,B\n0,1\n2,3\n4,5\n6,7' -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -1175,7 +1157,7 @@ Previous Behavior: 0 4 5 1 6 7 -New Behavior: +**New behavior**: .. ipython :: python @@ -1188,13 +1170,12 @@ Sparse Changes These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. - ``int64`` and ``bool`` support enhancements """"""""""""""""""""""""""""""""""""""""""" -Sparse data structures now gained enhanced support of ``int64`` and ``bool`` ``dtype`` (:issue:`667`, :issue:`13849`) +Sparse data structures now gained enhanced support of ``int64`` and ``bool`` ``dtype`` (:issue:`667`, :issue:`13849`). -Previously, sparse data were ``float64`` dtype by default, even if all inputs were ``int`` or ``bool`` dtype. You had to specify ``dtype`` explicitly to create sparse data with ``int64`` dtype. Also, ``fill_value`` had to be specified explicitly becuase it's default was ``np.nan`` which doesn't appear in ``int64`` or ``bool`` data. +Previously, sparse data were ``float64`` dtype by default, even if all inputs were of ``int`` or ``bool`` dtype. You had to specify ``dtype`` explicitly to create sparse data with ``int64`` dtype. Also, ``fill_value`` had to be specified explicitly because the default was ``np.nan`` which doesn't appear in ``int64`` or ``bool`` data. .. code-block:: ipython @@ -1221,9 +1202,9 @@ Previously, sparse data were ``float64`` dtype by default, even if all inputs we IntIndex Indices: array([0, 1], dtype=int32) -As of v0.19.0, sparse data keeps the input dtype, and assign more appropriate ``fill_value`` default (``0`` for ``int64`` dtype, ``False`` for ``bool`` dtype). +As of v0.19.0, sparse data keeps the input dtype, and uses more appropriate ``fill_value`` defaults (``0`` for ``int64`` dtype, ``False`` for ``bool`` dtype). -.. ipython :: python +.. ipython:: python pd.SparseArray([1, 2, 0, 0], dtype=np.int64) pd.SparseArray([True, False, False, False]) @@ -1235,29 +1216,29 @@ Operators now preserve dtypes - Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) -.. ipython:: python + .. ipython:: python - s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) - s.dtype + s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) + s.dtype - s + 1 + s + 1 - Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) -.. ipython:: python + .. ipython:: python - s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) - s - s.astype(np.int64) + s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) + s + s.astype(np.int64) -``astype`` fails if data contains values which cannot be converted to specified ``dtype``. -Note that the limitation is applied to ``fill_value`` which default is ``np.nan``. + ``astype`` fails if data contains values which cannot be converted to specified ``dtype``. + Note that the limitation is applied to ``fill_value`` which default is ``np.nan``. -.. code-block:: ipython + .. code-block:: ipython - In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64) - Out[7]: - ValueError: unable to coerce current fill_value nan to int64 dtype + In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64) + Out[7]: + ValueError: unable to coerce current fill_value nan to int64 dtype Other sparse fixes """""""""""""""""" @@ -1301,7 +1282,7 @@ These types are the same on many platform, but for 64 bit python on Windows, ``np.int_`` is 32 bits, and ``np.intp`` is 64 bits. Changing this behavior improves performance for many operations on that platform. -Previous Behavior: +**Previous behavior**: .. code-block:: ipython @@ -1310,7 +1291,7 @@ Previous Behavior: In [2]: i.get_indexer(['b', 'b', 'c']).dtype Out[2]: dtype('int32') -New Behavior: +**New behavior**: .. code-block:: ipython @@ -1319,6 +1300,35 @@ New Behavior: In [2]: i.get_indexer(['b', 'b', 'c']).dtype Out[2]: dtype('int64') + +.. _whatsnew_0190.api.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds, previously this would print a message to stdout. (:issue:`14101`) +- Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) +- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) +- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) +- ``Panel.to_sparse()`` will raise a ``NotImplementedError`` exception when called (:issue:`13778`) +- ``Index.reshape()`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`) +- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) +- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) +- An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) +- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) +- Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) +- ``Styler.apply`` is now more strict about the outputs your function must return. For ``axis=0`` or ``axis=1``, the output shape must be identical. For ``axis=None``, the output must be a DataFrame with identical columns and index labels. (:issue:`13222`) +- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) +- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) +- Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) +- ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) +- Faceted boxplots from ``DataFrame.boxplot(by=col)`` now return a ``Series`` when ``return_type`` is not None. Previously these returned an ``OrderedDict``. Note that when ``return_type=None``, the default, these still return a 2-D NumPy array. (:issue:`12216`, :issue:`7096`) +- ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) +- ``pd.read_csv()``, ``pd.read_table()``, and ``pd.read_hdf()`` raise the builtin ``FileNotFoundError`` exception for Python 3.x when called on a nonexistent file; this is back-ported as ``IOError`` in Python 2.x (:issue:`14086`) +- More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) +- ``pd.read_csv()`` in the C engine will now issue a ``ParserWarning`` or raise a ``ValueError`` when ``sep`` encoded is more than one character long (:issue:`14065`) +- ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) + .. _whatsnew_0190.deprecations: Deprecations @@ -1326,10 +1336,10 @@ Deprecations - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) -- ``PeriodIndex.to_datetime`` has been deprecated in favour of ``PeriodIndex.to_timestamp`` (:issue:`8254`) -- ``Timestamp.to_datetime`` has been deprecated in favour of ``Timestamp.to_pydatetime`` (:issue:`8254`) +- ``PeriodIndex.to_datetime`` has been deprecated in favor of ``PeriodIndex.to_timestamp`` (:issue:`8254`) +- ``Timestamp.to_datetime`` has been deprecated in favor of ``Timestamp.to_pydatetime`` (:issue:`8254`) - ``pandas.core.datetools`` module has been deprecated and will be removed in a subsequent release (:issue:`14094`) -- ``Index.to_datetime`` and ``DatetimeIndex.to_datetime`` have been deprecated in favour of ``pd.to_datetime`` (:issue:`8254`) +- ``Index.to_datetime`` and ``DatetimeIndex.to_datetime`` have been deprecated in favor of ``pd.to_datetime`` (:issue:`8254`) - ``SparseList`` has been deprecated and will be removed in a future version (:issue:`13784`) - ``DataFrame.to_html()`` and ``DataFrame.to_latex()`` have dropped the ``colSpace`` parameter in favor of ``col_space`` (:issue:`13857`) - ``DataFrame.to_sql()`` has deprecated the ``flavor`` parameter, as it is superfluous when SQLAlchemy is not installed (:issue:`13611`) @@ -1350,6 +1360,7 @@ Deprecations Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + - The ``SparsePanel`` class has been removed (:issue:`13778`) - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - The ``pandas.io.data`` and ``pandas.io.wb`` modules are removed in favor of @@ -1359,30 +1370,19 @@ Removal of prior version deprecations/changes - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) -- ``pd.Categorical`` has dropped the ``levels`` attribute in favour of ``categories`` (:issue:`8376`) +- ``pd.Categorical`` has dropped the ``levels`` attribute in favor of ``categories`` (:issue:`8376`) - ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) -- ``Panel.shift()`` has dropped the ``lags`` parameter in favour of ``periods`` (:issue:`14041`) -- ``pd.Index`` has dropped the ``diff`` method in favour of ``difference`` (:issue:`13669`) - -- ``pd.DataFrame`` has dropped the ``to_wide`` method in favour of ``to_panel`` (:issue:`14039`) +- ``Panel.shift()`` has dropped the ``lags`` parameter in favor of ``periods`` (:issue:`14041`) +- ``pd.Index`` has dropped the ``diff`` method in favor of ``difference`` (:issue:`13669`) +- ``pd.DataFrame`` has dropped the ``to_wide`` method in favor of ``to_panel`` (:issue:`14039`) - ``Series.to_csv`` has dropped the ``nanRep`` parameter in favor of ``na_rep`` (:issue:`13804`) - ``Series.xs``, ``DataFrame.xs``, ``Panel.xs``, ``Panel.major_xs``, and ``Panel.minor_xs`` have dropped the ``copy`` parameter (:issue:`13781`) - ``str.split`` has dropped the ``return_type`` parameter in favor of ``expand`` (:issue:`13701`) -- Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`, :issue:`13868`) - - Previous Behavior: - - .. code-block:: ipython - - In [2]: pd.date_range('2016-07-01', freq='W@MON', periods=3) - pandas/tseries/frequencies.py:465: FutureWarning: Freq "W@MON" is deprecated, use "W-MON" as alternative. - Out[2]: DatetimeIndex(['2016-07-04', '2016-07-11', '2016-07-18'], dtype='datetime64[ns]', freq='W-MON') - - Now legacy time rules raises ``ValueError``. For the list of currently supported offsets, see :ref:`here ` - +- Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`, :issue:`13868`). Now legacy time rules raises ``ValueError``. For the list of currently supported offsets, see :ref:`here `. - The default value for the ``return_type`` parameter for ``DataFrame.plot.box`` and ``DataFrame.boxplot`` changed from ``None`` to ``"axes"``. These methods will now return a matplotlib axes by default instead of a dictionary of artists. See :ref:`here ` (:issue:`6581`). - The ``tquery`` and ``uquery`` functions in the ``pandas.io.sql`` module are removed (:issue:`5950`). + .. _whatsnew_0190.performance: Performance Improvements @@ -1390,8 +1390,7 @@ Performance Improvements - Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) - Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) -- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - +- Improved performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - Improved performance of ``Index`` and ``Series`` ``.duplicated`` (:issue:`10235`) @@ -1402,7 +1401,6 @@ Performance Improvements - Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`) - .. _whatsnew_0190.bug_fixes: Bug Fixes @@ -1568,3 +1566,4 @@ Bug Fixes - Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue:`14095`) - Bugs in ``stack``, ``get_dummies``, ``make_axis_dummies`` which don't preserve categorical dtypes in (multi)indexes (:issue:`13854`) +- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) From 497a3bcbf6a1f97a9596278f78b9883b50a4c66f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Sep 2016 21:42:12 +0200 Subject: [PATCH 349/359] RLS: v0.19.0rc1 From ff435badcd6ea5efd5f5faa0f4b591955f5822d7 Mon Sep 17 00:00:00 2001 From: Russell Smith Date: Tue, 6 Sep 2016 09:31:06 -0400 Subject: [PATCH 350/359] BUG : bug in setting a slice of a Series with a np.timedelta64 closes #14155 closes #14160 --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/internals.py | 2 +- pandas/tests/series/test_indexing.py | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a007500322ed4..d672b9b897fda 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1527,6 +1527,7 @@ Bug Fixes - Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`) - Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`) - Bug in ``DatetimeIndex`` with nanosecond frequency does not include timestamp specified with ``end`` (:issue:`13672`) +- Bug in ```Series``` when setting a slice with a ```np.timedelta64``` (:issue:`14155`) - Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) - Bug in ``Index`` may ignore specified ``datetime64`` or ``timedelta64`` passed as ``dtype`` (:issue:`13981`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9a1c7864903d7..da72309b8eae1 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1696,7 +1696,7 @@ def _try_coerce_args(self, values, other): other = other.value elif isinstance(other, np.timedelta64): other_mask = isnull(other) - other = other.view('i8') + other = Timedelta(other).value elif isinstance(other, timedelta): other = Timedelta(other).value elif isinstance(other, np.ndarray): diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 54cf626858354..5eef06bacfcb0 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -1324,6 +1324,13 @@ def test_timedelta_assignment(self): s.loc['A'] = timedelta(1) tm.assert_series_equal(s, expected) + # GH 14155 + s = Series(10 * [np.timedelta64(10, 'm')]) + s.loc[[1, 2, 3]] = np.timedelta64(20, 'm') + expected = pd.Series(10 * [np.timedelta64(10, 'm')]) + expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm')) + tm.assert_series_equal(s, expected) + def test_underlying_data_conversion(self): # GH 4080 From d8cd33b148daba78f1450c19c69cfe4896cfb98c Mon Sep 17 00:00:00 2001 From: John Liekezer Date: Thu, 8 Sep 2016 06:27:21 -0400 Subject: [PATCH 351/359] BUG: fix tz-aware datetime convert to DatetimeIndex (GH 14088) closes #14088 Author: John Liekezer Closes #14090 from conquistador1492/issue_14088 and squashes the following commits: c91425b [John Liekezer] BUG: fix tz-aware datetime convert to DatetimeIndex (GH 14088) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/ops.py | 2 +- pandas/tests/series/test_operators.py | 173 ++++++++++++++------------ 3 files changed, 94 insertions(+), 82 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d672b9b897fda..3f3ebcb6e5830 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1549,6 +1549,7 @@ Bug Fixes - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) +- Bug in subtract tz-aware ``datetime.datetime`` from tz-aware ``datetime64`` series (:issue:`14088`) - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) - Bug in invalid frequency offset string like "D1", "-2-3H" may not raise ``ValueError (:issue:`13930`) - Bug in ``concat`` and ``groupby`` for hierarchical frames with ``RangeIndex`` levels (:issue:`13542`). diff --git a/pandas/core/ops.py b/pandas/core/ops.py index b84eb0ba4cbf9..b81d62c3cda18 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -453,7 +453,7 @@ def _convert_to_array(self, values, name=None, other=None): values = values.to_series() # datetime with tz elif (isinstance(ovalues, datetime.datetime) and - hasattr(ovalues, 'tz')): + hasattr(ovalues, 'tzinfo')): values = pd.DatetimeIndex(values) # datetime array with tz elif is_datetimetz(values): diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index f7fc45d78af97..197311868b768 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -18,7 +18,8 @@ from pandas.compat import range, zip from pandas import compat -from pandas.util.testing import assert_series_equal, assert_almost_equal +from pandas.util.testing import (assert_series_equal, assert_almost_equal, + assert_frame_equal) import pandas.util.testing as tm from .common import TestData @@ -45,8 +46,8 @@ def test_comparisons(self): # it works! exp = Series([False, False, False]) - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) + assert_series_equal(s == s2, exp) + assert_series_equal(s2 == s, exp) def test_op_method(self): def check(series, other, check_reverse=False): @@ -64,12 +65,12 @@ def check(series, other, check_reverse=False): result = op(series, other) expected = alt(series, other) - tm.assert_almost_equal(result, expected) + assert_almost_equal(result, expected) if check_reverse: rop = getattr(Series, "r" + opname) result = rop(series, other) expected = alt(other, series) - tm.assert_almost_equal(result, expected) + assert_almost_equal(result, expected) check(self.ts, self.ts * 2) check(self.ts, self.ts[::2]) @@ -149,8 +150,8 @@ def _check_op(series, other, op, pos_only=False, cython_or_numpy = op(left, right) python = left.combine(right, op) - tm.assert_series_equal(cython_or_numpy, python, - check_dtype=check_dtype) + assert_series_equal(cython_or_numpy, python, + check_dtype=check_dtype) def check(series, other): simple_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'mod'] @@ -187,7 +188,7 @@ def check_comparators(series, other, check_dtype=True): def test_operators_empty_int_corner(self): s1 = Series([], [], dtype=np.int32) s2 = Series({'x': 0.}) - tm.assert_series_equal(s1 * s2, Series([np.nan], index=['x'])) + assert_series_equal(s1 * s2, Series([np.nan], index=['x'])) def test_operators_timedelta64(self): @@ -668,6 +669,16 @@ def run_ops(ops, get_ser, test_ser): self.assertRaises(TypeError, lambda: td1 - dt1) self.assertRaises(TypeError, lambda: td2 - dt2) + def test_sub_datetime_compat(self): + # GH 14088 + tm._skip_if_no_pytz() + import pytz + s = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), pd.NaT]) + dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) + exp = Series([Timedelta('1 days'), pd.NaT]) + assert_series_equal(s - dt, exp) + assert_series_equal(s - Timestamp(dt), exp) + def test_sub_single_tz(self): # GH12290 s1 = Series([pd.Timestamp('2016-02-10', tz='America/Sao_Paulo')]) @@ -1175,21 +1186,21 @@ def test_comparison_flex_basic(self): left = pd.Series(np.random.randn(10)) right = pd.Series(np.random.randn(10)) - tm.assert_series_equal(left.eq(right), left == right) - tm.assert_series_equal(left.ne(right), left != right) - tm.assert_series_equal(left.le(right), left < right) - tm.assert_series_equal(left.lt(right), left <= right) - tm.assert_series_equal(left.gt(right), left > right) - tm.assert_series_equal(left.ge(right), left >= right) + assert_series_equal(left.eq(right), left == right) + assert_series_equal(left.ne(right), left != right) + assert_series_equal(left.le(right), left < right) + assert_series_equal(left.lt(right), left <= right) + assert_series_equal(left.gt(right), left > right) + assert_series_equal(left.ge(right), left >= right) # axis for axis in [0, None, 'index']: - tm.assert_series_equal(left.eq(right, axis=axis), left == right) - tm.assert_series_equal(left.ne(right, axis=axis), left != right) - tm.assert_series_equal(left.le(right, axis=axis), left < right) - tm.assert_series_equal(left.lt(right, axis=axis), left <= right) - tm.assert_series_equal(left.gt(right, axis=axis), left > right) - tm.assert_series_equal(left.ge(right, axis=axis), left >= right) + assert_series_equal(left.eq(right, axis=axis), left == right) + assert_series_equal(left.ne(right, axis=axis), left != right) + assert_series_equal(left.le(right, axis=axis), left < right) + assert_series_equal(left.lt(right, axis=axis), left <= right) + assert_series_equal(left.gt(right, axis=axis), left > right) + assert_series_equal(left.ge(right, axis=axis), left >= right) # msg = 'No axis named 1 for object type' @@ -1202,44 +1213,44 @@ def test_comparison_flex_alignment(self): right = Series([2, 2, 2], index=list('bcd')) exp = pd.Series([False, False, True, False], index=list('abcd')) - tm.assert_series_equal(left.eq(right), exp) + assert_series_equal(left.eq(right), exp) exp = pd.Series([True, True, False, True], index=list('abcd')) - tm.assert_series_equal(left.ne(right), exp) + assert_series_equal(left.ne(right), exp) exp = pd.Series([False, False, True, False], index=list('abcd')) - tm.assert_series_equal(left.le(right), exp) + assert_series_equal(left.le(right), exp) exp = pd.Series([False, False, False, False], index=list('abcd')) - tm.assert_series_equal(left.lt(right), exp) + assert_series_equal(left.lt(right), exp) exp = pd.Series([False, True, True, False], index=list('abcd')) - tm.assert_series_equal(left.ge(right), exp) + assert_series_equal(left.ge(right), exp) exp = pd.Series([False, True, False, False], index=list('abcd')) - tm.assert_series_equal(left.gt(right), exp) + assert_series_equal(left.gt(right), exp) def test_comparison_flex_alignment_fill(self): left = Series([1, 3, 2], index=list('abc')) right = Series([2, 2, 2], index=list('bcd')) exp = pd.Series([False, False, True, True], index=list('abcd')) - tm.assert_series_equal(left.eq(right, fill_value=2), exp) + assert_series_equal(left.eq(right, fill_value=2), exp) exp = pd.Series([True, True, False, False], index=list('abcd')) - tm.assert_series_equal(left.ne(right, fill_value=2), exp) + assert_series_equal(left.ne(right, fill_value=2), exp) exp = pd.Series([False, False, True, True], index=list('abcd')) - tm.assert_series_equal(left.le(right, fill_value=0), exp) + assert_series_equal(left.le(right, fill_value=0), exp) exp = pd.Series([False, False, False, True], index=list('abcd')) - tm.assert_series_equal(left.lt(right, fill_value=0), exp) + assert_series_equal(left.lt(right, fill_value=0), exp) exp = pd.Series([True, True, True, False], index=list('abcd')) - tm.assert_series_equal(left.ge(right, fill_value=0), exp) + assert_series_equal(left.ge(right, fill_value=0), exp) exp = pd.Series([True, True, False, False], index=list('abcd')) - tm.assert_series_equal(left.gt(right, fill_value=0), exp) + assert_series_equal(left.gt(right, fill_value=0), exp) def test_operators_bitwise(self): # GH 9016: support bitwise op for integer types @@ -1426,13 +1437,13 @@ def test_arith_ops_df_compat(self): exp = pd.Series([3.0, 4.0, np.nan, np.nan], index=list('ABCD'), name='x') - tm.assert_series_equal(s1 + s2, exp) - tm.assert_series_equal(s2 + s1, exp) + assert_series_equal(s1 + s2, exp) + assert_series_equal(s2 + s1, exp) exp = pd.DataFrame({'x': [3.0, 4.0, np.nan, np.nan]}, index=list('ABCD')) - tm.assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) - tm.assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) + assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) + assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) # different length s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') @@ -1440,13 +1451,13 @@ def test_arith_ops_df_compat(self): exp = pd.Series([3, 4, 5, np.nan], index=list('ABCD'), name='x') - tm.assert_series_equal(s3 + s4, exp) - tm.assert_series_equal(s4 + s3, exp) + assert_series_equal(s3 + s4, exp) + assert_series_equal(s4 + s3, exp) exp = pd.DataFrame({'x': [3, 4, 5, np.nan]}, index=list('ABCD')) - tm.assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) - tm.assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) + assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) + assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) def test_comp_ops_df_compat(self): # GH 1134 @@ -1485,28 +1496,28 @@ def test_bool_ops_df_compat(self): exp = pd.Series([True, False, False, False], index=list('ABCD'), name='x') - tm.assert_series_equal(s1 & s2, exp) - tm.assert_series_equal(s2 & s1, exp) + assert_series_equal(s1 & s2, exp) + assert_series_equal(s2 & s1, exp) # True | np.nan => True exp = pd.Series([True, True, True, False], index=list('ABCD'), name='x') - tm.assert_series_equal(s1 | s2, exp) + assert_series_equal(s1 | s2, exp) # np.nan | True => np.nan, filled with False exp = pd.Series([True, True, False, False], index=list('ABCD'), name='x') - tm.assert_series_equal(s2 | s1, exp) + assert_series_equal(s2 | s1, exp) # DataFrame doesn't fill nan with False exp = pd.DataFrame({'x': [True, False, np.nan, np.nan]}, index=list('ABCD')) - tm.assert_frame_equal(s1.to_frame() & s2.to_frame(), exp) - tm.assert_frame_equal(s2.to_frame() & s1.to_frame(), exp) + assert_frame_equal(s1.to_frame() & s2.to_frame(), exp) + assert_frame_equal(s2.to_frame() & s1.to_frame(), exp) exp = pd.DataFrame({'x': [True, True, np.nan, np.nan]}, index=list('ABCD')) - tm.assert_frame_equal(s1.to_frame() | s2.to_frame(), exp) - tm.assert_frame_equal(s2.to_frame() | s1.to_frame(), exp) + assert_frame_equal(s1.to_frame() | s2.to_frame(), exp) + assert_frame_equal(s2.to_frame() | s1.to_frame(), exp) # different length s3 = pd.Series([True, False, True], index=list('ABC'), name='x') @@ -1514,27 +1525,27 @@ def test_bool_ops_df_compat(self): exp = pd.Series([True, False, True, False], index=list('ABCD'), name='x') - tm.assert_series_equal(s3 & s4, exp) - tm.assert_series_equal(s4 & s3, exp) + assert_series_equal(s3 & s4, exp) + assert_series_equal(s4 & s3, exp) # np.nan | True => np.nan, filled with False exp = pd.Series([True, True, True, False], index=list('ABCD'), name='x') - tm.assert_series_equal(s3 | s4, exp) + assert_series_equal(s3 | s4, exp) # True | np.nan => True exp = pd.Series([True, True, True, True], index=list('ABCD'), name='x') - tm.assert_series_equal(s4 | s3, exp) + assert_series_equal(s4 | s3, exp) exp = pd.DataFrame({'x': [True, False, True, np.nan]}, index=list('ABCD')) - tm.assert_frame_equal(s3.to_frame() & s4.to_frame(), exp) - tm.assert_frame_equal(s4.to_frame() & s3.to_frame(), exp) + assert_frame_equal(s3.to_frame() & s4.to_frame(), exp) + assert_frame_equal(s4.to_frame() & s3.to_frame(), exp) exp = pd.DataFrame({'x': [True, True, True, np.nan]}, index=list('ABCD')) - tm.assert_frame_equal(s3.to_frame() | s4.to_frame(), exp) - tm.assert_frame_equal(s4.to_frame() | s3.to_frame(), exp) + assert_frame_equal(s3.to_frame() | s4.to_frame(), exp) + assert_frame_equal(s4.to_frame() | s3.to_frame(), exp) def test_series_frame_radd_bug(self): # GH 353 @@ -1546,7 +1557,7 @@ def test_series_frame_radd_bug(self): frame = DataFrame({'vals': vals}) result = 'foo_' + frame expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) - tm.assert_frame_equal(result, expected) + assert_frame_equal(result, expected) # really raise this time with tm.assertRaises(TypeError): @@ -1571,26 +1582,26 @@ def test_series_radd_more(self): for dtype in [None, object]: res = 1 + pd.Series([1, 2, 3], dtype=dtype) exp = pd.Series([2, 3, 4], dtype=dtype) - tm.assert_series_equal(res, exp) + assert_series_equal(res, exp) res = pd.Series([1, 2, 3], dtype=dtype) + 1 - tm.assert_series_equal(res, exp) + assert_series_equal(res, exp) res = np.nan + pd.Series([1, 2, 3], dtype=dtype) exp = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) - tm.assert_series_equal(res, exp) + assert_series_equal(res, exp) res = pd.Series([1, 2, 3], dtype=dtype) + np.nan - tm.assert_series_equal(res, exp) + assert_series_equal(res, exp) s = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('2 days'), pd.Timedelta('3 days')], dtype=dtype) exp = pd.Series([pd.Timedelta('4 days'), pd.Timedelta('5 days'), pd.Timedelta('6 days')]) - tm.assert_series_equal(pd.Timedelta('3 days') + s, exp) - tm.assert_series_equal(s + pd.Timedelta('3 days'), exp) + assert_series_equal(pd.Timedelta('3 days') + s, exp) + assert_series_equal(s + pd.Timedelta('3 days'), exp) s = pd.Series(['x', np.nan, 'x']) - tm.assert_series_equal('a' + s, pd.Series(['ax', np.nan, 'ax'])) - tm.assert_series_equal(s + 'a', pd.Series(['xa', np.nan, 'xa'])) + assert_series_equal('a' + s, pd.Series(['ax', np.nan, 'ax'])) + assert_series_equal(s + 'a', pd.Series(['xa', np.nan, 'xa'])) def test_frame_radd_more(self): data = [[1, 2, 3], @@ -1608,32 +1619,32 @@ def test_frame_radd_more(self): for dtype in [None, object]: res = 1 + pd.DataFrame([1, 2, 3], dtype=dtype) exp = pd.DataFrame([2, 3, 4], dtype=dtype) - tm.assert_frame_equal(res, exp) + assert_frame_equal(res, exp) res = pd.DataFrame([1, 2, 3], dtype=dtype) + 1 - tm.assert_frame_equal(res, exp) + assert_frame_equal(res, exp) res = np.nan + pd.DataFrame([1, 2, 3], dtype=dtype) exp = pd.DataFrame([np.nan, np.nan, np.nan], dtype=dtype) - tm.assert_frame_equal(res, exp) + assert_frame_equal(res, exp) res = pd.DataFrame([1, 2, 3], dtype=dtype) + np.nan - tm.assert_frame_equal(res, exp) + assert_frame_equal(res, exp) df = pd.DataFrame(['x', np.nan, 'x']) - tm.assert_frame_equal('a' + df, pd.DataFrame(['ax', np.nan, 'ax'])) - tm.assert_frame_equal(df + 'a', pd.DataFrame(['xa', np.nan, 'xa'])) + assert_frame_equal('a' + df, pd.DataFrame(['ax', np.nan, 'ax'])) + assert_frame_equal(df + 'a', pd.DataFrame(['xa', np.nan, 'xa'])) def test_operators_frame(self): # rpow does not work with DataFrame df = DataFrame({'A': self.ts}) - tm.assert_series_equal(self.ts + self.ts, self.ts + df['A'], - check_names=False) - tm.assert_series_equal(self.ts ** self.ts, self.ts ** df['A'], - check_names=False) - tm.assert_series_equal(self.ts < self.ts, self.ts < df['A'], - check_names=False) - tm.assert_series_equal(self.ts / self.ts, self.ts / df['A'], - check_names=False) + assert_series_equal(self.ts + self.ts, self.ts + df['A'], + check_names=False) + assert_series_equal(self.ts ** self.ts, self.ts ** df['A'], + check_names=False) + assert_series_equal(self.ts < self.ts, self.ts < df['A'], + check_names=False) + assert_series_equal(self.ts / self.ts, self.ts / df['A'], + check_names=False) def test_operators_combine(self): def _check_fill(meth, op, a, b, fill_value=0): @@ -1729,12 +1740,12 @@ def test_divide_decimal(self): s = Series([Decimal(10)]) s = s / Decimal(2) - tm.assert_series_equal(expected, s) + assert_series_equal(expected, s) s = Series([Decimal(10)]) s = s // Decimal(2) - tm.assert_series_equal(expected, s) + assert_series_equal(expected, s) def test_datetime64_with_index(self): From 02df7b683734cafd87ad7b5920dd2a628521999c Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Thu, 8 Sep 2016 11:02:13 -0400 Subject: [PATCH 352/359] DOC: minor typo in 0.19.0 whatsnew file (#14185) --- doc/source/whatsnew/v0.19.0.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 3f3ebcb6e5830..7e8e1b15654a0 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -296,8 +296,7 @@ Categorical Concatenation b = pd.Categorical(["a", "b"]) union_categoricals([a, b]) -- ``concat`` and ``append`` now can concat ``category`` dtypes wifht different -``categories`` as ``object`` dtype (:issue:`13524`) +- ``concat`` and ``append`` now can concat ``category`` dtypes with different ``categories`` as ``object`` dtype (:issue:`13524`) **Previous behavior**: From 8af626474f6f314527a9ad3f15403aa2dd8c402d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 8 Sep 2016 18:12:53 -0400 Subject: [PATCH 353/359] TST: Make encoded sep check more locale sensitive (#14161) Closes gh-14140. --- pandas/io/parsers.py | 21 +++++++++++++-------- pandas/io/tests/parser/test_unsupported.py | 4 ---- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3bd8579d456d3..93c431531355a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -800,17 +800,22 @@ def _clean_options(self, options, engine): " different from '\s+' are"\ " interpreted as regex)" engine = 'python' - - elif len(sep.encode(encoding)) > 1: - if engine not in ('python', 'python-fwf'): - fallback_reason = "the separator encoded in {encoding}"\ - " is > 1 char long, and the 'c' engine"\ - " does not support such separators".format( - encoding=encoding) - engine = 'python' elif delim_whitespace: if 'python' in engine: result['delimiter'] = '\s+' + elif sep is not None: + encodeable = True + try: + if len(sep.encode(encoding)) > 1: + encodeable = False + except UnicodeDecodeError: + encodeable = False + if not encodeable and engine not in ('python', 'python-fwf'): + fallback_reason = "the separator encoded in {encoding}" \ + " is > 1 char long, and the 'c' engine" \ + " does not support such separators".format( + encoding=encoding) + engine = 'python' if fallback_reason and engine_specified: raise ValueError(fallback_reason) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 0bfb8b17349cf..ef8f7967193ff 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -60,10 +60,6 @@ def test_c_engine(self): sep=None, delim_whitespace=False) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', sep='\s') - - # GH 14120, skipping as failing when locale is set - # with tm.assertRaisesRegexp(ValueError, msg): - # read_table(StringIO(data), engine='c', sep='§') with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', skipfooter=1) From 939a22118a531d71a667456daf46964265c79d1e Mon Sep 17 00:00:00 2001 From: Chris Date: Fri, 9 Sep 2016 15:32:44 -0400 Subject: [PATCH 354/359] BUG: Categorical constructor not idempotent with ext dtype closes #14190 Author: Chris Closes #14191 from chris-b1/cat-ctor and squashes the following commits: 4cad147 [Chris] add some nulls to tests da865e2 [Chris] BUG: Categorical constructor not idempotent with ext dtype --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/categorical.py | 2 +- pandas/tests/test_categorical.py | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 7e8e1b15654a0..be4f7473b002a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1426,7 +1426,7 @@ Bug Fixes - Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) - Bug where empty ``Series`` were incorrectly coerced in datetime-like numeric operations (:issue:`13844`) - +- Bug in ``Categorical`` constructor when passed a ``Categorical`` containing datetimes with timezones (:issue:`14190`) - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) - Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 48054c5bd34fa..0a13c8936eeec 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -259,7 +259,7 @@ def __init__(self, values, categories=None, ordered=False, ordered = values.ordered if categories is None: categories = values.categories - values = values.__array__() + values = values.get_values() elif isinstance(values, (ABCIndexClass, ABCSeries)): pass diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index c4ddd2c0981d9..a494a0d53b123 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -362,6 +362,22 @@ def test_constructor_from_index_series_period(self): result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) + def test_constructor_invariant(self): + # GH 14190 + vals = [ + np.array([1., 1.2, 1.8, np.nan]), + np.array([1, 2, 3], dtype='int64'), + ['a', 'b', 'c', np.nan], + [pd.Period('2014-01'), pd.Period('2014-02'), pd.NaT], + [pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.NaT], + [pd.Timestamp('2014-01-01', tz='US/Eastern'), + pd.Timestamp('2014-01-02', tz='US/Eastern'), pd.NaT], + ] + for val in vals: + c = Categorical(val) + c2 = Categorical(c) + tm.assert_categorical_equal(c, c2) + def test_from_codes(self): # too few categories From 289cd6d0df66b812921ff4c5cbade937b875406d Mon Sep 17 00:00:00 2001 From: Josh Howes Date: Fri, 9 Sep 2016 18:27:16 -0400 Subject: [PATCH 355/359] BUG: fix str.contains for series containing only nan values closes #14171 Author: Josh Howes Closes #14182 from josh-howes/bugfix/14171-series-str-contains-only-nan-values and squashes the following commits: c7e9721 [Josh Howes] BUG: fix str.contains for series containing only nan values --- doc/source/whatsnew/v0.19.0.txt | 2 +- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/strings.py | 3 ++- pandas/tests/test_strings.py | 20 ++++++++++++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index be4f7473b002a..a3e8f0c314352 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1552,7 +1552,7 @@ Bug Fixes - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) - Bug in invalid frequency offset string like "D1", "-2-3H" may not raise ``ValueError (:issue:`13930`) - Bug in ``concat`` and ``groupby`` for hierarchical frames with ``RangeIndex`` levels (:issue:`13542`). - +- Bug in ``Series.str.contains()`` for Series containing only ``NaN`` values of ``object`` dtype (:issue:`14171`) - Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`) - Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`) - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 695e917c76ba0..4aee6f72b1d53 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -81,3 +81,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b49761367b9b5..3041b17b99b17 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -165,7 +165,8 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): if na_mask: mask = isnull(arr) try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8)) + convert = not all(mask) + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) except (TypeError, AttributeError): def g(x): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 92fa7b976eb0e..4019bbe20ea1a 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2439,6 +2439,26 @@ def test_more_contains(self): True, False, False]) assert_series_equal(result, expected) + def test_contains_nan(self): + # PR #14171 + s = Series([np.nan, np.nan, np.nan], dtype=np.object_) + + result = s.str.contains('foo', na=False) + expected = Series([False, False, False], dtype=np.bool_) + assert_series_equal(result, expected) + + result = s.str.contains('foo', na=True) + expected = Series([True, True, True], dtype=np.bool_) + assert_series_equal(result, expected) + + result = s.str.contains('foo', na="foo") + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + assert_series_equal(result, expected) + + result = s.str.contains('foo') + expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) + assert_series_equal(result, expected) + def test_more_replace(self): # PR #1179 s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', From 678b636ca0cd61a85f8ed803a0b3a9f9473d08ce Mon Sep 17 00:00:00 2001 From: "Barry E. Moore II" Date: Thu, 2 Jun 2016 19:03:34 -0400 Subject: [PATCH 356/359] BUG: df.to_string with formatters, header and index False --- pandas/formats/format.py | 2 +- pandas/tests/formats/test_printing.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 4740dd25c419d..224c3a5c0c50e 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -604,7 +604,7 @@ def to_string(self): self._chk_truncate() strcols = self._to_str_columns() text = self.adj.adjoin(1, *strcols) - if not self.index: + if not self.index and self.header: text = text.replace('\n ', '\n').strip() self.buf.writelines(text) diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index 3bcceca1f50a7..f93f37f27b2c5 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -9,6 +9,17 @@ _multiprocess_can_split_ = True +def test_to_string_formatters_index_header(): + from pandas import DataFrame + frame = DataFrame(data={0: 0, 1: 0}, index=[0]) + expected = ' 0 0' + + formatter = lambda x: '{:4d}'.format(x) + + string = frame.to_string(formatters=[formatter, formatter], index=False, + header=False) + assert(string == expected) + def test_adjoin(): data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']] expected = 'a dd ggg\nb ee hhh\nc ff iii' From 5a1b743c182d7e8595c8b1d69aa67463318d7cf2 Mon Sep 17 00:00:00 2001 From: "Barry E. Moore II" Date: Thu, 2 Jun 2016 20:59:54 -0400 Subject: [PATCH 357/359] BUG: Fix issue #13032, annotate test --- pandas/formats/format.py | 2 -- pandas/tests/formats/test_printing.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 224c3a5c0c50e..f18e5b4dd51e3 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -604,8 +604,6 @@ def to_string(self): self._chk_truncate() strcols = self._to_str_columns() text = self.adj.adjoin(1, *strcols) - if not self.index and self.header: - text = text.replace('\n ', '\n').strip() self.buf.writelines(text) if self.should_show_dimensions: diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index f93f37f27b2c5..880f7413544dd 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import nose from pandas import compat +from pandas import DataFrame import pandas.formats.printing as printing import pandas.formats.format as fmt import pandas.util.testing as tm @@ -8,9 +9,8 @@ _multiprocess_can_split_ = True - +# Added due to issue #13032 as part of PR #13350 def test_to_string_formatters_index_header(): - from pandas import DataFrame frame = DataFrame(data={0: 0, 1: 0}, index=[0]) expected = ' 0 0' From 4d7559d74424757b51ae25fed5458cfc0971177a Mon Sep 17 00:00:00 2001 From: "Barry E. Moore II" Date: Thu, 2 Jun 2016 23:31:32 -0400 Subject: [PATCH 358/359] BUG: spacing issue complete --- pandas/formats/format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index f18e5b4dd51e3..270ed8dc81236 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -733,7 +733,7 @@ def space_format(x, y): fmt_columns = columns.format() dtypes = self.frame.dtypes need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - str_columns = [[' ' + x if not self._get_formatter(i) and + str_columns = [[x if not self._get_formatter(i) and need_leadsp[x] else x] for i, (col, x) in enumerate(zip(columns, fmt_columns))] @@ -2132,7 +2132,7 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self): - formatter = self.formatter or (lambda x: '% d' % x) + formatter = self.formatter or (lambda x: '%d' % x) fmt_values = [formatter(x) for x in self.values] return fmt_values From aa91bcd4ed007d59a73ac5ce0ec075336873fbe6 Mon Sep 17 00:00:00 2001 From: "Barry E. Moore II" Date: Fri, 3 Jun 2016 00:53:41 -0400 Subject: [PATCH 359/359] BUG: hunt down remaining leading whitespace --- pandas/formats/format.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 270ed8dc81236..e1f5f97088512 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1985,12 +1985,12 @@ def _format(x): fmt_values = [] for i, v in enumerate(vals): - if not is_float_type[i] and leading_space: - fmt_values.append(' %s' % _format(v)) - elif is_float_type[i]: + if not is_float[i] and leading_space: + fmt_values.append('%s' % _format(v)) + elif is_float[i]: fmt_values.append(float_format(v)) else: - fmt_values.append(' %s' % _format(v)) + fmt_values.append('%s' % _format(v)) return fmt_values

    TV6U zrd!Lc?bdPYy7k=pZUeWW+sJL~HgTJ}&D`d03%8})%5Ckoaof7>-1cqFamTvj-0|)N zccMGVo$O9=r@GVJ>Fx}7raQ}>?ap!My7S!m?gDqAyU1PaE^(K-%iQJe3U{Tu%3bZQ zao4)*-1Y7TccZ(>-Ry30x4PTh?d}eDr@PDj&i&rq?e1}ZaDQ}va({OBy8GPy?g96p zd&oWP9&vwhe|3+#$K2!YZ|?8zAMT&-U+&-T3HPLX%02C#anHKv-1F`~?!WE@_o92r zz3g6bue$%a*WByw4fm#d%f0R1aqqhK-23hW_o4g9ee6DQpSsW7=k5#lrTfZ#?Y?o} zy6@cg?g#gy`^o+6esRCL-(0~HJ;{?j#Zx`a(>=p8JQ@)CPVyrf<-FS(b(OX;QZQhRBJ+Hpkz-#C=@)~gdt1D%-ZpQ$x5L}%?ef0!zV~)}d%PdKAHAQvpS``_ zK5xHwz&q$2@(z1PykERuy`$bS@3{Ay_q+Fp_ow%l_qTV#JL#SBPJ3s(v)(!Hy!Vgy zuXn+_=w0$Idsn=x-hbXT@49!xyXoEXZhLpUyWTzTzW2a;=softdr!Ql-ZSsH_riPW zz4BgrZ@jnOJMX>s!Taca@;-ZCyszFjPw+)w@?~G~RbTUU-|$V}@@?PoUElM4Kk!38 z@?(A+Kdv9okMAe&6Z(n##C{S#sh`YG?x*lm`l z&Cl-V@N@dP{M>#XKd+z9&+ixT3;Kop!hR9Is9(%4?w9aO`lbBRei^^4U(PS@SMV$P zmHf(n6~C%q&9Cm)@N4?D{MvpUzph`;ukSbT8~Tm>#(opOso%_R?ziw;`mOxdejC57 z-_CFEcknyazpvlV@9z)r2l|8j!Tu0`s6Wgf?vLHp>b?Vs>Z`ltNU{u%$Q zf6hPe|KtDbU+^#bm;B5A75}RLpMTB2?%(il`nUYs{vH3Wf6u?~Kky&=kNn5}6aT6I z%zy5`@L&3`{MY^)|E>SdfA4?rKl-2i&;A$xtN+ax0x^&RIZy&M&;mU$0yD4zJ8%Lw z@B%*wf-s1JSP&NtArUx^EnZc}Jb}%QH8_Wyl2MdCQ z!J=Ssuq0R-EDM$gD}t55s$g}nCRiJ+3)Tl4f{nqZU~{k~*cxmLwg)?cox!f)yWsm^ zcd#e;A^0))Dfl_q8|(}A2M2QCO8|M3(g1s1pfvXf{VeW;Bs&!xElNyTnnxTH-ekNt>AWWC%7Bj3+@LGf``GQ z;BoLIcp5wlo(C_2m%*#xb?_#58@vnN2Oolu!KdJJ@Fn;fd<%q745d&Gl~4_}P!EmJ z46V=(ozM-v&<}$!45Kg>#tGww@xu6Ff-qs2C`=qC36qA&!sKC!FlCr3OdX~P(}wB7 z^kIfDW0)z-9A*i#hS|dGVU93om@CX3<_YtL`NI5Rfv{j$C@dTn35$ls!s20xuw+;& zEFG2!%ZBB`@?nLrVpu7x999XdhSkFAVU4h6SSzd@)(Pu|^}_mLgRo)PC~O=y37dw^ z!scO%uw~dPY#p`<+lKAJ_F;#xW7sL|9Ciu2hTX#MVUMt9*emQE_6hrj{lfm?fN)?q zC>$IP35SNm!r|eFaAY_t9374c$A;s=@!^DUVmK+B98L+RhSS37;f!!*I4hhT&I#v+ z^TPSzf^cEDC|n#a373Y;!sX$LaAmkETpg|n*M{rD_2GtaW4I~Y9Bv7>hTFpJ;f`=; zxGVfF{65?r?g@Vge++*Le-8JC`@;RC_Ee<34aNH4UdM$!sFp@;qT!e;h*7O z;osqj@ML%@JRP11&xYs1^Wi_?zu|@OVt6UM99{{phW~}v!t3FU@Md@`ydB;N?}qon z`{9G|VfZL~96kx3hR?$1;fwHP_$quIz6sxk@51-thwx+gDf}FM3BQKlLLm|(DUu^4 zQX?(WBO@{+E3zXeaw9MDqaX^SD2hdKqPS7KD1MY6N*E=I5=Tj*q*1acd6Xhb8KsI+ zM`@z8QMxF7lp)F(Wr{LKS)!~_wkUg)Bgz@&igHJJqP$VQD1THSDi{@t3P(kvqEWG^ zcvK=P8I_7kM`fb2QMssmR3WMuRf;M{RidgrO~ozd9)&08Lf&|M{A_4o63#U!q^5qtUVGc=TKJ zd-O;2XY^O}cXT2;8J&twM`xn5(Yfe+^iT9}bRoJJU5YM8SE8%Yf6=w*dUPYY8QqF* zM|YyT(Y@$?^dNc|J&GPjPok&Mv*>yBB6=CUie5)=qPNkz=za7d`WStRK1W}ouhF+i zh>0;NCdZVR8q;EW%!rvWD`v->m>ctAek_QEu_zXc#fim@#f!y{C5R=AC5k1EC5a`C zC5t7GrHG}BrHZAFrHQ4DrHiGHWr$^rWr}5vWr<~tWs7Bx<%s2s<%;Ew<%#8u<%{Ky z6^Ip#6^a#(6^Rv%6^j**m57y$m5P;)m5G&&m5Y^+RftuLRf<)PRf$!NRf|=R)ri%M z)r!@Q)rr-O)r-}SHHbBgHHtNkHHkHiHH$TmwTQKhwTiWlwTZQjwTrcnb%=G0b&7S4 zb%}M2b&GY6^@#P1^@{b5^@;V3^^5h74Tuel4T=qp4T%kn4T}wrjfjnmjf#zqjfsto zjf;(sO^8j5O^Qv9O^Hp7O^Z#B&4|s6&5F&A&56y8&5O;CEr>0QEs8CUEr~6SEsHIW zt%$9Rt%|LVt%4`a1_tlD13suZTDFB%=Bn*cXxto z@BjeZJ)XUueV+ZE1D=DP zL!QH)Bc7w4W1i!l6P}ZvQ=ZeFGoG`abDr~_3!aOfOP27NtY!Q3jL|WkQ)z7L*lbL)lRdloRDbxltaJ7v)3w zQ2|sC6+(qk5mXcvL&Z@ER1%d!rBM(ngUX_Es648GDxylLGOB{AqH3r*3Pv?hO;iij zMs-kKR1bxq`ltbFh(b{#)EG5EVW=r;hQd*E6oFcxmZ%kKjoP5Ls2%d6_9zl{Kpjyh z)ERX_T~RmG9YvuYs3+=$dZRw5FY1T-qXB3j8iWR;A!sNXhK8dNXe1hiMx!xkEEy+JrWvEodv+hPI;}XeZi*cB4ILFWQIpqXXz5I)o0RBj_kPhK{2X=p;IY zPNOsEEINnIqYLOFx`ZyHE9fe^hOVO<=q9>_ZlgQsF1m;AqX+0AdW0UMC+I19hMuDr z=p}lEUZXeYEqaIEqYvmK`h-5CFX$`!hQ6a8=pPgd2jJK^4vvfC;rKWKPKXoX#5f5~ zij(2wI0a6LQ{mLugAvAIfG^@p_%gnNui|U?I=+E#;#>GO zzJu@Ld-y(nfFI&V_%VKhpWXLEsE8ZIrDj!VyF;4*TVxXfG@E-ROf%g*KCa&o!2+*}?mFPD$Y z&lTVba)r3UToJA)SBxvpmEcNprMS{u5LbpP%a!BGa}~IXTqUkDSB0y}RpY92!CVcl zCRdBA&DG)Ra`m_nu0Gd*YsiIijkv~K6E2Kv$~EJ{x#nC1*Me)wwc=WHZMe2vJI>3s z=OVcdTt}`G*O}|Wb>+Ho-MJ{P2iKG9#r5X;aDBOcTz_r=H;@~|4d#Y$L%CtxaBc)Q zk{iX1=EiVixpCZhZUQ%vo5W4#rf^faY20*f1~-$N#m(mCaC5nN+N0lpw#h%d|+;fwOc_~LvCz9e6YFU<$>W%#muIleq!fv?C{;w$r2_^Nz0zB(Vw z*WhdNwfNe69lkDKj}PJN^9}fhd???DZ_GF0!}z9rGd`Se&PVVq_?CPtzBS*5Z_Bsi zy?lEXwGx=HkY<>ltL;YwcrtufCWzA1wjx6 zNgx6hWI+*BfeD(R3x;3{mS78kLK-2hkWNT1WDqh6nS{(j79p#UO~@|f5ONB+gxo?N zA+L~6$S)KS3JQgU!a@I(IQ5TU-%Kxin03XO!uLK7iOXeu-l!iDBSgwR50DYOz=3vGn9 zLOa1Lv=<_U4njwvlh9e{B6JnH3EhP#p@+~@=q2<_L3zdBS{Qfv`|m zBrFz|2up=!!g67Suu@nhtQOV?YlU^fdSQdGQP?DG7Pbgmg>AxiVTZ6&*d^>1_6U20 zeZqd>fN)SaBpeow2uFou!g1k*a8fuWoEFXqXN7aZdEtU^QMe>r7On_ag=@lf;f8Qi zxFy^c?g)2S-l;fL^#5K9aYV~cUbxMDmpzL-EvC?*mUi%G+&9ubLH|zcvrez#l+%b39+PDN-Qk~iDksHVmYzASV62PRuU_VRm7@d zHLx&JP2y&8i?~(XCT6B}pO@m1IegREbHNq)UcmN|t0xfl?YNt&~nmFJ+K2N|~h0QWhzzlugPm<&bhp zxuo1u9x1PsPs%S9kP1qLq{31Wsi;&;DlV0fN=l`q(o&FAMk*_nlgdjKq>54{sj^f> zsw!2Js!PFA4XLJ7OR6o^k?KnIq!6jT)Ie$|g-VU2#!?e0Olm4MlftFuQiRk(YALmn zT1#!Dwo*IEE47y*r4CX@sgu-M>LPWOx=G!oD5;0kQ|cx4mikD2rG8R>X@E3P8YB&t zhDbxDVbXAEgfvnbC5@KGNMogO(s*ftG*Ox)O_rueQ>AIrbZLe(Q<^2smgY!vrFqhP zX@Rs*S|lx&mPkvbWzuqKg|t#yC9Rg$NNc5a(t2rwv{Bk5ZI-r3TcvH%c4>#SQ`#l% zmi9<{rG3(V>40=lIwT#Ijz~wPW72Wygmh9mC7qVeNN1&U(s}8EbWyq_U6!s$SEXyx zb?Jt5Q@SPHmhMP*rF+tS>4Ef6dL%uTo=8unXVP=&h4fN-CB2s3NN=Tg(tGKH^ildG zeU`pRU!`x-cj<@pj}(gpkk}*+iA&;<_#^>IND`66Bne4Ml9A*j1xZO#k<`RP5W$2a zJQ0XUBti%!GEs<17}1DM3}OpQkusz#DM!ka3Zx>bL@JXiq$;UKs*_++ zgVZFoNNrMw)Ft&u2&qpRkcK3bG$M^j6B0(6l4c~FG$#?H1!+lIk=CRQX-nD>FKJIA zNe9xAbRwNe7t)nx89_#pQDih3 zL&lPEWIUNbCXz{HGMPfAl4)c*nL%cfS!6buL*|lsWIkCy7Lr9|F+GPy#ol56BTxj}A{TjVymL++A$?DxoIAnm*%7SX#rZ07NUh|5n7ZMqs3_nT9THcrD+f? zL(9@~v^=dqE7D4|GOa?Z(rUCi4W>0{OVYDf2 zM#E`y8bMpomb4XZP2146v>o-*_B4`qpdD!^+L?BtU1>Mkokr0fv?uLFd(%F&FYQPB z(*blK9YhDyA#^AmMu*c8bR-=`N7FHMEFDM3(+PAUokS~(+zYZ-9$IjEp#j0Mz_-)bSK?K zchfy|FWpD?(*yJ%Jwy-FBlIXeMvv1I^dvn+Pt!B>EImih(+l(>y+kk5EA%S8Mz7Nw z^d`MUZ__*UF1<(Z(+Bh+eMBGAC-ftr@@x5x{8oM^zn4GAALUQ-XZef# zRsJS_mw(9r$gz|FCAJbriL1m@;wuT1gi0bMv64hdsw7jAD=CzeN-8C_;!%)-6;9z5 zK@k;6AqrJwMNw3RDVm}yhGHs~Vk?148YQigPD!t1P%(n4vev{G6t zZIrf3JH@NCS0a@TN=K!W(pl-EbXB@3-IXY%htgB&rSw+%D1DWFN`Ga5GEf<$3|59H zLzQ95aAkxtQW>R;R>mk}m2t{=Wr8wMnWRisrYKXDY07kEhB8x`rOa05D07v0%6w&k zvQSy1ELN5%OO<8Ha%F|GQdy;}R@NwMm37K`WrMO&*`#b%wkTVbZOV3Khq6=IrR-Mr zD0`KC%6{d5a!@&>99E7fN0npBapi<^QaPoZR?aAAm2=8@<$`iixujfHt|(WPYsz)y zhH_K6rQBBTD0h{6%6;X5@=$rCJXW44PnBoNbLEBdQhBAkR^BLYm3PW}<%9B3`J{YS zz9?UnZ_0P&hw_gSOASzCt8vu0YCJW*nm|pcCQ=itNz|ljGBvrHLQScrQd6rQ6{%R| zR9+QSQI%ApQdL$JRaKd)sk&;YrfR9S8mOjG)2ivz^lAn*qnb(0tY%TOs@c@+Y7RB0 znoG^C=27#i`PBSs0kxo7NG+@uQH!d@)Z%IhwWL}~Ev*KrWz@22Ikmi6L9M7(QY))f z)T(MVwYnOt)=+Dzwba^b9ks4nPYqG)s}0nKYN*;sZLBs?!_=l~Gc{aou12UW)Rt;1 zwYAztZL79Zy=r?kQthC2R6D7i)h=pRwVT>qjZ%B4J=I=nZ?%uwSM8_vR|lvA)j{fD zb%;7t9i|RfN2nv!QR--Qj5<~wr;b-As1wym>ST3_I#r#fPFH8BGu2t@Y;}%0SDmNM zR~M)Y)kW%Jb&0xEU8XKqSEwu1RqASWjk;D{r><8ws2kNy>SlF|x>en#ZdZ4xJJnt4 zZgr2kSKX)XR}ZKM)kErG^@w^@J*FO4PpBu=Q|f8;jCxi*r=C|Ys29~s>SgtcdR4uq zURQ6ZH`QC}ZS{_NSG}j+S0AVk)ko@M^@;jaeWpHFU#KtDSL$o^jrvx7r@mJ|s2|l& z>Sy(f`c?g=epi2}|ERH80E^Ayu(&K9i_a3Uge(zD%#yIAEE!AAQm~XP6-&)L3^B|& z#xsG5Ok#vlCNqVpj4_Sr%wQ(7n9TxN8kUx&W9eB2mXT#*nOPQ=m1SeuSq_$ym13n?5G%vVvU03EtH3IH7fu(qrn^Ro6V zl67DmStr(+bzxmuH`bj+u^y}^>&1GrKCCb6$NIAYY#zFxvk`108^uPm zF>EXw$Hub>Y$BV)CbKDQDx1cpvl(nAo5g0cIczSQ$L6yIY$0337PBR6DO<*tvlVP5 zTg6thHEb>xYD4znZdC_BcE zvlHwjJH<}3Gwduo$Ii10>>|6wF0(7_D!az6vm5LtyTxv^JM1pI$L_NS>>+!^9?8ZcKC>_EEBnU2vmea)noBJ}i><}c;%f1<_*w!j zp_WKXtR>NsYRRsnybIYjw1`T0Jd9tFJZC z8fu|hBdxL4L<`fJYR$B8t+^JVwa{8>t+du!8?CL@PV;K*wMeal)=}%Eb=JCQUA1mn zcP&cmq4m^yX}z^RT3@Z7)?XW-4b%o{gS8>rP;HntTpOW{)JAEewK3XQZJahrsq3zUmX}h&O+Fos+wqHA-9n=nKhqWWxQSF#^Tsxti z)J|!qwKLjT?VNUAyP#dvE@_vwE8118W*(j&!VZIY4vn^dOd@lQO~4j*0bnY^=x`}J%^rC&!y+q^XPf?e0qMpfL>59q!-qU=tcEn zdU3skUQ#cmm)3*yGJ09PoL*k9pjXr@>6P^=dR4ueUR@8?Yv?ugT6%50j$T)Yen?dKbN` z-c9eWN9jHEo_a66x86tZtM}9U>jU(G`XGI)5q%* z^ojZ;eX>49pQ=yOr|UEHnffe!wmwIntIyNt>kIUS`XYU?zC>TDFVmOnEA*B6Dt)!S zMqjJ1)7R@8^o{x^eY3tr->PrZx9dCfo%$|)x4uW;tMAkI>j(6M`XT+Wenda2AJdQP zC-js0DgCs5Mn9{c)6eS{^o#l>{jz>Vzp7u;uj@DToBA#Nwth#ytKZY_>kssY`Xl|Z z{zQMOKhvMf>|Ehn}zw1AA=WA|^03)^$$B1jh zGvXTwjD$uaBe9XhNNOZAk{cX&B$)#Fmf8XjNC>ZBd?Lq$Zr%d3L1rs!bTCJ zs8P%)Zj>-e8l{ZVMvzg)C~K56${Q7oibf@)vQfpTYE(0-8^J~mqoz^IsBP3S>KgTo z5Tm})z-VZM8jXy`MiV2Sw(U@dRHl`R;jcLYoV}>!)m}Sg1<`{F0dB%KWfw9n7WGpt87)y<1#&TnYvC>#& ztTxsdYmIfrdSipJ(b!~cHntdBjcvwuV~4TR*k$ZC_85DOea3#{fN{_`WE?h*7)Om` z#&P3>and+toHouFXN_~ldEHm(>~jcdkr+xpV&4C$U#x~=a zam{#Ud^3TW&`e||Hj|i1&17bBGliMbOl77vJti`-$(g(rgYvyfTX zEMgWli}Yl} zJDXk1u4Xs0yBTHnFngN4%-&`nv#;6D>~9V*2bzP-!R8Qis5#6WZjLZVnxo9o<`{FV zInEq!PB15$lg!EH6mzOM&75w|FlU;x%-QA~bFMkhoNq2L7n+OA#pV)oskzKtZmuv_ znybv!<{ERYxz1c~ZZJ2Ro6ODT7IUk)&D?J8Fn5}}%-!Z5bFaD2+;1K*51NO}!{!n5 zsCmphZk{ktny1Xu<{9&>dCojVj8)buXO*`qSQV{GR%NS-Rn@9yRkwnz8dgoKmQ~xT zW7W0lSs_+^tAW+f3bh(pjjbkDnAOy3W`$eLtq7}y)zWHZwYJ(=ZLM~e*J^J?S{r-f2y3J@${KBr zvBp~CtntDCNurZvl&ZOyUfTJx;=)&gsxwa8j*EwPqb%dF+r z3Tvgc%35u$vDRAato7CgYooQv+H7sHwp!b)?bZ%!r?t!4ZSAr4TKla1)&c9Fb;vqw z9kGsD$E@Sl3G1YF$~tYGvCdlOtn=0d>!NkZx@=vsu3Fcu>(&kHrgh7@ZQZf%TKBB` z)&uLI^~ic`J+Yoz&#dRx3+tuz%6e_RvEEwmtoPOj>!bC_`fPo%zFOa`@753NA1js} zV8^!O*m3Q6c6>X5ozPBXC$^K=N$q5Iayx~c(oSWkwmmkovCY}ME!d(h*~F%{Y%8{E zGh4HD+ptaBvTZxiPGhIF)7k0m40c94lbzYlVrR9p+1c$Jc1}B&o!ic1=e6_M`RxLB zLA#J$*e+rhwTs!s?GkoLyOdqp4zkPGW$kiydAovL(XM1ywyW4x?P_*)JJ_ya*R*Td zwe31~UAvwgV%N7D*bVJayOG`4ZeoYoP3>lOxZT{2uv^$I?N)YcyN%t}ZfAS#_I9M* z!R}~xvOC*d?5=h1`)K5n0|Pui#K)AkwrtbNWtZ(p!4+L!Fh_7(f8ea*gZ->`4mx9r>Y9s90* z&%SRzupiow?8o*K`>FlRer~_8U)rzi*Y+Fxt^Lk^Z-1~q+Mn#t_80rB{muSv|FHkD zV+95T#tw`V7&kCpVEn)Yfe8Z>1ttzm5|}hFSzz+O6oHYAIy4H7*SvLP)?D60&WFiG zBzJzK?d?765am7M{7e&(%J~^i!{sRNQFlGu^V>($9dpZ^kEVMRr++vbT)l|u|3p3i zN6-0}V?$9KFfJGmj1MLN6M~7r#9$IIDVPjQ4yFK8f~mmNpa(=C204%i1yBSfkbo4F zK?RI{>I~L3@Q*JWbUsYa|M=rWh@(G@(D{%(|06A!4onYb05gJ_z|3G4Fe{i1%ns%N zbAq|R++ZFsFPIO^4;BCmf`!1sU=gq=SPU!5?lqY2G@XV!FAwza09pz+yrh0w}4y0ZQyor z2e=d51?~p-fP2Ax;C}D`cn~}U9tMwqN5Ny@aqt9q5OelfOo-r;C=7`_z-*qJ_etFPr+y4bMOWD5_|=|2H${h!FS+$ z@B{b}`~-dmzkpxCZ{T1SSTPfJwn*p!=G| zeWUNA%O&>pbRRY3Nlawb~qA*bnRv4gd#&gTTSy5O63s3>*%Q07rtOz|r6sa4a|u91l(aCxVl} z$>0=lDmV?C4$c5)g0sNc;2dx+I1ii;E&vyTi@?R;5^yQF3|tPb09S&mz}4Uya4onF zTn}ylH-ekM&EOVrE4U5Z4(YM>4ppb1)_ z4F-Z~z_egGFg=(7%m`)zGlN;ctY9`UJD3B^3FZQGgL%NbU_LNESO6>t76J={MZlt9 zF|asT0xSuZ0!xEIU>UG1SPm=?Rsbu4mB7ki6|gE;4Xh3ZgEhdKU@fpVSO=^N)&oPp z`d|aFAs7la0vm%(z%Z~W*bEE@n}ZQx3$P{F3TzFw0o#J@Krh%Hj08J?9l=guXRr&{ z73>Cf2cy6qU{A0Y*c3 zTm!BJ*MaN74d6y_6Sx`N0&WGjf!o0y;7)KCxEtI9?gjUO`@sX?LGTcG7(4Xg1)qV>!5835@D=zPd;`7(-+}MJ58y}e6ZjeY0)7R*f#1O&;6I@AeKY>+KNuT~ z1I7j8f$_lvU_vkvm>5g~CIyp$$-xw0N-!0e8uWk^Rqpa6=X1QL*fGN^zm$UqI$ zK?5{F3$(#NFb$X%Ob4b1Gk_VvOkid(3z!wm24)9yfH}cjU~VuEm>0|k<_8Oa1;IjK zVXz2T6f6c72TOn@!BSvpFbFIImIcd!<-rPIMX(ZB8LR?U1*?J8!CLaiXAJ?f4}nWzhC(l5I-Qnf4}nW zzhC+G->-c8?^nM4_bcE2fBKbgzPISbb>5FoK-=TLj%Ii*b z{P_+&ciO{wnNTi4|L_TeL%iLl-He1NfVs7acE?NHm$=N zw`u8hr&Sua4Q<^dw0)+=ZM2-h}ARw?wq`FS~OtX+pg2 zluH`t&4|uZb!T1ve5&pYjC)`K{-^3r#ME;B#VQ*5^QpO$GX8fny3;a$25_fn+yJrN zH!}vO^xekry`u6j{fy#H;e>^F-5H$d4smC3qP*@LPRtH*=W_o1CGNZq{1UfA-07V( z5!m?|{^(a1xRX8qp$ptOAb$+#g4hAhm;QEW6dK}nr-l4Y zv{Q^`b7!B@_z&Ipv)He;b0?(yee6z2{Tau3b?l4dd&6q5@Mj-K-^BXa*8xt6)7P=x z-F|!PYa9p^P%h2@H(mZisOTh#>+BNcjpyzd{k~c!mE-$D|Ng$(1Q6UgN;gRo`gU=X zB!Y9FU5rdg;IL|4GTj$S}q`t-9k0BZC3d1S6v+8f^U&ASd zQ|1ql@-G2W{T?87bbuJ+(c_ebMR}2fn{3$K!$~$b7u;my-1R6g?<;n*%^91%Y!loa zf+E~L=n&-#{|GeP# z26I>co_m^86`tE^TisoV`6b>)bc}NGeCbm8&&>Ou)}8%*52tB-k3ZKV-O+Elzt(qx zwA|nK$Gf#{x9r#R2y|Ef{yfq+RnAEut%KVy>D(=yesSB$?U(e9QQi!`QvXT7-z^#4 z`k)B+xxz^xyi0VmXL1Uon?19;XNWh8i~n%5`WAn0_H1t1ufxsmuKxXSb2wGuy_{yx z=`O@<_FU01+-46O9Q?D{{f+plFtlb7zX`!lJ^kzdu0G|@?R?Fi+x@k^X3yjPzCW5h zZ}c(v2Xj7m_3zIkzfM6yZMC=w>ey z-Rwo3!suo%=I$BdE$-q!+!DUU-CtN(=MKz7TU4Q+IPUk;$PQWpvl>2)Wr}Vc+^m=r($VOq|!6DxA?%ok= zoYIK3&X2J0+9BQwZkfLoD*6_GZ-q*3*{@cp?2GX84D7qw_yTqqP}N(-sq{yx`j<%6 zAQGI*tNS*oSu37%;F00purP11yL&BvNcZsEkTsmLu<+1|?s83cImBDbw`0uYuI-jM z$LKV!FS-9{+<%|EzQ)BryQoggIL@W)XPoF}bn1N%xJBjMUGhJi<_+=I^&OWl`|J4@ ze{aZ;=-~eBukWn-v+37mq=8fBZ@-3r2@v{wfJV^)iv0@E*eP=^BTXFKaTDh5;fxzM z(cN*=)LoDAHuDw7e9;o_ZV(jVwxe?yY5uc5`Z5#YuXo!g`Z97LdSJD13Zn;BOLxx@ zZz~u76KU;R{JkaGxMfafK_awuS7Qz=|MIUFGVPoVoM_H}z487fT6>80$GG~RyDVJv zWmTlJzteji++PwisX9X3IKHGB6xxz+WX+y4ch)S~@?`n-NL(>|#=xrHkSXz z;}kDh-8;(Ju>zwNQYH_MTfAho>fX^#L1b0$7zZcrSU2upemG6!y!sjE)cTuwytDLY zvP^Kxel_z%clGbhJjtnY(qyuOnLUo{==Q^Tl_sM=D20f;l}mn_FQ*0=HdF5i~D|-_?02^oXz|h zGXF0z7eLJaZH6p#_IEO5kuP-26j=;`|1YmZpL+grC0Y_Q&fl&?vEUJXnu}ilBhJ#8 zaRlEL?dhz~qfn|?vGQdbanp71UOf|byi|Q-ch|rN<&NT1f5cf9Gme}3%iV%n?)9%} zn>Ja!E1cEnwQSy%G1jtsSH)P%;aweLEvI*lv*vV~)0S(UCCGwxzQy0OV7*)BWC8pU zw!vBTC&90a-$tj*pGKSh5@7T10k%X3h;jAY`ZK_`zXaI+dw?C$0h~KFjo?zU^Jjov ze+jVr_W*mM0~G%iV6Ri=T)+1@xC3s#yN5I2+!5jqxC8Ebl=q;oIOg^Hkh?)pgqwZN z_51M8`snNT5r4g#RZjiUpY?Id#r2)okNNA}Y;o$3|E!N)&VS-R;jeeI0IvVRiTr2% zCLP*(PddfXAf(UnTN4R4;!X3&H?g)->2X5);gDMvg{xJMe=_0W1 zAJDg)Euh)UyX|X%F|a)l{2niJGkTejk~2YuHBL0 zj_bGXdX)E_uQ=woe(!D&6yfHZGcG^;tdAboAN}=i9y#@&e%42i>(BmrH$R;EFF)&J zmy2I6w(~OLtH0jucNo{seQW2YbXZI8H>WJh``x$8f!}Ua`|1-{cDG9G{o!oj+@$`; zA$r(5v+nM&4{-5!Uu@sv@BJUgEpz%G{s@TcuKvrgkI{tjoDKX#J^o*!C4gxE+Y5Ta zpV1TjC3<3r{y*IhPvUInoG_BQ!D5~-lEMDYpAt^9ySHDW|8pXYnE&tonDGC?OfXcl zfLXz8V0JJEm=nwe<_7bCdBJ>Oey{*o(EmqFa<^f8f5fD4TjY;FVp2x8l7Bd*a##OT zMx=H&a9*8y9Na;L-0ht~<>s9`sIX&{m-Cgz{3C{U>w_ZPK7m)K5xxr;WU}CtM`yC= z?i}KkT>QsPe2c$lGIh(GOor2d?5_ImYLtl#YgN>%IO`Gak6)())maPA9icgc#z`0E z?(kc_Xl{vnZv2@q|K@hxpLr7t+B^F8U9`V$)5Y>V{^j+i{`+rqu0=n8WAwErTIZmD zdqkH<`|s*~uksAvbM>Ww>0A80DXr+k@TY+7tooYn*DItzr^w$GY5uZ*+TZt2_pke> zcZ!@daRvuBX)?MyI7#Dnv70oR-1R7LW?ym4GjSGogP;hvXPh%>)}Qs!XX0%Bdbb^; z?~rtffRxJa6h^014tG!I{;Z4tdF1jf{@xC`-LhXPmB(H6HN?-nD(21W6#h!7e9l^U zuZSG}l*;d}#XJKQa3i<@e`eII-#Rf?bS6a`ok`I~XHvBPno<6E|F`=`$BDjO5`Dwj z%~0nr*@Es9@uf>4_Zj?g#wZ;9to-Rx#9j3@Q^fC$Q`D(&&J@KQ+)gj$m<3Jg3;*s$f%BPy8s6SBavarQEnt6F)ThZEGb`dQj{k; zDk-0jg$j+2>JVDRW=M{Xi>m4LqTJ};S^wa|yly-o7)tK!CW3;IHl8TAoBXq5LYp>8 zQOQYEm9S2fo66bKO?%x;Krk7f;E4;1jqV)fW(Nl2p{ZQriA+>4s**9@hNeM zo$^H_c%t&frNkyjhj~2qLhSw#3GqqEo|K4W%B`ewQ8qTdR<{}m^bH2X?fJ9-dupn$ zpCK~2Q(_e5)`OusCF})(HiHomhy;UWf`Wo-eYbzUM~oK5R*-#Of&GwL#VP9NZsU5e`;C^Do%;j*QV{vmzFfI#@)();BFd_ORi0Fa-;ev0@h z>Zh2W;(ki_De0%l;jUCkGX-Ob_JR{4-I-lKh52F8sV!7JAA?_*EM9lk0M~ys%sL72 z$*!vLk(4`|ZJpV@?i>LwcA$%M{bO|jE^(mir)~X#OHMy78EqvaekoBu#rzcaQ^HS4 zKc)PX_EW}BSwH zu=uu7R0HSfw}lE`cSSok+PHa=-IW4?U?eFyD$!jz5Om%W65Uk-QNNv44GVEs`^^KY zJDNuOx@=7c_^lH7Vc!3#oE&ZsQ^S?kWzSqQz-3SAvL|!>Gfk~9uRAapE0C{Xk$eT+ zwY}~-cDN4f7?tD>a+MFp!W~(Y+;v0T^}O!-0l`FETTiH6m*j2`5gHnv5^cw{_|QgH+^qv#%5O^a z1)^%6gw9d%K~*ZI#@Vo+aW>o!trJ`d`a^4kFSvC5BeXif#f}*u2-?n}xc(U;(mzCm z(+&b4lpG!vj5;AQ$sHBqZWGYd2|Wq%QSP?Bx8x29Zda7O=BW10nxjM9F#)!o?9kWl z&(Ig^3=R1dFZ@ur-$Mb%MO|&35FD4mK~8;|#GKXQ;hW+v|t@&FhET8XfVQMo0RQOZg?Zq^jpXsp~^` zpSnh+PBZ#%(~PmD8|z3n&XI1YBi(qfd&2M1O|+#O`7h}rlSA#iZjw*B$zJ!AU!|LB zOE)cb+Ub9rc7`q8Oc(01^TSyIE+QhaQ*vi2G?8-84hW`#8|JSXW=CC5yn9Zq8JoU; zx1%szyG|qb+~fxCc`nlBB#HCA?gjtadlvq-_x!O9FY@DOIQV4@yHZlObE=Fh;*Lc* z^!vB)H4Y6L@gA~ss*F=J4XJ<5&Y}FVezCpaC8+`}{hL6`Y#f(6IIeJToaW%T((7LJ zJC3Vu92ftIBb1!vUgHCCt=GNoR}k0RAZ|znX`>BNkYDmmUiapINxtQOl6YZZw9-=KdqDZq#CLnmHjV2g5!S*cmI2`XHT~?6`WnafV11{-t#YT z_Wn=c?E8NK&VJiG4mjp<&@qnl%<+_s>KjamH``_WoJ-$9{13Z~JF?d;bh^p9^pm2>P)-|Hz$xoYd;4 z*vV0p$CnbHw|((~*L~3eJ(%^SO?C?AOPSr50vS6gc3*bcFG>D$S71jwAH3r7|3&!% z)YVi;uKi7t>o#&X9OP~~$enPIyXAG?{()R-Ds#t1?#v&^1;ddkiSD~T`0jb#_kV@& zfeqioe`hVe48)(WJo369|G9%a@nd+_2Iy%(Q^%Go`ks}1&&s}MmDGA$GQVRyvvu+O zH(k8&kMrM0a{tEr(gw<2*sDJm_WGBFz4@;f_SS~>odfNA2ik`Yv>&|gkH16v$%gjD zpU|Rl(H)~A-JgBne(}1${tE6l8{F?cjd~n+|BF&Rpj{sp;(=`PD=;3|;kRRzokH6f zdl08s{YU7qlVba{Qm9y=!UcXktQ7E_Ry?T9>&x2OBqVq+yHLG?zoXB_Jh)wzx+uaK z_1}56hqUX%ydKJ7tnKyCKDHp&cN-TEV;8(0)>rk*zVG22=HM7--G5>PMJ9WAr`b-M zJx)^S^@#TK7ojxH%3lDn?0OrJG!CN;NLn9TP>gT6HYVxpve%Q|SNkg_865uL7-wyNVgg5m zC3`YD?G7lJ>=&;mv;F)Plq^m;#FN$G@+~i$kKrFs;8efzWVh>WIda(KN1*Z@2JJbb z+9uc+7Gbee@-?c)Lr0rp`c zm+cFAAp z>?v#)VjKhh89_Wn>{5!Ov!eD(Dk#P5w_kPk1C-)+y{)qn4x_EJl0LSe7$2Ip&Pv&3 zucx%HHdSY^v-Nn&IQ+pe4m5x0EX6)-B!;G*&^={+-OKs95BvdoQlfq55S8!8A>LQ= zALTveZB`rd3ch|7oqqOBUQ$Z9eaiGyvWxbP;p6`?L1nux^lO-wvwV^)@y&9LAp~O8=r#uj?B6=14tKdTQEyKTegNTF#iZn#sN{I~dpa5o%KD ze@d$be`Hbo)bP^)Kh0Gkxq9~xfbaJMq|np_QNEjr)Gfvn=&XeQ&FpIX*JB@YoP%zW z)Pt_4j<5PZT%`H#!5Disd&l`TI0XG1E5_+RFqm*|&YZ+6*}k~+)V1sS^>eJMp8b{z zaeb%DUm$K^*V|Ur&|$Q#s*#T^D8{E7+p2=?ve(ntSDR{8_RW|*MH7cVIK}}v)vAK2 zHqo)Mp}uPekF!ty#K~ngbygSdH?(F>Px~O167E_P<2Rz_cAeML!hQzC_-sc?#Wd8p z=lWqlEuD_Os|int!x`df<GIHwi{*> zuP5Bs*G_S;)Fj9gVOQ7}fVL?)*Up|ur}!T(06kGoXWPnnpH_d|413!6s{gCY;f?@s zJpI9nZ$$t|zanIWL*pvGy5`U=r~2i6XHz;dO;?A0&@Z-qZ`0}va%f27(g!|?Ke8B4TYIs#t+*UH zm6+e0DB3UH|1~Fy`HQE-_$P(>{;+P_e1N~WLA!r3{^ka;c5|v5#My7Dh7#{|@fk{t z-v|=y8m}kOSI-WN`9t|`r{{lCzQ^h0Q+|x2@PWbq|LZczUWe_@$%%3joT*ZecC;6SYlf&$x>Eg8gxrO)`sMKg}e@F_e=5(MuU2R_55xu^S-R!5|{~5pkr;gj*8Sy_|3;%W@ z?CFs@U{7bL5Kk|Mr0(3keXqZL)$sIj>TLZXsX1w1r|8K2x77ciExVu1k!no+oesYl z(*V21>lx^)|Fa(ezJV;byqvKM0?=I#CAXCxWotNyPdiG33P|2C5Rj1WWZMMQ;o zhB<5bGeivcuN(L6vn3o?{pa`J@3a5xtEnM@`fbA-;hzNj5fTvRcHxg*apb?4oN(aV z5l7igb}0CB?W6x+tbL4slHbFba|`mv`p5o@(Gk-5R_2po+}~vw@9$;Xjh*NGyp6GY z*}wlD|5X086;f$NKh6A0zVkP|Oz=!M>=4q*Gw*6tc!;~6cX82xz4=*!+j^R7<&a#WC z(Iw2D%=bOhGuy699bnF12AKQ%0P}tfQ0bom=KmaE!CwYg`1=5hehgrroqnuf@y`L4 z{AGZpzYnnN#{hN+|1rRFyDrM>Sz!}r$64w0uy-6M^g74uRZiLKS?#M%-E!>j;x>$H z9ERW+Cw!!MJ!^lq+n)h^?dww89T&CR*Z*vX0#na58&caH^S9eK{%nT>5numJsqKy_ zr9?Sjz+8=kN#_G&>YlRMt`BBIBcs}c*`KmP+eODmdbT)qsPC(rXRGrr|7`yyx!q>B z{7i0B!joK0y`Jq3H{ywETgbD+ezjE^;`Qvb-<@%pH z=7S&v!LYr^U%B@D!DW9_asKRYTlQxa&t9jssq+cFO|-}UcZPj-xv68NNzVT_M0)ny zW#^-d=YYeQYGns~ufLn=A*b%2UGT6|j45P4V+uPD%zpSzu=L=_j&F7rvnB4aBx4q)LQymD_5ptxSvAa24n|v00BTwAQzAu$OGgB@&WmQ0zg5a5KtH>0u%*`0mXq5KuMq! zP#P!$lm*HG<$(%7MW7N;8K?qO1*!qnff_(fpq70GsSVTtf`Ga}J)l0&0B8s_0)m0Y zKog)T&djV}P;1IAA<50hkC(0wx1ffT_SVU^*}Zm zZUc9KyTCo*KJWl|2s{EF15bdbz%$@E@B(-VyaHYWZ-BSJJK#O=0r&`f0zLy@fUm$e z;Jf35AOyl70-_)W;vfN%AO+GO1F|3o@}K~UpajaG0;-?}>YxFdparG@(}L;1^k4=s zBbW)y3}ykdg4w|AU=A<<%n9ZKbAx%nykI^sKUe@P2o?ehgGIoiU@@>bSOP2wmI6zI zWx%pvIj}rf0jvmC0xN@6z^Y(1usT=+tO?e#gJo^74j2U11?z$J!3JPMun`yxHU^u3 zO~GbhbFc;25)1)bfuUe)FboU_Bfv;73Ty+m1zlh?7z4HgW5GBu9!vleK{x0DlfYy! z1#Ay?06T)6z|LS7uq)^VyMf)o9$-(f7uXx@1NH^`f&IY&;6QK?I2arP4h4sS!@&{Y zNN^N58XNHo!3p3*a1uBfoB~b-r-9SK8Q@HC7C0N61I`8Kf%Cxy;6iW_xENdl zE(Mo?%fS`kN^ljp8e9Xe1=oS=!42R>a1*#0+yZU|w}IQi9pFxI7q}bT1MUU)f&0M& z;6d;Zco;ka9tDqq$H5ceN$?bS8axA@1b@1!4Kd^@DunM`~rRjzk%Ov z=eL6}1VadfLKuWY1Vlmt)24q4Olm}-`Jntz0jMBU2r3K}fr>)KpyE&os3cShDh-u^ z%0lI!@=yh+B2)>g3{`=uLe-$^Pz|UiR0|4(YD0CPAgC@>52_C}fEq%LpkSym)C6h@ zHG`T%EufZA2-FG+g<3;lP&gC;MM6lVhf})`qs2vmw#X<2<0+a~3ArF)UB||At zd#D4{5$XhWhPpsqAurSo>JIgQdP2RR-cTQ?FVqj}4-J3@LW7{e&=6=SGz=OJjetf% zqoC2y7-%ds4jK^bC3qy?|aqub|h^8|W?c4tfuLfIdQ>pwG}3=qvOM`fi7Q z5VkjA7=ck3gK?OENtl9Zn1NZCgLznhMOcDmSbbGOb~pzd0Oy2r!MWi)a9%hcoF6U#7laGJh2bJ_QMedf94-NugiFDt;WBVp zxEx#_t^ikrE5ViFDsWY}8eAQ&0oR0U!GUmXxDFfy*M;lB_2C9^L%0zf3^#_Gz)j(1 zaC5i?+!79fTfw1lYd8!Jha=!fI0|k9w}oABG#mrBgJa=1I37-b6Ja;(fs^25I0bGG zcYr&>o#4)J7q~0zg}cGs;T~{LxEI_T?gRIQ`@#L;0q{V05Ih(j0uP0U!NcJZ@JM(R zJQ^MYkA=s<){RXMtBpv8QubKg}1@m;T`Z!co)1I-UIK2_rd$&1Mork z5PTRu0w0Bs!N=hf@JaX-d>TFjpM}rC=iv+RMfehY8NLEvg|ETa;T!Nx_!fK{z60Nd z@4@%s2k=Aq5&Rf_0zZYH!O!6r@Jsj={2G1(zlGny@8J*dNB9%`8U6x)g}=ey?TsHq zAOuDb1Vu0eM+k&OD1=5Bghe=nM+8JfBt%9OL`5`2M-0S7EF=w*7Dd&R7I*G)sY%VO{5kQh}1^vAVEl7q#jZqX@E3D8X>_*W26bv6lsPu zM_M2)kr1R65{k4&!jNzz0*OSTkTytL#Dzp7F-SWk7KuaRkpv_WaU&ii2}wp$koHIi zq$AP^>5Ozix*}er8`2%=f%HUrA-$14NMEEM(jOUs3`7PYgOMS~P-GY~92tR(L`ET_ zkuk_vWE?UcnSe}0CLxoNDacf08ZsT3fy_i^A+wP=$XsL|G9OuhEJPL|i;*SBQe+vj z99e;^L{=fIku}I#WF4{|*???BHX)mlEyz}68?qhQf$T(fA-j=1$X;Y0vL88s97GNw zhmj-5QREnM965oUL{1^6ku%6y@%EXnnK++7NAo2BVG9CTLT%8QL6e zfwn|L&{k+D+8PZ*!_f#d5{*LJplwkX8jZ%F?a){>4vj|>&_vXYde9^^8BIakqaDzW zXeYEY+6C>3deLrZceDrE6YYieM*EhoD2zVd!vl1UeEOg^otY zpkvW-=y-GjIuV_OPDZDoQ_*SYbaVzf6P<<5M(3b&(Rt{6bOE{$U4$-1m!M10W$1Er z1-cSlg|0@|pli`}=z4Smx)I%kZbrADThVRkc60~26WxXGM)#n5(S7KC^Z$6TOArM(?0^(R=89 z^a1)1eS|(npP*0CXXtbE1^N)x>IHfmm&<4i<#f#p+@8u?AQ}tPvKBHO87?O|fQJbF2l{ z5(~jvVWC)SEDQ_BBCtp-3TuP4#avi47K63JVzD?Z9!tOyF*oMHlCWef1#6FWz&c`` zu+CT)tSjclx?$b19#~JT7uFl=gZ0JwVg0cI*g$L$HW(X%4aJ6G!?6+ANNf}~8XJR+ z#l~Udu?g5jY!WsZn}SWnreV{u8Q4s07B(B3gU!X}Ve_#C*g|X(wisK2Eyb2$%dr*M zN^BLj8e4;{#nxf#u?^TpY!kK_+k$Pywqe_`9oSB67q%PQgYCukVf(QI*g@=JevyMkTCu3^`)8`w?k7IquEgWbjMVfV2I z*hB0Q_85DDJ;k13&#@QSOY9Z)8heAi#ol4>u@Bfs>=X7G`+|MNzG2@n00(ghhj9c) zaSX?C0w-|_r*Q^naSrEk0T*!zmvIGGaShjT12=ICPlKn$)8Xmy40uL76P_8*f@j6E z;o0#VcmSRg&xPm4^Wb^$e0YAm0A3I;gcrt(;6?FbcyYW0UJ@^bm&VKBW$|)&dAtH% z5wC<-#;f2}@oIQ=yarwquZ0KVwedQ55MCFrhu6m&;0^Iccre}=Z-O_)o8isz7I;fM z1aF0h;;r#8JRFa}Bk?G_4c-=a;n8>u-VTq&CKDh`td?7r%$!#~b9 zCvp%0L{1_Xk(3PeSs z5>c6`LR2NH5!HzrL`|X=5lGY~>JULhU7{XQpJ+fdBpMOHL}Q`}(UfRLG$&dREr}4K z6%k6bCc=ntB7%q{qKGy`Tf#*|6EQ?PB9@3F;)w(zk#G|pB8f;QQi%3M2cjdm_f`WW)ZWAImBFI9x#8P4zv7A^ztRz+u ztBEzlT4Eisp4dQaBsLM7i7mudVjHoY*g@H*#8KiH zahy0ooFq;Wr-?JfS>haVp143+CxIx?`ZV|VMJH%b$9&w*|Ks+QK z5s!%{#8cuK@tk-;yd+)`uZcIrTjCw@p7=m~Bt8+Ji7&)g;v4au07#I8NSH)Ol*CA! zBuJ8^NSb6wmgGpD6iAVjNSRbfmDEU`G)R-Q$TVbHG98(o%s^%&Gm)9eEM!(P8=0NV zK?aaH$y{V^G7p)T%tz)Y3y=lLLS$jG2w9XYMiwVakR{1dWNEStS(YqEmM1Ha70F6u zWwHucm8?cqCu@*3$y#I}S(~gw29b5idSrdF0ojmjLAGMP*v+mjv0j$|jYGuegg zN_xp|WOuR$*^}%=_9pv~eaU`ge{ujhkQ_t~CWnwi$zkMhas)Y&97T>M$B<*mapZV% z0y&YKL{28BkWw<-1G$mhL~bUxkXy-Z}suR_j>Oys; zyi_-;JJo~gN%f+7Q+=quR6nXeHGmpO4Wb59L#UzDFlsn8f*MJUqDE6=sIk;IYCJW8 znn+EeCR0O6IUx=3B3E>l;itJF2>I(37(N!_AuQ+KGl)II7x^?-UvJ)#~{ zPpGHVGwM0@f_h24qFz&PsJGNR>OJ*=`bd4EK2u+)uhci{I|a}n4bd=-&?t@3I8D$b zP0=*X&@9c-JT1^7EzvTq&?>FbI&IJ8JU5qYHm!M11rRdUh8M-W8jxJAEpexdq=*n~z zx+-0bu1?pWYtpsoK)NiVcbZ5E?-Ieyz z-RSOg54tDai|$SLq5IPP=>GHodLTWB9!w9Rhtk97;q(Z4Bt42AO^>0+(&Omy^aOe$ zJ&B%7PobyM)9C5+407Bo`XGIXK1?5>kJ88J6r9P1|}nuiOI}lVX`vWnCwgrCVLRJ|;g?fGNlnVhS@wn4(NErZ`iADan*#N;74cvP?OqJX3+G$W&q~GgX+X zOf{xDQ-i6=)M5ge+DsiLh^foeW9l;vn1)OvCYWi=G+~-D&6ws)3#KI#!n9&Snbu4g z6V601kxUfRhH1;Vm}n-3X~)DeaZEguz$7wm#=|5r$xI5;eL zx-&hPo=h*MH`9md%k*RVGXt1`%phhkGlUt+3}c2fBbbrQC}uP>h8fF@W5zQRn2F3J zW->E{naWIKrZY2`nanI^HZzBr%gkfuGYgo7%pztnvxHg7EMt~4E0~qcDrPmahFQz3 zW7abpn2pRPW;3&e*~)BVwlh1Joy;y~H?xP?%j{$JGY6Q1%pvA5bA&m{9Al0%CzzAW zDdseDhB?ceW6m=dn2XFM<}!1IxyoE)t}{27o6IfdHgku$%iLq`GY^=D%p>M8^MrZI zJY$|SFPN9iE9N!xhIz}pW8O0#n2*dS<}>q!`O17_zB2#|vJeZi2#c~9i?akvvJ^|R z49l_{%d-M2vJxw^3ahdjtFs1cvKE_$P0OZZ)3X`ajBF-0Gn<9Y%4TD;vpLuRHYb~l z&CTXv^RoHa{A>ZXAX|tn%obsbvc=fqYzej`TZ%2smSM}X<=FCU1-2qviLK05VXLy$ z*y?NzwkBJP4PV*%-DR8_UMA@oWN{$huh%o5UuwDQtVT1KW}9#CB%8uw7X%+l}qc z_F#Lmz1ZGtAGR;skL}M6U=bq?JB^*r&R}PCBB*v0G;b}74zUCypxSF)?v)$AH}ExV3g z&u(BhvYXh=>=t$_yN%t>?qGMayV%|A9(FIgkKNB6U=Ol~*u(4*_9%OdJ=pJZdyT!$-e7OCx7ge49riAJkG;=6U>~xN*vIS>_9^>} zea^mMU$U>**X$eiE&Gmr&wgM(vY*(`>=*Vc`;Gn10vyOe9Lymc%3&PN5gf@;9L+Ht z%W)jf37p7DoXjbl%4wX=8Jx*kTpBJdmyS!%W#BS$nYhec7A`B7jmysE-~zauTrMs* zmxs&C<>T^m1-OD-A+9i2ge%Gwym&~Pb?YRzIN3Ijsnd`!J<-A-st~=L* z>&f-vdUJiazFa@9KR19I$PMBKb3?eH+%Rr9H-a0RX5Yq+)CI&M9; zf!oM!;x=CFd>OthUyd)&SKur1mH5hh6}~E8jjztv z;A`@=_&~lkUxyFk>+<#Z`g{YvA>W7(<{R@(_@;a_zB%85Z^?)7t@u#BH6O-@^AUU` zAH}!f+wv|xnvdbz@v(dyAI~T7iM*Tl@JW0!pTf82JMbO(PJCy+3*VLZ^4<9Ed=I`S z-;3|f_u>2U{rLX;0Dd4ph#$-k;fM0W_~HBrek4DNAI*>9$MWO&@%#jSB0q_r%unH` z^3(X~{0x33KZ~Eu&*A6t^Z5Dv0)8RCh+oVv;g|Bu_~rZxekH$(U(K)K*YfN5_522Y zBfp8?%x~eh^4s|B{0@F6zl-0^@8S3I`}qC*0sbI=h(F99;g9mi_~ZNu{v>~jKh2-v z&+_N^^ZW(=B7cd$%wOTJ^4Iw5{0;sle~Z7(-{J4__xSt#1O6fZh=0sK;h*x)_~-l! z{w4p4f6c$)-}3MH_xuO`Bmas2%zxp(^56LHJRpDqB)|e9paLe~0wItBCC~yRumUIW zf*^>3B*=mysDdWwf+3iKC8QD33h9LOLIxqDkV(iaWD&9o*@Wyu4k19uDdZAz3weaR zLOvnCP(Uas6cP#xMTDY4F`>9nLMSPe5=skYgt9_8p}bH*s3=qtDhpMFszNoPx==%? zDbx}Ih1xR2$4dR z&_-x0xP)jSMrbF*3UNZbkRT)qZowlY3CTi=&|c^ubQC%XorNw!SHUZE6S@mMgq}h# zp|{XS=qvOS`U?Yufx;kRurNdzDhv~b3nPS)!YE<1Fh&?Fj1$HS6NHJvBw?~JMVKl~ z6Q&C@gqgxDVYVDgMYt+l6Rrz4gqy-G;kIx`xGUTf?h6luhr%P_vG7EADm)XO3onG1 z!Ykpm@J4tmyc6CFAB2y>C*ia3MffUw6TS<82#Syhi-?Gdn23vnNQ#t5i;T#MoXCrU zD2kFOi;Ad;$jK0q*zKUEtV0>isi)eVg<3HSV^obRuQX;)x_#z4Y8(J zOAHiii*>{xv94H8tS>eY8;Xs@V6m~-L~JTH6Pt@I#Fk=+*h&l)TZ>_0xELWuicw-4 zv90J5qs17pofs>|iSc5Bm?*kMkC-GTiz#Azv4hxA>?C#;yNF#yuh>oOF7^<6ioL|% zVjr=u*iY;)4iE>5gT%q&5OJtDOdKwb5J!rm#L?myajZB_94}4~CyJBA$>J1osyI!Y zF3u2VinGMo;v8|VI8U4}E)W-ri^Rp^5^<@xOk6Im5Lb$;#MR;&ajm#cTrX}AH;S9Y z&EghutGG?vF76O_io3+!;vR9YxKG?K9uNcu%}9J`f*@kHp8~6Y;6|OnffB5MPR~ z#Mj~*@vZnyd@p_wKZ>8k&*B&HtN2a)E&>uLK@uz>5-MR5E)fzbQ4%dN5-V{MFA0(; zNs=rnk}7GEE*X+3SyCD)t&~nmFJ+K2N|~h0QWhzzlugPm<&XlToKh|+x0FZ9E9H~& zO9iBYQX#3ZR75H&6_bifC8Uy4DXFwnMk*_nlgdjKq>54{sj^f>sw!2Js!KJbno=z( zP^vA}k%FYUQa!1@)Ie$|HIjm*#!?fhsnkqrF13(aN+D7!DO74Lg-PL3gcK=7No}OI zl1qw~Vx)FbtQ053O9@h<tbSL+UB@l6p&h zq`p!=slPNp8Ym5t21`Svq0%sExHLi3ZVG-r|OJ}6B(mCn8 zbV0f(U6L+KSEQ@bHR-x^L%J#5l5R_Pq`T5R>Av(pdMG`T9!pQ8r_wX&x%5JMDZP?j zOK+sN(mUzB^g;S4eUd&)U!tWIyt?ZLCz>=k~7O$(sCKOtXxhmFISK&%9Z5GauvC%TurVn*N|(RJlIzO#Gr76kLT)LC$gSj1xwRZ7hszOiq#Px;k=x2H zIa-d9+sUzVoE$GF$ceIB_Q*+cvYaBfmpjNEGBMD zraViYEzgnX%Jby;@&b9GyhvUwFOiqZ%jD(q3VEfxN?t9mk=M%WPk3HhXaN+%ixrhH4jE#Hyv%J<~^@&oyy{78N*Karox&*bOw3;Ct|N`5WB zk>ASi8rqp%96@QR>_iloSj zqNs|d=!&73ilwAc(kkhc^hyRLqmoIsj1Xb0+rfI z9VJMqtJG8KD-D!}N+Ts$X{!rN>|0JbW^%3J(QkGFQvEAN9n8d zQ~E0dl!3}1Ww0_t8LA9ZhAShKk;*7#v@%8+tBg~|D-)E7$|Pm7GDVrHOjD*SGnARi zEM>MbN13b4Q|2oRl!eM7WwEkES*k2kmMbfimC7n*wX#N8tE^MjD;t!J$|hyAvPIdd zY*V%?JCvQuE@ii}N7<|FQ}!zdl!MA4<*;%@IjS5}jw>gWlgcUOv~or{tDIBLD;JcD z$|dEpaz(kSTvM(qHDdl!wYA<+1Wad8#~9o+~eum&z;Uwem)J ztGrX*D<71P$|vQs@iJYPcGqMygS28?~+KQlr%v zwVfKP#;NgYf|{thRgapaCaWoGd$ohwQSGF5R=cQORj=Aj?XLDvd#b(E-fADUui8)T zuMSWLs)N+Q>JW9PI!qm|j!;Lcqtwyr7JoLSx=dZJu25I1tJKx%8g;F@PF=5VP&cZZ)XnM^b*s8f z-LCFXcdEP8-Rd57uewj&uO3hjs)y9W>Jjy*dQ3g8o={J!r_|Hx8TG7sPCc()P%o;N z)XVA>^{RSJy{_I+Z>qP{+v*+lu6j?suRc&8s*lvi>J#;;`b>SUzEEGPuhiG-8}+UF zPJOR_P(P}l)X(Y{^{e_#{jLHUs6iU6AsVV-8m=6|{<4C9SenMXRb+)2eGVw3=EiEl{hi)zN~q zx>`N0zScl%s5R1pwZ>W#t*O>bYp%7>T52I$D=k!Ot%YgfT7(vKdrwu zKpUtH(gtfow4vHCZMZf<8>x-bMr&iVvD!Foyf#6bs7=x)Yg4qT+B9vtHba}K&C+IT zbF{hIJZ-+VKwGFS(iUq=w58fIZMn8WTdA$mR%>gtwc0vuy|zKysBO|VYg@Ff+BR*w zwnN*g?b3E@d$hgUK5f5tKs%@%(hh4!w4>TF?YMSAJE@)0PHShhv)VcBymmpms9n-7 zYge?Z+BNOEc0;?V-O_GrceK0OJ?*~sKzpb?(jIG1w5QrL?YZ_sd#SzBUTbf(x7s`H zz4k%-sD08tYhSdl+BfaH2I!y;>9CIIsE+BlPUxgg>9o%1tj_7YF6g2z>9Vfqs;=p} zZs?|N>1p(|dOAJ5obdmXdLBKmo=?xO7tjmph4jLD z5xuBhOfRmN&`av2^wN46y{ukNFRxe7E9#Z>%6b*Os$Na6uGi3O>b3Mhy|!LQ57O)E z_4N9B1HGZ%NDtN<>rM2gdNaMb-a>Dwhv=>JP`$MtribehdZZqux6#|`EcjNm`UribK1v_0kI~2KdW-y`U-uezDi%MuhG})>-6>d27RNxN#Cq*(YNZ`^zHf% zeW$)l->vV__v-uf{rUm@pnga{tRK;j>c{ls`U(A{eo8;BpV80i=k)XX1^uFaNx!UL z(XZ;)^y~T!{ic3PzpdZV@9OvT`}za@q5epJtUuA8>d*A&`V0M~{z`wXztP|7@AUWj z2mPb|N&l>W(ZA~7^zS-gfCgm124bKFX5a>4kOpPY24k=WXYht#h=ydyhGM9OX6S}t zn1*GfG1408jPynZBcqYY$ZTXWvKrZp>_!eFz{qLjGIASvjJ!rZBfn9=C}JxKY9=X_PWb8)b~LMmeLrQNgHaR5B_XRg9`eHKV#w!>DQ0G6Id-Mja!_sB6?S z>KhG=hDIYJ*l27tF`639jOIoQqoomIv@$}C)<&2SZbTT7MwHRUXluBPXd}jGXT%zD zM!b<=BpPnRV@)To2aJQpA>*)d#5igkGmaZ4jFZMG3FYJ4-k8-NL#kO`ZJiJF*+n}kW4lu4V6$(o$Wn}R8tk|~>tshXOpn}%tc zmYK#(Yo;^Pn;FcEW+pSUnZ?X%W;3&!Im`evrtD4o!>ShhIrdi7jG;5o6%pkL_SQn=Q~9V*2bzP- z!R8Qis5#6WZjLZVnxo9o<`{FVInEq!PB15$lg!EH6mzOM&75w|FlU;x%-QA~bFMkh zoNq2L7n+OA#pV)oskzKtZmuv_nybv!<{ERYxz1c~ZZJ2Ro6ODT7IUk)&D?J8Fn5}} z%-!Z5bFaD2+;1K*51NO}!{!n5sCmphZk{ktny1Xu<{9&>dCojFWthScRind~`c2=wvXT@6y zR-)y$JXVsGY^7N3tqxX4tCQ8)>SA@ZyjC}>yVb+$Y4!R)tld-4Buf{s@v&##F{&!7 zvMQ_c9ox2T+qP}nwr$(CZQDG1_V?_Yb9W-ny69i7?ufqXh?U(P@n6OOW1um}7;FqN zh8n|+;l>DKq%q1EZHzI-8sm)d#sp)cG0B*0OfjY!(~RlH3}dD-%b0Dk!jQz#|jATYOqnJ_6Xl8UXh8fe0Wrmos%{XRU zGoBgWOkgH76Pbz4B&K1SrezA#HXT!%uIZV|^v%H3W~iCeOlBrGQ3jhWU= zXQnqZm>JDXW@a;snbpi@W;b(~In7*VZZnUW*UV?;Hw%~r%|d2jvxr&LEM^urOPD3i zQf6thj9Jz!XO=fBm=(=RW@WRAS=Fp&RyS*yHO*RPZL^M9*Q{sOHyfA@%|>Qpvx(W% zY-Tn$TbM1)R%UCnjoH?0XSO#xm>tbdW@odD+12c3b~k&NJbPa=4Nw?xz*feZZ~(B zJI!6@ZgY>h*W73BHxHNx%|qs4^N4xWJZ2s@PnajoQ|4*&jCs~PXP!4Nm>11U=4JDW zdDXmTUN>);H_cn-ZS#(K*Su%mHy@Y}%}3^A^NIP?d}cm3Uzjh=SLSQ;jrrDmXTCQ- zm>DvQ{~(yj8)fXjQT*TUD&8RyC`-Rl}-j)v{_^b*#EpJ*&Rez-nkU zvKm`Wtfp2otGU&}YH78yT3c$r8oI%%D$&y9dTG6~UR!Uh zx7IuBz4gKRXnnFiTVJfN);H_B^~3sU{jz>rf2=SftOzHL~fBs-9&fML-Z8AL~qeY^cDR?e=$G|6obTI zF+>a%!^ChgLW~rn#Aq=_j1}X=crigt6qCecF-1%j)5LT!L(CMj#B4D~%oX#*e6c_* z6pO@Su|zBt%fxcALaY?4#A>ldtQG6Tda*%l6r03mu|;eZ+r)OUL+lj0#BQ-i>=paO zesMq?6olZr8AD+O_Q3b{)H}UC*v>H?SMpjqJvD6T7M1%x-SCuv^-#?ACT0 zyRF^MZf|$6JKCM>&UP2OtKH4+ZuhWz+P&=Fb|1U1-OuiC53mQ?gY3cf5PPUS%pPu! zut(aX?9uiZd#pXq9&b;uC)$(j$@Ua`sy)q~ZqKl1+OzE0_8fbzJcJ=K5t*JFWQ&v%k~xfs(sDAZr`wP+PCc6 z_8t4Keb2scKd>L#kL<_x6Z@(C%zkdauwUA*?AP`i`>p-Xes6!UKiZ${&-NGltNqRX zZvU`<+Q01I_8&Wp6V?gmgm)r15uHd*WG9Lf)rsarcVaj(omftY6WfX7#C761@tp)t zLMM@v*h%6Tj_FvAaBRnMq~kiCqa5D}9PNZUNu6X)awmn8(n;l{cG5U$opermCxesG z$>d~qvN&0tY)*D3hm+IE<>YqqIC-6XPJX9=Q_v~o6n2U@MV(?!ai@e+(kbPXcFH(q zopMfjr-D<_spM34syJ1hYEE^hhEvn2<=bEk#V z(rM+icG@^?opw%pr-Rec>Ev{Fx;R~(ZccZnhtt#P<@9#?IDMUdPJd^BGte3240eV% zL!DvHaA$-w(i!E9cE&hkopH{1XM!`)ndD4%rZ`ibY0h+KhBMQd<;-^GICGtO&U|Nq zv(Q=OEOwSSOPyuTa%Y9J(plxKcGfs+opsK7XM?lR+2m|?wm4gzZO(RQhqKe!{D9CnU4N1bENap#0{(mCavcFs6wopa84=Yn(5x#V1St~ghnYtD7& zhI7-o<=l4eICq_U&VA>B^U!(ZJa(QqPn~DZbLWNg(s|{)cHTH|op;WA=Y#Xn`Q&_d zzBpfCF9WG%s7xx8$>cJHOes^z)H01sE7QsJGK0)0 zGs(;{i_9vs$?P(R%qerp+%k{MEAz?xvVbfo3(3N=h%73L$>OqvEGbLL(z1*!E6d69 zvVyEAE6K{TimWQD$?CF(tSM{B+Om$UE9=SnvVm+U8_CA9iEJvH$>y?!Y$;pG*0POk zE8EHTvV-g>JIT(ni|i`9$?md;>?wQ6-m;JEEBnd*a)2Bt2g$*5h#V@1$>DN@94SZ1 z(Q=F&E62(4a)O*FC&|fjikvE^$?0;2oGE9?*>aAYE9c4ia)DeZ7snl| zTq#${)pCtoE7!^Oa)aC`H_6R%i`*)=$?bB7+$nd--ExoIEBDF$@_;-j56Q#wh&(Ef z$>Z{bJSk7f)AEcwE6>UE@`Ai5FUiaDio7bX$?NilyeV(V+wzXQEAPqs@_~FPAIZn^ ziF_)b$>;Kgd?{ba*Yb^gE8ofY@`L;+KgrMXi~K6T$?x)q{3(CQ-|~+PLF4xN+ThZhSX^o6t?M;mYq_=EI&NLJo?G8-;5Kv{xsBZ>Zd13J+uUv8wsc#$t=%?mTeqFt-tFLabUV47 z-7aodx0~DD?cw%xd%3;cK5k#PpWELZ;0|;Lxr5yy?ofA_JKP=Nj&w)4qunv?Sa+N| z-kso1bSJr!-6`%=cbYrho#D=OXSuW8IqqC{o;%-N;4X9*xr^N;?oxM|yWCyju5?$q ztKBv3T6dkh-reACbT_%1-7W4`cbmK2-Qn(Zce%UWJ?>t2pS#~Z;2v}jxrf~&?os!c zd)z(Yo^(&Sr`WybU(SD-7oG}_nZ6O{o(#}f4RTiKW-Q= ztQXD;??vz;dXc=yUKB5?7tM?A#qeT!vAhs3wim~X>&5fpdkMURULr5Cm&7wX)3ZF` z*`DJ`&-FY{dA=8T+6(oPdda-xUJ5Uzm&!}+rSZ~w>Adt_1}~$R$;<3z@v?f^yzE{M zFQ=Ew%kAay@_PBa{9XaCpjXH%>=p5fdd0lrUJ0+HSIR5xmGR1Y<-GD<1+Su4$*b&D z@v3^&yy{*JuclYatL@eC>U#CO`d$OCq1VW3>^1S4dd+SXN`g;Am{@ws@pf|`H><#gTdc(Zo-Ux4`H_99B zjq%2MBfPo9)f<=6dtI`Q8F=p|{9e>@D$@dds}! z-U@G}x5``Xt?|}+>%8^e25+Oc$=mF0@wR%~yzSl&Z>P7*+wJY~_Imrg{oVoZpm)eS z>>crrddIxu-U;udcgj2Mo$=0k=e+aY1@EGF$-C@b@veH;yzAZ#@1}RlyY1ca?t1sU z``!cZq4&sp>^cZsE8_(imaljs4AL@u41T|DwYaSu~i%uSH)BDRRWbzB~po15@je; zSxP8dIZ7&5c}gi?1xl+>l~g5D$yExKQl(OCV1JzJ9QjJv;)l@Z8%~cE4Qnga8RU6e-wNvd?2h~w^Qk_*7 z)m3#<-Bl0OQ}t54RUg$?^;7-T05wnzQiIhHHB=2#!_^2iQjJoh)fhEajZ@>*1T|4j zQj^sbHC0Vh)71<$Q_WJd)f_cf%~SK$0<};rQj66RwNx!r%hd|CQms;})f%-{tyAmO z2DMRbQk&HlwN-6X+tm)WQ|(f_)gHB1?Nj^J0d-IvQis(MbyOWw$JGgSQk_z#)fshG zom1!41$9wfQkT^gbyZzc*VPSmQ{7Ux)g5(L-Bb6~1NBfnQjgUW^;A7m&(#a{QoT~I z)f@Fzy;JYi2lY{XQlHfq^;LaS-_;NGQ~grE)gKkc59^2X!}}5Zh<+qLvLD5d>PPdV z`!W2Oek?!4kL}0t?iRJ-}EhC__pu((szB&SHABDzV<`?q<%6# zxu3#M>8J8j`)T~NemXzBpTW=QXYw=qS^TViHb1+c!_VpG@^kxn{JefXKfhnVFX$KY z3;RX6h|L`(^yHemTFqU%{{FSMn?SRs5=cHNU!F!>{Sr@@xBb{JMTU zzrNqVZ|FDj8~aWCrhYTOx!=NX>9_J*`)&NTemlRt-@)(bck(;?UHq5uY9`(ymE{y2ZUKf#~qPx2@GQ~as^ zG=I83!=LHT@@M;V{JH)-f4;xKU+6FL7yC>6rT#L1xxd0+>96uv`)mBQ{yKlXzro+= zZ}K<$Tl}s5Hh;Un!{6!e@^|}t{Js7@f4_ggKj_qy91fxPQVw>7VjX`)B;K z{yG1=f5E@#U-B>eSNyB~HUGMQ!@ud@@^AZh{JZ`=|Gxjgf9OB*ANx=Ir~Whlx&Oj{ z>A&({`)~ZW{yYD@|H1$0fAT;3U;MBBH~+i;!~g03@_+k({4hb-AY2eWh!8{!A_b9y zC_&U9S`a;m5yT8)1tCG~AWjfBh!?~U5(Ei@L_y*pNniwKU*6k_E|w6hX=$RggMJ6Qm8&1?htfLB=3ckU7W_WDT+f*@GNG&LCHiJIE8{4e|x~ zg91UppiodaC=wJ6iUq}k5<$tJR8Tr76O;|g1?7VZLB*g_P&ud)R1K;H)q@&A&7f9L zJE#-X4eAB;g9bsvpi$5`Xc9CHngz{+7D3CPRnR(U6SNK51?__lLC2s|&^hQ5bPc)% z-Gd%M&!AV(JLnVi4f+NBg8{+7U{EkP7!nK(h6Tfe5y8k{R4_Ui6O0YU1>=JW!Ng!v zFgchKObw<5(}Nko%wSe9JD3y94dw;&g9X9DU{SC*SQ0D^mIcd$6~W42Rj@i(6RZu^ z1?z(i!Ny=yusPTgYz?*r+k+j!&R|!tJJ=KK4fX~5g9E|A;81WlI1(HUjs?eq6T!*g zRB$>t6Pyjs1?Phc!NuTGa5=aVTn(-T*Ml3u&EQsWJGc|v4ekZ^g9pLG;8E~6coIAf zo(0c?7s1QmRq#4^6TA)H1@D6o!N=fJ@HzMrd=0(@--92)&)`?^JNOfX(P4Es9bQM! z5p^UTSx3=Pbu=Aa$Ivl#EFGd_>o_{Dj;G`61UjKkq!a5T+R&!9w9vM8wA8Nlw9>u~ zwAP_IsZOSo>l8YrPNh@pG&-$Lr_<{UI-|~{GwUomtInpg>l`|#&ZTqfJUXw=r}OIq zx}Ywk3+p1fs4k|9>k_)8E~QKBGPl(VIuBB`1I=Zf| zr|atmx}k2Q8|x;zscxp5>lV7DZlzo6HoC2Dr`ziex})x-JL@jGtL~<|>mItN?xlO{ zKDw{&r~B&xdY~Sp2kRkvs2-+=>k)dS9;HX?F?y^Xr^o9FdZM1BC+jJCs-C8&>lu2c zo~38&IeM<1r|0VhdZAvV7waW@sa~d+>lJ#XUZq#-HF~XHr`PKZdZXT?H|s5WtKO!! z>m7Qh-lcczJ$kR+r}ygv`k+3f59=fPs6M8T>l6Bl^x}zNK&LJNmA^r|;_r`k{WLAL}RjseY!P>lgZ^ex+aQH~Ou9r{C)j`lJ4& zKkG00tNy0H>mT~3{-uBGKRQfk*wApH;X@;YMhuM<8aXsdXw=YXq0vKQgvJbw6&e!S zv3~PZ|76GwYo7X_$A2>KM6dSO{yF{MnRnO!K1cqX99FS!)_;2Hzti8O?!Wan-}7(% zzwX2Po1gi&qW_P7vyuNEW75aM5R8rgWIz8~|11*xx8h-ZOn?b75hlhYXrPG}3bfHd zi7tAm(8mBZhT?xatdh|u#}t?nQ(-)B{|z(z=Uj&Vzf{VqJ8LMDb ztcKOG2G+z{SR3nLU95-ou>m&3M%WmeU{h>{&9Mcx#8%iE+hAL4hwZTgcEnED8M|Ot z?1tU32lm8X*cY> zoQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8 z+=kn62kyjOxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H z{D$B02mZug_#6M=Kf$v9R#*&&;V}Y6#J`!le=9OZ!KfGwqhkz=iLo#QV`ChQi}5f% zCcuQ42oqxxG|)r~1={G~-#FjD<)ViQeGE`zDE^K8{aeW}IsVQ5{aY#V@Ad3&{k@?5 ztu&Yx)8XHA0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFH zhRv}Bw!~K08rxu7Y=`Z!19rqt*crQESL}w}u?P0VUf3J^U|;Nq{c!*e#6dV1hu}~g zhQo0Lj>J(o8pq&R9Eam^0#3w9I2otlRGfy>aR$!BSvVW#;9Q)C^Kk(##6`Fmm*7%d zhRbmUuEbTi8rR@jT!-s%18&4kxEZ(LR@{c$aR=_iUAPT6{+Tmj!~HvM!i2+r zme2iL|G!tq$KriMFgC`)xEK%PV**Twi7+uHK?6;+P@s(tN_5dfg+2zTF%*+xGE9ys zFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;HSOkk=F)WTH zuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76 zuqC#_*4PHyVmoY)9k3&I!p_(QyJ9!&jyZzFARfZQcm$8)F+7eZ z@FbqX(|88Y;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO z@Fl*&*Z2nC;yZkgAMhi7!q4~xzv4Iijz91x{=(n*2mkw_KWupBKZeJDx0(I(QbhVl z7#X8rRE&nvF$TuOSQvt_F%HJXco-iOU_wlUi7^QpXrhGzZFEqgiykWUF+h!>m=u#? za!i3KF%_o9G?*6CVS3Df88H)P#w?f>vtf43fjKc3=Egjj7xQ6$EPw^E5EjNFSQLw4 zaV&u)u@siZGFTSNVR@{86|oXl#wz&l{-=LFpVjEAV-2i{wXinU!Ma!v>th3Kh>fr@ zHo>OY44Y#MY>BO~HMYUF*bduc2keNQurqeSuGkH`V-M_!y|6d-!M@lJ`{Mu{h=Xu2 z4#A-~42R*ZsI1b0-1e}PIa57H8sW=U%;|!dMvv4-f!MQjO=i>rgh>LJB zF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYft+)-h;||=3yKpz|!M(T-_u~OPh==en z9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S(t9T8s;|;utx9~RJ!Mk`5@8bh}h>!3w zKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kFulNnW;}86azwkHy!TtTItfDN$`HpV8{ z6q{jlY=JGY6}HAU*cRJid+dN6u@iR2F4z^jVR!6-J+T+|#y;2=`(b|^fCF(54#puk z6o=t(9DyTo6pqF*I2Om@c$|O}aS~3(DL56U;dGpVGjSHq#yL0_=iz)@fD3UEF2*Ie z6qn(0T!AZb6|TlLxE9ypdfb2;aT9LFEw~l8;db1CJ8>88#yz+f_u+m#fCupq9>ybh z6p!I?Jb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b z6rbU9e1R|V6~4wd_!i&cd;EYO@e_W=FZdO|;dlIjKk*m-#y|MaLfL;SEQZ7I7y%<< zB#ewvFe*mF=okZIVk`{7*cb=nVmyqG2{0ih!o-*a4K&e0fi^lQ(M1mx`WT?bP)v%+ zFgd2cl$Z)rV;W40=`cNJz>Js)Gh-IairFwb=D?ho3v**0%!~OjKNi4(SO^Pa5iE+u zusD{$l2{5$V;L-q<*+cgytO@Bj4Gu?E(}T38$FU|p<-^|1jq#75W{ zn_yFHhRv}Bw!~K08rxu7Y=`Z!19rqt*crQESL}w}u?P0VUf3J^U|;Nq{c!*e#6dV1 zhu}~ghQo0Lj>J(o8pq&R9Eam^0#3w9I2otlRGfy>aR$!BSvVW#;9Q)C^Kk(##6`Fm zm*7%dhRbmUuEbTi8rR@jT!-s%18&4kxEZ(LR@{c$aR=_iUAPAQF&ak47#I^{VF<>?I2ae>VSG%02{92S#w2K8}ndZ%!m2002ahTSQv|7 zQ7neVu>_XHQdkv02a#7(#vx8PRXhTCxm?!;ZV8~5N|+=u(|03O6cco>i1 zQ9Opn@dTd4Q+OKB;8{F}=kWqw#7lS?ui#a@hS%{1-o#sY8}Hy_!yty zQ+$Tc@ddubSNIy=;9Go$@9_hE#83Dczu;H=hTriA{={GS8~tTItfDN$`HpV8{6q{jl zY=JGY6}HAU*cRJid+dN6u@iR2F4z^jVR!6-J+T+|#y;2=`(b|^fCF(54#puk6o=t( z9DyTo6pqF*I2Om@c$|O}aS~3(DL56U;dGpVGjSHq#yL0_=iz)@fD3UEF2*Ie6qn(0 zT!AZb6|TlLxE9ypdfb2;aT9LFEw~l8;db1CJ8>88#yz+f_u+m#fCupq9>ybh6p!I? zJb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b6rbU9 ze1R|V6~4wd_!i&cd;EYO@e_W=FZdO|;dlIjKk*m-#y=P)3iBVsVR(#y5it@*#wZvS zqhWN6fiW=_hG1-rgK;q)#>WJh5EEfyOo9fQXrVwG9hB&zhYEcRP-7@2#blTqQ(#I= zg{d(Orp0ua9y4G@%!HXS3ueV^m>qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSX zg{83!mc?>d9xGr)tb~=Z3RcBxSRHF%O{|5ru@2V7dRQMDU_)$#jj;(f#b($XTVP9U zg{`p-w#9bX9y?%1?1Y`M3wFhB*d2RdPwa)gu@Cmee%K!e;6NONgK-EB#bG!cN8m^t zg`;r{j>T~}9w*>LoP?8c3QomoI2~u;Oq_+YaSqPKc{m>z;6hx4i*X4q#bvl0SKvxq zg{yH5uElk@9yj1d+=QEP3vR`2xE*)kPTYmNaS!greYhVF;6Xfuhw%s=#bbCJPvA*B zg{Schp2c%`9xvcUyo8tW3SPx)cpY!xO}vG-@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8 zg|G1qzQuR=9zWnm{DhzJ3x36K_#J=XPyB_y@ehWH%KXQ07#<^FM2v)yF$zY-Xc!%1 zU`&jKAs8FuU|fuc@i74=#6*}Flc0enS}4#)2PL}bp+X-6)EJ6MF&QSu6qpiIVQNf+ zX)zt9#|)SeGht@Tf>|*eX2%?u6LVp1%!7F`ALhpbSP%p5^R>vAx6Ki2@tb=v29@fVO*bp0GV{C#=u^BeU7T6M7VQXxI zZLuA;#}3#LJ7H(+f?cs2cE=vr6MJEA?1O!=ANI!qI1mTnU>t%&aTpHA5jYY@;bUuCPRAKI6KCOUoP%?59?r)FxDXfNVqAhtaTzYh6}S>t;c8ri zYjGW}#|^j-exUdJ1F6K~;dyn}b~9^S_X_z)lAV|;>7@fkkH7x)ri;cI+@ zZ}ASbyT1i(0EQZCg1eU~7SQ^Vw}aN>~}IU{$P!)v*TF#9CMz>tJ21hxM@mHpE8Q7@J^IY=+IT1-8Ui*c#hlTWp8z zu>*F*PS_c{U{~yh-LVJu#9r7N`(R(}hy8H?4#Yt?7>D3c9EQVj1dhZ}I2y;`SR9Ar zaRN@nNjMp&;8dK3({TpQ#925S=ipqNhx2g(F2qH+7?_uyXKhx_pW9>ha<7?0plJch^d1fIlGcpA^(Sv-g5 z@d94NOL!Tt;8nba*YO74#9Me9@8Dg$hxhRTKEy}(7@y!%e1^~Q1-`^r_!{5fTYQJ_ z@dJLuPxu+X;8*;H-|+|j#9#Ou|KNW?y{VlK>$c`z^L!~9qP3t}NGj76|07Q^CL0!v~kERAKbESAIa zSOF_yC9I59uqsx=>R1D7VlAwVb+9hh!}{0&8)74Dj7_j9HpAxF0$XA$Y>jQOEw;n< z*a16YC+v(}uq$@M?$`r+VlV8CeXuX~!~Qq`2jUa4Js2={N&t;w+qvb8s%s!}+)X7vdsZj7xASF2m)x0$1WHT#ajREw01$ zxB)lfCftl$a4T-Z?YIMX;x62cdvGuA!~J*w58@#_j7RV&9>e2!0#D*8JdJ1YES|&j zcmXfsCA^GR@G4%z>v#ii;w`+5cknLW!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC| z_yIrSC;W_G@GE}9@Aw0M;xGJ-fAGJ6{;)Ba{}>)4U_^|BkueHJ#b_8EV_-~-g&`Ol z<6vBjhw(81Cd5RT7?YrZCR!-aMh7Lj=%GR%1JoFbNii8F#}t?nQ(|SQBeuZLEWJu^!gP2G|fAVPkB9O|cm^#}?QUTVZQ#gKe=Lw#N?G z5j$aL?1Ejf8+OMY*b{qUZ|sA8u^;xw0XPr`;b0tsLva`m#}POZN8xB3gJW?Vj>ic& z5hvkfoPtwv8cxR%z-&E7v{!1m>2V5ek_0mu@Dxmq=6{}%&tbsML7S_f(SQqPIeQbaYu@N@LCfF34VRLMOEwL50#x~d%+hKd`fE}?D zcE&E)6}w?~?14S87xu^N zPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB#x=MW*Wr5HfE#fW zZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X#xr;p&*6EzfEV!+ zUdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X}FYy(=#y9vD-{E`wfFJP_ ze#S5O6~Ezk{DD957yiaS7$z3~|6@1|j}b5;M#9J#1*2j#jE*rdCdR@LjE!+HF2=+7 zm;e)EB20`)&_EL{6lkM^5?%CAp^pJ-48^3F43lFDOo^#5HKxI|m=4op2F!?=Ff(Ss zte6e6V-C!TxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$1+0jburgM` zs#p!HV-2i{wXinU!Ma!v>th3Kh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc2keNQurqeS zuGkH`V-M_!y|6d-!M@lJ`{Mu{h=Xu24#A-~42R*ZsI1b0-1e}PIa57H8 zsW=U%;|!dMvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYf zt+)-h;||=3yKpz|!M(T-_u~OPh==en9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S( zt9T8s;|;utx9~RJ!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kF zulNnW;}86azwkHy!7w4re+-A=F#<-!NEjKTU{s8T(J= z%z-&E7v{!1m>2V5ek_0mu@Dxmq=6{}%& ztbsML7S_f(SQqPIeQbaYu@N@LCfF34VRLMOEwL50#x~d%+hKd`fE}?DcE&E)6}w?~ z?14S87xu^NPR1!X6{q2J zoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB#x=MW*Wr5HfE#fWZpJOR6}RDb z+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X#xr;p&*6EzfEV!+UdAhU6|doS zyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X}FYy(=#y9vD-{E`wfFJP_e#S5O6~Ezk z{DD957yiaS7$!FJAH!jIjDQg_5=O=-7!{*obc}&9F&2hkY>b0(F&@Up1eg#LVPZ^z z2AXK0KpP#D=%R-TeGE`zC?>^Zm>g4JN=${RF%720beJA9U`EV@nK27y#cY@zb6`%) zg}E^g=EZ!N9}8eXEQE!z2o}X+SR6}WNi2n>u?&{Qa#$WKU`4Eim9Yv|#cEg`YhX>R zg|)E`*2Q{Q9~)ppY=n)m2{y%M*c@A6OKgR$u?@DxcGw;}U`OnPov{mc#ctRgdtguO zg}t#4_QihK9|zz-9E5{$2oA+zI2=ddNF0TuaSV>daX20);6$8+lW_`8#c4PlXW&en zg|l%E&c%5+9~a<4T!f2p2`Lk zg}ZSN?!|q$9}nO`JcNhw2p+{_cpOjQNj!z8@eH2Db9f#v;6=QIm+=Z-#cOySZ{SV5 zg}3nz-o<-(A0OaDe1wnj2|mSV_#9v0OMHc|@eRJkclaJZ;79y~pYaQR#c%i>f8bC2 zg}?C+hKa-c$8Z=PBVa^~gpn}{M#X3t9b;fjjD;Z>8{=SHjEC_t0Vc#mm>83wfhJlg z&_)L(y6B-o9|P1Fib*jUCdU+*5>sJnOoM4L9j3<&m=QB!X3T_y7RM4;5=&ueEQ4jS9G1rlSP?5>Wvqf#u^Lv#8dwu+VQs8~ zb+I1S#|GFC8)0K?f=#g*Hpdp&5?f(wY=dpF9k#~~*bzHnXY7Jqu^V>B9@rCmVQ=h% zeX$?*#{oDH2jO5GfxDhwuX54~XaT{*O9k>&B;cnc6 zdvPD`#{+l}58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKw zckv$H#|QWjAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`}ANUi0;cxtd zVd66XF&u`+2pAC~VPuSgQ85}u#~2tBV_^u!#yA)k<6(SEfC(`XCdMRapota=w9!F{ zE_$fY#{e~kVp2?o$uR|{#8j9X(_mUmhv_i`X2eXG8M9zk%!b)92j;|Fm>ct8Ud)I2 zu>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDbtcKOG2G+z{SR3nLU95-o zu>m&3M%WmeU{h>{&9Mcx#8%iE+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cY>oQBhJ2F}D;I2-5ST%3pV zaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjOxEuH2UfhTK z@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H{D$B02mZug_#6LVn0U;8 z42R({0!GA07#X8rRE&nvF$TuOSQvt_F%HJXco-iOU_wlUi7^QpXrhGzZFEqgiykWU zF+h!>m=u#?a!i3KF%_o9G?*6CVS3Df88H)P#w?f>vtf43fjKc3=Egjj7xQ6$EPw^E z5EjNFSQLw4aV&u)u@siZGFTSNVR@{86|oXl#wu79t6_Dlfiv=P-xoa4yf~Jf6q%c>(8h0T*%+7xO|c;YGZdOSz28c?mD& zWxSkM@Je3At9cEt<#oKCH}FQ@#1*`mw{RtIe>tpX5_~n$PfAuH|!lp6j@tFYraa#FzOBU*!gF7l2`#&Sb%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS! zz>2KI%B;ewtj6lB!J4ea+N{IFSeJ*h9*@$woYijoE}v*^JHEf-TvKt=Wcc z*^cemfgRb2N3%1#uq(TGdYP_Je89LDp&gPjsi)V8V zb9fHt@?6g2c|4yNa6T7sAs2BmFXR$l#EZF<%eb7E@KRpJ%XtN_RoKNscKEs%KF8;|j_dgXU*t=CnXm9wZs104;%2_aEqtAC@J+tOt$dsBa2wy{d)&?)+{sI<=2mj<> z49dIzGh)nq%+CTW$U-d4A}q>cEY1=v$x)&h=Ws618_SMW++#jAM@ujO^To;UDD-ozEWnYVByZ{=-V#oKuY z@8n&)oA>Zu-pBj7nh)?nKE#Ll2p{DdKE}uS1fS$ne45YjS+3=Ce4gvLo-go4zQmXL z3SZ?0ZsaCz=4;%-*ZBtDh?cBkg+{NA8!@Yc;AMiu&<9>d`kNF8d z~h3EWm;+#KJ7X zqAbSZEWwg2#nLRpvMk5)tiXz_#LBF~s;tK9tihVB#oDaH!&sMxvmTFNeKuf29?3>L zijCQXP1%gi*@7+Eimlm(ZP||P*?}F|iAS?DyRa*}u{(RPCws9sk6|D7Wk2@k01o6J z4(72O!l4|-3=ZdU9Kqvx0#D>g9LbY8il=Zi$8apiaXcq*A~QLOSv-}KIfYX>jnjD= zvpIt^Ig6+B49@16Jd0;@4s&=8=ki?6<9R%v7jQloa3L3QF)!p2Uc`&Jl*_oBm+(?v z#>;sHujEy{n%D4JUdQWs18?L_T)~@p3s>@1-o{nDopOyq-7k zM&85~yqULfC2!?zT*cdY2k+!vyqov%Uf#$1xtb5~K|aKX`3N878a~Fy`2?TjQ+%4w z@L8_qb9|obxSlWYMZUzB`3hg<25#ggZsu#;!q@o*-{f1|%D4FrxA9%R$L-v~o!rIU z+{3+mpC9l;?&E%b#Ea4++)5NgT{0cOzQwJ4o9}QN-{pJU&K=yzUEIw*+{^d*0YBtE?&nARn4j=d ze#X!F1;6B1{F>kJTYksy`2&CCPyCs`@K^rE-}wjs_e5-iD5EX^`3%W^Ew3arRVtjsE`%4)368m!4$tj#(+jCFZ9>+uNIX9G6mk!-}H z*qBY&l+Dv81`Xb_G5nz;6M)I zU>?gM9LizL;BX$t5j>tJ@I;=(kvy5BcnU{z499XD$8!QFGLw^-#Zx(%Q#h5=IGv|4 zn=?3*vv@ks;B21Bvv@Y=Fo)-GF3;sWp2zcf0q1i87jh97^Fl7+MZB0xxs1zs2`}Yk zyqs6?N?ygQc@3}Sb-bQ8@J8Om6}*|Za3yc$ZCu6Mc?a+0UA&w3@Lt}>`?;DA@IgMr zhxrH}$sjT@I}7Fm-z}`$NVh7f-Jau{V!lANFNG_U8Z&v;oj zbr)RlJ>d@J`;vyLk`q<$b)LtN8#Q-hp-+ukclF;6`rZX1>NPe4TIbO}@pge4FoZ8{g%7+|C``$z9yd zJ>1Lp`2j!VKJMp7{FtBcQ+~$J`31k^SNxja@LPVz@A(6N}mvkSYj8@say zd$JdM^BDGFU-n~v4&Xoz;$R-jAsotK%;0by#}Pc9C-6j`#F0Fiqj(BOa}39F9LIA4 zCo+?hn8i~$nNv8G(>R@{F`F|ule2g_&){sH$+LJi=P-xoa4yf~Jf6q%c>(8h0T*%+ z7xO|c;YGZdOSz28c?mD&WxSkM@Je3At9cEt<#oKCH}FQ@#1*`mw{RtIe>tpX5_~n$PfAuH|!lp6j@tFYraa#FzOB zU*!gF7l0`#&Sb%*XsJz=ABq!Ysm~ zEXLw2!ICV+(k#QWEXVS!z>2KI%B;ewtj6lB!J4ea+N{IFSeJ*h9*@$woYi zjoE}v*^JHEf-TvKt=Wcc*^cemfgRb2N3%1#uq(TGdYP_Je89LDp&gPjsi)V8Vb9fHt@?6g2c|4yNa6T7sAs2BmFXR$l#EZF<%eb7E@KRpJ z%XtN_RoKNscKEs%KF8;|j_dgXU*t=CnXm9wZs104;%2_aEqtAC@J+tO zt$dsBa2wy{d)&?)+{sI<=2mj<>463{TGh)nq%+CTW$U-d4A}q>cEY1=v$x)&h z=Ws618_SMW++#jAM@ujO^To;UDD z-ozEWnYVByZ{=-V#oKuY@8n&)oA>Zu-pBj7nh)?nKE#Ll2p{DdKE}uS1fS$ne45Yj zS+3=Ce4gvLo-go4zQmXL3SZ?0ZsaCz=4;%-*ZBtDh?cBkg+{NA8 z!@Yc;AMiu&<9>d`kNF8d~h3EWm;+#KJ7XqAbSZEWwg2#nLRpvMk5)tiXz_#LBF~s;tK9tihVB#oDaH z!&sMxvmTFNeKuf29?3>LijCQXP1%gi*@7+Eimlm(ZP||P*?}F|iAS?DyRa*}u{(RP zCws9sk6|D7Wk2@k01o6J4(72O!l4|-3=ZdU9Kqvx0#D>g9LbY8il=Zi$8apiaXcq* zA~QLOSv-}KIfYX>jnjD=vpIt^Ig6+B49@16Jd0;@4s&=8=ki?6<9R%v7jQloa3L3Q zF)!p2Uc`&Jl*_oBm+(?v#>;sHujEy{n%D4JUdQWs18?L_T)~@p3s>@1-o{nDopOyq-7kM&85~yqULfC2!?zT*cdY2k+!vyqov%Uf#$1xtb5~K|aKX z`3N878a~Fy`2?TjQ+%4w@L8_qb9|obxSlWYMZUzB`3hg<25#ggZsu#;!q@o*-{f1| z%D4FrxA9%R$L-v~o!rIU+{3+mpC9l;?&E%b#Ea4++)5 zNgT{0c zOzQwJ4o9}QN-{pJU&K=yzUEIw* z+{^d*0YBtE?&nARn4j=de#X!F1;6B1{F>kJTYksy`2&CCPyCs`@K^rE-}wjsh=X}7hj1u|F@wW-97phYp1>1%5=Zi6j^ZgC%`qIyaU9PH zoXAX0Vir&3WKQ8!PUCc*#%#{uOwQuzJcF}&CePy8oWmTR!?`?{^LQT5=LMY41zgBQ zT+9o(gctE*F6A;V=Ow(9m+^95!7F(cujVzpme=um-oP7q6IbwN-oll zFXt7!l2`F+Uc+m79k1sNypcC?1#jjpT*+H`8&~mm-oZP07w_ghyqEX!ey-*Ne2@?E zVLrk~xrUGNaX!H(`4pe#Gklh7`5d3;IER$*0EV|CVGP1a&<*5P5S%fnfZN3cE{upy6RBOb-ZY{I5&#^!9nmTbk=Y{Rx} z$M)>Nj_kyv*_mC~mEG8#J=l}I*qg_&5Bsto`*Q#Xau5geSPtP(4r2y~^Ei&+@jQVi z@+6Mr$sEN~IGSTPmg6{{6F8BXoWv}i%E_Ft@*zIVNBAh$@G(BlC-@|v;?sPF z&vGrFIjqmb3Zs!i}! z9`5D){D2>FANTVke#}q!DL>=q{DNQdD}K#y_$|NV_xyoB@+bbxU-&D306NF=jsIX8{&uAr@v47G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio z9>%&nob`AF>$3qH@<=w~QEbd6Y|3VA&K7LRR&32SY|D0R&kpR!PCS~O*@a!%josOU zJ=u%Bc?|ooFZ;1S2XG(%qg78X`If}n9UiS$yq#|XK*&p&!6 zuW|!7auYZ6HE!YSe1mWDEpFx8e23fkF5lyJ?%+=D;%@HYUcS!{_#yXkKR@Ef{DhzK zGk(r5_$9yM*ZhXx@;iRdANV7G;?Mkrzw$T!&Oi7k|6*{s`#&Sb%*XsJz=ABq!Ysm~ zEXLw2!ICV+(k#QWEXVS!z>2KI%B;ewtj6lB!J4ea+N{IFSeJ*h9*@$woYi zjoE}v*^JHEf-TvKt=Wcc*^cemfgRb2N3%1#uq(TGdYP_Je89LDp&gPjsi)V8Vb9fHt@?6g2c|4yNa6T7sAs2BmFXR$l#EZF<%eb7E@KRpJ z%XtN_RoKNscKEs%KF8;|j_dgXU*t=CnXm9wZs104;%2_aEqtAC@J+tO zt$dsBa2wy{d)&?)+{sI<=2mj<>4C=Z6Gh)nq%+CTW$U-d4A}q>cEY1=v$x)&h z=Ws618_SMW++#jAM@ujO^To;UDD z-ozEWnYVByZ{=-V#oKuY@8n&)oA>Zu-pBj7nh)?nKE#Ll2p{DdKE}uS1fS$ne45Yj zS+3=Ce4gvLo-go4zQmXL3SZ?0ZsaCz=4;%-*ZBtDh?cBkg+{NA8 z!@Yc;AMiu&<9>d`kNF8d@ju$_C|v@IR1g^-^6_KWGp%42}#M1xE#qgC;@Kpjps7Xc4pwS_Q3xHbL8< zUC=)05OfSW1xE*+gDyeWpj*&A=n?b`dIi0MV}d?G-=JU6KNt`U3*gJXjs!O&n> zkP!?IjtfQv#|I|_Ck7`4BZHHJQNby}=wM7RHW(L-4<-Z?gUn!3kQJO7Ob(_5Q-f*2 z^x(80JD3s73}ywV2WJGcgEND(g0q7;K~8W^FgG|im=~NEoF7~e%nud>3xh?$;^4wy zNpMkcaj-O47Ay}g2`&vT3oZ|?2(ApS3a$>W39b#U3$71t2yP5+3RVO+2e$+(gIj~! zf>puo!5zV!!Ck@K!9Bsf!F|E~!Rp|F;KAUb;Njqr;L%`B@L2G8@I>%r@Ko@0@J#S* zur_!ucs^JctPfrYUJPCeUJhOfUJW(`8-q>3=HRtpOYnN|M(}3vR%J_!%ZC-hieaU&a#$s-8deLdhc&{QVXd%sSSLIztQ#I4)(eja>xT`(hT)N6 zqwuJ(ao8km8a4}?hb_XEVXLrp*d}ZnwhP;b9m0-br|{^obJ!*98g>i2hdsicVXv@v zcud$Q>>KtA`-cOaAbILI4V3P9374c z$A;s=@!^DUVwf3D3bVpf!^z>4aB4U$oF1MQW`{Gvnc=MP^ze*uc6erZR(N(eC(H@Y z3Fn6AhV#Pn!t=um!ujEXaACM8TpV5)E(tFRFAkT6%fjX1CE=yvW#Q%F72%cPRpHg) zHQ}}4b>a2l4dIR9P2r00=J1wqWq50NTevE`J-j2lGrTLjJG>{nH@q*rKU^I?5Iz__ z6h0h25aAUYB z+#J3ZZV6ux-w59f-wL;eZ-?)M+roFl_rmSrj&NtVE8HFK3HOHYhaZF=hWo<(;YZ=e z;V0pz;b-CJ;TPeT;aB0;;Wy#8;dkNp;Sb@D;ZNbu;Ve7iONRhqViFNsA5zpsvK2`sz%kK>QRlT zW>hPx9o2~ri|R&)NA;p3qWV#Ts9|(u)F?VCY8*9*nnul{=245NWz;Ha9kq$tM(v{Z zQHQ8w)G0bT>Kt{6x<=ii?op4ZXVfd|9UT+(iTXzUqW;l61X zMcL7eXl67kIz2ignjM`PofVxO&53fNbE3J?xzW7nyy*Psf@prUAX*qLiWWx~MoXfL zqKl)Y(XwcHbV+n+bXjzHbVYP!bX9b9bWL<^bX|0PbVGDwbW^k?v3t??vGYS4@3_}4@D10k3^3~Yof=Z$D=2rC!?pLr=w@0XQQ>z zbJ6qBx@dj$LiA$vQuK24O7v>9A=(&iiZ(~DMO&iRqc@^Aqqm~1(c95G(YEN_=)Gur zv?JOX?TU6sd!oJ3`_Tu{hta-hfAmrGar8;_Y4ln2dGtl}W%O0_b@Wa2ZS-CAee^^0 zWAsz>bM#B}YxG<6d-O;2XY^O<(jUfA9LM?M{BeP}U|c9J92bd;#>L{|af!HOTq-Ud zmx;^9<>K;jg}7o|DXtt>iL1ud;_7jYxMo}{t{vBj4~y%@hsX8eBjWmTgScUQWZWn| zDsCJ%iJQjF;^uLSxMkcbZXLIY+s5tU_Hl=}W85h|I_?~IiMz(#;_h*exM$oe?j0W! z_lf(){o?-dfOud$C>|Ui8xM(x#>3)_czAqVJR&|mJ|R9aJ}DjzD#}C90#t+30 z$B)F1#%too;>Y7B;wR&$;-}+h;%DQv@pJL>@w#|@{6hR<{8IdK{7U?4ydmBgZ;Cg^ zuf<#9*W)+hH{-YBt?}FOJMp&o-T1wDd%Ppw8SjdB$9v+v@%!-y@rUugcz^s+JabIe zQOV6ct8wZ*vsz~Loa{ZRt@@c|4|y2ao8;v_3}n_?JgiI2+-IBI=YibqNLI{z%+CT$ zKSZRq3rQDd5f)`J7H0{TWGR+r8J1-^mS+W4WF=N+6;@?6R%Z?7oq8?l+Wg1YK2uLP zxvzgL2jx8$rk;FqUk%uhN3s!*Vq-R8Q#NCBwqQ%PVr#ZxTef3+c3?+#;?eBPF6_!~ z?9LwS$zJTuW7vm%*^m7>fCD**gLy26a43f{gTr|oNAP%_z!P~ANAhHj;wc=>F&xWr z9M1`y$V^UR7Ek45PT^Ee<8+?JY|h|J&f@7jgR^-i&*Isf!yKN&xjdKicplH^1)R?X zT*yUS%nP}M7x7{)K<~6*Q*YSGZz#DlJSMX-u!j-(0w{aD3 z=N-J0ckyoC!+Uuj@8@bhzz6vdALb)`lxz4HALkQ%l27qzKEr3Zme28duH$;Xz!&)v zU*;=(l^eK`o4A>;aSLDP8+?;*aVy{EJKV;1`5w1(2X}H8cXJQ-@_l~554n%~`4K6?z#sV&f95azmA~V)A&FypjjB zGdYP_Je89LDp&gPjsi)V8Vb9fHt@?6g2c|4yNa6T7sAs2Bm zFXR$l#EZF<%eb7E@KRpJ%XtN_RoKNscKEs%KF8;|j_dgXU*t=CnXm9w zZs104;%2_aEqtAC@J+tOt$dsBa2wy{d)&?)+{sI<=2mj<>O#K=l{R$Z|W_s9vjH3ONH*e8 zY|JKX%4TfN7Hr8@Y|S=o%XVzf4(!NIJer-^g{UDV)k_oX*pj z%^94@Sv;L*a5m56Sv;F_n8R~8m*;XG&*S;Lfb+S43%Q7kc_EkZB3{g;T*l?RgqQL% zUd}6cC9mSuyoT5EI$qBkcq4D(3f|0HxRST>Hm>6Byn}c0F5b<1crWkc{anol_#hwR z!+eB~at$Bj<9vco@+m&eXZS4F@;N@wbzILE_#$88%Y22easxMV6F2iUZsF^EgKzRJ zZsps2huior-{W@f;7;!1Ztme;zRwT%A@^}VKjO#ygrD*=e$FrWCBNd={D$B1JATg} z_#=Pf&-{hI@;Cm@KlmsAV(Lo(x%YoYjG2%5S%3vuh=o~%MOlo+S%M{5ilteGWm%5p zS%DQYHD%YN+70UXFd9L!@mghM%u863{zID*IX z1fIx~IFcuG6i?x3j^S92<9JTsL}qdlvv?{ea|)+&8mIF#W^)E-au!eL8Jx{Cc^1#+ z9Om#G&gHqB$MbkTFW`JG;6g6qVqVB4yoeWbDVK3MFX5%UjF{9^`#JFIhpz$xhY}{^jd=$#O#K zH}d@o&d(gXIHO{M@^eLe(bc# z*~x0f*y*FEjvGCr@z`nE;~S@|7wIv2=ICC@jzzMQb&S+^50hn#oa|%~qkd-lk=e;A zM&|IWWbrApV^*@Zn5vU3Y1AK?ovdioPklEbbu!7K#y?LcS>Q-cCY+y{oJ_LLF(UOp z=sNnq!6$1TsRp{_WGAa0^)tKm3o^U+3p0E4i+o8kSp^wCGCNrW$?N`PAtWa|SqAx+ z?oU=lk`Im2-5-U)zw~^vV)D;}O;%6rmmF-6dYG!8)jXA%RUq~DcXx-Wf{X&A#*QAF zIX*i|C33RkBrD(4ag(NG=Syw&3unw4GkRop{?vAE=M+e7{qyq*CV78%PN7utpPf@U zl{Y#kyGV-6{yEu2lRYvAW+lCzIcQ{du~a%IyLdWxNaoOfsSoZKroJCqBB`H|mGsiA z(X(dE9FvOS;z=Z=27lCq$s)saFVAlBDNX)F77k4dF zxjEUb(mVZ^xNDshj?8Y8u9J?tw&~47d#GKKmmE`W54BH{{~mW8Qbn0PQ+?hs**G|? zI_0&|FMm3$28*HtVf8GSBEeBM7Is!QH6@B>C^> z(JfV!x-fK4k&IW5q-JWol733Yt7pob>|W{I+zUhQaP>~gGqRF%&ATvUF3uZ{W70eI zN$>PuhNEv%I5N9mx=#8u`lmM!9gYD>-rvJ9FiHOVX$(pgWiCq%$KYh+;NdtnuZ?6l zW-ZP*FdVs^{B>u&-6hvT{CRF}`oEQzI#4e?97B?iO`pfmnUg&(-Oi|y+2hk22;*{3DDXXZAU9A467b}Dat=4t7_*O-xPkIbH#-tpjYot5OIIwjRD z>2R%|8-{uB|2-VjJ@@xtcAS22JE@!7fp+q)BB}E9%adyAk3Di870PwVwGykb7VEGs>#;r? zvQd}xb?~y}@Y20pNfH`?~@p~ zHf2usb?MyP5Qx*y64xi?8Cglsbm&9o46w7j@16CNVzlJbpFf&IoWrmHw$I1$}BwaA478Y-yc>nBO1^)dGt>` z_}`Px&X*B(95`@b-@j6?^jp6mr+!8>bw=teQ~zczm^mi7vea5VW%Ky0W&S4*{G@ho zs`1S9f!*i8R@TpmyZ0QHeSa!Fpj3%|L2hz&DmimV_5(@%|2XmolYLW%k?N*{kNlx@ z_W#QxU%O;f<)7lR!T;caBY!y6c&ck3Nj@Vt%pXl}9vTR1(uZ*1Is36xGB*zXzPdf0 z%FAuwiGOL}$wM1>DzAZqE{{*A@=}+_XHq0nw*BvO&yF9MXEl*DLz~%9`ys7d+DmN$l#q>`9WvaZC6pqY(IbA0`RbEMN z9@;~%CV8o`%a?m0*pMU-o+|0>?&4|| zYLWfUL8+G6+YU;#%6>PMN_BjyFW*aTiS_O2%|l0WN0OIxZ0<$_5Uxv{?c zUmDnRXajrm8aOD{-#^g62mjK*hle(>FRy{ra#Y@5$L>GSz(@bmz{iI+@JU_+-T!Xj z(^Ot+wtbc&nQfmZd!%Mt66MKk`yxr_WPh2?K6ti$l~l;cN}?|{+rBDfP~^7H2I zPpR$H(pc*KzuxA4NeWZzW9d5Sx%*ps^Ux9hJ;}?x?j`~EN0R(+b2m5t?-~4Os&eiO z{wv+`f6m|_^*ox2x-dokL!NV@B&UbJ-{}(ejzjmVJ`7PV! zNhSX|Ws9cravLc2FAWqww1E%(V>^o$kJzZX8a8j#C_Ti}tsWr)Z>Dou6YvE`&OTYZ7UQ3p5^8O-1!2kdE<4x`dHa63mvKgDR1zWNeTeA(@ zvK`yA13R))?vGWil8#A_SnH%m{_zaoCa;%r@4;=8AzV2>m(HR@(}&m@^%Tj?`%{FJWQy;A9{Fk3qup^QR8Cl85q%M%hA1Keez@Cs>p7dkh z1v0h5EYwa+<>q9cl-}vTgxbiYaAfw$={o8D8I|5Vw0}-X^8OCB(MdAhK?mZkTlSb# z=HH<fHep&)o{^RGUEY;t)~v?=567C4*8l(j literal 0 HcmV?d00001 diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/io/tests/generate_legacy_storage_files.py index bfa8ff6d30a9c..25fd86d899c08 100644 --- a/pandas/io/tests/generate_legacy_storage_files.py +++ b/pandas/io/tests/generate_legacy_storage_files.py @@ -80,6 +80,7 @@ def create_data(): [u'one', u'two', u'one', u'two', u'one', u'two', u'one', u'two']])), names=[u'first', u'second'])) + series = dict(float=Series(data[u'A']), int=Series(data[u'B']), mixed=Series(data[u'E']), @@ -135,6 +136,10 @@ def create_data(): items=[u'A', u'B', u'A']), mixed_dup=mixed_dup_panel) + cat = dict(int8=Categorical(list('abcdefg')), + int16=Categorical(np.arange(1000)), + int32=Categorical(np.arange(10000))) + return dict(series=series, frame=frame, panel=panel, @@ -143,7 +148,8 @@ def create_data(): mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), - sp_frame=dict(float=_create_sp_frame())) + sp_frame=dict(float=_create_sp_frame()), + cat=cat) def create_pickle_data(): diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index c12d6e02e3a2e..e337ad4dcfed2 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -109,8 +109,12 @@ def compare_series_dt_tz(self, result, expected, typ, version): tm.assert_series_equal(result, expected) def compare_series_cat(self, result, expected, typ, version): - # Categorical.ordered is changed in < 0.16.0 - if LooseVersion(version) < '0.16.0': + # Categorical dtype is added in 0.15.0 + # ordered is changed in 0.16.0 + if LooseVersion(version) < '0.15.0': + tm.assert_series_equal(result, expected, check_dtype=False, + check_categorical=False) + elif LooseVersion(version) < '0.16.0': tm.assert_series_equal(result, expected, check_categorical=False) else: tm.assert_series_equal(result, expected) @@ -125,8 +129,12 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): tm.assert_frame_equal(result, expected) def compare_frame_cat_onecol(self, result, expected, typ, version): - # Categorical.ordered is changed in < 0.16.0 - if LooseVersion(version) < '0.16.0': + # Categorical dtype is added in 0.15.0 + # ordered is changed in 0.16.0 + if LooseVersion(version) < '0.15.0': + tm.assert_frame_equal(result, expected, check_dtype=False, + check_categorical=False) + elif LooseVersion(version) < '0.16.0': tm.assert_frame_equal(result, expected, check_categorical=False) else: tm.assert_frame_equal(result, expected) From d38ee272f3060cb884f21f9f7d212efc5f7656a8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 5 Jul 2016 18:14:36 -0500 Subject: [PATCH 072/359] CLN: Check Warnings in test_graphics_others (#13188) closes #13185 Also a MatplotlibDeprecationWarning for use of .get_axes() vs. .axes in some tests. --- pandas/tests/test_graphics.py | 128 ++++++++++++------- pandas/tests/test_graphics_others.py | 183 +++++++++++++++++---------- pandas/tools/plotting.py | 6 +- 3 files changed, 200 insertions(+), 117 deletions(-) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index b09185c19bffb..bd19a83ce2b64 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -450,8 +450,9 @@ def _check_box_return_type(self, returned, return_type, expected_keys=None, self.assertIsInstance(value.lines, dict) elif return_type == 'dict': line = value['medians'][0] + axes = line.axes if self.mpl_ge_1_5_0 else line.get_axes() if check_ax_title: - self.assertEqual(line.get_axes().get_title(), key) + self.assertEqual(axes.get_title(), key) else: raise AssertionError @@ -820,10 +821,13 @@ def test_hist_legacy(self): _check_plot_works(self.ts.hist) _check_plot_works(self.ts.hist, grid=False) _check_plot_works(self.ts.hist, figsize=(8, 10)) - _check_plot_works(self.ts.hist, filterwarnings='ignore', - by=self.ts.index.month) - _check_plot_works(self.ts.hist, filterwarnings='ignore', - by=self.ts.index.month, bins=5) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, + by=self.ts.index.month) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, + by=self.ts.index.month, bins=5) fig, ax = self.plt.subplots(1, 1) _check_plot_works(self.ts.hist, ax=ax) @@ -857,32 +861,40 @@ def test_hist_layout(self): def test_hist_layout_with_by(self): df = self.hist_df - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.gender, layout=(2, 1)) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.gender, layout=(2, 1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.gender, layout=(3, -1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.gender, layout=(3, -1)) self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.category, layout=(4, 1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(4, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.category, layout=(2, -1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(2, -1)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.category, layout=(3, -1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(3, -1)) self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.category, layout=(-1, 4)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(-1, 4)) self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.classroom, layout=(2, 2)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.classroom, layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) @@ -899,7 +911,7 @@ def test_hist_no_overlap(self): subplot(122) y.hist() fig = gcf() - axes = fig.get_axes() + axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() self.assertEqual(len(axes), 2) @slow @@ -1300,17 +1312,21 @@ def setUp(self): @slow def test_plot(self): df = self.tdf - _check_plot_works(df.plot, filterwarnings='ignore', grid=False) - axes = _check_plot_works(df.plot, filterwarnings='ignore', - subplots=True) + _check_plot_works(df.plot, grid=False) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, + subplots=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = _check_plot_works(df.plot, filterwarnings='ignore', - subplots=True, layout=(-1, 2)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, + subplots=True, layout=(-1, 2)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - axes = _check_plot_works(df.plot, filterwarnings='ignore', - subplots=True, use_index=False) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, + subplots=True, use_index=False) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) df = DataFrame({'x': [1, 2], 'y': [3, 4]}) @@ -1326,8 +1342,8 @@ def test_plot(self): _check_plot_works(df.plot, xticks=[1, 5, 10]) _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100)) - _check_plot_works(df.plot, filterwarnings='ignore', - subplots=True, title='blah') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.plot, subplots=True, title='blah') # We have to redo it here because _check_plot_works does two plots, # once without an ax kwarg and once with an ax kwarg and the new sharex @@ -2217,7 +2233,9 @@ def test_plot_bar(self): _check_plot_works(df.plot.bar) _check_plot_works(df.plot.bar, legend=False) - _check_plot_works(df.plot.bar, filterwarnings='ignore', subplots=True) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.plot.bar, subplots=True) _check_plot_works(df.plot.bar, stacked=True) df = DataFrame(randn(10, 15), @@ -2433,8 +2451,10 @@ def test_boxplot_vertical(self): self._check_text_labels(ax.get_yticklabels(), labels) self.assertEqual(len(ax.lines), self.bp_n_objects * len(numeric_cols)) - axes = _check_plot_works(df.plot.box, filterwarnings='ignore', - subplots=True, vert=False, logx=True) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.box, + subplots=True, vert=False, logx=True) self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) self._check_ax_scales(axes, xaxis='log') for ax, label in zip(axes, labels): @@ -2494,8 +2514,9 @@ def test_kde_df(self): ax = df.plot(kind='kde', rot=20, fontsize=5) self._check_ticks_props(ax, xrot=20, xlabelsize=5, ylabelsize=5) - axes = _check_plot_works(df.plot, filterwarnings='ignore', kind='kde', - subplots=True) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, kind='kde', + subplots=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) axes = df.plot(kind='kde', logy=True, subplots=True) @@ -2522,8 +2543,9 @@ def test_hist_df(self): expected = [pprint_thing(c) for c in df.columns] self._check_legend_labels(ax, labels=expected) - axes = _check_plot_works(df.plot.hist, filterwarnings='ignore', - subplots=True, logy=True) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.hist, + subplots=True, logy=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) self._check_ax_scales(axes, yaxis='log') @@ -2902,8 +2924,9 @@ def test_line_colors_and_styles_subplots(self): # Color contains shorthand hex value results in ValueError custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF'] # Forced show plot - _check_plot_works(df.plot, color=custom_colors, subplots=True, - filterwarnings='ignore') + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.plot, color=custom_colors, subplots=True) rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) for cmap in ['jet', cm.jet]: @@ -3294,8 +3317,10 @@ def test_pie_df(self): ax = _check_plot_works(df.plot.pie, y=2) self._check_text_labels(ax.texts, df.index) - axes = _check_plot_works(df.plot.pie, filterwarnings='ignore', - subplots=True) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.pie, + subplots=True) self.assertEqual(len(axes), len(df.columns)) for ax in axes: self._check_text_labels(ax.texts, df.index) @@ -3304,9 +3329,10 @@ def test_pie_df(self): labels = ['A', 'B', 'C', 'D', 'E'] color_args = ['r', 'g', 'b', 'c', 'm'] - axes = _check_plot_works(df.plot.pie, filterwarnings='ignore', - subplots=True, labels=labels, - colors=color_args) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.pie, + subplots=True, labels=labels, + colors=color_args) self.assertEqual(len(axes), len(df.columns)) for ax in axes: @@ -3362,9 +3388,12 @@ def test_errorbar_plot(self): self._check_has_errorbars(ax, xerr=2, yerr=2) ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind) self._check_has_errorbars(ax, xerr=2, yerr=2) - axes = _check_plot_works(df.plot, filterwarnings='ignore', - yerr=df_err, xerr=df_err, subplots=True, - kind=kind) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, + yerr=df_err, xerr=df_err, + subplots=True, + kind=kind) self._check_has_errorbars(axes, xerr=1, yerr=1) ax = _check_plot_works((df + 1).plot, yerr=df_err, @@ -3455,8 +3484,11 @@ def test_errorbar_timeseries(self): self._check_has_errorbars(ax, xerr=0, yerr=1) ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) - axes = _check_plot_works(tdf.plot, filterwarnings='ignore', - kind=kind, yerr=tdf_err, subplots=True) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(tdf.plot, + kind=kind, yerr=tdf_err, + subplots=True) self._check_has_errorbars(axes, xerr=0, yerr=1) def test_errorbar_asymmetrical(self): diff --git a/pandas/tests/test_graphics_others.py b/pandas/tests/test_graphics_others.py index 7285d84865542..f9a210a492594 100644 --- a/pandas/tests/test_graphics_others.py +++ b/pandas/tests/test_graphics_others.py @@ -5,7 +5,6 @@ import itertools import os import string -import warnings from distutils.version import LooseVersion from pandas import Series, DataFrame, MultiIndex @@ -61,8 +60,11 @@ def test_hist_legacy(self): _check_plot_works(self.ts.hist) _check_plot_works(self.ts.hist, grid=False) _check_plot_works(self.ts.hist, figsize=(8, 10)) - _check_plot_works(self.ts.hist, by=self.ts.index.month) - _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) fig, ax = self.plt.subplots(1, 1) _check_plot_works(self.ts.hist, ax=ax) @@ -96,29 +98,42 @@ def test_hist_layout(self): def test_hist_layout_with_by(self): df = self.hist_df - axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) + # _check_plot_works adds an `ax` kwarg to the method call + # so we get a warning about an axis being cleared, even + # though we don't explicing pass one, see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, + layout=(2, 1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - axes = _check_plot_works(df.height.hist, by=df.gender, layout=(3, -1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, + layout=(3, -1)) self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) - axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, + layout=(4, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(2, -1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.category, layout=(2, -1)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(3, -1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.category, layout=(3, -1)) self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(-1, 4)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.category, layout=(-1, 4)) self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) - axes = _check_plot_works( - df.height.hist, by=df.classroom, layout=(2, 2)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.classroom, layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) @@ -135,7 +150,7 @@ def test_hist_no_overlap(self): subplot(122) y.hist() fig = gcf() - axes = fig.get_axes() + axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() self.assertEqual(len(axes), 2) @slow @@ -203,33 +218,43 @@ def test_boxplot_legacy(self): _check_plot_works(df.boxplot, return_type='dict') _check_plot_works(df.boxplot, column=[ 'one', 'two'], return_type='dict') - _check_plot_works(df.boxplot, column=['one', 'two'], by='indic') + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, column=['one', 'two'], + by='indic') _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2']) - _check_plot_works(df.boxplot, by='indic') - _check_plot_works(df.boxplot, by=['indic', 'indic2']) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by='indic') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by=['indic', 'indic2']) _check_plot_works(plotting.boxplot, data=df['one'], return_type='dict') _check_plot_works(df.boxplot, notch=1, return_type='dict') - _check_plot_works(df.boxplot, by='indic', notch=1) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by='indic', notch=1) df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) df['Y'] = Series(['A'] * 10) - _check_plot_works(df.boxplot, by='X') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by='X') # When ax is supplied and required number of axes is 1, # passed ax should be used: fig, ax = self.plt.subplots() axes = df.boxplot('Col1', by='X', ax=ax) - self.assertIs(ax.get_axes(), axes) + ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes() + self.assertIs(ax_axes, axes) fig, ax = self.plt.subplots() axes = df.groupby('Y').boxplot(ax=ax, return_type='axes') - self.assertIs(ax.get_axes(), axes['A']) + ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes() + self.assertIs(ax_axes, axes['A']) # Multiple columns with an ax argument should use same figure fig, ax = self.plt.subplots() - axes = df.boxplot(column=['Col1', 'Col2'], - by='X', ax=ax, return_type='axes') + with tm.assert_produces_warning(UserWarning): + axes = df.boxplot(column=['Col1', 'Col2'], + by='X', ax=ax, return_type='axes') self.assertIs(axes['Col1'].get_figure(), fig) # When by is None, check that all relevant lines are present in the @@ -304,11 +329,13 @@ def test_boxplot_empty_column(self): @slow def test_hist_df_legacy(self): from matplotlib.patches import Rectangle - _check_plot_works(self.hist_df.hist) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.hist_df.hist) # make sure layout is handled df = DataFrame(randn(100, 3)) - axes = _check_plot_works(df.hist, grid=False) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, grid=False) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) self.assertFalse(axes[1, 1].get_visible()) @@ -317,17 +344,21 @@ def test_hist_df_legacy(self): # make sure layout is handled df = DataFrame(randn(100, 6)) - axes = _check_plot_works(df.hist, layout=(4, 2)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, layout=(4, 2)) self._check_axes_shape(axes, axes_num=6, layout=(4, 2)) # make sure sharex, sharey is handled - _check_plot_works(df.hist, sharex=True, sharey=True) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, sharex=True, sharey=True) # handle figsize arg - _check_plot_works(df.hist, figsize=(8, 10)) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, figsize=(8, 10)) # check bins argument - _check_plot_works(df.hist, bins=5) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, bins=5) # make sure xlabelsize and xrot are handled ser = df[0] @@ -401,22 +432,30 @@ def test_scatter_plot_legacy(self): def scat(**kwds): return plotting.scatter_matrix(df, **kwds) - _check_plot_works(scat) - _check_plot_works(scat, marker='+') - _check_plot_works(scat, vmin=0) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, marker='+') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, vmin=0) if _ok_for_gaussian_kde('kde'): - _check_plot_works(scat, diagonal='kde') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, diagonal='kde') if _ok_for_gaussian_kde('density'): - _check_plot_works(scat, diagonal='density') - _check_plot_works(scat, diagonal='hist') - _check_plot_works(scat, range_padding=.1) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, diagonal='density') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, diagonal='hist') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, range_padding=.1) def scat2(x, y, by=None, ax=None, figsize=None): return plotting.scatter_plot(df, x, y, by, ax, figsize=None) _check_plot_works(scat2, x=0, y=1) grouper = Series(np.repeat([1, 2, 3, 4, 5], 20), df.index) - _check_plot_works(scat2, x=0, y=1, by=grouper) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat2, x=0, y=1, by=grouper) def test_scatter_matrix_axis(self): tm._skip_if_no_scipy() @@ -607,8 +646,7 @@ class TestDataFrameGroupByPlots(TestPlotBase): @slow def test_boxplot_legacy(self): grouped = self.hist_df.groupby(by='gender') - with warnings.catch_warnings(): - warnings.simplefilter('ignore') + with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2)) @@ -620,7 +658,8 @@ def test_boxplot_legacy(self): index=MultiIndex.from_tuples(tuples)) grouped = df.groupby(level=1) - axes = _check_plot_works(grouped.boxplot, return_type='axes') + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3)) axes = _check_plot_works(grouped.boxplot, subplots=False, @@ -628,7 +667,8 @@ def test_boxplot_legacy(self): self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) grouped = df.unstack(level=1).groupby(level=0, axis=1) - axes = _check_plot_works(grouped.boxplot, return_type='axes') + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, @@ -774,18 +814,22 @@ def test_grouped_box_layout(self): self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], by=df.gender, layout=(-1, -1)) - box = _check_plot_works(df.groupby('gender').boxplot, column='height', - return_type='dict') + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('gender').boxplot, + column='height', return_type='dict') self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2)) - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - return_type='dict') + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('category').boxplot, + column='height', + return_type='dict') self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) # GH 6769 - box = _check_plot_works(df.groupby('classroom').boxplot, - column='height', return_type='dict') + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('classroom').boxplot, + column='height', return_type='dict') self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) # GH 5897 @@ -803,13 +847,15 @@ def test_grouped_box_layout(self): column=['height', 'weight', 'category'], return_type='dict') self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, 2), return_type='dict') + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('category').boxplot, + column='height', + layout=(3, 2), return_type='dict') self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, -1), return_type='dict') + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('category').boxplot, + column='height', + layout=(3, -1), return_type='dict') self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) box = df.boxplot(column=['height', 'weight', 'category'], by='gender', @@ -848,8 +894,7 @@ def test_grouped_box_multiple_axes(self): axes_num=4, layout=(2, 2)) fig, axes = self.plt.subplots(2, 3) - with warnings.catch_warnings(): - warnings.simplefilter('ignore') + with tm.assert_produces_warning(UserWarning): returned = df.boxplot(column=['height', 'weight', 'category'], by='gender', return_type='axes', ax=axes[0]) returned = np.array(list(returned.values())) @@ -858,8 +903,7 @@ def test_grouped_box_multiple_axes(self): self.assertIs(returned[0].figure, fig) # draw on second row - with warnings.catch_warnings(): - warnings.simplefilter('ignore') + with tm.assert_produces_warning(UserWarning): returned = df.groupby('classroom').boxplot( column=['height', 'weight', 'category'], return_type='axes', ax=axes[1]) @@ -871,7 +915,8 @@ def test_grouped_box_multiple_axes(self): with tm.assertRaises(ValueError): fig, axes = self.plt.subplots(2, 3) # pass different number of axes from required - axes = df.groupby('classroom').boxplot(ax=axes) + with tm.assert_produces_warning(UserWarning): + axes = df.groupby('classroom').boxplot(ax=axes) @slow def test_grouped_hist_layout(self): @@ -883,12 +928,14 @@ def test_grouped_hist_layout(self): self.assertRaises(ValueError, df.hist, column='height', by=df.category, layout=(-1, -1)) - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, 1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, column='height', by=df.gender, + layout=(2, 1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, -1)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, column='height', by=df.gender, + layout=(2, -1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) axes = df.hist(column='height', by=df.category, layout=(4, 1)) @@ -904,12 +951,14 @@ def test_grouped_hist_layout(self): tm.close() # GH 6769 - axes = _check_plot_works( - df.hist, column='height', by='classroom', layout=(2, 2)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.hist, column='height', by='classroom', layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) # without column - axes = _check_plot_works(df.hist, by='classroom') + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, by='classroom') self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) axes = df.hist(by='gender', layout=(3, 5)) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index baca8045f0cc1..b6c1926c1e7fc 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -3353,7 +3353,8 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, if sharex or sharey: warnings.warn("When passing multiple axes, sharex and sharey " "are ignored. These settings must be specified " - "when creating axes", UserWarning) + "when creating axes", UserWarning, + stacklevel=4) if len(ax) == naxes: fig = ax[0].get_figure() return fig, ax @@ -3370,7 +3371,8 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, return fig, _flatten(ax) else: warnings.warn("To output multiple subplots, the figure containing " - "the passed axes is being cleared", UserWarning) + "the passed axes is being cleared", UserWarning, + stacklevel=4) fig.clear() nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) From cc0a188addb46f7b4986dce32947e66295f1bb3b Mon Sep 17 00:00:00 2001 From: adneu Date: Wed, 6 Jul 2016 17:41:28 -0400 Subject: [PATCH 073/359] BUG: Groupby.nth includes group key inconsistently #12839 closes #12839 Author: adneu Closes #13316 from adneu/12839 and squashes the following commits: 16f5cd3 [adneu] Name change ac1851a [adneu] Added docstrings/comments, and new tests. 4d73cbf [adneu] Updated tests 9b75df4 [adneu] BUG: Groupby.nth includes group key inconsistently #12839 --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/core/groupby.py | 35 ++++++++++++++++++++++++--------- pandas/tests/test_groupby.py | 31 ++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index be1f745537d05..b9afa7fcb7959 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -521,7 +521,7 @@ Bug Fixes - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - +- Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) - Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 04e4db9d1fdc6..8d33c27481d93 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -95,7 +95,7 @@ def _groupby_function(name, alias, npfunc, numeric_only=True, @Appender(_doc_template) @Appender(_local_template) def f(self): - self._set_selection_from_grouper() + self._set_group_selection() try: return self._cython_agg_general(alias, numeric_only=numeric_only) except AssertionError as e: @@ -457,8 +457,21 @@ def _selected_obj(self): else: return self.obj[self._selection] - def _set_selection_from_grouper(self): - """ we may need create a selection if we have non-level groupers """ + def _reset_group_selection(self): + """ + Clear group based selection. Used for methods needing to return info on + each group regardless of whether a group selection was previously set. + """ + if self._group_selection is not None: + self._group_selection = None + # GH12839 clear cached selection too when changing group selection + self._reset_cache('_selected_obj') + + def _set_group_selection(self): + """ + Create group based selection. Used when selection is not passed + directly but instead via a grouper. + """ grp = self.grouper if self.as_index and getattr(grp, 'groupings', None) is not None and \ self.obj.ndim > 1: @@ -468,6 +481,8 @@ def _set_selection_from_grouper(self): if len(groupers): self._group_selection = ax.difference(Index(groupers)).tolist() + # GH12839 clear selected obj cache when group selection changes + self._reset_cache('_selected_obj') def _set_result_index_ordered(self, result): # set the result index on the passed values object and @@ -511,7 +526,7 @@ def _make_wrapper(self, name): # need to setup the selection # as are not passed directly but in the grouper - self._set_selection_from_grouper() + self._set_group_selection() f = getattr(self._selected_obj, name) if not isinstance(f, types.MethodType): @@ -979,7 +994,7 @@ def mean(self, *args, **kwargs): except GroupByError: raise except Exception: # pragma: no cover - self._set_selection_from_grouper() + self._set_group_selection() f = lambda x: x.mean(axis=self.axis) return self._python_agg_general(f) @@ -997,7 +1012,7 @@ def median(self): raise except Exception: # pragma: no cover - self._set_selection_from_grouper() + self._set_group_selection() def f(x): if isinstance(x, np.ndarray): @@ -1040,7 +1055,7 @@ def var(self, ddof=1, *args, **kwargs): if ddof == 1: return self._cython_agg_general('var') else: - self._set_selection_from_grouper() + self._set_group_selection() f = lambda x: x.var(ddof=ddof) return self._python_agg_general(f) @@ -1217,7 +1232,7 @@ def nth(self, n, dropna=None): raise TypeError("n needs to be an int or a list/set/tuple of ints") nth_values = np.array(nth_values, dtype=np.intp) - self._set_selection_from_grouper() + self._set_group_selection() if not dropna: mask = np.in1d(self._cumcount_array(), nth_values) | \ @@ -1325,7 +1340,7 @@ def cumcount(self, ascending=True): dtype: int64 """ - self._set_selection_from_grouper() + self._set_group_selection() index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) @@ -1403,6 +1418,7 @@ def head(self, n=5): 0 1 2 2 5 6 """ + self._reset_group_selection() mask = self._cumcount_array() < n return self._selected_obj[mask] @@ -1429,6 +1445,7 @@ def tail(self, n=5): 0 a 1 2 b 1 """ + self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 10362cbb24888..d6d601f03d561 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -354,6 +354,35 @@ def test_nth_multi_index_as_expected(self): names=['A', 'B'])) assert_frame_equal(result, expected) + def test_group_selection_cache(self): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + expected = df.iloc[[0, 2]].set_index('A') + + g = df.groupby('A') + result1 = g.head(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.tail(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.head(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.tail(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index @@ -6116,7 +6145,7 @@ def test_cython_transform(self): # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == 'shift': - gb._set_selection_from_grouper() + gb._set_group_selection() for (op, args), targop in ops: if op != 'shift' and 'int' not in gb_target: From 2655daef1b7346feabd00d4d40910a80386d0812 Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Thu, 7 Jul 2016 03:26:12 -0400 Subject: [PATCH 074/359] In gbq, use googleapiclient instead of apiclient #13454 (#13458) closes #13454 --- doc/source/whatsnew/v0.18.2.txt | 2 ++ pandas/io/gbq.py | 38 ++++++++++++++++++++++++++------- pandas/io/tests/test_gbq.py | 19 +++++++++++++++-- 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index b9afa7fcb7959..64644bd9a7a26 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -528,3 +528,5 @@ Bug Fixes - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) + +- Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index e706434f29dc5..140f5cc6bb6e3 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -46,8 +46,12 @@ def _test_google_api_imports(): try: import httplib2 # noqa - from apiclient.discovery import build # noqa - from apiclient.errors import HttpError # noqa + try: + from googleapiclient.discovery import build # noqa + from googleapiclient.errors import HttpError # noqa + except: + from apiclient.discovery import build # noqa + from apiclient.errors import HttpError # noqa from oauth2client.client import AccessTokenRefreshError # noqa from oauth2client.client import OAuth2WebServerFlow # noqa from oauth2client.file import Storage # noqa @@ -266,7 +270,10 @@ def sizeof_fmt(num, suffix='b'): def get_service(self): import httplib2 - from apiclient.discovery import build + try: + from googleapiclient.discovery import build + except: + from apiclient.discovery import build http = httplib2.Http() http = self.credentials.authorize(http) @@ -315,7 +322,10 @@ def process_insert_errors(self, insert_errors): raise StreamingInsertError def run_query(self, query): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError from oauth2client.client import AccessTokenRefreshError _check_google_client_version() @@ -420,7 +430,10 @@ def run_query(self, query): return schema, result_pages def load_data(self, dataframe, dataset_id, table_id, chunksize): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError job_id = uuid.uuid4().hex rows = [] @@ -474,7 +487,10 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize): self._print("\n") def verify_schema(self, dataset_id, table_id, schema): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError try: return (self.service.tables().get( @@ -765,7 +781,10 @@ class _Table(GbqConnector): def __init__(self, project_id, dataset_id, reauth=False, verbose=False, private_key=None): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError self.http_error = HttpError self.dataset_id = dataset_id super(_Table, self).__init__(project_id, reauth, verbose, private_key) @@ -865,7 +884,10 @@ class _Dataset(GbqConnector): def __init__(self, project_id, reauth=False, verbose=False, private_key=None): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError self.http_error = HttpError super(_Dataset, self).__init__(project_id, reauth, verbose, private_key) diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 5cb681f4d2e7d..278c5d7215624 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -73,8 +73,12 @@ def _test_imports(): if _SETUPTOOLS_INSTALLED: try: - from apiclient.discovery import build # noqa - from apiclient.errors import HttpError # noqa + try: + from googleapiclient.discovery import build # noqa + from googleapiclient.errors import HttpError # noqa + except: + from apiclient.discovery import build # noqa + from apiclient.errors import HttpError # noqa from oauth2client.client import OAuth2WebServerFlow # noqa from oauth2client.client import AccessTokenRefreshError # noqa @@ -280,6 +284,17 @@ class GBQUnitTests(tm.TestCase): def setUp(self): test_requirements() + def test_import_google_api_python_client(self): + if compat.PY2: + with tm.assertRaises(ImportError): + from googleapiclient.discovery import build # noqa + from googleapiclient.errors import HttpError # noqa + from apiclient.discovery import build # noqa + from apiclient.errors import HttpError # noqa + else: + from googleapiclient.discovery import build # noqa + from googleapiclient.errors import HttpError # noqa + def test_should_return_bigquery_integers_as_python_floats(self): result = gbq._parse_entry(1, 'INTEGER') tm.assert_equal(result, float(1)) From f11b9c1eef4bb161a35a1a5695aebb934f7c8b96 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jul 2016 17:08:02 +0200 Subject: [PATCH 075/359] RLS: switch master from 0.18.2 to 0.19.0 (#13586) --- doc/source/categorical.rst | 2 +- doc/source/merging.rst | 2 +- doc/source/text.rst | 2 +- doc/source/whatsnew.rst | 2 +- doc/source/whatsnew/v0.18.2.txt | 532 -------------------------------- doc/source/whatsnew/v0.19.0.txt | 485 +++++++++++++++++++++++++++-- doc/source/whatsnew/v0.20.0.txt | 83 +++++ pandas/computation/ops.py | 2 +- pandas/core/base.py | 4 +- pandas/core/categorical.py | 2 +- pandas/core/generic.py | 2 +- pandas/indexes/base.py | 6 +- pandas/indexes/category.py | 2 +- pandas/io/html.py | 2 +- pandas/io/pytables.py | 2 +- pandas/tools/merge.py | 4 +- pandas/tseries/base.py | 2 +- pandas/tseries/index.py | 2 +- pandas/tseries/offsets.py | 4 +- pandas/tslib.pyx | 4 +- pandas/types/concat.py | 2 +- 21 files changed, 574 insertions(+), 574 deletions(-) delete mode 100644 doc/source/whatsnew/v0.18.2.txt create mode 100644 doc/source/whatsnew/v0.20.0.txt diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index e971f1f28903f..f0e01ddc3fc2d 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -653,7 +653,7 @@ The same applies to ``df.append(df_different)``. Unioning ~~~~~~~~ -.. versionadded:: 0.18.2 +.. versionadded:: 0.19.0 If you want to combine categoricals that do not necessarily have the same categories, the `union_categorical` function will diff --git a/doc/source/merging.rst b/doc/source/merging.rst index b69d0d8ba3015..f14e5741c6e2e 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -1133,7 +1133,7 @@ fill/interpolate missing data: Merging AsOf ~~~~~~~~~~~~ -.. versionadded:: 0.18.2 +.. versionadded:: 0.19.0 A :func:`merge_asof` is similar to an ordered left-join except that we match on nearest key rather than equal keys. For each row in the ``left`` DataFrame, we select the last row in the ``right`` DataFrame whose ``on`` key is less than the left's key. Both DataFrames must be sorted by the key. diff --git a/doc/source/text.rst b/doc/source/text.rst index 3822c713d7f85..3a4a57ff4da95 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -316,7 +316,7 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as ``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the same result as a ``Series.str.extractall`` with a default index (starts from 0). -.. versionadded:: 0.18.2 +.. versionadded:: 0.19.0 .. ipython:: python diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 685f1d2086c69..77dc249aeb788 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,7 +18,7 @@ What's New These are new features and improvements of note in each release. -.. include:: whatsnew/v0.18.2.txt +.. include:: whatsnew/v0.19.0.txt .. include:: whatsnew/v0.18.1.txt diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt deleted file mode 100644 index 64644bd9a7a26..0000000000000 --- a/doc/source/whatsnew/v0.18.2.txt +++ /dev/null @@ -1,532 +0,0 @@ -.. _whatsnew_0182: - -v0.18.2 (July ??, 2016) ------------------------ - -This is a minor bug-fix release from 0.18.1 and includes a large number of -bug fixes along with several new features, enhancements, and performance improvements. -We recommend that all users upgrade to this version. - -Highlights include: - -- :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - -.. contents:: What's new in v0.18.2 - :local: - :backlinks: none - -.. _whatsnew_0182.new_features: - -New features -~~~~~~~~~~~~ - -.. _whatsnew_0182.enhancements.asof_merge: - -:func:`merge_asof` for asof-style time-series joining -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A long-time requested feature has been added through the :func:`merge_asof` function, to -support asof style joining of time-series. (:issue:`1870`). Full documentation is -:ref:`here ` - -The :func:`merge_asof` performs an asof merge, which is similar to a left-join -except that we match on nearest key rather than equal keys. - -.. ipython:: python - - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) - - left - right - -We typically want to match exactly when possible, and use the most -recent value otherwise. - -.. ipython:: python - - pd.merge_asof(left, right, on='a') - -We can also match rows ONLY with prior data, and not an exact match. - -.. ipython:: python - - pd.merge_asof(left, right, on='a', allow_exact_matches=False) - - -In a typical time-series example, we have ``trades`` and ``quotes`` and we want to ``asof-join`` them. -This also illustrates using the ``by`` parameter to group data before merging. - -.. ipython:: python - - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', - 'MSFT', 'GOOG', 'AAPL', 'GOOG', - 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) - -.. ipython:: python - - trades - quotes - -An asof merge joins on the ``on``, typically a datetimelike field, which is ordered, and -in this case we are using a grouper in the ``by`` field. This is like a left-outer join, except -that forward filling happens automatically taking the most recent non-NaN value. - -.. ipython:: python - - pd.merge_asof(trades, quotes, - on='time', - by='ticker') - -This returns a merged DataFrame with the entries in the same order as the original left -passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. - -.. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: - -:func:`read_csv` has improved support for duplicate column names -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:ref:`Duplicate column names ` are now supported in :func:`read_csv` whether -they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) - -.. ipython :: python - - data = '0,1,2\n3,4,5' - names = ['a', 'b', 'a'] - -Previous behaviour: - -.. code-block:: ipython - - In [2]: pd.read_csv(StringIO(data), names=names) - Out[2]: - a b a - 0 2 1 2 - 1 5 4 5 - -The first 'a' column contains the same data as the second 'a' column, when it should have -contained the array ``[0, 3]``. - -New behaviour: - -.. ipython :: python - - In [2]: pd.read_csv(StringIO(data), names=names) - -.. _whatsnew_0182.enhancements.semi_month_offsets: - -Semi-Month Offsets -^^^^^^^^^^^^^^^^^^ - -Pandas has gained new frequency offsets, ``SemiMonthEnd`` ('SM') and ``SemiMonthBegin`` ('SMS'). -These provide date offsets anchored (by default) to the 15th and end of month, and 15th and 1st of month respectively. -(:issue:`1543`) - -.. ipython:: python - - from pandas.tseries.offsets import SemiMonthEnd, SemiMonthBegin - -SemiMonthEnd: - -.. ipython:: python - - Timestamp('2016-01-01') + SemiMonthEnd() - - pd.date_range('2015-01-01', freq='SM', periods=4) - -SemiMonthBegin: - -.. ipython:: python - - Timestamp('2016-01-01') + SemiMonthBegin() - - pd.date_range('2015-01-01', freq='SMS', periods=4) - -Using the anchoring suffix, you can also specify the day of month to use instead of the 15th. - -.. ipython:: python - - pd.date_range('2015-01-01', freq='SMS-16', periods=4) - - pd.date_range('2015-01-01', freq='SM-14', periods=4) - -.. _whatsnew_0182.enhancements.other: - -Other enhancements -^^^^^^^^^^^^^^^^^^ - -- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - -- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here ` (:issue:`10008`, :issue:`13156`) -- ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) - - .. ipython:: python - - idx = pd.Index(["a1a2", "b1", "c1"]) - idx.str.extractall("[ab](?P\d)") - -- ``Timestamp`` s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) - - .. ipython:: python - - pd.Timestamp(2012, 1, 1) - - pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) - -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) - -- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) -- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) - - .. ipython:: python - - idx = pd.Index(['a', 'b', 'c']) - idx.where([True, False, True]) - -- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) -- ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) -- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) -- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) -- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) -- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) -- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - -.. _whatsnew_0182.api: - -API changes -~~~~~~~~~~~ - - -- Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) -- An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) -- Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) -- ``Styler.apply`` is now more strict about the outputs your function must return. For ``axis=0`` or ``axis=1``, the output shape must be identical. For ``axis=None``, the output must be a DataFrame with identical columns and index labels. (:issue:`13222`) - -.. _whatsnew_0182.api.tolist: - -``Series.tolist()`` will now return Python types -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behaviour (:issue:`10904`) - - -.. ipython:: python - - s = pd.Series([1,2,3]) - type(s.tolist()[0]) - -Previous Behavior: - -.. code-block:: ipython - - In [7]: type(s.tolist()[0]) - Out[7]: - - -New Behavior: - -.. ipython:: python - - type(s.tolist()[0]) - -.. _whatsnew_0182.api.promote: - -``Series`` type promotion on assignment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ``Series`` will now correctly promote its dtype for assignment with incompat values to the current dtype (:issue:`13234`) - - -.. ipython:: python - - s = pd.Series() - -Previous Behavior: - -.. code-block:: ipython - - In [2]: s["a"] = pd.Timestamp("2016-01-01") - - In [3]: s["b"] = 3.0 - TypeError: invalid type promotion - -New Behavior: - -.. ipython:: python - - s["a"] = pd.Timestamp("2016-01-01") - s["b"] = 3.0 - s - s.dtype - -.. _whatsnew_0182.api.to_datetime_coerce: - -``.to_datetime()`` when coercing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). -Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. - -Previous Behavior: - -.. code-block:: ipython - - In [2]: pd.to_datetime([1, 'foo'], errors='coerce') - Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) - -This will now convert integers/floats with the default unit of ``ns``. - -.. ipython:: python - - pd.to_datetime([1, 'foo'], errors='coerce') - -.. _whatsnew_0182.api.merging: - -Merging changes -^^^^^^^^^^^^^^^ - -Merging will now preserve the dtype of the join keys (:issue:`8596`) - -.. ipython:: python - - df1 = pd.DataFrame({'key': [1], 'v1': [10]}) - df1 - df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) - df2 - -Previous Behavior: - -.. code-block:: ipython - - In [5]: pd.merge(df1, df2, how='outer') - Out[5]: - key v1 - 0 1.0 10.0 - 1 1.0 20.0 - 2 2.0 30.0 - - In [6]: pd.merge(df1, df2, how='outer').dtypes - Out[6]: - key float64 - v1 float64 - dtype: object - -New Behavior: - -We are able to preserve the join keys - -.. ipython:: python - - pd.merge(df1, df2, how='outer') - pd.merge(df1, df2, how='outer').dtypes - -Of course if you have missing values that are introduced, then the -resulting dtype will be upcast (unchanged from previous). - -.. ipython:: python - - pd.merge(df1, df2, how='outer', on='key') - pd.merge(df1, df2, how='outer', on='key').dtypes - -.. _whatsnew_0182.describe: - -``.describe()`` changes -^^^^^^^^^^^^^^^^^^^^^^^ - -Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`) - -.. ipython:: python - - s = pd.Series([0, 1, 2, 3, 4]) - df = pd.DataFrame([0, 1, 2, 3, 4]) - -Previous Behavior: - -The percentiles were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame if the percentiles were duplicated. - -.. code-block:: ipython - - In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - Out[3]: - count 5.000000 - mean 2.000000 - std 1.581139 - min 0.000000 - 0.0% 0.000400 - 0.1% 0.002000 - 0.1% 0.004000 - 50% 2.000000 - 99.9% 3.996000 - 100.0% 3.998000 - 100.0% 3.999600 - max 4.000000 - dtype: float64 - - In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - Out[4]: - ... - ValueError: cannot reindex from a duplicate axis - -New Behavior: - -.. ipython:: python - - s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - -Furthermore: - -- Passing duplicated ``percentiles`` will now raise a ``ValueError``. -- Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) - -.. _whatsnew_0182.api.other: - -Other API changes -^^^^^^^^^^^^^^^^^ - -- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) -- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) -- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) -- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) -- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - -.. _whatsnew_0182.deprecations: - -Deprecations -^^^^^^^^^^^^ - -- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) -- ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) -- ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) -- top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) - -.. _whatsnew_0182.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) -- Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) -- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - -- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) -- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - - -.. _whatsnew_0182.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) -- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) -- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) -- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) -- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) -- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - - -- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) -- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) - -- Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) - -- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()`` ); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) - -- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) - -- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) -- Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) - - -- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) -- Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) -- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) -- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame`` appropriately when empty (:issue:`13212`) -- Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) -- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue:`13306`) -- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) -- Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) - -- Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) -- Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) -- Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) - -- Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) -- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) -- Bug in ``.str.replace`` does not raise ``TypeError`` for invalid replacement (:issue:`13438`) - - -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) -- Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a tempfile.TemporaryFile on Windows with Python 3 (:issue:`13398`) -- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) -- Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) -- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) -- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) -- Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - - - -- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) - - -- Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) -- Bug in ``pd.to_datetime()`` which overflowed on ``int8``, `int16`` dtypes (:issue:`13451`) -- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - -- Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) -- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) -- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) -- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) -- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - - -- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) -- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) -- Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) - -- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) - - -- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) -- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) - -- Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 42db0388ca5d9..70d54ea0d364d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1,7 +1,7 @@ .. _whatsnew_0190: -v0.19.0 (????, 2016) --------------------- +v0.19.0 (August ??, 2016) +------------------------- This is a major release from 0.18.2 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all @@ -9,75 +9,524 @@ users upgrade to this version. Highlights include: +- :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` -Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. - -.. contents:: What's new in v0.19.0 +.. contents:: What's new in v0.18.2 :local: :backlinks: none -.. _whatsnew_0190.enhancements: +.. _whatsnew_0190.new_features: New features ~~~~~~~~~~~~ +.. _whatsnew_0190.enhancements.asof_merge: + +:func:`merge_asof` for asof-style time-series joining +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A long-time requested feature has been added through the :func:`merge_asof` function, to +support asof style joining of time-series. (:issue:`1870`). Full documentation is +:ref:`here ` + +The :func:`merge_asof` performs an asof merge, which is similar to a left-join +except that we match on nearest key rather than equal keys. + +.. ipython:: python + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + left + right + +We typically want to match exactly when possible, and use the most +recent value otherwise. + +.. ipython:: python + + pd.merge_asof(left, right, on='a') + +We can also match rows ONLY with prior data, and not an exact match. + +.. ipython:: python + + pd.merge_asof(left, right, on='a', allow_exact_matches=False) + + +In a typical time-series example, we have ``trades`` and ``quotes`` and we want to ``asof-join`` them. +This also illustrates using the ``by`` parameter to group data before merging. + +.. ipython:: python + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + +.. ipython:: python + + trades + quotes + +An asof merge joins on the ``on``, typically a datetimelike field, which is ordered, and +in this case we are using a grouper in the ``by`` field. This is like a left-outer join, except +that forward filling happens automatically taking the most recent non-NaN value. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + +This returns a merged DataFrame with the entries in the same order as the original left +passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. + +.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: + +:func:`read_csv` has improved support for duplicate column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:ref:`Duplicate column names ` are now supported in :func:`read_csv` whether +they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) + +.. ipython :: python + + data = '0,1,2\n3,4,5' + names = ['a', 'b', 'a'] + +Previous behaviour: + +.. code-block:: ipython + + In [2]: pd.read_csv(StringIO(data), names=names) + Out[2]: + a b a + 0 2 1 2 + 1 5 4 5 + +The first 'a' column contains the same data as the second 'a' column, when it should have +contained the array ``[0, 3]``. + +New behaviour: + +.. ipython :: python + + In [2]: pd.read_csv(StringIO(data), names=names) + +.. _whatsnew_0190.enhancements.semi_month_offsets: +Semi-Month Offsets +^^^^^^^^^^^^^^^^^^ + +Pandas has gained new frequency offsets, ``SemiMonthEnd`` ('SM') and ``SemiMonthBegin`` ('SMS'). +These provide date offsets anchored (by default) to the 15th and end of month, and 15th and 1st of month respectively. +(:issue:`1543`) + +.. ipython:: python + + from pandas.tseries.offsets import SemiMonthEnd, SemiMonthBegin + +SemiMonthEnd: + +.. ipython:: python + + Timestamp('2016-01-01') + SemiMonthEnd() + + pd.date_range('2015-01-01', freq='SM', periods=4) + +SemiMonthBegin: + +.. ipython:: python + Timestamp('2016-01-01') + SemiMonthBegin() + pd.date_range('2015-01-01', freq='SMS', periods=4) + +Using the anchoring suffix, you can also specify the day of month to use instead of the 15th. + +.. ipython:: python + + pd.date_range('2015-01-01', freq='SMS-16', periods=4) + + pd.date_range('2015-01-01', freq='SM-14', periods=4) .. _whatsnew_0190.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ +- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) + +- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here ` (:issue:`10008`, :issue:`13156`) +- ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) + + .. ipython:: python + + idx = pd.Index(["a1a2", "b1", "c1"]) + idx.str.extractall("[ab](?P\d)") + +- ``Timestamp`` s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) + + .. ipython:: python + pd.Timestamp(2012, 1, 1) + pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) +- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) +- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) -.. _whatsnew_0190.api_breaking: + .. ipython:: python -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + idx = pd.Index(['a', 'b', 'c']) + idx.where([True, False, True]) + +- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) +- ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) +- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) +- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) +- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) +- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) +- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) +- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) .. _whatsnew_0190.api: +API changes +~~~~~~~~~~~ +- Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) +- An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) +- Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) +- ``Styler.apply`` is now more strict about the outputs your function must return. For ``axis=0`` or ``axis=1``, the output shape must be identical. For ``axis=None``, the output must be a DataFrame with identical columns and index labels. (:issue:`13222`) +.. _whatsnew_0190.api.tolist: +``Series.tolist()`` will now return Python types +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Other API Changes -^^^^^^^^^^^^^^^^^ +``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behaviour (:issue:`10904`) -.. _whatsnew_0190.deprecations: -Deprecations -^^^^^^^^^^^^ +.. ipython:: python + + s = pd.Series([1,2,3]) + type(s.tolist()[0]) + +Previous Behavior: + +.. code-block:: ipython + + In [7]: type(s.tolist()[0]) + Out[7]: + + +New Behavior: + +.. ipython:: python + + type(s.tolist()[0]) + +.. _whatsnew_0190.api.promote: + +``Series`` type promotion on assignment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``Series`` will now correctly promote its dtype for assignment with incompat values to the current dtype (:issue:`13234`) + + +.. ipython:: python + + s = pd.Series() +Previous Behavior: +.. code-block:: ipython + In [2]: s["a"] = pd.Timestamp("2016-01-01") + In [3]: s["b"] = 3.0 + TypeError: invalid type promotion -.. _whatsnew_0190.prior_deprecations: +New Behavior: -Removal of prior version deprecations/changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. ipython:: python + s["a"] = pd.Timestamp("2016-01-01") + s["b"] = 3.0 + s + s.dtype +.. _whatsnew_0190.api.to_datetime_coerce: +``.to_datetime()`` when coercing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). +Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. + +Previous Behavior: + +.. code-block:: ipython + + In [2]: pd.to_datetime([1, 'foo'], errors='coerce') + Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) + +This will now convert integers/floats with the default unit of ``ns``. + +.. ipython:: python + + pd.to_datetime([1, 'foo'], errors='coerce') + +.. _whatsnew_0190.api.merging: + +Merging changes +^^^^^^^^^^^^^^^ + +Merging will now preserve the dtype of the join keys (:issue:`8596`) + +.. ipython:: python + + df1 = pd.DataFrame({'key': [1], 'v1': [10]}) + df1 + df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + df2 + +Previous Behavior: + +.. code-block:: ipython + + In [5]: pd.merge(df1, df2, how='outer') + Out[5]: + key v1 + 0 1.0 10.0 + 1 1.0 20.0 + 2 2.0 30.0 + + In [6]: pd.merge(df1, df2, how='outer').dtypes + Out[6]: + key float64 + v1 float64 + dtype: object + +New Behavior: + +We are able to preserve the join keys + +.. ipython:: python + + pd.merge(df1, df2, how='outer') + pd.merge(df1, df2, how='outer').dtypes + +Of course if you have missing values that are introduced, then the +resulting dtype will be upcast (unchanged from previous). + +.. ipython:: python + + pd.merge(df1, df2, how='outer', on='key') + pd.merge(df1, df2, how='outer', on='key').dtypes + +.. _whatsnew_0190.describe: + +``.describe()`` changes +^^^^^^^^^^^^^^^^^^^^^^^ + +Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`) + +.. ipython:: python + + s = pd.Series([0, 1, 2, 3, 4]) + df = pd.DataFrame([0, 1, 2, 3, 4]) + +Previous Behavior: + +The percentiles were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame if the percentiles were duplicated. + +.. code-block:: ipython + + In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[3]: + count 5.000000 + mean 2.000000 + std 1.581139 + min 0.000000 + 0.0% 0.000400 + 0.1% 0.002000 + 0.1% 0.004000 + 50% 2.000000 + 99.9% 3.996000 + 100.0% 3.998000 + 100.0% 3.999600 + max 4.000000 + dtype: float64 + + In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[4]: + ... + ValueError: cannot reindex from a duplicate axis + +New Behavior: + +.. ipython:: python + + s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + +Furthermore: + +- Passing duplicated ``percentiles`` will now raise a ``ValueError``. +- Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) + +.. _whatsnew_0190.api.other: + +Other API changes +^^^^^^^^^^^^^^^^^ + +- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) +- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) +- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) +- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) +- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) + +.. _whatsnew_0190.deprecations: + +Deprecations +^^^^^^^^^^^^ + +- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) +- ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) +- ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) +- top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) .. _whatsnew_0190.performance: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) +- Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) +- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - +- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) +- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) .. _whatsnew_0190.bug_fixes: Bug Fixes ~~~~~~~~~ + +- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) +- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) +- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) +- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) +- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) + + +- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) +- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) + +- Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) + +- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()`` ); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) + +- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) + +- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) +- Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) + + +- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) +- Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame`` appropriately when empty (:issue:`13212`) +- Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) +- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue:`13306`) +- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) +- Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) + +- Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) +- Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) +- Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) + +- Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) +- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) +- Bug in ``.str.replace`` does not raise ``TypeError`` for invalid replacement (:issue:`13438`) + + +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) +- Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a tempfile.TemporaryFile on Windows with Python 3 (:issue:`13398`) +- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) +- Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) +- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) +- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) +- Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) + + + +- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) + + +- Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) +- Bug in ``pd.to_datetime()`` which overflowed on ``int8``, `int16`` dtypes (:issue:`13451`) +- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) + +- Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) +- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) +- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) +- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) +- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) + + +- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) +- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) +- Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) + +- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) + + +- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) +- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) + +- Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt new file mode 100644 index 0000000000000..695e917c76ba0 --- /dev/null +++ b/doc/source/whatsnew/v0.20.0.txt @@ -0,0 +1,83 @@ +.. _whatsnew_0200: + +v0.20.0 (????, 2016) +-------------------- + +This is a major release from 0.19 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +Highlights include: + + +Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. + +.. contents:: What's new in v0.19.0 + :local: + :backlinks: none + +.. _whatsnew_0200.enhancements: + +New features +~~~~~~~~~~~~ + + + + + +.. _whatsnew_0200.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + + + + + + +.. _whatsnew_0200.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0200.api: + + + + + + +Other API Changes +^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0200.deprecations: + +Deprecations +^^^^^^^^^^^^ + + + + + +.. _whatsnew_0200.prior_deprecations: + +Removal of prior version deprecations/changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + + + + +.. _whatsnew_0200.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + + + + +.. _whatsnew_0200.bug_fixes: + +Bug Fixes +~~~~~~~~~ diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index bf6fa35cf255f..7a0743f6b2778 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -286,7 +286,7 @@ def _cast_inplace(terms, acceptable_dtypes, dtype): acceptable_dtypes : list of acceptable numpy.dtype Will not cast if term's dtype in this list. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 dtype : str or numpy.dtype The dtype to cast to. diff --git a/pandas/core/base.py b/pandas/core/base.py index 96732a7140f9e..13a6b4b7b4ce0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1001,7 +1001,7 @@ def is_monotonic(self): Return boolean if values in the object are monotonic_increasing - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Returns ------- @@ -1017,7 +1017,7 @@ def is_monotonic_decreasing(self): Return boolean if values in the object are monotonic_decreasing - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Returns ------- diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 6dba41a746e19..f4aeaf9184d09 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -348,7 +348,7 @@ def astype(self, dtype, copy=True): If copy is set to False and dtype is categorical, the original object is returned. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 """ if is_categorical_dtype(dtype): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cc5c45158bf4f..7b271df4085cc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3642,7 +3642,7 @@ def asof(self, where, subset=None): The last row without any NaN is taken (or the last row without NaN considering only the subset of columns in the case of a DataFrame) - .. versionadded:: 0.18.2 For DataFrame + .. versionadded:: 0.19.0 For DataFrame If there is no good value, NaN is returned. diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 96472698ba9d9..ad27010714f63 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -378,7 +378,7 @@ def _shallow_copy_with_infer(self, values=None, **kwargs): def _deepcopy_if_needed(self, orig, copy=False): """ - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Make a copy of self if data coincides (in memory) with orig. Subclasses should override this if self._base is not an ndarray. @@ -494,7 +494,7 @@ def repeat(self, n, *args, **kwargs): def where(self, cond, other=None): """ - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from @@ -813,7 +813,7 @@ def _to_embed(self, keep_tz=False): satisfied, the original data is used to create a new Index or the original Index is returned. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 """ diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 3b7c660f5faa1..84b8926f4177f 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -313,7 +313,7 @@ def _can_reindex(self, indexer): def where(self, cond, other=None): """ - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from diff --git a/pandas/io/html.py b/pandas/io/html.py index 48caaa39dd711..609642e248eda 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -837,7 +837,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, Character to recognize as decimal point (e.g. use ',' for European data). - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Returns ------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index cbe04349b5105..d4ca717ddbc4e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -276,7 +276,7 @@ def read_hdf(path_or_buf, key=None, **kwargs): path_or_buf : path (string), buffer, or path object (pathlib.Path or py._path.local.LocalPath) to read from - .. versionadded:: 0.18.2 support for pathlib, py.path. + .. versionadded:: 0.19.0 support for pathlib, py.path. key : group identifier in the store. Can be omitted a HDF file contains a single pandas object. diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 4b7162398738e..d65dfc3254465 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -182,7 +182,7 @@ def merge_ordered(left, right, on=None, * outer: use union of keys from both frames (SQL: full outer join) * inner: use intersection of keys from both frames (SQL: inner join) - .. versionadded 0.18.2 + .. versionadded:: 0.19.0 Examples -------- @@ -263,7 +263,7 @@ def merge_asof(left, right, on=None, Optionally perform group-wise merge. This searches for the nearest match on the 'on' key within the same group according to 'by'. - .. versionadded 0.18.2 + .. versionadded:: 0.19.0 Parameters ---------- diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 42631d442a990..2e3d1ace9734c 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -747,7 +747,7 @@ def repeat(self, repeats, *args, **kwargs): def where(self, cond, other=None): """ - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 77500081be62c..83cb768b37aaa 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1857,7 +1857,7 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): - 'coerce' will return NaT if the timestamp can not be converted into the specified timezone - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 infer_dst : boolean, default False (DEPRECATED) Attempt to infer fall dst-transition hours based on order diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index f4b75ddd72126..d0b1fd746d0d5 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1258,7 +1258,7 @@ class SemiMonthEnd(SemiMonthOffset): Two DateOffset's per month repeating on the last day of the month and day_of_month. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Parameters ---------- @@ -1317,7 +1317,7 @@ class SemiMonthBegin(SemiMonthOffset): Two DateOffset's per month repeating on the first day of the month and day_of_month. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Parameters ---------- diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 8837881af0b6c..df6554fe1d5de 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -246,7 +246,7 @@ class Timestamp(_Timestamp): :func:`datetime.datetime` Parameters ------------------------------------ - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 year : int month : int @@ -539,7 +539,7 @@ class Timestamp(_Timestamp): - 'coerce' will return NaT if the timestamp can not be converted into the specified timezone - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Returns ------- diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 53db9ddf79a5c..44338f26eb2e8 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -206,7 +206,7 @@ def union_categoricals(to_union): Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. - .. versionadded 0.18.2 + .. versionadded:: 0.19.0 Parameters ---------- From ba82b511c76d87421c8900348efebe4577548ec6 Mon Sep 17 00:00:00 2001 From: Haleemur Ali Date: Fri, 8 Jul 2016 17:16:15 +0200 Subject: [PATCH 076/359] BUG: Datetime64Formatter not respecting ``formatter`` - [x] closes #10690 - [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` - [x] whatsnew entry the Datetime64Formatter class did not accept a `formatter` argument, so custom formatters passed in through `df.to_string` or `df.to_html` were silently ignored. Author: Haleemur Ali This patch had conflicts when merged, resolved by Committer: Joris Van den Bossche Closes #13567 from haleemur/fix/dt64_outputformat and squashes the following commits: 8d84283 [Haleemur Ali] fix bug in Datetime64Formatter, which affected custom date formatted output for df.to_string, df.to_html methods --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/formats/format.py | 4 + pandas/tests/formats/test_format.py | 128 ++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 70d54ea0d364d..657de7ec26efc 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -522,6 +522,7 @@ Bug Fixes - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) +- Bug in ``.to_html``, ``.to_latex`` and ``.to_string`` silently ignore custom datetime formatter passed through the ``formatters`` key word (:issue:`10690`) - Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index a8e184ce94c89..0c6a15db4ccfe 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -2239,9 +2239,13 @@ def _format_strings(self): """ we by definition have DO NOT have a TZ """ values = self.values + if not isinstance(values, DatetimeIndex): values = DatetimeIndex(values) + if self.formatter is not None and callable(self.formatter): + return [self.formatter(x) for x in values] + fmt_values = format_array_from_datetime( values.asi8.ravel(), format=_get_format_datetime64_from_values(values, diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index e67fe2cddde77..c5e9c258b293a 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -456,6 +456,28 @@ def test_to_string_with_formatters(self): '2 0x3 [ 3.0] -False-')) self.assertEqual(result, result2) + def test_to_string_with_datetime64_monthformatter(self): + months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] + x = DataFrame({'months': months}) + + def format_func(x): + return x.strftime('%Y-%m') + result = x.to_string(formatters={'months': format_func}) + expected = 'months\n0 2016-01\n1 2016-02' + self.assertEqual(result.strip(), expected) + + def test_to_string_with_datetime64_hourformatter(self): + + x = DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], + format='%H:%M:%S.%f')}) + + def format_func(x): + return x.strftime('%H:%M') + + result = x.to_string(formatters={'hod': format_func}) + expected = 'hod\n0 10:10\n1 12:12' + self.assertEqual(result.strip(), expected) + def test_to_string_with_formatters_unicode(self): df = DataFrame({u('c/\u03c3'): [1, 2, 3]}) result = df.to_string(formatters={u('c/\u03c3'): lambda x: '%s' % x}) @@ -1233,6 +1255,63 @@ def test_to_html_index_formatter(self): self.assertEqual(result, expected) + def test_to_html_datetime64_monthformatter(self): + months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] + x = DataFrame({'months': months}) + + def format_func(x): + return x.strftime('%Y-%m') + result = x.to_html(formatters={'months': format_func}) + expected = """\ + + + + + + + + + + + + + + + + + +
    months
    02016-01
    12016-02
    """ + self.assertEqual(result, expected) + + def test_to_html_datetime64_hourformatter(self): + + x = DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], + format='%H:%M:%S.%f')}) + + def format_func(x): + return x.strftime('%H:%M') + result = x.to_html(formatters={'hod': format_func}) + expected = """\ + + + + + + + + + + + + + + + + + +
    hod
    010:10
    112:12
    """ + self.assertEqual(result, expected) + def test_to_html_regression_GH6098(self): df = DataFrame({u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')], u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), @@ -2775,6 +2854,33 @@ def test_to_latex_format(self): self.assertEqual(withindex_result, withindex_expected) + def test_to_latex_with_formatters(self): + df = DataFrame({'int': [1, 2, 3], + 'float': [1.0, 2.0, 3.0], + 'object': [(1, 2), True, False], + 'datetime64': [datetime(2016, 1, 1), + datetime(2016, 2, 5), + datetime(2016, 3, 3)]}) + + formatters = {'int': lambda x: '0x%x' % x, + 'float': lambda x: '[% 4.1f]' % x, + 'object': lambda x: '-%s-' % str(x), + 'datetime64': lambda x: x.strftime('%Y-%m'), + '__index__': lambda x: 'index: %s' % x} + result = df.to_latex(formatters=dict(formatters)) + + expected = r"""\begin{tabular}{llrrl} +\toprule +{} & datetime64 & float & int & object \\ +\midrule +index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ +index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ +index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ +\bottomrule +\end{tabular} +""" + self.assertEqual(result, expected) + def test_to_latex_multiindex(self): df = DataFrame({('x', 'y'): ['a']}) result = df.to_latex() @@ -4161,6 +4267,28 @@ def test_dates_display(self): self.assertEqual(result[1].strip(), "NaT") self.assertEqual(result[4].strip(), "2013-01-01 09:00:00.000000004") + def test_datetime64formatter_yearmonth(self): + x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)]) + + def format_func(x): + return x.strftime('%Y-%m') + + formatter = fmt.Datetime64Formatter(x, formatter=format_func) + result = formatter.get_result() + self.assertEqual(result, ['2016-01', '2016-02']) + + def test_datetime64formatter_hoursecond(self): + + x = Series(pd.to_datetime(['10:10:10.100', '12:12:12.120'], + format='%H:%M:%S.%f')) + + def format_func(x): + return x.strftime('%H:%M') + + formatter = fmt.Datetime64Formatter(x, formatter=format_func) + result = formatter.get_result() + self.assertEqual(result, ['10:10', '12:12']) + class TestNaTFormatting(tm.TestCase): From f95576b883d919cdde30fdbaa6065cf9f5a6c1f4 Mon Sep 17 00:00:00 2001 From: Yuichiro Kaneko Date: Sun, 10 Jul 2016 02:01:59 +0900 Subject: [PATCH 077/359] BUG: Fix TimeDelta to Timedelta (#13600) --- pandas/tseries/tests/test_timedeltas.py | 4 ++-- pandas/tslib.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index e515ba624d203..1586d0385732f 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -137,12 +137,12 @@ def test_construction(self): self.assertRaises(ValueError, lambda: Timedelta('3.1415')) # invalid construction - tm.assertRaisesRegexp(ValueError, "cannot construct a TimeDelta", + tm.assertRaisesRegexp(ValueError, "cannot construct a Timedelta", lambda: Timedelta()) tm.assertRaisesRegexp(ValueError, "unit abbreviation w/o a number", lambda: Timedelta('foo')) tm.assertRaisesRegexp(ValueError, - "cannot construct a TimeDelta from the passed " + "cannot construct a Timedelta from the passed " "arguments, allowed keywords are ", lambda: Timedelta(day=10)) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index df6554fe1d5de..61c0f9c5a093b 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2615,7 +2615,7 @@ class Timedelta(_Timedelta): if value is None: if not len(kwargs): - raise ValueError("cannot construct a TimeDelta without a value/unit or descriptive keywords (days,seconds....)") + raise ValueError("cannot construct a Timedelta without a value/unit or descriptive keywords (days,seconds....)") def _to_py_int_float(v): if is_integer_object(v): @@ -2630,7 +2630,7 @@ class Timedelta(_Timedelta): nano = kwargs.pop('nanoseconds',0) value = convert_to_timedelta64(timedelta(**kwargs),'ns',False) + nano except TypeError as e: - raise ValueError("cannot construct a TimeDelta from the passed arguments, allowed keywords are " + raise ValueError("cannot construct a Timedelta from the passed arguments, allowed keywords are " "[weeks, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds]") if isinstance(value, Timedelta): From 5701c69369264f3aa6f571384602ceec1133dabc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 7 Jul 2016 13:36:49 -0700 Subject: [PATCH 078/359] COMPAT: 32-bit compat fixes mainly in testing closes #13566 closes #13584 --- pandas/core/internals.py | 2 +- pandas/tests/indexes/test_datetimelike.py | 7 +-- pandas/tests/indexes/test_multi.py | 8 ++-- pandas/tests/series/test_analytics.py | 24 +++++----- pandas/tests/test_algos.py | 6 ++- pandas/tests/test_categorical.py | 23 +++++----- pandas/tests/test_groupby.py | 16 +++---- pandas/tools/merge.py | 3 +- pandas/tools/tests/test_merge.py | 12 ++--- pandas/tools/tests/test_tile.py | 5 ++- pandas/tseries/tests/test_base.py | 54 +++++++++++++---------- pandas/tseries/tests/test_timedeltas.py | 6 ++- pandas/tslib.pyx | 20 ++++----- 13 files changed, 104 insertions(+), 82 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c931adc9a31df..1ea567f15cb7f 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3085,7 +3085,7 @@ def reduction(self, f, axis=0, consolidate=True, transposed=False, # compute the orderings of our original data if len(self.blocks) > 1: - indexer = np.empty(len(self.axes[0]), dtype='int64') + indexer = np.empty(len(self.axes[0]), dtype=np.intp) i = 0 for b in self.blocks: for j in b.mgr_locs: diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 4a664ed3542d7..9eba481a66685 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -534,9 +534,9 @@ def test_get_loc(self): # time indexing idx = pd.date_range('2000-01-01', periods=24, freq='H') tm.assert_numpy_array_equal(idx.get_loc(time(12)), - np.array([12], dtype=np.int64)) + np.array([12]), check_dtype=False) tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), - np.array([], dtype=np.int64)) + np.array([]), check_dtype=False) with tm.assertRaises(NotImplementedError): idx.get_loc(time(12, 30), method='pad') @@ -587,7 +587,8 @@ def test_time_loc(self): # GH8667 ts = pd.Series(np.random.randn(n), index=idx) i = np.arange(start, n, step) - tm.assert_numpy_array_equal(ts.index.get_loc(key), i) + tm.assert_numpy_array_equal(ts.index.get_loc(key), i, + check_dtype=False) tm.assert_series_equal(ts[key], ts.iloc[i]) left, right = ts.copy(), ts.copy() diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index fb5576bed90b4..e6a8aafc32be4 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1750,12 +1750,12 @@ def test_reindex_level(self): exp_index2 = self.index.join(idx, level='second', how='left') self.assertTrue(target.equals(exp_index)) - exp_indexer = np.array([0, 2, 4], dtype=np.int64) - tm.assert_numpy_array_equal(indexer, exp_indexer) + exp_indexer = np.array([0, 2, 4]) + tm.assert_numpy_array_equal(indexer, exp_indexer, check_dtype=False) self.assertTrue(target2.equals(exp_index2)) - exp_indexer2 = np.array([0, -1, 0, -1, 0, -1], dtype=np.int64) - tm.assert_numpy_array_equal(indexer2, exp_indexer2) + exp_indexer2 = np.array([0, -1, 0, -1, 0, -1]) + tm.assert_numpy_array_equal(indexer2, exp_indexer2, check_dtype=False) assertRaisesRegexp(TypeError, "Fill method not supported", self.index.reindex, self.index, method='pad', diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 433f0f4bc67f5..0dbff0a028619 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -262,7 +262,7 @@ def test_kurt(self): self.assertTrue((df.kurt() == 0).all()) def test_argsort(self): - self._check_accum_op('argsort') + self._check_accum_op('argsort', check_dtype=False) argsorted = self.ts.argsort() self.assertTrue(issubclass(argsorted.dtype.type, np.integer)) @@ -289,8 +289,10 @@ def test_argsort_stable(self): mexpected = np.argsort(s.values, kind='mergesort') qexpected = np.argsort(s.values, kind='quicksort') - self.assert_series_equal(mindexer, Series(mexpected)) - self.assert_series_equal(qindexer, Series(qexpected)) + self.assert_series_equal(mindexer, Series(mexpected), + check_dtype=False) + self.assert_series_equal(qindexer, Series(qexpected), + check_dtype=False) self.assertFalse(np.array_equal(qindexer, mindexer)) def test_cumsum(self): @@ -487,10 +489,11 @@ def testit(): except ImportError: pass - def _check_accum_op(self, name): + def _check_accum_op(self, name, check_dtype=True): func = getattr(np, name) self.assert_numpy_array_equal(func(self.ts).values, - func(np.array(self.ts))) + func(np.array(self.ts)), + check_dtype=check_dtype) # with missing values ts = self.ts.copy() @@ -499,7 +502,8 @@ def _check_accum_op(self, name): result = func(ts)[1::2] expected = func(np.array(ts.valid())) - self.assert_numpy_array_equal(result.values, expected) + self.assert_numpy_array_equal(result.values, expected, + check_dtype=False) def test_compress(self): cond = [True, False, True, False, False] @@ -1360,13 +1364,13 @@ def test_searchsorted_numeric_dtypes_scalar(self): self.assertEqual(r, e) r = s.searchsorted([30]) - e = np.array([2], dtype=np.int64) + e = np.array([2], dtype=np.intp) tm.assert_numpy_array_equal(r, e) def test_searchsorted_numeric_dtypes_vector(self): s = Series([1, 2, 90, 1000, 3e9]) r = s.searchsorted([91, 2e6]) - e = np.array([3, 4], dtype=np.int64) + e = np.array([3, 4], dtype=np.intp) tm.assert_numpy_array_equal(r, e) def test_search_sorted_datetime64_scalar(self): @@ -1380,14 +1384,14 @@ def test_search_sorted_datetime64_list(self): s = Series(pd.date_range('20120101', periods=10, freq='2D')) v = [pd.Timestamp('20120102'), pd.Timestamp('20120104')] r = s.searchsorted(v) - e = np.array([1, 2], dtype=np.int64) + e = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(r, e) def test_searchsorted_sorter(self): # GH8490 s = Series([3, 1, 2]) r = s.searchsorted([0, 3], sorter=np.argsort(s)) - e = np.array([0, 2], dtype=np.int64) + e = np.array([0, 2], dtype=np.intp) tm.assert_numpy_array_equal(r, e) def test_is_unique(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 8af93ad0ecb2e..cb90110c953c1 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -702,12 +702,14 @@ def test_unique_label_indices(): left = unique_label_indices(a) right = np.unique(a, return_index=True)[1] - tm.assert_numpy_array_equal(left, right) + tm.assert_numpy_array_equal(left, right, + check_dtype=False) a[np.random.choice(len(a), 10)] = -1 left = unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] - tm.assert_numpy_array_equal(left, right) + tm.assert_numpy_array_equal(left, right, + check_dtype=False) def test_rank(): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index cff5bbe14f1eb..90876a4541da6 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -515,17 +515,20 @@ def f(): def test_argsort(self): c = Categorical([5, 3, 1, 4, 2], ordered=True) - expected = np.array([2, 4, 1, 3, 0], dtype=np.int64) - tm.assert_numpy_array_equal(c.argsort(ascending=True), expected) + expected = np.array([2, 4, 1, 3, 0]) + tm.assert_numpy_array_equal(c.argsort(ascending=True), expected, + check_dtype=False) expected = expected[::-1] - tm.assert_numpy_array_equal(c.argsort(ascending=False), expected) + tm.assert_numpy_array_equal(c.argsort(ascending=False), expected, + check_dtype=False) def test_numpy_argsort(self): c = Categorical([5, 3, 1, 4, 2], ordered=True) - expected = np.array([2, 4, 1, 3, 0], dtype=np.int64) - tm.assert_numpy_array_equal(np.argsort(c), expected) + expected = np.array([2, 4, 1, 3, 0]) + tm.assert_numpy_array_equal(np.argsort(c), expected, + check_dtype=False) msg = "the 'kind' parameter is not supported" tm.assertRaisesRegexp(ValueError, msg, np.argsort, @@ -1505,7 +1508,7 @@ def test_searchsorted(self): # Single item array res = c1.searchsorted(['bread']) chk = s1.searchsorted(['bread']) - exp = np.array([1], dtype=np.int64) + exp = np.array([1], dtype=np.intp) self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) @@ -1514,21 +1517,21 @@ def test_searchsorted(self): # np.array.searchsorted() res = c1.searchsorted('bread') chk = s1.searchsorted('bread') - exp = np.array([1], dtype=np.int64) + exp = np.array([1], dtype=np.intp) self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) # Searching for a value that is not present in the Categorical res = c1.searchsorted(['bread', 'eggs']) chk = s1.searchsorted(['bread', 'eggs']) - exp = np.array([1, 4], dtype=np.int64) + exp = np.array([1, 4], dtype=np.intp) self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) # Searching for a value that is not present, to the right res = c1.searchsorted(['bread', 'eggs'], side='right') chk = s1.searchsorted(['bread', 'eggs'], side='right') - exp = np.array([3, 4], dtype=np.int64) # eggs before milk + exp = np.array([3, 4], dtype=np.intp) # eggs before milk self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) @@ -1538,7 +1541,7 @@ def test_searchsorted(self): chk = s2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) # eggs after donuts, after switching milk and donuts - exp = np.array([3, 5], dtype=np.int64) + exp = np.array([3, 5], dtype=np.intp) self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index d6d601f03d561..efcba758e3b38 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -5934,49 +5934,49 @@ def test_nargsort(self): result = _nargsort(items, kind='mergesort', ascending=True, na_position='last') exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='first' result = _nargsort(items, kind='mergesort', ascending=True, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='last' result = _nargsort(items, kind='mergesort', ascending=False, na_position='last') exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='first' result = _nargsort(items, kind='mergesort', ascending=False, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='last') exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='last') exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) def test_datetime_count(self): df = DataFrame({'a': [1, 2, 3] * 2, diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index d65dfc3254465..075dff9cf6c38 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -436,7 +436,8 @@ def _merger(x, y): # if we DO have duplicates, then # we cannot guarantee order - sorter = np.concatenate([groupby.indices[g] for g, _ in groupby]) + sorter = com._ensure_platform_int( + np.concatenate([groupby.indices[g] for g, _ in groupby])) if len(result) != len(sorter): if check_duplicates: raise AssertionError("invalid reverse grouping") diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 2505309768997..c8d1bae78dad3 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -91,8 +91,8 @@ def test_cython_left_outer_join(self): exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 - self.assert_numpy_array_equal(ls, exp_ls) - self.assert_numpy_array_equal(rs, exp_rs) + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) @@ -117,8 +117,8 @@ def test_cython_right_outer_join(self): exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 - self.assert_numpy_array_equal(ls, exp_ls) - self.assert_numpy_array_equal(rs, exp_rs) + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) @@ -141,8 +141,8 @@ def test_cython_inner_join(self): exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 - self.assert_numpy_array_equal(ls, exp_ls) - self.assert_numpy_array_equal(rs, exp_rs) + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index bb5429b5e8836..16731620a1dcd 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -19,8 +19,9 @@ class TestCut(tm.TestCase): def test_simple(self): data = np.ones(5) result = cut(data, 4, labels=False) - desired = np.array([1, 1, 1, 1, 1], dtype=np.int64) - tm.assert_numpy_array_equal(result, desired) + desired = np.array([1, 1, 1, 1, 1]) + tm.assert_numpy_array_equal(result, desired, + check_dtype=False) def test_bins(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 7077a23d5abcb..7eadbfb031222 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -505,7 +505,8 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, idx) self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2], dtype=np.int64)) + np.array([0, 1, 2]), + check_dtype=False) self.assertEqual(ordered.freq, idx.freq) ordered, indexer = idx.sort_values(return_indexer=True, @@ -513,7 +514,8 @@ def test_order(self): expected = idx[::-1] self.assert_index_equal(ordered, expected) self.assert_numpy_array_equal(indexer, - np.array([2, 1, 0], dtype=np.int64)) + np.array([2, 1, 0]), + check_dtype=False) self.assertEqual(ordered.freq, expected.freq) self.assertEqual(ordered.freq.n, -1) @@ -550,16 +552,16 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, expected) - exp = np.array([0, 4, 3, 1, 2], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertIsNone(ordered.freq) ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) self.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertIsNone(ordered.freq) def test_getitem(self): @@ -1271,7 +1273,8 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, idx) self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2], dtype=np.int64)) + np.array([0, 1, 2]), + check_dtype=False) self.assertEqual(ordered.freq, idx.freq) ordered, indexer = idx.sort_values(return_indexer=True, @@ -1309,16 +1312,16 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, expected) - exp = np.array([0, 4, 3, 1, 2], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertIsNone(ordered.freq) ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) self.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertIsNone(ordered.freq) def test_getitem(self): @@ -2074,14 +2077,16 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, idx) self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2], dtype=np.int64)) + np.array([0, 1, 2]), + check_dtype=False) _check_freq(ordered, idx) ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) self.assert_index_equal(ordered, idx[::-1]) self.assert_numpy_array_equal(indexer, - np.array([2, 1, 0], dtype=np.int64)) + np.array([2, 1, 0]), + check_dtype=False) _check_freq(ordered, idx[::-1]) pidx = PeriodIndex(['2011', '2013', '2015', '2012', @@ -2103,16 +2108,17 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, expected) - exp = np.array([0, 4, 3, 1, 2], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) self.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, + check_dtype=False) _check_freq(ordered, idx) pidx = PeriodIndex(['2011', '2013', 'NaT', '2011'], name='pidx', @@ -2148,7 +2154,8 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, idx) self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2], dtype=np.int64)) + np.array([0, 1, 2]), + check_dtype=False) self.assertEqual(ordered.freq, idx.freq) self.assertEqual(ordered.freq, freq) @@ -2157,7 +2164,8 @@ def test_order(self): expected = idx[::-1] self.assert_index_equal(ordered, expected) self.assert_numpy_array_equal(indexer, - np.array([2, 1, 0], dtype=np.int64)) + np.array([2, 1, 0]), + check_dtype=False) self.assertEqual(ordered.freq, expected.freq) self.assertEqual(ordered.freq, freq) @@ -2191,16 +2199,16 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, expected) - exp = np.array([0, 4, 3, 1, 2], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertEqual(ordered.freq, 'D') ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) self.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertEqual(ordered.freq, 'D') def test_getitem(self): diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 1586d0385732f..c3bd62849bf82 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1547,12 +1547,14 @@ def test_sort_values(self): ordered, dexer = idx.sort_values(return_indexer=True) self.assertTrue(ordered.is_monotonic) self.assert_numpy_array_equal(dexer, - np.array([1, 2, 0], dtype=np.int64)) + np.array([1, 2, 0]), + check_dtype=False) ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) self.assertTrue(ordered[::-1].is_monotonic) self.assert_numpy_array_equal(dexer, - np.array([0, 2, 1], dtype=np.int64)) + np.array([0, 2, 1]), + check_dtype=False) def test_insert(self): diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 61c0f9c5a093b..0db4282808a26 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -3754,11 +3754,11 @@ except: def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): cdef: - ndarray[int64_t] utc_dates, tt, result, trans, deltas, posn + ndarray[int64_t] utc_dates, tt, result, trans, deltas Py_ssize_t i, j, pos, n = len(vals) - int64_t v, offset + ndarray[Py_ssize_t] posn + int64_t v, offset, delta pandas_datetimestruct dts - Py_ssize_t trans_len if not have_pytz: import pytz @@ -3790,7 +3790,6 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if not len(tt): return vals - trans_len = len(trans) posn = trans.searchsorted(tt, side='right') j = 0 for i in range(n): @@ -3826,18 +3825,19 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): # Convert UTC to other timezone trans, deltas, typ = _get_dst_info(tz2) - trans_len = len(trans) - - # if all NaT, return all NaT - if (utc_dates==NPY_NAT).all(): - return utc_dates # use first non-NaT element # if all-NaT, return all-NaT if (result==NPY_NAT).all(): return result - posn = trans.searchsorted(utc_dates[utc_dates!=NPY_NAT], side='right') + # if all NaT, return all NaT + tt = utc_dates[utc_dates!=NPY_NAT] + if not len(tt): + return utc_dates + + posn = trans.searchsorted(tt, side='right') + j = 0 for i in range(n): v = utc_dates[i] From 713eaa6837127f619619bca8a5a32ed02b145754 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 10 Jul 2016 17:01:51 -0400 Subject: [PATCH 079/359] BUG: DatetimeIndex - Period shows ununderstandable error closes #13078 Author: sinhrks Closes #13581 from sinhrks/dti_period_error and squashes the following commits: c957541 [sinhrks] BUG: DatetimeIndex - Period shows ununderstandable error --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/tseries/tests/test_base.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 657de7ec26efc..6a1d450cf083f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -473,7 +473,7 @@ Bug Fixes - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) - +- Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`) - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) - Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 7eadbfb031222..360944e355b4d 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -443,6 +443,20 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + def test_sub_period(self): + # GH 13078 + # not supported, check TypeError + p = pd.Period('2011-01-01', freq='D') + + for freq in [None, 'D']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=freq) + + with tm.assertRaises(TypeError): + idx - p + + with tm.assertRaises(TypeError): + p - idx + def test_value_counts_unique(self): # GH 7735 for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: @@ -1159,6 +1173,20 @@ def test_dti_tdi_numeric_ops(self): expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) tm.assert_index_equal(result, expected) + def test_sub_period(self): + # GH 13078 + # not supported, check TypeError + p = pd.Period('2011-01-01', freq='D') + + for freq in [None, 'H']: + idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) + + with tm.assertRaises(TypeError): + idx - p + + with tm.assertRaises(TypeError): + p - idx + def test_addition_ops(self): # with datetimes/timedelta and tdi/dti From 675a6e35cc78063f68a14338ae69c099588e23d1 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 10 Jul 2016 17:06:14 -0400 Subject: [PATCH 080/359] ENH: add downcast to pd.to_numeric Title is self-explanatory. Closes #13352. Author: gfyoung Closes #13425 from gfyoung/to-numeric-enhance and squashes the following commits: 4758dcc [gfyoung] ENH: add 'downcast' to pd.to_numeric --- asv_bench/benchmarks/inference.py | 21 +++++- doc/source/basics.rst | 102 ++++++++++++++++++++++------- doc/source/whatsnew/v0.19.0.txt | 7 ++ pandas/tools/tests/test_util.py | 77 ++++++++++++++++++++++ pandas/tools/util.py | 103 ++++++++++++++++++++++++++---- 5 files changed, 273 insertions(+), 37 deletions(-) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 3fceed087facb..6809c351beade 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -135,4 +135,23 @@ def setup(self): self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) def time_dtype_infer_uint32(self): - (self.df_uint32['A'] + self.df_uint32['B']) \ No newline at end of file + (self.df_uint32['A'] + self.df_uint32['B']) + + +class to_numeric(object): + N = 500000 + + param_names = ['data', 'downcast'] + params = [ + [(['1'] * N / 2) + ([2] * N / 2), + (['-1'] * N / 2) + ([2] * N / 2), + np.repeat(np.array('1970-01-01', '1970-01-02', + dtype='datetime64[D]'), N), + (['1.1'] * N / 2) + ([2] * N / 2), + ([1] * N / 2) + ([2] * N / 2), + np.repeat(np.int32(1), N)], + [None, 'integer', 'signed', 'unsigned', 'float'], + ] + + def time_to_numeric(self, data, downcast): + pd.to_numeric(data, downcast=downcast) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 8145e9536a82a..63a7c8fded2db 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1754,39 +1754,93 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` object conversion ~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.convert_objects` is a method to try to force conversion of types from the ``object`` dtype to other types. -To force conversion of specific types that are *number like*, e.g. could be a string that represents a number, -pass ``convert_numeric=True``. This will force strings and numbers alike to be numbers if possible, otherwise -they will be set to ``np.nan``. +pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types. +The following functions are available for one dimensional object arrays or scalars: + +- :meth:`~pandas.to_numeric` (conversion to numeric dtypes) + + .. ipython:: python + + m = ['1.1', 2, 3] + pd.to_numeric(m) + +- :meth:`~pandas.to_datetime` (conversion to datetime objects) + + .. ipython:: python + + import datetime + m = ['2016-07-09', datetime.datetime(2016, 3, 2)] + pd.to_datetime(m) + +- :meth:`~pandas.to_timedelta` (conversion to timedelta objects) + + .. ipython:: python + + m = ['5us', pd.Timedelta('1day')] + pd.to_timedelta(m) + +To force a conversion, we can pass in an ``errors`` argument, which specifies how pandas should deal with elements +that cannot be converted to desired dtype or object. By default, ``errors='raise'``, meaning that any errors encountered +will be raised during the conversion process. However, if ``errors='coerce'``, these errors will be ignored and pandas +will convert problematic elements to ``pd.NaT`` (for datetime and timedelta) or ``np.nan`` (for numeric). This might be +useful if you are reading in data which is mostly of the desired dtype (e.g. numeric, datetime), but occasionally has +non-conforming elements intermixed that you want to represent as missing: .. ipython:: python - :okwarning: - df3['D'] = '1.' - df3['E'] = '1' - df3.convert_objects(convert_numeric=True).dtypes + import datetime + m = ['apple', datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors='coerce') - # same, but specific dtype conversion - df3['D'] = df3['D'].astype('float16') - df3['E'] = df3['E'].astype('int32') - df3.dtypes + m = ['apple', 2, 3] + pd.to_numeric(m, errors='coerce') + + m = ['apple', pd.Timedelta('1day')] + pd.to_timedelta(m, errors='coerce') -To force conversion to ``datetime64[ns]``, pass ``convert_dates='coerce'``. -This will convert any datetime-like object to dates, forcing other values to ``NaT``. -This might be useful if you are reading in data which is mostly dates, -but occasionally has non-dates intermixed and you want to represent as missing. +The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it +encounters any errors with the conversion to a desired data type: .. ipython:: python - import datetime - s = pd.Series([datetime.datetime(2001,1,1,0,0), - 'foo', 1.0, 1, pd.Timestamp('20010104'), - '20010105'], dtype='O') - s - pd.to_datetime(s, errors='coerce') + import datetime + m = ['apple', datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors='ignore') + + m = ['apple', 2, 3] + pd.to_numeric(m, errors='ignore') + + m = ['apple', pd.Timedelta('1day')] + pd.to_timedelta(m, errors='ignore') + +In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument ``downcast``, which gives the +option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory: + +.. ipython:: python + + m = ['1', 2, 3] + pd.to_numeric(m, downcast='integer') # smallest signed int dtype + pd.to_numeric(m, downcast='signed') # same as 'integer' + pd.to_numeric(m, downcast='unsigned') # smallest unsigned int dtype + pd.to_numeric(m, downcast='float') # smallest float dtype + +As these methods apply only to one-dimensional arrays, lists or scalars; they cannot be used directly on multi-dimensional objects such +as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the function over each column efficiently: -In addition, :meth:`~DataFrame.convert_objects` will attempt the *soft* conversion of any *object* dtypes, meaning that if all -the objects in a Series are of the same type, the Series will have that dtype. +.. ipython:: python + + import datetime + df = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O') + df + df.apply(pd.to_datetime) + + df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O') + df + df.apply(pd.to_numeric) + + df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O') + df + df.apply(pd.to_timedelta) gotchas ~~~~~~~ diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 6a1d450cf083f..40ae38f12fccb 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -186,6 +186,13 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) +- ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) + + .. ipython:: python + + s = ['1', 2, 3] + pd.to_numeric(s, downcast='unsigned') + pd.to_numeric(s, downcast='integer') - ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here ` (:issue:`10008`, :issue:`13156`) - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index c592b33bdab9a..5b738086a1ad4 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -291,6 +291,83 @@ def test_non_hashable(self): with self.assertRaisesRegexp(TypeError, "Invalid object type"): pd.to_numeric(s) + def test_downcast(self): + # see gh-13352 + mixed_data = ['1', 2, 3] + int_data = [1, 2, 3] + date_data = np.array(['1970-01-02', '1970-01-03', + '1970-01-04'], dtype='datetime64[D]') + + invalid_downcast = 'unsigned-integer' + msg = 'invalid downcasting method provided' + + smallest_int_dtype = np.dtype(np.typecodes['Integer'][0]) + smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0]) + + # support below np.float32 is rare and far between + float_32_char = np.dtype(np.float32).char + smallest_float_dtype = float_32_char + + for data in (mixed_data, int_data, date_data): + with self.assertRaisesRegexp(ValueError, msg): + pd.to_numeric(data, downcast=invalid_downcast) + + expected = np.array([1, 2, 3], dtype=np.int64) + + res = pd.to_numeric(data) + tm.assert_numpy_array_equal(res, expected) + + res = pd.to_numeric(data, downcast=None) + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_int_dtype) + + for signed_downcast in ('integer', 'signed'): + res = pd.to_numeric(data, downcast=signed_downcast) + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_uint_dtype) + res = pd.to_numeric(data, downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_float_dtype) + res = pd.to_numeric(data, downcast='float') + tm.assert_numpy_array_equal(res, expected) + + # if we can't successfully cast the given + # data to a numeric dtype, do not bother + # with the downcast parameter + data = ['foo', 2, 3] + expected = np.array(data, dtype=object) + res = pd.to_numeric(data, errors='ignore', + downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + # cannot cast to an unsigned integer because + # we have a negative number + data = ['-1', 2, 3] + expected = np.array([-1, 2, 3], dtype=np.int64) + res = pd.to_numeric(data, downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + # cannot cast to an integer (signed or unsigned) + # because we have a float number + data = ['1.1', 2, 3] + expected = np.array([1.1, 2, 3], dtype=np.float64) + + for downcast in ('integer', 'signed', 'unsigned'): + res = pd.to_numeric(data, downcast=downcast) + tm.assert_numpy_array_equal(res, expected) + + # the smallest integer dtype need not be np.(u)int8 + data = ['256', 257, 258] + + for downcast, expected_dtype in zip( + ['integer', 'signed', 'unsigned'], + [np.int16, np.int16, np.uint16]): + expected = np.array([256, 257, 258], dtype=expected_dtype) + res = pd.to_numeric(data, downcast=downcast) + tm.assert_numpy_array_equal(res, expected) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 61d2c0adce2fe..d70904e1bf286 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -50,7 +50,7 @@ def compose(*funcs): return reduce(_compose2, funcs) -def to_numeric(arg, errors='raise'): +def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. @@ -61,6 +61,27 @@ def to_numeric(arg, errors='raise'): - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input + downcast : {'integer', 'signed', 'unsigned', 'float'} , default None + If not None, and if the data has been successfully cast to a + numerical dtype (or if the data was numeric to begin with), + downcast that resulting data to the smallest numerical dtype + possible according to the following rules: + + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) + - 'float': smallest float dtype (min.: np.float32) + + As this behaviour is separate from the core conversion to + numeric values, any errors raised during the downcasting + will be surfaced regardless of the value of the 'errors' input. + + In addition, downcasting will only occur if the size + of the resulting data's dtype is strictly larger than + the dtype it is to be cast to, so if none of the dtypes + checked satisfy that specification, no downcasting will be + performed on the data. + + .. versionadded:: 0.19.0 Returns ------- @@ -74,10 +95,37 @@ def to_numeric(arg, errors='raise'): >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float64 + >>> pd.to_numeric(s, downcast='float') + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float32 + >>> pd.to_numeric(s, downcast='signed') + 0 1 + 1 2 + 2 -3 + dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') + 0 apple + 1 1.0 + 2 2 + 3 -3 + dtype: object >>> pd.to_numeric(s, errors='coerce') + 0 NaN + 1 1.0 + 2 2.0 + 3 -3.0 + dtype: float64 """ + if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): + raise ValueError('invalid downcasting method provided') + is_series = False is_index = False is_scalar = False @@ -102,20 +150,51 @@ def to_numeric(arg, errors='raise'): else: values = arg - if com.is_numeric_dtype(values): - pass - elif com.is_datetime_or_timedelta_dtype(values): - values = values.astype(np.int64) - else: - values = com._ensure_object(values) - coerce_numeric = False if errors in ('ignore', 'raise') else True + try: + if com.is_numeric_dtype(values): + pass + elif com.is_datetime_or_timedelta_dtype(values): + values = values.astype(np.int64) + else: + values = com._ensure_object(values) + coerce_numeric = False if errors in ('ignore', 'raise') else True - try: values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) - except: - if errors == 'raise': - raise + + except Exception: + if errors == 'raise': + raise + + # attempt downcast only if the data has been successfully converted + # to a numerical dtype and if a downcast method has been specified + if downcast is not None and com.is_numeric_dtype(values): + typecodes = None + + if downcast in ('integer', 'signed'): + typecodes = np.typecodes['Integer'] + elif downcast == 'unsigned' and np.min(values) > 0: + typecodes = np.typecodes['UnsignedInteger'] + elif downcast == 'float': + typecodes = np.typecodes['Float'] + + # pandas support goes only to np.float32, + # as float dtypes smaller than that are + # extremely rare and not well supported + float_32_char = np.dtype(np.float32).char + float_32_ind = typecodes.index(float_32_char) + typecodes = typecodes[float_32_ind:] + + if typecodes is not None: + # from smallest to largest + for dtype in typecodes: + if np.dtype(dtype).itemsize < values.dtype.itemsize: + values = com._possibly_downcast_to_dtype( + values, dtype) + + # successful conversion + if values.dtype == dtype: + break if is_series: return pd.Series(values, index=arg.index, name=arg.name) From 1edc1df161f3274218fcd19c23663ea63386f105 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 10 Jul 2016 17:16:00 -0400 Subject: [PATCH 081/359] CLN: remove radd workaround in ops.py Remove workaround added in #353. Author: sinhrks Closes #13606 from sinhrks/ops_radd_cln and squashes the following commits: d873aad [sinhrks] CLN: remove radd workaround --- pandas/core/ops.py | 36 +++---------- pandas/sparse/series.py | 3 +- pandas/tests/series/test_operators.py | 75 +++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 34 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index f27a83f50e115..34ab3ae6863b5 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -34,7 +34,7 @@ # methods -def _create_methods(arith_method, radd_func, comp_method, bool_method, +def _create_methods(arith_method, comp_method, bool_method, use_numexpr, special=False, default_axis='columns'): # creates actual methods based upon arithmetic, comp and bool method # constructors. @@ -55,14 +55,14 @@ def names(x): return "__%s__" % x else: names = lambda x: x - radd_func = radd_func or operator.add + # Inframe, all special methods have default_axis=None, flex methods have # default_axis set to the default (columns) # yapf: disable new_methods = dict( add=arith_method(operator.add, names('add'), op('+'), default_axis=default_axis), - radd=arith_method(radd_func, names('radd'), op('+'), + radd=arith_method(lambda x, y: y + x, names('radd'), op('+'), default_axis=default_axis), sub=arith_method(operator.sub, names('sub'), op('-'), default_axis=default_axis), @@ -149,7 +149,7 @@ def add_methods(cls, new_methods, force, select, exclude): # ---------------------------------------------------------------------- # Arithmetic -def add_special_arithmetic_methods(cls, arith_method=None, radd_func=None, +def add_special_arithmetic_methods(cls, arith_method=None, comp_method=None, bool_method=None, use_numexpr=True, force=False, select=None, exclude=None): @@ -162,8 +162,6 @@ def add_special_arithmetic_methods(cls, arith_method=None, radd_func=None, arith_method : function (optional) factory for special arithmetic methods, with op string: f(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs) - radd_func : function (optional) - Possible replacement for ``operator.add`` for compatibility comp_method : function, optional, factory for rich comparison - signature: f(op, name, str_rep) use_numexpr : bool, default True @@ -176,12 +174,11 @@ def add_special_arithmetic_methods(cls, arith_method=None, radd_func=None, exclude : iterable of strings (optional) if passed, will not set functions with names in exclude """ - radd_func = radd_func or operator.add # in frame, special methods have default_axis = None, comp methods use # 'columns' - new_methods = _create_methods(arith_method, radd_func, comp_method, + new_methods = _create_methods(arith_method, comp_method, bool_method, use_numexpr, default_axis=None, special=True) @@ -218,7 +215,7 @@ def f(self, other): exclude=exclude) -def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, +def add_flex_arithmetic_methods(cls, flex_arith_method, flex_comp_method=None, flex_bool_method=None, use_numexpr=True, force=False, select=None, exclude=None): @@ -231,9 +228,6 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, flex_arith_method : function factory for special arithmetic methods, with op string: f(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs) - radd_func : function (optional) - Possible replacement for ``lambda x, y: operator.add(y, x)`` for - compatibility flex_comp_method : function, optional, factory for rich comparison - signature: f(op, name, str_rep) use_numexpr : bool, default True @@ -246,9 +240,8 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, exclude : iterable of strings (optional) if passed, will not set functions with names in exclude """ - radd_func = radd_func or (lambda x, y: operator.add(y, x)) # in frame, default axis is 'columns', doesn't matter for series and panel - new_methods = _create_methods(flex_arith_method, radd_func, + new_methods = _create_methods(flex_arith_method, flex_comp_method, flex_bool_method, use_numexpr, default_axis='columns', special=False) @@ -858,17 +851,6 @@ def wrapper(self, other): return wrapper -def _radd_compat(left, right): - radd = lambda x, y: y + x - # GH #353, NumPy 1.5.1 workaround - try: - output = radd(left, right) - except TypeError: - raise - - return output - - _op_descriptions = {'add': {'op': '+', 'desc': 'Addition', 'reversed': False, @@ -963,11 +945,9 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): series_flex_funcs = dict(flex_arith_method=_flex_method_SERIES, - radd_func=_radd_compat, flex_comp_method=_comp_method_SERIES) series_special_funcs = dict(arith_method=_arith_method_SERIES, - radd_func=_radd_compat, comp_method=_comp_method_SERIES, bool_method=_bool_method_SERIES) @@ -1209,11 +1189,9 @@ def f(self, other): frame_flex_funcs = dict(flex_arith_method=_arith_method_FRAME, - radd_func=_radd_compat, flex_comp_method=_flex_comp_method_FRAME) frame_special_funcs = dict(arith_method=_arith_method_FRAME, - radd_func=_radd_compat, comp_method=_comp_method_FRAME, bool_method=_arith_method_FRAME) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 519068b97a010..5c7762c56ec6d 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -7,7 +7,6 @@ import numpy as np import warnings -import operator from pandas.compat.numpy import function as nv from pandas.core.common import isnull, _values_from_object, _maybe_match_name @@ -803,7 +802,7 @@ def from_coo(cls, A, dense_index=False): # overwrite basic arithmetic to use SparseSeries version # force methods to overwrite previous definitions. ops.add_special_arithmetic_methods(SparseSeries, _arith_method, - radd_func=operator.add, comp_method=None, + comp_method=None, bool_method=None, use_numexpr=False, force=True) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 1e23c87fdb4ca..6ab382beb7973 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1259,8 +1259,6 @@ def _check_op(arr, op): _check_op(arr, operator.floordiv) def test_series_frame_radd_bug(self): - import operator - # GH 353 vals = Series(tm.rands_array(5, 10)) result = 'foo_' + vals @@ -1273,7 +1271,78 @@ def test_series_frame_radd_bug(self): tm.assert_frame_equal(result, expected) # really raise this time - self.assertRaises(TypeError, operator.add, datetime.now(), self.ts) + with tm.assertRaises(TypeError): + datetime.now() + self.ts + + with tm.assertRaises(TypeError): + self.ts + datetime.now() + + def test_series_radd_more(self): + data = [[1, 2, 3], + [1.1, 2.2, 3.3], + [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), + pd.NaT], + ['x', 'y', 1]] + + for d in data: + for dtype in [None, object]: + s = Series(d, dtype=dtype) + with tm.assertRaises(TypeError): + 'foo_' + s + + for dtype in [None, object]: + res = 1 + pd.Series([1, 2, 3], dtype=dtype) + exp = pd.Series([2, 3, 4], dtype=dtype) + tm.assert_series_equal(res, exp) + res = pd.Series([1, 2, 3], dtype=dtype) + 1 + tm.assert_series_equal(res, exp) + + res = np.nan + pd.Series([1, 2, 3], dtype=dtype) + exp = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) + tm.assert_series_equal(res, exp) + res = pd.Series([1, 2, 3], dtype=dtype) + np.nan + tm.assert_series_equal(res, exp) + + s = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('2 days'), + pd.Timedelta('3 days')], dtype=dtype) + exp = pd.Series([pd.Timedelta('4 days'), pd.Timedelta('5 days'), + pd.Timedelta('6 days')]) + tm.assert_series_equal(pd.Timedelta('3 days') + s, exp) + tm.assert_series_equal(s + pd.Timedelta('3 days'), exp) + + s = pd.Series(['x', np.nan, 'x']) + tm.assert_series_equal('a' + s, pd.Series(['ax', np.nan, 'ax'])) + tm.assert_series_equal(s + 'a', pd.Series(['xa', np.nan, 'xa'])) + + def test_frame_radd_more(self): + data = [[1, 2, 3], + [1.1, 2.2, 3.3], + [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), + pd.NaT], + ['x', 'y', 1]] + + for d in data: + for dtype in [None, object]: + s = DataFrame(d, dtype=dtype) + with tm.assertRaises(TypeError): + 'foo_' + s + + for dtype in [None, object]: + res = 1 + pd.DataFrame([1, 2, 3], dtype=dtype) + exp = pd.DataFrame([2, 3, 4], dtype=dtype) + tm.assert_frame_equal(res, exp) + res = pd.DataFrame([1, 2, 3], dtype=dtype) + 1 + tm.assert_frame_equal(res, exp) + + res = np.nan + pd.DataFrame([1, 2, 3], dtype=dtype) + exp = pd.DataFrame([np.nan, np.nan, np.nan], dtype=dtype) + tm.assert_frame_equal(res, exp) + res = pd.DataFrame([1, 2, 3], dtype=dtype) + np.nan + tm.assert_frame_equal(res, exp) + + df = pd.DataFrame(['x', np.nan, 'x']) + tm.assert_frame_equal('a' + df, pd.DataFrame(['ax', np.nan, 'ax'])) + tm.assert_frame_equal(df + 'a', pd.DataFrame(['xa', np.nan, 'xa'])) def test_operators_frame(self): # rpow does not work with DataFrame From 2a96ab7bd9614be79f349975908b42c676a244ab Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 10 Jul 2016 17:21:11 -0400 Subject: [PATCH 082/359] DEPR: rename Timestamp.offset to .freq closes #12160 Author: sinhrks Closes #13593 from sinhrks/depr_timestamp_offset and squashes the following commits: c7749d5 [sinhrks] DEPR: rename Timestamp.offset to .freq --- doc/source/whatsnew/v0.19.0.txt | 32 +++--- pandas/io/packers.py | 11 +- .../0.18.1_x86_64_darwin_2.7.12.msgpack | Bin 0 -> 119258 bytes .../0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack | Bin 0 -> 119206 bytes .../0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle | Bin 127220 -> 127584 bytes .../0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle | Bin 0 -> 125826 bytes .../io/tests/generate_legacy_storage_files.py | 12 ++- pandas/io/tests/test_packers.py | 28 +++-- pandas/io/tests/test_pickle.py | 6 ++ pandas/lib.pxd | 1 + pandas/src/inference.pyx | 5 +- pandas/src/period.pyx | 7 +- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/series/test_constructors.py | 4 +- pandas/tests/test_multilevel.py | 2 +- pandas/tseries/index.py | 7 +- pandas/tseries/tests/test_base.py | 35 ++++--- pandas/tseries/tests/test_timeseries.py | 60 +++++------ pandas/tseries/tests/test_tslib.py | 19 +++- pandas/tslib.pyx | 98 +++++++++--------- 20 files changed, 187 insertions(+), 142 deletions(-) create mode 100644 pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack create mode 100644 pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack create mode 100644 pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 40ae38f12fccb..a6c3c0c5d7f79 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -194,7 +194,7 @@ Other enhancements pd.to_numeric(s, downcast='unsigned') pd.to_numeric(s, downcast='integer') -- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here ` (:issue:`10008`, :issue:`13156`) +- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here ` (:issue:`10008`, :issue:`13156`) - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) .. ipython:: python @@ -202,7 +202,7 @@ Other enhancements idx = pd.Index(["a1a2", "b1", "c1"]) idx.str.extractall("[ab](?P\d)") -- ``Timestamp`` s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) +- ``Timestamp`` can now accept positional and keyword parameters similar to :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) .. ipython:: python @@ -227,8 +227,7 @@ Other enhancements - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) -- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) +- A top-level function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) .. _whatsnew_0190.api: @@ -238,9 +237,16 @@ API changes - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) +- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) - Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) - ``Styler.apply`` is now more strict about the outputs your function must return. For ``axis=0`` or ``axis=1``, the output shape must be identical. For ``axis=None``, the output must be a DataFrame with identical columns and index labels. (:issue:`13222`) +- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) +- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) +- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) +- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) +- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) + .. _whatsnew_0190.api.tolist: @@ -361,7 +367,7 @@ We are able to preserve the join keys pd.merge(df1, df2, how='outer').dtypes Of course if you have missing values that are introduced, then the -resulting dtype will be upcast (unchanged from previous). +resulting dtype will be upcast, which is unchanged from previous. .. ipython:: python @@ -419,17 +425,6 @@ Furthermore: - Passing duplicated ``percentiles`` will now raise a ``ValueError``. - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) -.. _whatsnew_0190.api.other: - -Other API changes -^^^^^^^^^^^^^^^^^ - -- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) -- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) -- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) -- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) -- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - .. _whatsnew_0190.deprecations: Deprecations @@ -439,6 +434,7 @@ Deprecations - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) - ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) - top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) +- ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) .. _whatsnew_0190.performance: @@ -503,7 +499,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a tempfile.TemporaryFile on Windows with Python 3 (:issue:`13398`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a ``tempfile.TemporaryFile`` on Windows with Python 3 (:issue:`13398`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) - Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) @@ -516,7 +512,7 @@ Bug Fixes - Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) -- Bug in ``pd.to_datetime()`` which overflowed on ``int8``, `int16`` dtypes (:issue:`13451`) +- Bug in ``pd.to_datetime()`` which overflowed on ``int8``, and ``int16`` dtypes (:issue:`13451`) - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 23aa133125213..ff06a5f212f8b 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -481,12 +481,12 @@ def encode(obj): tz = obj.tzinfo if tz is not None: tz = u(tz.zone) - offset = obj.offset - if offset is not None: - offset = u(offset.freqstr) + freq = obj.freq + if freq is not None: + freq = u(freq.freqstr) return {u'typ': u'timestamp', u'value': obj.value, - u'offset': offset, + u'freq': freq, u'tz': tz} if isinstance(obj, NaTType): return {u'typ': u'nat'} @@ -556,7 +556,8 @@ def decode(obj): if typ is None: return obj elif typ == u'timestamp': - return Timestamp(obj[u'value'], tz=obj[u'tz'], offset=obj[u'offset']) + freq = obj[u'freq'] if 'freq' in obj else obj[u'offset'] + return Timestamp(obj[u'value'], tz=obj[u'tz'], freq=freq) elif typ == u'nat': return NaT elif typ == u'period': diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack b/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack new file mode 100644 index 0000000000000000000000000000000000000000..978c2c5045314bbac06fd0965346f15cfb9db120 GIT binary patch literal 119258 zcmeEqWq965x}{3Y%<#owW@ct)2FH*%i6OSb%uEh*97C9ynVFfHnQ86V>7LuSd+y9! z*xhGWKXgj-C9YqLx)i zdJ`QI8dNp;!-T--9uYyylfO>xD^#F)SX9gS;K-oPu@&NyUj6o__0MsWhxI>D2ZEse z4=>~AWC+rFhBr*cKn9IZ#x=91y-r3rSI)BMld$fNUE`8*UY$nQlkvr^$2l$}VX3Ho zy^?VZsvf=|$A|fc2JMUs35y<@@|Yr5(qf_u75IG>zfYeeLDB_M0D@ByNkudju~fuU zkw`@{6{%EA(ahqi$+P`bOY+iFG?^F?6d4j8IP4!03r&>+OGP{tiBu$0kxE566`53I zQ;|!>6tR}n`6r{PM&75R@1m92^xCy)!W?X@a1r;mPVBma^(pE5t`dheu!uN#jLELrIOG z2#~6(dj4VUQamVmiY1ZB%~TgjF=bM3>bSp~FXi9F`ah@pTMMVyHA|{Bez)tdj$d~E z)t(<3@=MT^VG_gJb_fcH{(Y*SBTkM1NDM$@U?c`kx?`!b*X+_MI^;JqF9}U?ie+!g(k{4p}D3^D?^V5CF76p zZPzAat??h=q}1a#JMHMY=jHj)QK{1Z-hAw~ax zRT@9``}^<3GAZwqiqcPJySP$v0sghXwuv(fqRp|8k>A|A)HKpG4>X zW;gm3F2CIBSNjiclq%&Ps^34IBrp8NvHs!cd@}FTcvXU+V)@42NX8L+(&f^U@M41v zy^}HX{pv?FDMj4hw(~#pUaHLhzUusAh4sR3JL!K!*2(cnY$|W^Eno62fAXzB@~vQs zXiIAU+?4%u33<{lnw2ufA4%4KYk6Fnw72;gQ%O6Vs9$mQD?R_6Ey%y%6TkUf3UT?9 zOgQ0h6ZUU=nqPaHKgSfEtw^f(KTmTSH2YhpSd&DZem~^N;a9$5gO>k-q>$x#a(agzE3m$pgPv#9xd1MfCo>(){T`(SMw;llL3Hyy_PN{+EmX z0rQs&{Z>_f37u*gU@PioaOGFVf|v_Gd`NY^eyP zqMC|+shBAhbNqW&@ef;h;?Lah7u&D@*37UdB`y4o0snuw{o75}pRvndu>_@Fe{83O z{BNx7|G{#5@wamO>%clSbpP%?c1CiE{nN7hpFR;uG0DGFwcka?Xr6>6zsV7BGi~mVNAK)K7DEU+_ zf4*S}0pWo`|Nf*SZ=t^_$jK5PtmT&!Js`I>9h=&A7gd|9Y z6i9_MNQVr_ge=I09LR+{$Ool?{GhZ@Iw(Dq0m=wvf-*x{psY|fC_9t`$_eFyazlBb zyih(UKU4rJ2o-_~Lq(vXP%)@DR01jqm4ZqqCG@gTIjB5T0jda9f+|B*psG+cs5(>w zstMJCYD0CPx==l+KGXne2sMHlLrtKjP&24G)Br-)DP+p4S)thgP_6C z5NIeg3>prNfJQ>2pwZA6Xe=}i8V^l?VxWo8Bxo`;1)2&?gQi0>pqbDtXf`wlnhVW? zVxjp^925^FK#9--Xd$!+S`00LmO{&*<0p!?7R=ppn7dJH{*o+Ww4>Ev^AQQ+8vVg208^{iFfSe!~$PMy;YXbsu`f6x{LfItugfsu{1Hm9L7z_bJ!7wl!i~u9SC@>m~0b{{9 zFdj?*F<>H?1SW$iU@Djfrh^$^CYS|ggE?R>m|nw3&29K2rLFmz*4Xb zEC(yVO0Wv725Z1tunw#T8^A`e32X*iz*evgYzI5QPOuB?27AC>un+792f#sa2pk4S zz)^4v90w=BNpK3B24}!oa1NXY7r;eu30wwOz*TS!Tn9J6O>hg`26w<+a1Y!E55Pn4 z2s{Q)z*F!HJO?kpOYjQ325-Py@D98OAHYZO348`$z*q1Md<6cX)4}QC3~)v`6Py{& z0%wJ@!P(&)a85WEoEy#q=Y{jZ`QZX^LAVfH7%l=Ag^R((;Sz93xD;F(E(4c^%faR0 z3UEcZ5?mRs0#}8r!PVg!a80-tTpO+f*M;lB_2C9^L%0##7;XYLg`2_6;TCX9xE0(Q zZUg(nZQ%eo5DtQa;dXF)I0WthcZ55^p>P-+4oAS9;Yc_Nj)uFyUEywUcen@K6Yd4~ zhWo&M;eK#`cmO;Q9t01DhrmPOVeoKx1UwQR1&@Zuz+>TY@OXFv90N~;C&829DezQy z8ay4I0ndbI!L#8x@LYHv91G8fh=G`hh1iILxQK`NkTi%Nk`_saq(?F!8Iep#W+V%e z70HHVM{*!Jkz7b_BoC4o$%o`e3LpiMLP%kx2vQU&h7?CiASIDfNNJ=DQWhzPlt(Hc z6_H9vWuyvH6{&_)M`|E7ky=P?qz+OSsfW}@8XygkMo43%3DOj4hBQZ7AT5zrNNc1G z;*YdN0+2u?2nj~oA?=Y6qyy3s>4bzLVMsU4J1cx*^?>9!O857t$N) zgY-rEA^nj7$UtNeG8h?x3`K?^!;ullNMsZ;8X1F(MaCiHkqJl)G7*`COh%?4Q;})N zbYuoH6Pbm~M&=-Mk$FfgG9QUU;*kU-5m|sNL>3{7ktN7dWErv?S%IuXRw1jAHON|I z9kL$TfNVrIA)Apc$W~+_m1UyOBM}USuD#A31;=L=GW`kt4`a?iJktfJg zi$XDbW@*Vk+RQLf3qX>$k7>c6=N}?1> zqYTQT9Ll2tDxwl9qYA2`8mglPYN8fuqYmn#9_mBWpnhmtG##2A&46Y^GohK$ENE6V z8=4)>f#yVWp}EmKXkIiQnjbBI7DNl7h0!8tQM4FZ94&#CL`$Ki(K2XRv>aL>xedvDl0D2HTgdRqZphwYT=yCJ}dJ;W_ zo<`50XVG)$dGrE$5xs<7Mz5e((QD{+^agqpy@lRJ@1S?ld+2@i0s0Vqgg!=}pij|f z=yUW1`VxJGzDD1mZ_#(?d-MbP5&eXIM!%q6(QoK?^al!I0E00ELop1)F#;no3ZpRw zV=)fnF#!`X36n7eQ!x$GF#|I(3$rl?b1@I|VQDZwEG?D}OOIv1GGdvq%vcsIE0zt* zj^)5|V!5!~SRO1dmJiF16~GE&g|Na{5v(Xy3@eV6z)E7Ju+mr=tSnXzE00ycDq@we z%2*YwDpn1vj@7_wVzsc^SRJe`Ru8L>HNYBTjj+a86RauL3~P?Hz*=Ihu+~@`%pYru z1z>?#5EhKJ!`fpZSO=^l)(H#6!mw~G0_%)LVo_K$)&=W|b;G)2J+PiwFRVA#2kVRV z!}?DUZx zCN>M3jm^R4V)L+AY(5r;#bXIrBDMfqh%LevV@t55*fMN6wgOv;t-@AgYp}K0I&3|* z0o#ae!Zu@Du&vlOY&*6C+llSMc4K?6z1Ti%KXw2+h#kTXV@I%~*fH!lb^<$zox)CI zXRx!_IqW=k0lSD@!Y*T1u&dZL>^gP>yNTVxZew?_yVyPKKK1~6h&{p{V^6TB*fZ=o z_5yo}y~18&Z?L!6JM2C70sDx3!aiePu&>xR>^t@YgK&VuID(@%hT}MalQ@ObID@k| zhx53Ai@1c#xPq&=hU>V2o4AGBxP!a6hx_m}xF4PtPlu<+GvFEVOn7EI3!W9vhG)lf z;5qSJcy2roo)^!D=f?}+1@S_7VY~=l6fcGs$4lTP@ltqcybN9zFNc@ME8rFJN_b_w z3SJejhF8aH;5G4Dcx}86UKg*2*T);+4e>^JW4sC86mNz%$6Men@m6?iybbP;x5WeS zKs*Q!#@pfT@esTN-VyJFhvH#)I39s_#v}14JR0wUcg4Hm-SHlHPrMi28}Ebn#rxs? z@d5Zid=NeuAA%3XhvCEV5%@@a6h0asgOA0>;p6cMcnm%fpM+1wr{GiZY4~(}20jy? zh0n(4;B)bLcq~32kHh2f1UwO6fG@-s;fwJl_)>fsz8qhHuf$j3tMN7XT6`V89^Zg( z#5dua@h$jPd>g(U-+}MMcj3G7J@{UHAHE+yfFHyU;fL`f_)+{AejGo6pTtk$r|~oR zS^OM+9>0KJ#4q8O@hkXM{2G28zk%PxZ{fG`JNRAv9)2HxfIq|^;g9hr_*48D{v3aS zzrj`6NK2$6(i0hoj6^0PGm(YJN@OFl6FG>S zL@pvXk%!1j}DpqBK#4C`*(h$`cicibN%%GEs%7 zN>n4N6E%pML@lB=QHQ8Y)FbK>4Ty$BBcd_UglI}MBbpN}h?YbvqBYTm@F&_50Yo4X zL05{rn%#1djDv5Z(wtRPkrtBBRa8e%Q6j#y7@AT|=4 zh|RE^&{zPdp$V5|4<-#1rBv@r-y*ydYi@ zuZY*g8{#eTj(AUeAU+bGh|k0q;w$lu_)h#FAQF%;iI6CXkvK_^BuSAp$&f6`kvu7o zA}NtFsgNqEkveIRCTWp2>5wkzkv=jF=|`p|(~;@P3}i+!6PcOJLS`kik=e-{WKJ>{ znVZZ*<|Xry`N;xgL9!58m@GmTC5w^8$r5BqvJ_dGEJKzh%aP^D3S>pH5?Pt7LRKZK zk=4l>WKFUbS(~gw)+Ota^~nZgL$VRsm~28eC7Y4W$rfZwvK85yY(x5!ZOH&KkPIS& z$#!IWGKB0vb|gEIp=1~tPDYTO$w)Gaj3&E~UCC}_cd`fBlk7$ICi{?m$$n&iasWAy z97GN#hmb?bVdQXf1UZr%MUE!NkYmYl_4tbZnN8TqNkPpd6B!e~=IbD40Shl)@;SA}ErgD4Jp@mf|R$5-5?9 zD49|ymC`7kGANU>D4TL9m+~kdm4@=8(o*TD^i&2aBbABDOl6_6QrW2NR1PX9m5a(v z<)QLY`KbI<0jeNXh$>7Kp^8$)sNz%!sw7p4DovH4%2MU1@>B(?B2|g1OjV((Qq`#H zR1K;oRg0=k)uHNA^{Dz(1F9j_h-yqVp_)?7sOD4)swLHmYE8AF{HeB702N3DQNdI@ zsy!7#b)Y&@ov2VMj0&eBsLoU*6-7l;U8t^9H>x|;gX&53qIy$(sJ>J`sy{V=8b}SI z22(?*q0}&HI5mPANsXdLQ)8&H)HrH9HGztuCQ_5A$N0hOx=LN6u2VOto764pHg$)(OWmXHQxB+z)FbLK^@Ms#J)@peFQ}K)E9y1% zhI&iAqux^=sE^bq>NE9)`bvGHzEeLahz2xFBQ#25G)@yVNmDdUGc-$cG*1h(NK3R# zE3`^$v`!neNn5l{JG4uCw2w|h`_XCXbaZ+;1D%o1L}#Y6&{^qhbapxios-T*=ce<} zdFgy~e!2i%kS;_Qri;);>0)$ox&&R4E=8B7%g|-%a&&pR0$q`=L|3M(&{gSbbalE0 zU6Zaw*QV>xb?JI^eYyeNkZwdbrkl`B>1K3ux&_^mZbi4I+tB`WTRMOaq=V>Sx*gq~ z4xu~H9qCSVC>=(J(-CxMI+Bi}qvDs zx6#|_9rR9m7rmR_L+_>c(fjEG^g;R%eV9H%AEl4c$LSOFN%|Chnm$9HrO(ml=?nBl z`VxJazCvH6uhG}(8}v>37JZw(L*J$E(f8>G^h5d){g{42Kc%11&*>NROZpZ4ntnsS zrQgx-=@0Zr`V;+`{z8AHztP|6A2h@O24)ZjWiSS32!>=RhGrOsWjKas1V&^eMrIU7 zWi&=-48~+E#%3JGWjw~mq+$G+v`jiCJ(GdS$Yf$NGg+9dOg1JvlY`00LR zJ|;g?fGNlnVhS@wn4(NErZ`iADan*#N;74cvP?OqJX3+G$W&q~GgX+XOf{xDQ-i6= z)M9Egb(p$LJ*Ga>fN97yVj43|n5IlKra9AsY00!=S~G1Jf2J)HzyvZuOfb`qY0rc( z9hiCX&c1~P+~!ORe5 zC^L*1&WvD2GNYK$%ot`YGmaV0OkiS|iOeKsGBbsl%1mRXGc%Z(%q(U$Gl!YW%wuAi z`Ai%W&m=I3%mQX1vxr&DEMb;1%b4ZN3T7p6 zxy)Q)t}@q{>&y-2CUc9q&D>$`GWVGK%md~j^N4xOJYk+P&zR@T3+5&Bih0evVcs(D znD@*F<|Ffo`OJJ_zB1pK@5~PdVgU=Y2#c~9i?akvvJ^|R49l_{%d-M2vJxw^3ahdj ztFs1cvKDKz4(qZW>toZfer#Gc9h;ubz-DAKv6Y*n@zTb-@J)?{n3 zwb?psUA7)upKZW4WE-)K*(PjLwi(-;ZNau=Td}R#HmpC}mJMJ7*&sHUZO67}L)Z>% zN466i%7(GwYy{hxjbx+PXtoR6mF>oMXM3}YljJC+^Cj%O#ZG3-Qk5<8il!cJwUvD4WZ>`ZnRJDZ)u&SmGZvFv;{ zj*Vv%*hF>#yO3SPE@qdoOW9@Ya&`r~l3m5FX4kN5*>&uCb_2VS-NbHYx3F8;ZR~b- z2fLHq#qMVJuzT5k?0)tDdyqZE9%hfQN7-ZSarOjzl0C(qX3wx^*>mi9_5yp6y~JK- zudr9yYwUIQ278me#olJ`uy@&e?0xnD`;dLaK4zb=PuXYebM^)Ml6}R#X5X-H*>~)F z_5=Ho{ltD|zp!7~Z|ryW2Mck4gE@plIgG2;a+$cyTox`XmyOHL<=}F1xwzb19xgAJkIT;$ z;0khuxWZf!t|(WGE6$bRN^+&R(p(v?ELV;z&sE?ma+SEsTotY=SB&!)RQCu|Fh3m?7&x}y`f~%gf!rW&FgJu7$_?X& zb0fHs+$e4|H-;O_jpN316Sx>|A~%Vf%uV5@a?`l!+zf6eH;bFi&Ee*9^SD@UJ{QNu za|v7`w}4y7E#ek)OSq-nGHyAyf?LV0;#PBOxV79mZauew+sJL=Hgj9Jt=u+lJGX<| z$?f8Hb9=bG+&*qUcYr&{9pVmiN4TThG442bf;-8b;!bmCxU<|j?mTyayU1PQE^}A7 ztK2p2I(LJ+$=%{^b9cDA+&%6-_kerIJ>nj7Pq?StGwwO}f_urm;$CxaxVPLp?mhQ` z`^bIbK678VuiQ88JNJWwc)-Iv!lOLK<2=EWJjK&I!?Qfc^Sr=|yu{1A!mGT->%766 zyv5tR!@Io4`}j1xAD@;_$EW8r@EQ3`d}clipOw$XXXkV9Ir&_CZaxp6m(R!N=L_%! z`9geQz6f8GFUA+=OYkN6QhaH?3}2Qn$Cu|T@D=$=d}Y20UzM-MSLbW+HThb6ZN3g) zm#@dy=Ns@1`9^$Wz6sxyZ^k$0TktLUR(xx|4e!slk7!Vl$#@x%EM z{78NjKbjxIkLAbld#82j@@KgC|{B(W>Ka-!u&*tawbNP9EEI*%*l`8E7nejUG_-@tF=H}RYKE&Nt~8^4|3!SCdE z@w@pw{9b+^zn?$AALI}5hxsG?QT`ZzoIk;z@wfRq{9XPYf1iKAKja_rkNGG3Q~nwMoPWW;gE!9xNNU;ziMgn~jL zp|DUyC@K^aiVG!#l0qq=v`|JUE0hz;3l)TlLM5THP(`RJR1>NTHH4Z%EupqhN2n{* z6Y2{MgoZ*Rp|Q|JXeu-lnhPz2mO?9`wa`ZJ7upH|LZA>N1Pkqi_CkozLFg!S5<-PA zAzX+MIt!6Pln^a+5xNT9gziEQp{LMG=q>aS`U?Go{=xuZpfE@nEDRBb3d4lq!U$oc zFiIFLj1k5P5@JcQlvr9UBbF7*iRHx#Vnwl%SXrzhRu!v>)x{cOO|h0(TdX7273+!h z#Rg(Sv60wVY$7%ln~BZE7Gg`WmDpNrBl?SN#Q-r-3=)IIc4B)mMC>4T6g!EbVwe~% zMu?roNHI!`7Q2XD#cpDEv4_}G>?QUV`-pwTeqw)dfH+VbBn}pbh(pC;;&5?P2y&8i?~(XCT!#%SVAOJ!X#WGBvPUzT4E$t;v`-YBvFziSyCib(j;9nBvY~^TXG~< z@+6;>M)H%=O6jEZQU)oblu61gWs$N<*`(}J4k@RUOUfTQU$4^R7t8VRgtPn)uifD4XLJ7OR6o^k?KnIr20|= zsiD+JYAiL8no7;2=28o(rPNAlEwz#SrM6Ol6etBr!BRV^y%ZvKkUC18q);hL3YQ|J z&QhclB}GeJq^?posk_ue>M8Y-dP{wzzEVG_zcfG^C=HSZOGBig(lBYbG(s9Fjgm%7 zW2CXtIBC2zL5h(kN|U6?(iCZ`G)la@;>q?OVtX|=RQS}U!S)=L|tjnXD*v$RFpDs7XtOFN{U(k^MYv`5-2 z?UVLP2c(12A?dJmL^>)Rla5O#q?6Jq>9ll4IxC%%&Px}hi_#_OvUEkdDqWMVOE;vO z(kbVs@?-IMN152T0EBk8g9M0zSclb%a2q?ghw>9zDmdMmw?-b){(kJ2aUv-Cy! zDt(i_OFtw?1~M!oGAd&-E)y~-Q!*_xGAna3FAK6LOR_90vMOt`E*r8bTe2-XvMYPC zPfjEI$!XXSbicum7mGa? z$=~H4GNb?nRuBbMFa=i#g;XeoRv3jS+QgSPKl)OqlCBITYDX0`u3M)mFqDnEP zxKctXsgzPmD`k|jN;##xQbDPxR8lG{Rg|hqHKn>zL#e6MQfe!8l)6eirM}WYX{a<( z8Y@kdrb;uVxza*uskBmBD{T~irL7X61S&yFu+mOxuY@Qal#WU#B~%Gh!j%Z6vl6L9 zDbY$7rK{3S>8|updMdq?-bx>(uhLKHuMAKIDua~4$`EC!GE5n+j8H}@qmMbN13b4Q(~3*N}Lj}Bq)i>0%f7HNLj2b zQI;ypl;z3_Wu>x8S*@&5)++0i^~wfiqq0fatZY%XD%+Ip$_{0xvP;>m>{0eA`;`64 z0p*}_NI9$=QI0Cdl;g??<)m^-Ijx*g&MN1W^U4L~qH;;OtXxs9D%X_j$_?eFa!a|b z+)?f-_mum}1LdLeNO`P0QJyN#l;_F|<)!jUd9A!r-YV~u_sR$5qw-1ltb9?vD&Lgv z$`1unfeNdLimI53tAt9bluE0N%Bq~otAZ-3k}9i;s;Zi*tA=W-mTIex>Z+dVQ`4w^ zYFag&nqJMIW>hn&nbj<6RyCWNUCp89RCB4h)jVomHJ_SaEua=u3#ox-e zCTdf)nc7@!p|(_8sjbyEs=wM+4NwEsAT?NRr?yu^)DCJ#wUZjEhNR@$O^&tI$52fPF1I=)72U3Om&tzTb-lMRp+U(>U=d$jaL)YM0J6>P+g=hR+p$t z)n)2(b%nZ8U8Sy8*QjgNb?SO`gSt`Oq;6KXs9V)->UMR9x>Mby?pF7xd)0mFe)WKQ zP(7p`R*$Gh)nn>$^@Ms-J*A#j&!}hBbLx5Zf_hQCq+V99s8`i%>UH&odQ-in-d69Z zch!69ef5F*P<^C6R-dR()o1E+^@aLUeWkuu->7fZcj|lfgZfeZq<&Vvs9)7@>UZ^r z3TZ%tHAF)-Ov5!oBQ;8+HAZ7KPUAH}6E#VbHAPc3P17|)Gc`-IHAizbPxEPMG(Rn^ zmQG8rWzaHenY7GW7A>omP0Oz3&~j?IwA@-AEw7eO%dZvC3TlP4!delns8&oXu9eVA zYNfQ&S{bdZR!%FgRnRJGm9)xQ6|JgPO{=ce&}wS6wAxx7t*%y2tFJZC8fuNS##$4t zsn$$uuC>rwYOS=^S{uz@YpVrlfm)CjthLkHYav<(t)tdS3)RB3a4kaXtVL>3TC~#B9rx@$eOo?0)hx7J7NtM$|RYXh`_+8}MPHbfh$4bz5eBeap)C~dShMjNY*)5dEP zv>0uoHc6YTP0^-m)3oW@3~i=1OPj6D(dKINv{-Gv7N^B)30k7IKwGFS(iUq=w58fI zZMn8WTdA$mR%>gtwc0vuy|zKysBO|VYg@Ff+BR*wwnN*g?b3E@d$hgUK5f5tKs%@% z(hh4!w4>TF?YMSAJE@)0PHShhv)VcBymmpms9n-7Yge?Z+BNOEc0;?V-O_GrceK0O zJ?*~sKzpb?(jIG1w5QrL?YZ_sd#SzBUTbf(x7s`Hz4k%-sD08tYhSdl+BfaH_Cteo zpu;+%qdKPJI-!#~rPDg2vpT2qx}b}?q|3UZtGcG^x}lr8rQ5orySk_Q^fbDko>ot% zr`I#+8TCwhW<86ZRnMkp*K_DO^;~*xJ&&GO&!^|t3+M&)LV97nh+b4LrWe;s=q2@1 zdTG6kURE!sm)9%k74=GbWxa}CRj;O3*K6oC^;&vuy^da2ucz178|V%7MtWntiQZIi zrZ?AH=q>eDdTYIn?ytAi1N1;WNDtQA>FxCpy@TFS@1%$7VS2b8p?B6J^(Z}B@1l3r zyXoEa9(qr`m)=|NqxaSO>HYNq`ape3bbW?CQ=g^J*5~MR^?7=%K3|X16`T}`c{3LzFps;@6>ncyY)T#UVWdwUq7H9)DP*0 z^&|RG{g{4SKcS!0PwA)iGx}NmoPJ)vpkLH4>6i5@`c?g!eqFzz-_&pExAi;vUHzVZ zUw@!K)F0`O^(XpM{h9t;f1$tBU+J&)H~L%so&H|`pnudq>7Vs4`d9s%{$2l}Lk2Kl z12IqoGjM}2NP{wHgE3fxGk8NVL_;!ULorlCGjzi+Ov5s4!!caLGkiuG!_P=-q%+bR z8H|iZCL^KhG=hDIZ!vC+h6YBV#N z8!e2MMk}MW(Z=vM+8O~ypb=yQ8|{qtMu^eD=xB5@LX9vZ+=wtb8<9qo5p8rax*FY# z?nV!zr_sykZS*nv8vTs^#sFiWF~}Hf3^9fp!;Im^2xFu%${1~oF~%C>jPb?yG#uj6%vCY_S>@ap3yNun&9%HYu&)9DqFb*1rjKjte zCFsg zMl+L{+00^QHM5!7%^YSzVb<24+LEk=fX6Vm39Kna#}> zW=pe`+1hMl`kQUd05i}GGK0-_W_vTl>|k~@JDH(om>F(Hn4Qf?Gs=uMyO>?gZf1A0 zhuPEYW%f4vn0?KDW`A>lInW$r4mO9FL(O64aC3w?(i~-uHpiG_&2i>bPa=4Nw?xz*feZZ~(BJI!6@ZgY>h*W73BHxHNx%|qs4^N4xW zJZ2s@PnajoQ|4*&jCs~PXP!4Nm>11U=4JDWdDXmTUN>);H_cn-ZS#(K*Su%mHy@Y} z%}3^A^NIP?d}cm3Uzjh=SLSQ;jrrDmXTCQ-m>8$it1}meL z$;xbHv9em(tn5|}E2ovq%5CMb@>=<<{8j<0pjF5!Y!$JJTE(p5Rtc-5Rmv)Dm9ffN z<*f2n1*@V}$*OEsv8r0ttm;+`tEN@Us%_P=>RR=z`c?z0q1DK0Y&Ef(TFtEHRtu}8 z)yisZwXyuIwpM@@Xa!lpRy(V`6=HR;I$E8qP%F#|w<4_0R-_eWMO$60u2wgzyVb+$ zY4x&tTYap)RzIu1HNYBZ4YCGXL#(0JFl)Fq!WwCfvPN5Dtg+TOYrHkVim@hIldQ?s z6lVmDVb2 zwYA1tYpt`^TN|v6)+TGSwZ+#%jiI%*xW zj$0?Jlh!Hgv~|WhYn`*sTNkX0)+Ot*b;Y`BU9+xRH>{i1E$g;*$GU6Xv+i3DtcTVk z>#_C3dTKqho?9=hm)0xmwe`k&YrV7HTOX{C)+g(;^~L&XeY3t>KP<=wHf$p{YGXET z6E)3VedUk!gf!)w|nc{-QEtdJJ=oVPIjmrW{2Amc4s@%j+3W2M_C|Y?z1iMkZ?(7C+wC3pPJ5TV+umdEwfEWk?F05f`;dLuK4KrWkJ-oV z6ZT2_lzrMhW1qFp+2`#G_C@=Wec8TZU$w8<*X9K@hj#==bRBayxmPyiPtRzf-^|=oE4aJ4KwLPBEvrQ^G0flyXWtWt_53Ij6i+ z!Kvs}awI=rnQ~J58LXPBW*u)52-#v~pTI zZ5)56trOq`Izdjb)6Qw{gg70Xj!q{h)CqIKod~D16X`@b(M}hqtJBTt?(}eaI=!6U zP9LYQ)6ePe3~&ZIgPg(65ND_}%o*;Ca7H?#oYBr0XRI^M8ShMRVw{Q2BxkZS#hL0% zbEZ2poSDunXSOrPnd{7RVx9R;oD=UPIEl^zXQ8vmS?nxvmO9Iv<<1IcrL)Re?W}Ru zI_sSE&IV_rv&q@)Y;m?a+nnvr4riyc%h~PharQdM;mYq_=EI&NLJo?G8-;5Kv{xsBZ>Zd13J+uUv8wsc#$t=%@R zzuVRga0A^SH`s0Gws%9^4sJ)clN;)Wx#4bv+u4nDqugk>i`&)h=5}{`xINuoZg01b z+t=;q_IC%k1KmOHV0VZ+)E(vycSpD*-BIpncZ@sM9p{dBC%7^0M0b)q*`4A}b*H)0 z-5Ksoca}Tbo#W1R=ee=&d^gUGcN5%1cY(XmUF0rym$*yaW$to!g}c&S<*s(uxNF^Y z?s|8FyV2d`Zg#i0TitE$c6W!n)7|CncK5h@-F@zU_kerQJ>(vCkGMzOWA1VHgnQCG z<(_uWxM$sS?s@lud(pk*UUsjzSKVvwb@zsQ)4k>1cJH`%-FxnR_ksJ+edIoNpSVxm zXYOJ;{?j#Zx`a(>=p8JrM%K!8LzBY&MWU#@G5$h zyvkk`uc}wgtM1kCYI?Q2+Fl*6u2;{i?=|omdX2osUK6jW*UW3~weVVct-RJ=8_(Zs z>jij$UXT~;we#A0AzlZsqu0p`^}@VxFT(5WMS4+QwAaP!>UHzFdp*3KUN5h=*T?JY z_4E3B1H6IWAaAfY#2e}j^M-pPypi50Z?reY8|#hp#(NXI7;mCC$(!s=@uqsyyy@Ny zZ>BfPo9)f<=6dtISZ}@;=f!&oUZS_aTj(wF7JEy)rQR}cxwpbw>87DXU zduP0}-Z}5Qcfq^pUGgq_SG=pAmt^dvCnA-aGHT_rd$cf1vkMNN`%18Sc zAM4|Myif3nKFKHh6rbwTe7eu@nLf*B`y8L^^L##E8lRsptuLJ~y)T0=qc4*$voDJ; zt1p``yDx_?r!SW;w=a+Hf3bECLAP#ex2D&cwvD&Df77;Y+qP}nwr$(CZQC}_KIgx8 zWJVS;vXH%Ujb}uRQEj!>(gzuWj6tR#bC4y-8e|Ky2RVYAL9QTokSE9+ zp`dV3Bq$mb3yKFNf|5b0pmb0sC>xXu$_EvKib18Ia!@6x8dM9a2Q`A4L9L*6P$#Gx z)C=kd4T6S2qo8rnBxo8m3z`Qlf|fz6pmoqDXdAQ(+6NtijzOoObI>K|8gvV~2R(wG zL9d{9&?o2{^b7h21A>9UpkQz?Bp4bD3x)?Hf|0?fV017h7#oZW#s?FEiNT~`axf*B z8cYkO2Qz}1!K`3*FejKB%nRlR3xb8gqF`~bBv=|O3zi2ff|bFlV0Ex2SR1Sh)(0Dc zjlrg1bFd}Y8f*);2RnkD!LDF;uqW6X>2ZDpaq2O?EBsdxz3yudTf|J3i;B;^% zI2)V`&IcEQi@~Mfa&RTM8e9vm2RDM7!L8tSa3{DM+zajp4}yomqu_DyBzPJ;3!Vor zf|tRo;C1jOcpJP6-UlCokHM$lbMPhj8hi`B2S0+J!LQ(V@F(~i{3HGqp+smAMuZjN zM0gQFL==%kWD!L~712a=5ktfju|#YUN5mEJM0}AzBov86Vv$576@d^!3MI5K!U`w6 z2ocFda*;x$6sbgNkw&By=|p;wL1Yw}L}rmiWEI&&c9BEm6uCrhkw@ee`9ywEKok^( zL}5`x6cxopaZy5)6s1IIQAU&%RgL39+IL}$@ObQRr1chN)i6um@m(MR+Z z{X~B;KnxUv#9%Q*3>Cw~a4|xR6r;pwF-D9P%@AoL2MM8#AdNYY!%zYcCkb3 z6uZQ3u}AC``^0{6KpYf@#9?tn92LjJadASN6sN>#aYmdK=frt&L0lA<#AR_sTou>E zb#X)76t~1}aYx)0_r!hiKs*$W#AES9JQdHxbMZpF6tBc<@kYEA@5FoYL3|XS#AoqE zd==lsckx5~6u-o8@kjg>|Hyx3C>dIYkzr*x8D2(^5oIJ9Sw@jjWi%OG#*i^(EE!wI zk#S`_8DA!l31uRgSSFE4Wgvx=Qb{e1w9-j0Lu4|UT&9pIWh$9krjco7I+cQwvlaR zJK0`#kR4?w*;#gxU1c}fUG|VYWiQ!V_K|&MKiOXnkOSo)Iam&nL*+0zT#k?<Kk8oGg;x<&L={OzR#8+`6-`A~F;q+yOT|`kR9qEL#a9VbLX}7*R!LM+6)2&kQc5eM zta8e$5S2_NS1D9Vl}e>nX;fO3PNi2FR7RCaWmZ{KR+UX`R7F)uRaRA0RaH$@S2a{kRZG=YbyQtd zPt{iqR72HBHC9bjQ`Jm0S1nXa)k?KiZB$#;PPJDZR7cfGbyi(eSJh2*S3Oiu)l2nO zeNz4OTE&O;%IXR5eXaS2NU1 zHA~G_bJSclPt8{g)IzmLEmlj^QngGiS1Z&?wMwm4Yt&k`POVoP)JC;QZB|>XFPwiI+)IoJf9acxwQFTlmS0~g-bxNI9XVh7BPMud5)J1hkT~=4r zRdr2WS2xs6bxYk=chp^VPu*7!)I;@1JyuWDQ}s+eS1;5{^-8@~Z`51$PQ6zj)JOG6 zeO6!8SM^POS3lHG^-KL$f7D;~kN#JO(xG)29ae|a;dKNZQAg5|brc;{N7K=D3>{O) z(y?_M9aqQG@pS^7P$$xfbrPLa2U=*UmDbv5tDW{bL?_e9bqbwQr_!l)8l6_B)9G~v zol$4fnROPORcF)Lbq<|V=hC@#9-UX`)A@A)T~HU&g>?~KR2S35bqQTkm(rzm8C_PF z)8%yqT~Sxkm30+eRaev1bq!rp*V46h9bH$~)Ae-&-B35ujdc^_fbr0QB_tL#}AKh2?)BW`TJx~wQgY^(SR1ed`^$0yu zkJ6*{7(G^x)8q97JyB26ll2rmRZr8?^$a~z&(gE?96eXh)ARKLy-+XGi}ez{R4>!Z z^$NXGuhOga8ogGp)9dvHy-{z{oAnmGRd3VV^$xvL@6x;V9=%uZ)BE)SeNZ3LhxHMC zR3FpF^$C4apVFuG8GTlt)93XCeNkW1m-Q8WRbSKB^$mSf-_p1B9er2d)A#iQ{ZK#B zkM$G%R6o{M{ZW6?pY<30Re#gp^$-11|I)wpAN^PVWBxUv zOlT9vgf-zzcoV@yG?7eX6U9U|(M)s`!^AYPOl%X!#5M6ue3QT=G>J@Nlf)!7fe}U; zWwbHI8fUx-G09AFlftAlsZ45<#-ugrOnQ^SWHgyfW|PHaHQ7vdlf&dRxlC@8$K*Bn zOny_q6f}iQVN=8uHN{MEQ^J%qrA%p4#*{VXOnFnmR5X=LWmCmeHPuXYQ^V9WwM=bO z$J90TOnuY9G&GG&W7EVmHO)+O)55egtxRjv#&^uY%-hNrm!h(Dx2D-v1x5O zo8D%y8Eqz;*=Dg>Z8n?T=CC zv1M&JTi#Z%6>TM3*;cVtZ8cln*042gEnC~xv2|@dTi-UY4Q(Ua*fz0EZ8O{4wy-U2 zE8E((v2ATT+unAt9c?Gu*>EIZrIv2*P_JKrv_3+*Di*e160?KZpJ?yx)UF1y?Av3u=4yWbwL2kjwy z*dDP*?J;}Yp0FqFDSO(Uv1jc$d){8K7wsi`*KL08BXc12uKSIiZ6C0t2Y%9VCyTv=Dnm3I|fMOVpHc2!(eSIt#-HC#JU*UU9{EnG|2%C&ZFTwB-9wRas{N7u=9c3oUo*Ufcz zJzP)M%k_4BTwmAE^>+i@KsU$@c0=4yH_Q!pBiu+g%8hno+*mixjdv5=L^sJzc2nF` zH_c6VGu%u!%guIk+*~)$&36mjLbu2*c1zq+x6CbfE8I%A%B^;5+*-HJt#=#TMz_gr zc3a$5x6N&LJKRpU%k6f1++MfO?RN*_sl(aFWgJ_%Dr}P+*|j~ zy>}nnNB7Bnc3<3A_sxBGKip6E%l&qL++X*P|JR4|p?w%1)`#=qeFPuTNAi(<6d%<` z^U-|_AJfP3v3(pL*T?hmeFC4*C-RAX5}(uuUU=!1*WP&Ro%cS(C-cdD3ZK%a@~M3q zpVp`I>3s&D(P#3ReHNe9XY<*84xiKK^0|E;pV#N}`F#Oj&=>NBeGy;O7xTq^318Be z@}+$lU)GoN<$VQT(O2@7eHCBTSM$|<4PVpO^0j>(U)R_3^?d{1&^PjpeG}i*H}lPX z3*XYW@~wRv-`2PD?R^K|(RcEleHY)=ck|tS58u=G^1Xc@-`Dr^{rvzx&=2y1{SZIY z5A(zQ2tU$~@}vD2Kh}@)_FZ0X&3cu2?@~izCzt*qw>-`45(Qopb{T9E~Z}Z#z4!_gy^1J;Wzt`{c`~3lb z&>!-L{SklEAM?lk34hX`@~8b7f7YM#=lunL(O>eH{S|-JU-Q@f4S&<$^0)mRf7jph z_x%I^&_D8z{S*JxKl9K13;)u;@~{0H|JJ|r@BIh=(SP!v{TKh$fAin{5C7Bu^1uBb z|JVN$@^46}kkBDvLc)fG3ke?*AtYi*q>#uVQ9`1IL<@-?LTBLVkQo2>!q25`J2YzC zp>d-F|1(SQfAz}I@mKIaWw1@#c4+>e-I)LVvt|D&r~BV`_Z58j&matqeg7}bsmK5M zH(S5|6x99C`)|QxDDMAfdFubyHJ(WPJ~BqZs2B~SV+{OHGxGm=|5+9KKQA`M!MGR? z85)v!9&z?xVKYhxX(i}kQRHo%712peM)Y>LgWIkv!-*a}-?8*Gd1uswFbj@Su1 zV;Ag-|E&V<&fNohVlV8CeXuX~!~Qq`2jUa4Js2={N&t;w+qvb8s%s!}+)X7vdsZj7xASF2m)x0$1WHT#ajREw01$xB)lf zCftl$a4T-Z?YIMX;x62cdvGuA!~J*w58@#_j7RV&9>e2!0#D*8JdJ1YES|&jcmXfs zCA^GR@G4%z>v#ii;w`+5cknLW!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC|_yIrS zC;W_G@GE}9@Aw0M;xGJ-fABB7|{8{=SHjEC_t0Vc#mm>82_QVdX_M1>j+T6E|!1e0NMOo1se6{f~Cm=@Dvddz?s zF%xFSESMFuVRp=cIWZUJ#ypr8^I?80fCaG-7RDl26pLYTEP*Al6qd#^SQg7+d8~jH zu@Y9sDp(b(VRfv5HL(`f#yVIR>tTItfDN$`HpV8{6q{jlY=JGY6}HAU*cRJid+dN6 zu@iR2F4z^jVR!6-J+T+|#y;2=`(b|^fCF(54#puk6o=t(9DyTo6pqF*I2Om@c$|O} zaS~3(DL56U;dGpVGjSHq#yL0_=iz)@fD3UEF2*Ie6qn(0T!AZb6|TlLxE9ypdfb2; zaT9LFEw~l8;db1CJ8>88#yz+f_u+m#fCupq9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ z@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO z@e_W=FZdO|;dlIjKk*m-#y|KM|GQ%FZz%E~L*xJ7v$)at-{=?vV`40fjd3t8#>4oS z025*&OpHk|DF!G|qC$-ZEjsiVg2^yBrofb#3R7bmOpEC-J!Zg+mVx%J$As3*acz=gO77vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6 zowy5k;~w0L`*1%Vz=L=Q591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bW zn|KRv;~l(<_wYVGz=!wRk0dY#~N4@ zYhi7ygLSbU*2f0e5F24*Y=TX(88*ij*b-Y|YixsUu^qO@4%iVpVQ1`uU9lT>#~#=d zdtq{5Fg=Ve1cE$89v7s_!3{?YkY%m@g2U$5BL#3;b;7UU-27$#~=6; zf8lTZgMaZq%XI(e{fnV6G={;j7!Jc@1dNE0FfvBLs2B~SV+@Rmu`o8q!MGR?<6{C$ zh>0*UCc&f_pg@TVH5#<&&|?TD!{nF(Q(`JijcG6~ro;4@0W)GI%#2wuD`vy&m;-ZS zF3gR2FfZoA{8#`BVj(PyMX)Fq!{S&1OJXT3jb*Sbmc#N`0V`r9tc+E#DptelSOaTf zEv$`ourAia`q%&)Vk2yfO|U68!{*omTVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfF zFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+m zES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|M zF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?Z zExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8_yd39 zFZ_*v@Gt&nm-c`1A46ei41-}Y9EQgT7!f03WQ>AQF&ak47#I^{VQh?paWNjo#{`%V z6JcUZf=Mwzff5yJG-%PG#}G`0$uR|{#8j9X(_mUmhv_i`X2eXG8M9zk%!b)92j;|F zm>ct8Ud)I2u>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDbtcKOG2G+z{ zSR3nLU95-ou>m&3M%WmeU{h>{&9Mcx#8%iE+hAL4hwZTgcEnED8M|Ot?1tU32lm8X z*cY>oQBhJ2F}D; zI2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjO zxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H{D$B02mZug z_#6M=U;NK1{r}`YhQiPo2E$@F437~oB1Xc<7zLwZG>nchFeb*r*cb=nVmyqG2{0ih z!o-*alVX4ZB`VZt(4s?+A(#x4V+u@(sW3IB!L*nT(_;qAh?y`mX2GnO4YOko%!#=$ zH|D{-m=E(~0W64xurL7)R4Xa}ftckU- zHrBzqSP$!C18j(murW5lrq~RdV+(AFt*|w=!M4~A+hYgph@G%AcEPUL4ZC9x?1{aw zH}=84*bn>T033*ea4-(Rp*ReO;|Lsyqi{5i!Lc|F$KwQ?h?8(KPQj@-4X5J_oQbn= zHqODhI1lIJ0$hlTa4{~yrML{2;|g4ft8g{0!L_&!*W(7kM!LxV{&*KHWh?np(Ucsw)4X@)3yotB) zHr~Ozcn|O61AK^&@G(BYr}zw?;|qL=ukba#!MFGh-{S}Th@bE?e!;K!4Zq_L{E5Ht zH~zuD_)mE9A46ei41-}Y9EQgT7!f03WQ>AQF&ak47#I^{VQh?paWNjo#{`%V6JcUZ zf=Mwzff5yJG-%PG#}G`0$uR|{#8j9X(_mUmhv_i`X2eXG8M9zk%!b)92j;|Fm>ct8 zUd)I2u>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDbtcKOG2G+z{SR3nL zU95-ou>m&3M%WmeU{h>{&9Mcx#8%iE+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cY>oQBhJ2F}D;I2-5S zT%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjOxEuH2 zUfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H{D$B02mZug_#6M= zU;HNm`H!J6G={;j7!Jc@1dNE0FfvBLs2B~SV+@Rmu`o8q!MGR?<6{C$h>0*UCc&f_ zpg@TVH5#<&&|?TD!{nF(Q(`JijcG6~ro;4@0W)GI%#2wuD`vy&m;-ZSF3gR2FfZoA z{8#`BVj(PyMX)Fq!{S&1OJXT3jb*Sbmc#N`0V`r9tc+E#DptelSOaTfEv$`ourAia z`q%&)Vk2yfO|U68!{*omTVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz* z{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn z`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t| z{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN z`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8_yd39FZ_*v@Gt%o zk^IL{7#hQ1SPX~ZF#<-!NEjKTU{s8T(J=JeU{rVSX%t z1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4<5?014SQV>bb*zCku@=_GI#?I$VSQ|X z4Y3h6#wOSln_+Wofi1BWw#GKt7TaNa?0_Ay6L!Wf*cH2BckF>Zu^0BnKG+xgVSgNe z191=z#vwQqhv9G>fg^Dgj>a)K7RTXuoPZN?5>Cb`I2EVibew@RaTd4oS025*&OpHk|DF!G|qC$-Z zEjsiVg2^yBrofb#3R7bmOpEC-J!Zg+mVx%J$As3*acz=gO7 z7vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q z591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!w< zALA2ziqG&lzQC9G3SZ+Je2ee!J$}HC_z6Gb7yOFf@H_s%pZE)Z;~)Hs|3oJLF%*Wz zFc=oYVR(#y5it@*#wZvSqhWN6fiW=_#>O}p7vo`kOn?b75hlhYm=psPC{dwCgBBfn z48dfW98+LQOogd24W`9(m>x4=M$CknF$-qJY?vK$U{1`1xiJss#eA3_3t&MkgoUvP z7R6#%97|wHEQO`943@=mSRN~2MXZFCu?kkjYFHg>U`?!rwXqJ?#d=sD8(>3hgpIKY zHpOPx99v*ZY=y0{4YtL0*d9AzN9=^1u?u#^ZrB}rU{CCYy|EAW#eUcy2jD;)goAMi z4#irsL98cg$JcXz644%bvcpfj{MZAQU@d{qWYj_=R;7z=RxA6|%#d~-kAK*iLgpctF zKE-GF9ADr|e1)&^4Zg*9_#QvtNBo4J@e6*%Z}=U5;7|O8zwr8GV;BsJ z;V?W#z=#+LBV!bdiqSAS#=w{u3u9v(jEnIwJ|@6~mta2uj}5RPHp0f(1e;q9kCAPxW*aq8TJ8X{~up@TD&e#RJVmIuLJ+LSC!rs^i`(i)rj{|TZ4#L4W1c%}< z9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6 zT#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T1drk| zJdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN1fSwF ze2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61!r%A@|KdMU$$t!mp)m}G#c&uN zBVa^~gpn}{M#X3t9b;fjjD@i=4#vfJ7#|a0LQI5-F$pHc00l}^sL`NBhaN*P879XR zm=aTAYD|M^F&(DI444r!VP?#NSuq=C#~hdwb75}GgLyF@=Enk95DQ^pEP_R`7#7D8 zSQ1NNX)J?fu^g7i3Rn>Rk0dY#~N4@Yhi7ygLSbU*2f0e5F24*Y=TX(88*ij z*b-Y|YixsUu^qO@4%iVpVQ1`uU9lT>#~#=ddtq{5Fg=Ve1cE$89v7s z_!3{?YkY%m@g2U$5BL#3;b;7UU-27$#~=6;f8lTZgMaa#XyiYJ!q6B7!(uoLj}b5; zM#9J#1*2j#jE*rdCdR_p7zg8GJdBSCFd-(w#Fzw=Vt@iAD%5DuqC<}%m<*F+3QUQq zFg2#Zw3rUlV+PEKnJ_bE!K|1Kvttg-iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9}2`q`F zur!vzvRDqwV+E{;m9R2a!Kzpdt78qUiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4 zur;>9w%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{ za5Rp=u{aLL;{=?DlW;Ol!KpY6r{fHqiL-Dv&cV4j59i|oT!@QsF)qQSxD1!$3S5b+ za5b*MwYUz~;|AP_n{YF3!L7Irx8n}niMwz&?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X z@HC#mvv>~A;|08km+&%P!K-);uj388iMQ}J-od+g5AWjxe29b0(F&@Up1eg#LVPZ^zNijfy5*2DRXwjj^5KM;2F$Jc?RG1pm zU|LLv=`jOl#7vkOvtU-thS@O(=EPi>8}ndZ%!m2002ahTSQv|7Q7neVu>_XHQdkv02a#7(#vx8PRXhTCxm?!;ZV8~5N|+=u(|03O6cco>i1Q9Opn@dTd4Q+OKB z;8{F}=kWqw#7lS?ui#a@hS%{1-o#sY8}Hy_!ytyQ+$Tc@ddubSNIy= z;9Go$@9_hE#83Dczu;H=hTriA{={GS8~@;6{3izakD)L$hQY8H4#Q&vjEIpiGDg9u z7!9Li42+4fFgC`)xEK%PV**Twi7+uH!K4_VK#2-98no!pV+bb0SI818ZU}tc`WBF4n{P*Z>=1BW#RKuqigf=GX#TVk>NoZLlr2 z!}iz#J7Op7j9suRcEj%21AAgG?2Ub}FZRR!H~D z!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ z!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U? z!}s_BKjJ6+j9>68e#7th1ApQ#{EdI`Fa8sg{Krrj8pB{%42R({0!GA07#X8rRE&nv zF$TuOSQs1QU|fuc@i74=#6*}FlVDN|P@qJG8Vy=>=rIJ7VRB4?DKQnM#x$4~(_wnd zfEh6pX2vX-6|-S>%z-&E7v{!1m>2V5ek_0mu@Dxmq=6{}%&tbsML7S_f(SQqPIeQbaYu@N@LCfF34VRLMOEwL50#x~d%+hKd` zfE}?DcE&E)6}w?~?14S87xu^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB#x=MW*Wr5H zfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X#xr;p&*6Ez zfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X}FYy(=#y9vD-{E`w zfFJP_e#S5O6~Ezk{DD957yiaS_!s|)MgC(b42@whEQZ7I7y%<;O(V-YNh#jrS*z>-)BOJf-us$}xhS&%jV-swO&9FJPz?RqwTVoq+i|w#IcEFC< z2|HsK?26s6JNCey*b94OAMA_$us;sKfj9^U;}9H*!*Do`z>zo#N8=bAi{o%SPQZyc z2`A$eoQl(MI?lkEI16Xv9Gr{ua6T@;g}4Y8;}Tqo%Wyfaz?HZPSK}I7i|cSbZorMW z2{+>w+=|<9JMO@pxC?jV9^8xja6cZvgLnuJ;}JZH$M86wz>|0iPvaRpi|6n>UcifZ z2`}Rnyo%TGI^MvWcnfdi9lVS8@IF4khxiB|;}d*}&+s|Ez?b+6U*j8mi|_C~e!!3T z2|wc({EFZ3JO03*_zQpIAN-5|#3uhS6o$qy7#71}c#MD%F%m|`C>Rx^VRVdvF)qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSXg{83!mc?>d9xGr)tb~=Z z3RcBxSRHF%O{|5ru@2V7dRQMDU_)$#jj;(f#b($XTVP9Ug{`p-w#9bX9y?%1?1Y`M z3wFhB*d2RdPwa)gu@Cmee%K!e;6NONgK-EB#bG!cN8m^tg`;r{j>T~}9w*>LoP?8c z3QomoI2~u;Oq_+YaSqPKc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5uElk@9yj1d+=QEP z3vR`2xE*)kPTYmNaS!greYhVF;6Xfuhw%s=#bbCJPvA*Bg{Schp2c%`9xvcUyo8tW z3SPx)cpY!xO}vG-@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8g|G1qzQuR=9zWnm{DhzJ z3x36K_#J=XPyB_y@elsRf8vn;7z#sU7z~TyFg!-Uh!_bYV-$>v(J(s3z?c{dV`ChQ zi}5f%CcuQ42oqxxOo{;tl&DanL5mJOhF~&GjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9 zHq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$M zHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jxDeyw!+rf2HRpgY>yqVBX+{h*af>{ zH|&l*uqXDy-q;8GVn6JU18^V?!ofHMhvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKT zG@Onza3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3 zHr$Roa3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QG zHN1{D@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb> zH~fx2@F)Jl-}ndr;y-c8e+-48F${*qa2OsVU_^|BkueHJ#b_8EV_-~-g|RUX#>IFT z9}{3gOoWLs2`0q=1xi$?(V#_#9z!r0CdU+*5>sJnOoM4L9j3<&m=QB!X3T_y7RM4;5=&ueEQ4jS9G1rlSP?5>Wvqf#u^Lv# z8dwu+VQs8~b+I1S#|GFC8)0K?f=#g*Hpdp&5?f(wY=dpF9k#~~*bzHnXY7Jqu^V>B z9@rCmVQ=h%eX$?*#{oDH2jO5GfxDhwuX54~XaT{*O z9k>&B;cnc6dvPD`#{+l}58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o z8+a3M;cdKwckv$H#|QWjAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`} zANUi0;cxtdfAODq~8VmJ(s5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0 zAtu7am;{qzfC42d)M(J6LysYt43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!T zxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$1+0jburgM`s#p!HV-2i{ zwXinU!Ma!v>th3Kh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc2keNQurqeSuGkH`V-M_! zy|6d-!M@lJ`{Mu{h=Xu24#A-~42R*ZsI1b0-1e}PIa57H8sW=U%;|!dM zvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYft+)-h;||=3 zyKpz|!M(T-_u~OPh==en9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S(t9T8s;|;ut zx9~RJ!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kFulNnW;}86a zzwkHy!N2%VeDWVdVQ374VKE$r#|Rh^BVlBWf>ALVM#mT!6Jud)jDvA89>&K6m=F_T zVoZWbF+hP5|5x*N9dTfwkP?vY?ha{0>F(|ZX=zZ9l5Pa4+d(6Mo9i_&LAem;8!f^BaE4@Ay4`;E%k; zpZGI>;jjFSzw-|UX^sDk7&8GAG7%Fq36nAzlQRWVG8OM&YTn7acsJAV9^T9Qn3n07 zo*9^tnV6Ybn3eZ48?!S9b21knU~cANUOvct%+CTW$U-d4A}q>cEY1=v$xXLAncavtaN2|mdMT*#-mh>N*|OSz0sb2(RVC0B7Z z*KjS@aXmM1BR6p~pWzlh%dOnT=eV6axRcNG1-{5#e2Fh}H}`Na_wg0(=c_!x*LaYx z^AHd72#@j@kMjgi@)S?=4Bz0JJj-)@i*NHhFYq0{%Zt3k%e=y?yvFOi!T0z+Kj4SF z$&dIkKjEkRjGyxhe#x)+HNWAv{EpxA2mZ)g{E0vF7yioM_&fh#kk0teh%pl|Armn% zlQ1chF*#E(B~$SZrskczi+3{(@8P|?k7=2X>6w8UnTeU1g;{w&voSk!Feh{I0p?~N z=H-LT$NVh7f-Jay;iGKHR&32SY|D0R&kpR!PVCGs?89LixF&Ji5RQ5?-N9LsSW&&N1{6FG^KIfYX>jgNCWXK*HGaW?00 zF6VJRpWu^Rz=eE@i@2CexRlHIG?#M)S8^3sa}C#W9oKUMH*ym<^BHd8v)syUe2&|> zgFE>=U*L<}#h3UpcXJQ-avxvee!j{Be2oYBIuG$MkMJmu@iC^8(-DyS&Ityv!@S%4@vN8+?!N^8^yf8nqEjlc5`2I-Cej2JTk6EYDKGYOM28Iv;wQ!*9rU~1mUyLdO#@E+dF z`h8VP1%gi`7m4X5kAV6 zY{k}W!?tY4_Uyop?8MIO!mjMb?(D&y?8V;f!@lgt{v5!89K^vK!l4|-;T*w{9L3Qb z!?7I4@qCOEIFXY$nNv8G)A%^2a|UN}7H4w~=W-tB^9er51zgCdxQL6ngiE=MPjfj} za3xo9HP>)0*Ks{Ja3eQyGoRrWKFh7##^<=5JGhh2^98=hU3`fzb2s;JFZb~k?&qsK zz}I+?uk#QO^9Yaf7?1M=Px2H`^9@>Ne2Z`MJTLGazRQce#LK+GtGveRyutVQ zK0n}xyvdLFF+bs_{EVOT3x3J3_%*-bxBQOZ^9TOOTl|SX^B4Zg-}pQKV35K1&xkP- zFd-8$F_SPUlQB6{FeOv*4yNXvyo+};4e#N-ypL&_j_H|!8JUThnT1(-KeI7Ab1)}! z@d4&$9_HnP%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS!z>2KI%B;ewtj6lB!J4ea z+N{I6tjGF%hz;0~jo6q?*p$uKoDZ`FAK{~H$yRL5Hf+mwY|jqt$WH9cF6_!~?9LwS z$zJTuKJ3eW?9Txl$Uz*;AsotK9L^CO$x$55F&xWr9M8u%ffG52lR1S`IgO8VI%jYu zXK^;?a4zR@KA+%|T)>5Vii^0IOSqKF_%xSu1y^zvS91;5avj%m12=LLH}e^8;j`Sz zZG4W~xr00TJYV38+{KspGIw(i_i`U!;eNi#1AL7K`8p5rFpuykkMTH9@FY+1G|%u2 zzR9yZ$G7-4&+`J`;k&%ZOT5f0yvl35&KrD>@ACtG$ea9#AM+D_%Fp;Yzu=erieK{^ ze#`IpJ%8Yjyv3jRGk@W){Eff!4+a^H|BM(j0TVJ26Eg{uG8vOI1yeE=?_g@)$-8(r z)9@bN%lnv?>6o4wn30*7nOT^X_cI%_GY4}r7aw45=3!nw$b8Jt0xZZvEX*P-%3>_e z5-iD5EX^`3%W^Ew3arRVtjsE`%4)368m!4$tj#*C%X+NOhuDA(*@%tVgiYCu&G|4} z@DV=BmTbk=Y{Rx}$M)>Nj_kzF?82_>#_sIFp6tcm?8Cn7$Nn6^fgHra9KxX-#^D^n zksQU*9K*33$MJlO6F8BRIGIy8mDBh*r*j5pau#QE4(D%77D_&z`2hrG#;_%T1>r~Hhc^9z2-ulO~;;kW#b-}49l$Xoo0Kl2y<%HQ}q|6q{G z_|J$j6EGnYF)@=cDU&fdQ!ph{@eZcuoxF>8GY#+Iy}XZUnU3k1ff<>JnVE%Ic|Wr; zJ9986bMXP@W*+9{gUrYLEWm;+#KJ7XqAbSZEWwg2#nLRpvMk5)tiXz_#LBF~s;tK9 ztihVB#oDaHx~#|ge25L$kd4@wP1uyp*qjfu1s~y~Y{^z^%{FYyc5KfM?8r{+%r5N8 zZtTt;?8#p2%|7hQe(cWy9LPZ&%pn}gVI0m89LZ4}%`qIyaU9RbIDr#6iIX{nQ#p-~ zb2?{mCTDRr=Ws6PaXz2mlU%@se2R;>m`k{n%lI^xa|Ks&6<2c&*K!@#a|1VW6F2i2 zZsD`s%58j(+qr`~`8;3Xi`>PR_%e5M5BG8(U*Uef$^(3j2l+Y=@i33@D39?tPw*s9 z@ifoy4Zg{e(1&g{aj?8ffw!Jh2J-t5D^?8p8bz=0gZ!5qS& z9LC`s!I2!r(Hz6E9LMo|j1xGKlQ@}EIF-}*IHz+4XL1&2a}MWn9_RB3KFI}K$fvl7 zi@AhLxr|S9IahEcS8+Aha4pwyJvVS8H*qtc;TAs2t=z`vxScz=lh5-7zQ|pCi7#_E z_i!)w@fGgpt31Hhc#yC25D)VRkMbCg^8`=w6i@RE-{6}(%X55-Z}U7a@EyL(i@e0k zyuz!z#_PPn_xL_P;D@})kN7b^;ivqJpYscT$*=e|zu~w1j^FbK{>WSWi9hof{>tC@ zJO5yi#rV&NF%vK$6EQK9Fe#HUIa4qtQ}GU_=AFEYcQXy|;k~?%X_=1cnSmLZiJ6&& zS$RLRF*|cGCv))u=4Kw|<%7(}{4BtNEX2Yr!lEq3;w-_EEXC3+!?G;L@~ps$ti;N! z!m6ys>a4+d(6Mo9i z_&LAem;8!f^BaE4@Ay4`;E%k;pZGI>;jjFSzw-|US&jdU7&8GAG7%Fq36nAzlQRWV zG8OM&YTn7acsJAV9^T9Qn3n07o*9^tnV6Ybn3eZ48?!S9b21knU~cANUOvct%+CTW z$U-d4A}q>cEY1=v$xXLAncavtaN2|mdM zT*#-mh>N*|OSz0sb2(RVC0B7Z*KjS@aXmM1BR6p~pWzlh%dOnT=eV6axRcNG1-{5# ze2Fh}H}`Na_wg0(=c_!x*LaYx^AHd72#@j@kMjgi@)S?=4Bz0JJj-)@i*NHhFYq0{ z%Zt3k%e=y?yvFOi!T0z+Kj4SF$&dIkKjEkRjGyxhe#x)+HNWAv{EpxA2mZ)g{E0vF z7yioM_&fh#aDVWRC1k{y37C+Hn3zeJl*yQ!DVUO}cn4GSPTs}4nTGf9Uf##FOvm)h zz>Lhq%*?{9yr0>aojI73x%dEcGY|9fLFQwA7GOaZVqq3xQ5IuymS9PiVriCPS(amY zR$xU|Vr5ogRaRql)?iK6Vr|x8UDjiLKEwuW$VP0;CTz-PY|e+-f{*Y~wqz@|W*fF; zJGN&Bc4Q}ZW*2s4H+E+a_GB;iW*_!tKlbMU4&)#X<`53$Fb?Mkj^rqg<`|CUIF9FI zoWO~k#L1k(shq~gIh`{&le0LRb2yjtIG<1ONiN_*KE*{`%q3jPWqg{;xq>UXimSPX zYq^f=xq%zGiJSQhxA0kR4km zNtukvnSv>qigz$I@8n&)n`w9t@8x|=%XCc749v((%*-sz%KMp(*_nemnTroFH}fzr zA7nn}X8{&uAr@v47G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41` z=R<73hHS*fY{I5&#^!vOE%*o@WlOeVYqnuqwqtvCU`KXhXLey%c4K$;U{Cg9Z}wqd z_G5nz;6M)IU=HC>4&!i+;7E?*XpZ4nj^lVf#tEFrNu10noXTl@oYOgjGdYX1IfrvO zkMsEipX35A;e3U*s;n#Fx37d$^bT_zL&)RUY7LJjmC1h=+NEM|q6Ld4eZ-il=#oZ}3f? zv`okJ%)pGy#LUdXth}Gun4LM8lezc+b2AU~@$!m&xrv+k47c!EZsj&U$L-v~oqV1z@I~(8OMIETxrckXkFRh)U*!S5 z#)EvFhj^Grc$CL@oF{mar+AuY_y*tPS)Sute4FQaf$#8LUgRZS<`rJ$HD2cpzQ_0Z z0YBtTe#DRY2|wj${G4C#OMb<#`3=A2cl@3|@JHU_PyCs`@K^rE-}wiF9L9e}jG2H5 znTUy*gh`o<$(e#FnTmHXHSgqIyqjrw5AWrDOv`jk&kW4SOw7zI%*y+jjoF!lIhl(O zFgNosFCSz+=4SyGWFZ!25f)`J7H0{TWGR+r8J1-^mS+W4WF=N+6;@?6R%Z>?WG&Wa z9oA(%*5^ZPz=mwZ#%#i-Y{uq%m@W7SA7x9nVr#ZxTef3+c3?+#VrO9yYq*x{xSkuhk(;=g&u|N$Ntl$$n4Bq?lBswHQ}a&V#k-k? z_wZic$Fxkx^vuAF%*4#h!mPZX*_fRER$*0EV|CVGP1a&<)?r=NV|_lv25iViY|JKX%4TfNhuMOU z@KLs8E4F4Ewq-lEX9spY9GGdPp8IGb}gm-9HEPw+`D;6gsdMO@4!T*_s9 zn#;L@E4hlRxrS@Gj_bLB8@Y*_`3$%4S#ISvKF96c!JT}bFYra~;!Av)ySayZxsR`K zKVRhmzQ%)moridsM|hOSc$_DAlBal@XZQx+OLnT^?* zgE^Ut4=^|LFfSiuKIUfu7Gxn7W)T);F&1YDmSicGW*L@cIhJPyR%9hsW))UtHCAU0 z)?_W#W*ydLJ=W(#Y`}(W#KvsGrfkOMe3&ix2p?rjwqk3xVOzFidv;()c4B9CVOMrz zclKaU_F`}LVPE!Re-7Y44&q=A;ZP3aaE{%qg78X?&d1 zIfFAfi?cb0b2*Rm`2?Tj0xslJT*Sp(!lhisr@5RfxRR^5nrpb0>$sj9xRINdpRbJzD-r#$DpC9l;-sDI8n4j=de#X!F1;6B1 z{F>kJTYksy`2&CCE&jxx`3ryLZ~UEqFnGZD&xkP-Fd-8$F_SPUlQB6{FeOv*4zBIl ztNp-%n`-wNRH9hzKAk!bTRmuazqQ@}TKn^LP~j3&)^_aMsq?_eYx;ER-aA~|>Cerb z|K75pW8dEW26P@caQ~W4?FY3#lK9M@ml6h15C;i@gh8SpagZcP8YByn2PuM-L8{=6 zAa!tOa9415kS4e%xHq^jNE@UJ(gzuWj6tR#bC4y-8r&ab3$h0}f}BCFKW|O$AWx7t zcreHpN9uDg+gSNDYZ7E}*v z1T}+NLG7SUP&cR-)DIpC8Uzi4MnU7CNzgQC7BmkY4q60{1dj$SgH}Q7piR&=Xcx2( zIs_eqPC@6OOVBmw7IY7K1U-XZLGPeX&^PE8^bZCE1A{@q;9y8FG#C~P4@LwdgHgfg zU`#MJ7#EBW9t$P}6N5>?8J%pT?lbB4LX2g2N8o-l9tV3;q=9~KA;hK0hyVUe(CSS&0amIzCRrNYu-nXqhF zE-W8b2rGt_!pdQluxeN>tRB_~YlgML+F_lrZdfm@A3hW|2pfit!p32fuxZ#VY#u%w zwg?{y9}QcEt-{t}o3L%zE^Hrm2s?(I!p>oruxr>Y>>lI4m3GHhEIiy!o}f|aA~+Kd^%hnt_W9#tHRacns9BnE?ggO2sehC z!p-3`;g;~(aBH|Nd@kG`?g)2=&xbFBFNV9qm%^9B-Qk{aZ@4dfCEOpr8XgE=3lD~` zhlj$$;gRrYcq}{~o(NBdr^3_WnedJ9&G2k^E_^F|J3Jp=2;T|c4KIe5!pq^6@M?H1 zydK^N-wWRlKL|ezZ-yU*ABUfWpN5}>pNC(BUxr_WUx(j>--h3X--kbhKZdu$pTeKR zU&3F*-@@O+KmKg|VH8DilpsnNC5jS9Nus1tvM70!B1##hitdO~M|VbdMR!MOqI;ry zqx+(?QMxF7lp)F(Wr{LKS)#1b{ZY0kdz2%}8Rd!|h;m1HqP)?AQNAdDR3IuC6^aT+ zMWUinv8Z@dA}Se`ib_XiqOwuBsC-l*su)#@Do0hKs!_G5dQ>B-8P$qvM|GmQQN5^s z^ib3wY8W+&8b?i{rctw~dGv79B6=ixG-?^Oidsi)qP9`HsD0ES>KJv3I!9fiu2HwB zd(KM}4BcQNO5vG$0xn4T=UwL!zP4uxNNRA{rTuibh9cqOsAqXngcoG$EQ8 zO^PN*Q=+NSwCM3@dNd=N8O@4jM{}aN(Y$DW^hESzv>;j-JrymA7Dr2>rO~qJ>1cVh zB3c=(idIK!qP5YwXnnLH+8Aw$Hb>7yTcT&9tyIvBkk9f}S|N1~(AvFLboB03qJicUvoqBo*9qqEVu=&k7O z=zMe`dMA1}x)@!GE=O0QtI@USdUPXtFM2=vAo?)68GRIe9DNdf8hsXh9(@sg8GRLf z9eoph8+{jjAN>&h7~P6~ihhoMiGGcKi++#(__ON|<0y{f1aZPRQJgqV5+{w5#mVCo zamqMVd`Fx*zB9fnzB^76-xJ>(-xsHi)5YoI3~|OdQ=B=@5@(I?kF&+u;~a6$I9L2Y zoIB1F=ZznX^Tqk&0&&5(P+T}J5*Lk&#l_Lx*#kf*jIj#~{ zjjP4g;~H_zxK>;{t`pad>&5lshvEit!?;o0IBpU*jhn^I
    uG@gwo0am%<>+&XR( zw~gDy?c)w{$GB76Iqnj7jl0F&;~sI(xL4de?i2To`^Ej^0r9|iP&_yu5)X}s#lzzf z@yK{oJUSi|kB!I0Jh%#gE6+;~DYHcvd_+o)gcF=f(5mC*mjL z1@Xf8sd!PmI9?JjjhDqw$IIgt@yd8rygFVJuZ`Em>*Eda#&}b_IesSI5|UJMp{m#rRTuIldBKjjzSm;~VjN@%!-y@rUuv zcVa6Im+#Ju~{pq~h<@$R{xq zFd-8$F_SPUlQB6{FeOv*4yNXvyo+};4e#N-ypL&_j_H|!8JUThnT1*TFaIm@=LG5R z9ENM{*QLa}39F9LMu9PT)jN z;$%+YR8HgLoX#1X$yuDuIh@ORoX;otBo}ZYpW-4e<`ORDGCs}aT)~xG#noKHwOq&b z+`x_8#Law$Tlg%uavPuHcJAO#KF=5UB6sm6zRca+!@bY}i zpYk()&M){Szv9>YhTrl#e$OBHBX991{>)$aD}Uqf{DXh~n(&V$WW<;Wn2?E>m`RwF z$(Woen3Ab@2UGJ--o?9_hWGGZ-p8~|$Mnp=jLgK$%)+d^pV^q5Ihd2V_yBV=5A*Ut z=3{;qU_lmQVHROg7GrUiU`du@X_jGGmScHVU`1A9WmaKTR%3P6U`^IyZPsC3)? z#0G50Mr_O`Y|3VA&WG88kML2pWGl928@6RTwr2-+WG8lJ7j|Vgc4rUvWH0t+ANFNG z_U8Z&?yQj^|^Xz=@p1$(+KeoW{pFoijL-vpAb`IG6J{ zpHJ{fF5p5w#YJ4qC0xp7e45L-f-AX-tGR}2xsL0(fg8DroB0g4@L6u)nY+1%d%2IVa6ez=0lvnAe4U4Qm`8Y&$9SA4c#@}hnrHY1-{e`I<6C^2 z=Xrte@LgWyC0^zgUgb4j=MBEc_xS-oI<=2mgGQ@b~!7h%pl|Armn%lQ1chF*#E(B~$SZrskczi+3{(@8P|? zk7=2X>6w8UnTeU1g;{w&voSk!Feh{I0p?~N=H-LT$NVh7f-Jay;iGKH zR&32SY|D0R&kpR!PVCGs?89LixF&Ji5RQ5?-N z9LsSW&&N1{6FG^KIfYX>jgNCWXK*HGaW?00F6VJRpWu^Rz=eE@i@2CexRlHIG?#M) zS8^3sa}C#W9oKUMH*ym<^BHd8v)syUe2&|>gFE>=U*L<}#h3UpcXJQ-avxvee!j{B ze2oYBIuG$MkMJmu@iC^8(-DyS&Ityv!@S%4@vN8+?!N z^8^yf8nqEjlc5`{+UVe_xR6< zF%vK$6EQK9Fe#HUIa4qtQ}GU_=AFEYcQXy|;k~?%X_=1cnSmLZiJ6&&S$RLRF*|cG zCv))u=4Kw|<%7(}{4BtNEX2Yr!lEq3;w-_EEXC3+!?G;L@~ps$ti;N!!m6ys>a4+< zti{@_!@8`;`h18D*pQ9bm`&J}&Dfj|vjrdFqio4mY|S=o%XVzf4(!NI?949g%5Ln= z9_-0p?9D#x%YN+70UXFd9Lymc%3&PN5gf@;9L+Ht%W)jf$2fr#If;`wg;P0=k8?U_ za3*JQHs^3I=W#xt;FDayg?x&OxR^`0l*{-umvaSIauru|4cBrV*K-3mauYZ68E)aT z+{$fyj@!9|JNZ0c;EUYFm-sSwa}W1&A79~qzRCl9jR*NU5AiUM@FjOe=zKDFzk3R>~t{fd@$^C zFzovG6|Vi4%*Bm^y7%rpa8Uc+{U)vH(05R`iK_;U*i?C7_x1&w_U$>m?_VeX&t29I zY2Ryb=hJt7HzjxV9qod3eY;AR<+V|<) zYr^VIgZoWf*P&P6jy(s?o&0xJ#riJ2`nLZwtMlL8$3M6K=S0r`u*YACoQKo?J?r}O z^al*ZJPg2KoE5`EkN%-9PT+z{9s68{Iz4R~4;Q`r9N-%`{BQjLgbx%)wmD z&AiOV0xZPBEW)BJ#^Nl&k}Sp2EW@%a$MTglv>Jci@hX4S!kYHOIuD%xXVUCHvt0h` zu%5rN^2#bJEC1JVto)jRgZlPMxc1M09xy2Q^G)Y|p)YLQzE9WA{}%k42mJN6?LU%t z|9#Qu|E%a=mAmC1vDCJA`#$Zvb{^pD8vYe)8+&&j*11#Le;07`{+|Zi?Z>6T?f44+ zi}(uv8D9;9|G3A!9d^ZsfN;_cr8`|o`8|2Zz#{-eA5f2|X|{pkLmeW0q0{)^cBGu!oc-?!p+ yP4RY2-u8A(-u8A(-u6Go=HCbUKP`XzIJf)O{~^}@*L~H$503w)j{bid9RC*<;i_Z+ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack b/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack new file mode 100644 index 0000000000000000000000000000000000000000..ea8efdc86dd2d45a151ae49e78790a694a115d4f GIT binary patch literal 119206 zcmeErWpLO^vhD~*%#02*Gcz+YIDF~xS6nVA`Ehnbm~nVFdxU!2%`md@_pJ-c=5 zRz3aDCyi!C-P2$9v|5AXL&AfCJNJtX36B{V-!3#TI(kus@R+>0D2gP)W4F1vO zN5g`a5n+*0!O_t>Vt+LL8WtBG7#6%EE+{Z2aDR1>bRdZrXd+^Xh$kXUL?RK%M5Gdt zPQ-+Hg9rZ-YH8^o*D)bs!9V4i8`E{iFWHtQ^hs#Xo4ZkXbd%qNjB6DY+NO6AKfxAG%zZ<&%y-zM#lt(MfOjS;af{!a_iAxXm0WdG3RjcUWm>oaa(JJbPe1l?xI{X^6v_)t{%XZDe0>cAa2S@!f z&cYv@3M`w@Z{E+R(PIYA`#EFu{?3UMOaiL-uPZANQzl~iM0`;U{5+(@doK}FCt}7# z1c|66V)8^xn~0fvfvGyWzVjGI$pPRThX zlIT@4Fue6I*NuygiHL+h*%Xz~n8>Cd=8cL0KOP20BEKAIVOU7#;GkyzBLj~AlL6Bw zshWsM21+tOl0iT8`fn^a@^it@|4D|wSziSH9Z8d{k~HkE?|(_U;754=Gh+M-&i@uI zlB7&}&p(s$@$8SIH5q_{IY(Ykz#+R+X4QY-h1%=8Ct$jJl@9CjFDd^Num4QcKgHo+ zvHB0moV2I>1DU7B{Rk~N5>7c2PPr0Jxf4!#4*zL||9(J``%gV`Y~)`Cl-M5w4;N1y z_)Evd)qcjvKhrQVYW`cAp+7lu0VGCpkhEL=jfUtS_00)CeY4b@(tpT5e{3N?SMOhn ztYiNU-G8m+?>VOQUx|cClushS-z@Gwbj&2vB#|%4ME?tp`3J-E*aQkB1?T_V-ap}A z+xvfH1Mx9!qJo2$1jRJ_73O2QM*o{^O1>iOSn!W*?VsQ;8!b;tBWpMOGq>IJ7fEF1 zUA={Z|T>u6*{-`Aq zz8L?Q9g=kWYhC~PJ}K-b_5bVp|8)5Mzhi^=pTYBAx5EF=kN-|N<99!slEQcr@BUe` z@P8~no{*3W|K0rfmqYz$)7zxQOmg18vzUJ{rx~A=)+XgO|IYRj85kZM+V3Bh4dSaO z^!b~T!HcA@m59He*#7E}R02pUDI}E)k{AE=JY^y#l@O9j1W6@_q>@7B#C};5F?%BB zNW`3pm@5%;Ct{vN%$tb$5;1=w7D&W`iC8ER3nyZc|4M@XKQ2{_!yx*TIOF~oYZbr4 zIP^bL=J*T3{q`mGdkM{76AvUBEUDy?guf*+;OFt*NobP%{?|JI_-nr(hd}gCABVrG zloWH5Fez*#O_S7~gunOvKb;#Ssgv~Hzo*V0q~qB53NgW9C4QZMhgtth0qx)Rho70@ zKS{Lzf|I``*|U@ElZZ)NOu{5CCSej6lkjgj`D>#8>G-61l7ehfQvcuK_`jMkjzj-& zton0u{OJS!-`ME>7&(3`!4LRRZ2y^m{??ROm6zYF`S$!s@;|T2FZ%7nIyO8orcXk6 zOt|>@ujcn=iJSE$$Dc}Zb4n)^_veL2M1=*0{!1f%{tbw05fRho_qq>CUuJdlBr)ey z`SF59C89$Dv(=7h*CishWnfIdgc@tEoJqfQO8xlb)H)(6y$_8~IW3!+uozARqtQpteL+9a9}EBk!5}ag3;{#IFfbg903*RDFdB>j zW5GBu9!vle!6YylOaW8DG%y{^05icXFdNJPb3rVK1My%Um=6|!gbUU>R5r zR)CdY6<7_{fVE&9SPwRUjbIbl47Px+U>n#Dc7UB=7uXHi{KKt46cBy;2O9NZh)KM7Pt-WfV~+kKhyd48DM`;2ZeA{c@r5Fro}p%5Bj z5EkJO9uW``kq{YC5Eao79Wf9Su@D<^5Et5&Xb zMkEuG8OefVMY18;ksL@)Bo~q!$%EuY@*(+=0!Trm5KLK-!21rAs5z-iGf;2^%A)w=$V6lkG8vhI zOhu+4(~%j-Ok@@^8<~U5MPiXSBp#WE%tsa=3z0?0Vq^)j6j_EWM^+#!kyXfQWDT+w zS%<7gHXs|3O~__s3$hj2hHOW6AUly=$Zlj0vKQHh>_-kD2a!X_VdMyM6gh?*M@}Fo zkyFTN>hFnK(AUBa)$Zg~fau>OW+(#ZD50OX6W8?|) z6nTa`M_wQ=kyprTfVc8VHm`DU?PT zltnp|M+H!5YfdT4#L0oo94gf>Q-piR+cXmd0W zZGpB#gV12K71|nYgNC4O(ROGk8it0W5ojdZ9*shy(HOJ?+7a!9c1F9PUD0l6ceDrE z6YYieM*EhoD2zVd!vl1UeEOg^otYpkvW-=y-GjIuV_OPDZDo zQ_*SYbaVzf6P<<5M(3b&(O5JNjYsF9^U($9LUa+j7+r!cMVFz=(G}=QbQQW9U4yPg z*P-jt4d_O66S^7Qf^J2(q1(|N=uUJOx*Oet?nU>Z`_TjFLG%!M7(Id>MUSD!(G%!N z^b~p;J%gS_&!Okh3+P4k5_%cEf?h?hq1VwH=uPw%dKzL) zi*Xo_37CjUn2afyifNdR8JLM#n2kA@i+Pxj1z^dr4p7tRPkhD~uJviekmE;#di+BvuM5jg`U5 zV&$;%SOu&iRtc+&Rl%xa)v)SV4Xh?s3#*OQ!Rli5u=-d7tRdD2Ym7C)nqtkc=2#%s z0&9r{VZm4{tTomK3&GlA?XXZR3=791ut=;u7KKG)F<1wzBi0G)jCH}fV%@OrSP!fx z)(h*6^}+gL{jmPn0Bj&O2pfzI!G>bPu;JJUY$P@c8;y;@#$w~J@z?}xA~p$|j7`C& zV$-nc*bHnYHVd1L&B5kku~-}ykIlp8V+*i_*dlB(wgg*>EyI>$E3lQ=Dr_~j23w1* z!`5RPu#MOzY%{h6+lp<&wqrZ6o!BmHH?{}ci|xbqV+XK<*dgpNb_6?$9m9@eC$N*) zDeN?M20M$L!_H$Du#4Cw>@s!*yNX@Iu46Z_o7gSvHg*TQi`~QSV-K*0*dy#Q_5^#1 zJ;R=3FR+)`E9^D)278OW!`@>bu#ea$>@)TS`-*+TzW>R;36*JGOpk%uHiav;3jV2Htygq?%_TjfG5L~<0NiWkF+<0bHtcqzOzUIs6V zm&42B74V99CA>0T1+R)%!>i*p@S1onyf$73uZ!2i>*EdZhIk{qG2R4kiZ{cX>oS;#>i z3Q&X+l%WDus6ibX(1aGWp#xp$K_3RdWH32Q0aLEKP&(X!a}exECP$dVz4+Y0ZYPCurw?K%ffQ7JgfjK!b-3* ztOBdTYOp%20c*lqur{m%>%w}lK5PIR!bY$$Yyz9YX0SO7ge_o87zBf1E7%&gfg!Lh zYzISO7z~FIFcP+hQ7{_Dzz(n@>;yZ*F0d=?2D`%^uqW&Vd&54kFYE{V!vSz090Ui$ zA#f-h28Y8Da3mZBN5e62EF1^N!wGOAoCGJsDR3&B2B*Ura3-7uXTv#gE{uh7Fdoi> z^Wg%x5H5m?;S#tME`!VA3b+!kf~(;gxE8L1>){5t5pIH;;TE_RZiCz54!9HUg1g}! zxEJn&`{4n25FUbu;SqQg9)ri>33w8of~Vmbcov?6=ivo-5nh6q;T3olUW3=+4R{mY zg16xvco*J-_u&Kh5I%yB;S=~2K7-HU3-}Vgg0JBl_!ho{@8Jjd5q^T7;TQN7euLj3 zAP@p2FajqaK@cQC5j4RNEWr^xArK-V5i+3=DxncNVGt%^5jNovF5wYA5kMp(k`pP2 zltd~bHIar$OQa*x6B&q%L?$9Lk%h=gWFxW@If$G@E+RLPhsaChBk~gkh=N2RqA*c} zC`uF~iW4P>l0+$@G*N~qOOzwZ6BUSxL?xm!QH7{VR3oYrHHex-EuuD2hp0=`BkB_k zh=xQXqA}5gXi79AniGLU3!)_vLwOL=+KC#1I{b zjzlM-Gtq_UN^~Q-6FrEYL@%N@(TC_u^dtHc1BijdAYw2vgcwQ;BZd^r7Vl**^ z7)y*J#uF2WiNqvgGBJgiN=zfB6EldJ#4KVqF^8B-#1e5tJTZ@$Pb?r75{rn%#1djD zv5Z(wtRPkrtBBRa8e%Q6j#y7@AT|=4h|R zE^&{zPdp$V5|4<-#1rBv@r-y*ydYi@uZY*g8{#eTj(AUeAU+bGh|k0q;w$lu_)Y*4 zAyE<|aT1aQNs<&vlMKm{9LbXcDUuQ?lM1Pl8mW^8X_6LclMd;U9_f<-WHK^2nSxA7 zrXo|5X~?u>Ix;<(fy_u|A~Ta&$gE^GGCP@r%t_`VbCY?!^sFTl59^#kwA)k`Z$miq>@+J9-d`-R~-;(dh_v8oiBl(H^OnxE1lHbVhB%lxq zr7#MoAVp9lMNu@xP%On!JS9*fB~db^P%5QSI%QBMWl=WeP%h!t(Q#GiXR4uADRfnoe)uZZD4XB1x zBdRggglbAPqncBJR12yl6+{J7t*F*i8!CiqOSPjysW2*>il8E?_EZ!VO~p_hsE$-8 zsx#Gv>PmH^x>G%VWYH`RygOZB7rQv;}h)F5gwHG~>U4WourBdC$oC~7n{h8jza zqsCJcsEO1hYBDv2no3Qhrc*Phnba(5HZ_NuOT|)gR6I40noljD7E+6-#ncjNDYcAR zPOYF;Qmd%d)Ea6nwT@a(ZJ;(%o2bpy7HTWCjoMD_pmtKbsNK{aYA>~q+D{#z4pN7x z!_*P#D0PfFPMx4mQm3fX)EVk5b&fhuU7#*fm#E9s73wN=jk-?Vpl(vPsN2*X>MnJU zx=%fz9#W5}$J7(*DfNtcPQ9RBQm?4j)Enw8^^ST^eV{&4pQz8&7wRkZjrvXj8lh1d zqj4J21WnQuP16j`(j3jx0xi-KEz=6E(i*MP25r(7ZPO0z(jM*80dz7tIh}$|NvEPy z(`o3mbUHdcoq^6sXQDIHS?H{EHaa_S4I+AWrN72!A4Bdh5NOz(; z(_QGUbT_&?-GlB)_o92#edxY)Ke|6XfF4K>q6gDM=%Ms5dN@6T9!Za)N7G~IvGh24 zJUxM)NKc|C(^KfF^fY=pJ%gS}&!T73bLhErEFDM3)AQ*0^a6Szy@*~+FQJ#x%jo6w z3VJ2Iie62xq1V#u==JmldLzAw-b`+}u!CVh*(P2ZvK()Z~5 z^aJ`K{fK@{KcSz}&*}v>C5zE`ZEKVfy^LgFf)W1$_!(MGb5Oh%qV6wGlm(EhW;Qd2najj7aZEfjkD1RbU=}iqn8nNzW+}6bS|k~>yO`a~9%e7IkJ-;0U=A{en8VBw z<|uQFInJD5PBN#M)65y>EOU-I&s<-7$n8(Z$<|*@xdCt6GUNWzk*UTH{E%T0f&wOA$GM|{w%opY>^Nsn=02X0U7GrT1 zvII-A6ic%V%d#BHvjQu!5-YO`tFjuavj%Ij7HhK(>#`o}vjJ=}HaVMuP06NWQ?qH< zv}`&yJ)42e$Yx?Qvsu`zY&JGKn}f~C=3;ZRdDy&cJ~lsFfGx-tVhgiH*rIGPwm4gY zEy)7?|26iL6iQUX@VYjl|*zN2Nb|<@w-OcV{_pK_9lCaz0KZX@3QyU`|Jbu zA^V7Z%syeCvd`G(>T?aahFl}AG1r7^ z$~EJfbAen7t|b@51#_*q)?6Ddglo&S<3hPGE}VdgTqmwG*M;lK zb>q5oJ-D7+FRnM&hwIDrGq{=DEN(V8hnvgAa&cTdH;$&fVZ{a<{nK+#T*NcaOWzJ>VX4 zkGRL&6YeSZjC;<#;9hdCxYyhp?k)F@d(VB~K60P9&)gU8EBB52&H*0bQ6A%Q9`XcF z@)S?=4A1f$&+`H=@)9re3a|1Suk!|P@)mFN4)5|F@ACnCGCnz-v8^LhBZd_F!uUw|*j7vc-^Mfjq8F}^rof-lLJ z;!E>o__BOCzC2%nugF*8EAv(Os(dxRI$wjY$=Bj*^L6;Td_BHC-+*t(H{u)fP57pK zGrl<=$hY8I@U|x8XzhwtPE2ln>*>`3OFeZ_h{Z(R>Wwf$zw7;yd$Q_^y06 zzB}K8@5%S#d-HwxzI;EvKRZ{fG{+xYGL4t^)Ui{H)f;rH_U`2G9={vdydKg=KDkMhU( zltL;YwU9iMgn~jLp|DUyC@K^aiVG!#l0qq= zv`|JUE0hz;3l)TlLM5THP(`RJR1>NTHH4Z%EupqhN2n{*6Y2{MgoZ*Rp|Q|JXeu-l znhSwK3!$YDBm@hsgw{eEAw*~^v=c&wFdMXz6NU>TgptB1VYDzt7%Pku#tRdKiNYjd zvM@!MDohim3p0e7!YpC7Fh`gx#0qgjyf9CgFDwui3X6or!V+PruuNDktPoZTtAy3U z8ey%lPFOE&5H<>%gw4VhVXLrB*e>i4b_%+&X~eW*Ix)SN zLCh#-5;Kcg#H?aAF}s*U%qiv)bBlSzykb5vzgR#lC>9b6i$%nuVllC}SVAl*mJ&;g zWyG>#IkCK0L98fN5-W>U#HwO7vAS48tSQzKYm0Tnx?(-CzSux)C^ixsi%rC)Vl%P1 z7$~+7TZ%zqu-Hm$Ew&Lu#I|BPF;ol_!^H?OQfx0qiP2(=*g@_UBs?pH?h0e zL+mN`5_^k%#J*xbvA;M#94HPF2a7|*q2e%cxHv)_DUK3Hi(|yG;y7`ir2*J;tlbpcuTx3-VyJL_r&|+1M#8wNPH|l z5ub|B#OLA*@um1md@a5a--_?V_u>cfqxebuEPfHcir>WVB9IUXl`sjHphQTdL`k&7 zNUX$3yd+4XBuTQQNUEesx@1VEWJ$K@r8skBr^ zDl3(f%1afbic%%1vQ$N?DpixJOEsjLQZ1>rR7a{S)syN=4Wx!rBdM{}L~1HElbTC` zQVXf26eI;pt)$jc8!1F;E47nCr7$U6ijX3u_EMA-EyYM3q>fT2sk78Y>MC`Ux=THz zo>DKVx70`KEA^B5O9P~V(jaNDG(;LI4U>jTBczeiC~34bMj9)Plg3LEq>0ibX|gm$ znkr3`rb{!VnbIt2wlqhYE5%B2QoJ-znlCMo7D|hx#nKXKskBU5F0GJON~@&R(i&;4 zv`$(tZICufo21Rs7HO-rP1-K)kakMDq}|dUX|J?T+AkfD4oZil!_pDysB}y^E}f80 zN~fgL(i!QjbWS=iU63wHm!!+m73r#UO}Z}KkZwx1q}$RR>8^B7x-UJD9!ig-$I=t& zsq{>GF1?UmO0T5X(i`cm^iFy&eULs%pQO*y7wN0?P5LeY8Ie&LlW`f!giOkmOv{YS z%ACy0f-K6CEX#_l%9^aphHT1~Y|D=9%AV}Y0dg`qxtu~yDW{TC%W34aaymJ^oI%bg zXOc6^S>&v8HaWYTL(VDZl5@*>;l55L#1ygWgkC{L0n%Twg3 z@-%t6JVTx-&yr`$bL6>ltQ;rD%k$*<@&b9GyhvUwFOiqZ%jD(q3VEfxN?t9mk=M%W zPk3HhXa zN+%ixrhH4jE#Hyv%J<~^@&oyy{78N*Karox z&*bOw3;Ct|N`5WBk>ASiDq(UjQ!YHi5 zDZC;mq9Q4>qA04ODY{}PreZ0!;wY}-DZUb*BvXivMV{1oJuYww~|N6tK?JiD+QE-N+G4NQbZ}L6jO>TC6tm%DW$YhMk%Y5 zQ_3q9l!{6vrLs~*sj5^{sw*{=no2FDwo*r_tJG8KD-D!}N+YGQ(nM*hG*g-@fl3Rd zr4pnBE3K5)N*g6aX{)qTLX|KjT!~O3mG(-M60O829h8nrC#AE}Md_+^Q@SfXl%7g2 zrMJ>Y>8tcp`YQvJfyy9durfp$sti+xDH63l$pvbWwtU$nXAMqaZ0>0PnoYQP!=kSl*P&tWvQ}ES+1;5Rw}EM)yf)Wt+Gy8 zuWV2@Dw~we$`)m-vQ62p>`-gPAaFA z)5;m;ta45{uUt?rDwmYY$`$3Ra!t9e+)!>Rx0Kt;9p$caPr0u=P#!9el*h^w<*D*a zd9J)rUMjDY*UB5^t@2KJuY6EGDxZ|k$`|FU@=f`!02NVD6;p8)s)S0aluE0N%Bq~o ztAZ-3k}9i;s;Zi*tA=W-mTIex>Z+dVs{v{$szv}!svy_!MIsAf_# zt69{nYBn{ynnTU0=2CO3dDOgWJ~h8uKrN^iQVXj^)S_xJwYXYBEvc4LORHtnvT8ZC zyjnr6s8&)dt5wvhYBjaGT0^a=)>3P$b=10QJ+;2tKy9ctQX8vH)TU}PwYeIowoqHD zL29ttN^PySQA5(LLI4&Qb(&})UoO~b-X%3ov2PyC#zG`sp>R! zx;jIhsm@Yot8>)3YOETk#;fzx`RW37p}I(2tS(WPs>{^n>I!wGx=LNGu2I*j>(uq? z26dylN!_e&QManw)a~jHb*H*Z-L39X_p1BU{ptbrpn6C>tR7L1s>jsh>IwCvdP+U5 zo>9-L=hXA+1@)qONxiIIQLn1k)a&XE^`?4Dy{+C+@2dCI`|1Prq54RDtUghns?XHt z>I?Oy`bvGRzER(*@6`9|2lb=+N&T#TQNOC+)bA?L5DnEZ4cDMXXrxAIw8m(x#%a7J zXrd-*vZiRNrfIrnXr^Xqw&rNA=4rkbpe56iYbmspS}HBImPSjfrPI=D8MKUACM~m; zMa!yX)3R$hw47QlEw`3O%d6$n@@oaOf?6T1uvSDXsuk0UYbCUjS}CoxRz@qUmD9>= z6|{<4C9SenMXRb+)2eGVw3=Eit+rN2tE<)1>T3<1aL~E$V{hHE3Vk=iJ2v^GW?tBupfYZJ7I+9YkVHbtANP1B}p zGqjo7EN!+nN1Lm~YH?b;Hcy+cEzlNfi?qes5^brrOk1w4&{k@zwAI=gZLPLWTd!@< zHfo!+&Ds`itF}$suI|y6YZ(?Ona`q z&|YeXc6FjLzzu&g+6M z>XI(&imvLKuIq+w>XvTnj_&H7?&|@1GCjGTLQkou(o^ec^t5_9J-wbm&!}h8GwWIO zta>&*yPiYOspryj>v{CNdOkhBUO+FX7t#ysMf9S2F}=86LNBS8(o5@Q^s;(6y}VvQ zuc%kjE9+JCs(LlOx?V%Csn^nL>viOp$2 z-b!z+x6woNwt71~R1ed`^$0yuZ?8w`(Rz&DLGP${(mU&2^sah0y}RB+@2U6Fd+UAl zzIs2szdk@8s1MQy>qGRR`Y?UCK0+Ug)9N`UZWY zzDeJ#Z_&5v+w|@F4t=M-OW&>U(f8{6^!@q){h)qGKdc|okLt(tgV+H`UU->eo4QqU(v7X*YxZ94gIEmOTVq((eLW_^!xe){h|Iyf2=>zpX$%_=lTo% zrT$8Pt-sOV>hJXT`Um}^{z?C=f6>3{-}LV~Fc1SZFatNBK^UY#8MMI|tic()AsC_| z8M2`ms-YRWVHl=i8MfgVuHhNJ5nv=Uk{c>v*lz4Fb{e~k-Nqhcud&bAZyYcV8i$O-#u4MFam+YwoG?xrr;O9a8RM*R z&Ny#eFfJOGjLXIqNn~cetoXMMlDVmZg zn~JHLnyH(HX_}U4n~v$4p6QzbW->FmnZitIrZQ8TY0R`{Iy1eQ!OUo8GBcZ5%&cZM zGrO6?%xUH_bDMe0ykIkUW3!K`Rj zGAo-^%&KNJv$|QstZCLVYnye+-!_5dY(rj-=nbBs9*}?2+b}~DgUCgd#H?zCh!|ZAHGJBhS%)Vwn zv%fjO9B2+Q2b)98q2@4ixH-ZcX^t{Sn`6we<~Vb_Il-K0PBJH(Q_QL6G;_K+!<=c( zGH07}%(-T)8E3|u^UV3?0&}6c$XskLF_)Ul%;n|^bEUb;Ty3r~*P83h_2ve1qq)i4 zY;G~Pn%m6n<_>eGxy#&b?lJe8`^^330rQ}F$UJNwF^`(Z%;V+>^Q3voJZ+va&zk4V z^X3KfqIt=@Y+f<1n%B(h<_+_vdCRZYE`qUTQ#hjRxPWxRmZAp)wAkb4XlP%Bdf91#A<3avzl9hRtu}86=Vflt*q8o z8!NS}efx?4T0o>nibx7Ek$YxT4G zTLY|t)*x%JHN+Zf4YP(@Bdn3uC~LGe#u{slv&LH!tclhnYqB-Pnrcn6rdu& z7Hg}u&Dw75uy$IztlicgYp=D>+HW1O4qAt-!`2b&sCCRbZk@1BTBoej)*0)pbX&AM*gux?tntlQQd>#lXrx^F$O9$JsA$JP_;srAfyZoRNxTCc3v z)*I`s_0D>4eXu@SpRCW;7wfC_&H8Qu8?jLvvvC{RgiYF%P1}sk+MLbXf-TyTE!&E% z+M2D~hHcuGZQG9R+Mey(0d_Jwxt+pJX{WMN+iC2yb~-z~ox#p%XR(!9vTNIQ?7DV6yT0APZfG~M8{19nrgk&CxgBV?uv^+ecCg*bZf&=* zL+rM8J3G`4v%~EOJJN1%N7>PKjNQTRXm_$Z+gKq9%+xVN84lUvGzE7ygk95Xiu^y+f(eR_B4CCJ;R=9&$4IR zbL_cxtQ}{^+w<)C_5ypMy~ti{FR_=}%k1U$3VWr!%3f`+vDez`?Dh5rd!xO{-fVBN zx7yq6?e-3Pr@hPGZSS%7+WYMN_5u5#eaJp+AF+?x$L!-G)%rhUu4ZQrr)+V|}H_5=H&{m6c7Ke32A{2X!z9cc4Q!q(eEh!#J$NIlLn{q9Zx7qd2Og zIl5ywreis_<2bJ4IldF%By*BGDV&r}Dkrs*#!2g>bJ9B*oQzH;C$p2q$?9ZtvO77P zoK7w$x0A=o>*RCtI|ZDAP9dkTQ^YCi6myC@C7hB@DW|kk#wqKRbILmvoQh5*r?OMU zsp?d7syj8DnocdJwo}Kc>(q1VI}MzMP9vwW)5K}&G;^9efldpjr4!@?JFT47P8%o0 zY3sCeLY*)t+=*}^o%T+Y6Yaz}9h{C%C#SR1#p&vFbGkb{oSsfEr?=C`>Fe}!`a1)h zfzBXjurtIN>I`#+J0qNt&M0TJGsYR~jC0026P$_8BxkZS#hL0%bEZ2poSDunXSOrP znd`(laZbE5&zbKma27g?oW;%(XQ{KyS?;WGRywPk)y^7ct+UQq?`&{3I-8u$&K75@ zv(4G=>~MBEyPVz59%rw!&)M%Ba1J_$oWsr$=csecIqsZrPCBQY)6N;^taHve?_6*$ zI+vWw&K2jXbIrN#+;DC>x18I~9p|od&$;hBa2`63oX5@+=c)6|dG5S$UOKOw*UlT~ zt@F-#?|g7RI-i`+&KKva^UeA002gsl7jtnJx`a!*luNse%etJ)yMimak}JE4tGb%2 zyM}AJmTS9?>$;xny8&)8H@Ta_P3fj`Q@d&0v~D^#y_>GpDayM5fgZa=rbJHQ?2 z4sr*(L)@Y6Fn72+!X4?3a!0#k+_COBcf32no#;+-C%aSJsqQp)x;w+2>CSRzyK~&R zZmb*U#=G;}`R)RDp}WXk>@IPay35?)iG326v;o$=&R3akskL z-0kiTcc;6{-R>hECy2sq(?g{s#d&)iSo^j8*=iKw|1^1$R z$-V4eaj&}9-0SWQ_ojQxz3tv{@4ENg`|bnxq5H^v>^^ayy3gF_?hE&&`^tUozH#5W z@7(w92lu1<$^Gnpalg9X-0v>%5D)b*5BH!)c%(;pw8wa?$9cRbc%mnHvZr{er+K<( zc&2B0w&!@R=Xt&t;3e~tdnvq>UMerOm&Qx$rSsBz8N7^MCNHy>#mnkt^Rjz6yqsPx zFSnP+%j@Oy@_Plmf?gr7uvf$@>J{^fdnLS*UMa7%SH>&rmGjDb6}*aGC9kqq#jEO7 z^QwC_yqaDuueMjmtLxSC>U#~mhF&ADvDd_F>NWG4dx2gHuca5{1$(W$)?OPg#B1xd z^FqBaFWigpBE9xrlo##AcpbcsUMH`!*Tw7Vb@RG=J-nV?FR!=P$Ls6$^ZI)Oyn)^z zZ?HGS8|n@7hI=Etk=`h8v^T~Z>y7iqdlS5g-Xw3bH^rOkP4lLEGrXDJEN`|q$D8ZL zdU0O7H_w~zE$|k4i@e3&5^t%u%v_h^Tkmb~HhP=9&E6JotGCVD z?(Oh)db_;c-X3qSx6j+}9qbult5? z`j&6|j_>-O@B0CMGC#SW!cXa^@>Bb1{Iq^LKfRy9&**3JGy7TmtbR5>yPw0)>F4ru z`+5Alem+0HU%)Tu7xD}HMf{?EF~7K9!Y}EU@=N<={IY&Izr0_;ujp6uEBjUas(v-U zx?jVu>DTgW`*r-fem%dw-@tF^H}V_%P5h>QGrzeX=(q4&`ayoM-^y?8xA8;#wthQ5 z)DQE+{RltOZ|_I>(SD5I!SCpI@;m!o{H}gCzq{YV@9FpQd;5L-zJ5Qyzdyhq=nwJ- z`$PPp{xE;IKf)jBkMc+RWBjrHIDfo9!Jp_)@+bRK{Hgvlf4V=zpXtx?XZv&fxqhr4 z=g0f={Q3R@f1$s~U+gdOm-@^6<^BqPrN7Ew?XU6I`s@7l{sw=ezscY1Z}GSK+x+eR z4u7Y=%irzq@%Q@s{Qdp`|Db=!KkOgzkNU^_ zU-7T{*Zk}L4gaQp%fId4@$dTg{QLd`|DpfLf9yZ;pZd@I=l%=-rT@x*?Z5Hg`tSVr z{s;e~|H=RCfAPQi-~8`B2tWeR04x9xfB{4R89)Wl0ZhREW9=V;Mu`@!O?T~O+sNtp zE!(zj+qP}nwr$(CZQHDU|MQ->5!I@Qs@ZvBHZr>-vol7NAZidTh#tfUVg|8-*g>2i zZV)esA0!A828n{iL6RV8kSquS5y(IVIxv9^T;PL{AbF4?NExIGQU_^*v_ZNceUKr@ z7-R}E2U&uwLAD@!kR!+$fLrN(W_v zvO&3^d{7~%7*q->2UUWqLA9WIP$Q@r)Cy_`b%MG^a^?heS*G0zo36GAQ%`73I+#5 zf}z2%V0bVh7#WNTMh9bpvB9`td@v!H7)%N#2UCKn!L(p{Fe8{5%nD`)bAq|SykLH? zAXpeI3Kj=Tf~CQ-V0o}2SQ)GeRtIZ>wZXbzeXt?e7;FkQ2U~)z!M0#~up`(R>cFAUGHt3JwQHf}_E);COH%I2oJ@P6ua#v%$IGd~hMS7+eZ22Umit!L{Ie za3i=G+zM_7cY?dYz2JWEAb1!&3LXbff~Uc=;Cb*Ocp1D3UI%Z2x52yMeefap7<>vo z2Va7(!MEUh@FVyc{0e>te}cckzaW(Oj|eToh_E7@2rnXth$51RETV|0BASRUVu+X` zmWVClh`1u2h%XX|gd&khERu+%BAEz;5K<_ig%MUb;YEl@E>eh;B9%xj(ulMook%Y- zh>RkW$SksmtRkDpE^>&RBA3W5@`$`5pU5u?h=QV!C@hMIqN126E=q`!qLe5t%80U} zoG33Uh>D_;s4S|8s-l{xE^3IHqL!#F>WI3co~SPxh=!t(Xe^qDrlOf>E?S6|qLpYZ z+K9HIooFvQh>oI@=q$R3uA-ahE_#TbqL=6``iQ=wpXe_Jh=F2|7%YZ}p<2p7m@KA?(@VwG4e)`+!Yomek6h>c>C*ete)tzw(lE_R5WVwc!0_K3Y=pV%)Bh=bygI4q8c zqvDu2E>4J(;*>Zo&WN+(oH#Eoh>PNqxGb)StKyotE^dgM;+D8A?ufhMp13a_h=<~l zcr2cXr{bA-E?$V2;+1$U-iWv2op>)kh>zlv_${)oTgp9m%Y zBSXtDGOP?I!^;RVqKqUX%P2Ccj3%SY7&4}eC1cAtGOmm#!DOeIsxG%~GBC)3LeGNa5SGs`S8tIQ^|%N#PN%q4TnJTkA$ zC-ch!vY;#^3(F$1s4OOn%M!ArEG0|JGP0~JC(FwUvZAacE6Xafs;nlf%Nnw#tR-v9 zI*ovZL%IJIgMztL!Gb%O0|) z>?M24KC-XuC;Q6*a-bX}2g@OHs2nDT%Mo&<93@A~F>Fca;e-ZkIJj^sr;&dDyRyn!m5ZWs*0)Ns)Q=3N~zMS zj4G?jsq(6Vs;DZd%BqU0s;a5#s)nkmYN^_)j;gEbsrsscYN#5i#;S>Gs+y_hs)cH) zTB+8mjcTjfsrIUa>Zm%Y&Z>**s=BG}s)y>Sda2&3kLs)Xss3t!8mI=T!D@&as)niI zYJ?i8Myb(ij2f%Psqt!pny4nJ$!dz4s-~&wYKEGrW~td~j+(3HsrhPwTBsJO#cGLK zs+OtcYK2;V!I}PN~!Cj5@2%sq^ZBx~MLx%j$}{s;;T)>V~?hZmHYqj=HPvsr%}I zdZ-?$$LfiCs-CIm>VWBKNeyQK;kNT_r zsZjbqI!|HH4ypEtF>PR}Wj-sRLXga!%p=0V;I<}6ZO?xRPNI|Q zWIE77ORcolMqBN)*C9H&PN7rkR64azqtohiI=#-IGwMt_v(BQk>TEi@&Y^SaTspVT zqx0%~I=?QU3+h6;ur8vD>SDUME}={6Qo6J*qs!`Yy1cHSE9y$RvaX`5>T0^WuAyt{ zTDrEbqwDH=y1s6p8|p^7v2LQ9>SnsRZlPQ1R=Txrquc6sy1nk8JL*olv+kn1>TbHb z?xB0?Ub?sLqxS21g9-&9-QF^oS=nqo}p*zS$ejfqvz^*dcIzu7wScNv0kE=>ScPlUZGd&ReH5vqu1(ndcEGDH|kA# zv)-b&>TPSOx2KA}(QQ~I<%qtEJd`nTCMCzM*gGTl%)Xqwnf_`o4akAL>W?v3{bT>Sy}7exYCLSNgSnqu=Ux z`n~?3Kk85Vv;LyL>TmkH{-J;BU;4NHqyOrEI+Xd332nlduqK=dZz7n8CX$J4qL`>A znu%^=n3yJ(iEZMTxF(*7ZxWbOl9;3>nF)+A(kP>iG1fTaO^8WuQkaw`l}T;V zn6xIHNpCWkj3$%GY_gcFCY#A_a+sVZm&tANn7k&R$!`jnf~JruY>JqorkE*iN|=(S zlqqe>n6jpvDQ_y6il&mOY^s>5rkbg4YM7d)mZ@#(n7XE(sc#yXhNh8eY?_#+rkQDO zT9}rmm1%9-n6{>!X>U51j;538Y`U1Prkm+*dYGQ3m+5W#n7*c;>2C&@fo6~yY=)Sj zW|$dnMwpRilo@Ttn6YM@8E+<-iDr_SY^Ip0W}2C9W|)~~mYHqln7L-2nQs=Dg=Udi zY?hd%W|>)TR+yD$m04}pn6+k|S#LI&jb@YCY_^!KW}De=c9@-Jm)ULhn7wA7*>4V* zgXWMqY>t?t=9oEdPMDMClsRqAn6u`bId3kQi{_HKY_6EA=9;-~ZkU_qmbq>2n7ihl zxo;krhvt!aY@V2>=9zhJUYM8Wm3eL6n78Jgd2c?LkLHv4Y`&PU=9~F$ewd%;m-%h} zn7`(q31$CdL)$PmtPN+w+XyzIjbtO+C^o8%W~18}Hl~ebW7{}3u8n8o+XObDO=J_> zBsQr{W&)Qslp>1Rv+a|WDZDyO>7Ph5rWn0@ewykYv+uIJdqwQon+b*`N z?Pj~%9=50LWqaE`wy*7H``ZC_pdDlf+aY$S9cG8y5q6{>cCX!M_uB*Zpgm*{+ava%zJ4E`p2bBDu&e zii_%^x#%v2i|Jyy*e;HX>*BfiE`dwv61l`KiA(B|xxfi0opRb4XPtB2g}CG{g-hvD zxzsL=OY73P^e%(T=rXy?E{n_RvbpRohs)`5x!f*~%j@#F{H}m2=nA>Qu81q@in-#h zge&PvxzetTE9=U+@~(ob=qkC&u8OPbs=4Z}hO6mnx!SIdtLy5y`mTX%=o-1ku8C{v znz`n#g=^_rxz?_YYwOy%_O65L=sLO1u8ZsHy1DMIhwJHjx!$gi>+AZt{%(L9=mxpL zZipM|hPmNxgd6EbxzTQn8|%io@os{f=q9(hP&x*x!dlJ zyX)?``|g2z=pMPp?umQqp1J4lg?s5-O`|JL>Q2swYv=8IM`fxtHkKiNvNItTU;-mU#KDv+LWBOP=wvXfE`glIRPv8^! zL_V=k;*-u`WzHi_g`bNI7Z{nN!X1=*^;amDvzO`@T+xm9Cz3<>V`cA&H z@8Y}qZoa$k;d}aCzPIn=`}%&qzaQWS`ayoMAL57lVSczD;Ya#WezYIs$NF)8yr1AF z`bmDWpW>(bX@0t&;b;0;ezu?E=lXemzF*)M`bB=RU*ebgWq!F|;aB=qezjlY*ZOsS zz2D$B`b~bb-{QCWZGOAo;dlC7ez)J__xgQ)zdzs)`a}M(KjM%2WB#~5;ZOQg{tc|Hn8#b_8EV_;1D?`+8bIk7Pg#>IFT9~0pJZrLQ_o*0v0QcQ*c z3Y4f&qd|)fJ%-@_?#ra$o)S}GYD|M^F&(DI|5h9RpOXY6LAtw#wj=zr{Q#*firOy z&c-=77w6%8T!0I45iZ6hxD=P+a$JEcaTTt{HMkbn;d@fE(tH~1Fc;d}gmAMq1@#xM94zu|ZMfj{vV z{>DG}7ysdZYiIw@`45K1Fc=oYVR(#y5it@*#wZvSqhWN6fiW=_#>O}p7vo`kOn?b7 z5hlhYm=u#?fC42d)M(J6LysYt98+LQOogd24W`9(m>x4=M$CknF$-qJY?vK$U{1`1 zxiJss#eA3_3t&MkgoUvP7R6#%97|wHEQO`943@=mSRN~2MXZFCu?kkjYFHg>U`?!r zwXqJ?#d=sD8(>3hgpIKYHpOPx99v*ZY=y0{4YtL0*d9AzN9=^1u?u#^ZrB}rU{CCY zy|EAW#eUcy2jD;)goAMi4#irsL98cg$JcXz644%bvcpfj{MZAQU@d{qWYj_=R;7z=R zxA6|%#d~-kAK*iLgpctFKE-GF9ADr|e1)&^4Zg*9_#QvtNBo4J@e6*%Z}=U5;7|O8 zzwr3IVV;qc&@i0Cnz=W6x6JrugipelQ zff5yJG-%PG#}G`8DKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{%#JxQC+5Q3mKFp5= zupkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>YR>I0y1*>8;td2FXCf35*SO@E3J*SeNC+@=CxCi&*KHQH7 z@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+( z@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00VrVV`vP6VKE$r#|Rh^BVlBWf>ALVM#mT!6Jud)jDvA89>&K6m=F_TVoZWbF&PFZ zP@+PO1}!@D7=p<$1*XJQm>SbyT1i(0EQZCg1eU~7SQ^Vw}aN>~}IU{$P!)v*TF#9CMz>tJ21hxM@m zHpE8Q7@J^IY=+IT1-8Ui*c#hlTWp8zu>*F*PS_c{U{~yh-LVJu#9r7N`(R(}hy8H? z4#Yt?7>D3c9EQVj1dhZ}I2y;`SR9AraRN@nNjMp&;8dK3({TpQ#925S=ipqNhx2g( zF2qH+7?_uyXKhx_pW z9>ha<7?0plJch^d1fIlGcpA^(Sv-g5@d94NOL!Tt;8nba*YO74#9Me9@8Dg$hxhRT zKEy}(7@y!%e1^~Q1-`^r_!{5fTYQJ_@dJLuPxu+X;8*;H-|+|j#9#Ou|KMNzhyNX% z`=9*B&=>~8VmJ(s5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0Atu7am;{qzG7M0l zM1>j+T6E|!1e0S5Oo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)l zh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$1+0jburgM`s#p!HV-2i{wXinU!Ma!v>th3K zh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc2keNQurqeSuGkH`V-M_!y|6d-!M@lJ`{Mu{ zh=Xu24#A-~42R*ZsI1b0-1e}PIa57H8sW=U%;|!dMvv4-f!MQjO=i>rg zh>LJBF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYft+)-h;||=3yKpz|!M(T-_u~OP zh==en9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S(t9T8s;|;utx9~RJ!Mk`5@8bh} zh>!3wKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kFulNnW;}86azwkHy!N2$q|2sGP zKlzWLF${*qa2OsVU_^|BkueHJ#b_8EV_-~-g|RUX#>IFT9}{3gOoWLs2`0s47@$Cj z3N;$E=+I*bCdU+*5>sJnOoM4L9j3<&m=QB!X3T_y7RM4;5=&ueEQ4jS9G1rlSP?5>Wvqf#u^Lv#8dwu+VQs8~b+I1S#|GFC z8)0K?f=#g*Hpdp&5?f(wY=dpF9k#~~*bzHnXY7Jqu^V>B9@rCmVQ=h%eX$?*#{oDH z2jO5GfxDhwuX54~XaT{*O9k>&B;cnc6dvPD`#{+l} z58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H#|QWj zAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`}ANUi0;cxtdfAJswcWnND z@*hKE7z~TyFg!-Uh!_bYV-$>v(J(s3z?c{dV`ChQi}5f%CcuQ42oqxxOp3`cK!Fk! zYBXrkp~nzRjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9% z!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L z!p7JHn_@F;jxDeyw!+rf2HRpgY>yqVBX+{h*af>{H|&l*uqXDy-q;8GVn6JU18^V? z!ofHMhvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW z!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s z!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A z!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)Jl-}ndr;y(-(f&9nN z7zV>)I1G;wFd|06$QT8qVl<47F)${^!q^xG<6=CFj|ng#Cc?y+1e0Pi3{ap%g&GZ7 zbm%bzlVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI4$O(UFgNDGyqFL3V*xCPg|ILd z!J=3Ui(?5aiKVbKmcg=E4$ET&tcaDcGFHK=SPiRV4XlZ^ur}7gx>yhEV*_l6jj%B` z!KT;@n_~-XiLJ0Tw!ya84%=e~?1-JPGj_qQ*bTd55A2D(us8O>zSs}@;{Y6pgK#ho z!J#+|hvNtwiKB2dj=`}w4#(pJoQRWfGETv%I1Q)c44jFxa5m1txi}B!;{sfWi*PY6 z!KJtim*WatiK}omuEDjq4%g!b+=!cSGj74HxDB`C4%~^ma5wJ3y|@qe;{iN~hwv~S z!J~K#kK+kEiKp;1p24$t4$tESyoi_ZGG4)}cnz=P4ZMlB@HXDTyLb=p;{$w%kMJ=* z!Ke5PpW_RBiLdZAzQMQn4&UPk{D`0MGk(FZ_zl0~5B!P0@HhU!zxWSBMI`?*G={;j z7!Jc@1dNE0FfvBLs2B~SV+@Rmu`o8q!MGR?<6{C$h>0*UCc&hb3QK3eI79DyF z!Q_|%Q(`JijcG6~ro;4@0W)GI%#2wuD`vy&m;-ZSF3gR2FfZoA{8#`BVj(PyMX)Fq z!{S&1OJXT3jb*Sbmc#N`0V`r9tc+E#DptelSOaTfEv$`ourAia`q%&)Vk2yfO|U68 z!{*omTVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb# z!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ z!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f z!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)| z!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8_yd39FZ_*v@Gt(uP?5-g42@whEQZ7I z7y%<r%PphbrsLohj} zz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLbTgm;O(V-YNh#jrS* zz>-)BOJf-us$}xhS&%jV-swO&9FJP zz?RqwTVoq+i|w#IcEFC<2|HsK?26s6JNCey*b94OAMA_$us;sKfj9^U;}9H*!*Do` zz>zo#N8=bAi{o%SPQZyc2`A$eoQl(MI?lkEI16Xv9Gr{ua6T@;g}4Y8;}Tqo%Wyfa zz?HZPSK}I7i|cSbZorMW2{+>w+=|<9JMO@pxC?jV9^8xja6cZvgLnuJ;}JZH$M86w zz>|0iPvaRpi|6n>UcifZ2`}Rnyo%TGI^MvWcnfdi9lVS8@IF4khxiB|;}d*}&+s|E zz?b+6U*j8mi|_C~e!!3T2|wc({EFZ3JO03*_zQpIAN-5|FjQpnA46jp42$6~JVwBX z7zra|6pV_|FgnJ-m>3IVV;qc&@i0Cnz=W6x6JrugipelQff5yJG-%PG#}G`8DKI6b z!qk`s(_%VIj~Or{X2Q&v1+!u{%#JxQC+5Q3mKFp5=upkz~!dL{0Vlga^C9oux z!qQj<%VIe!j}@>YR>I0y1*>8;td2FXCf35*SO@E3J*SeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym z!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4 z!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*alVUOqP@qJG8Vy=>=rIJ7V+u@(sW3IB z!L*nT(_;qAh?y`mX2GnO4YOko%!#=$H|D{-m=E(~0W64xurL7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AFt*|w= z!M4~A+hYgph@G%AcEPUL4ZC9x?1{awH}=84*bn>T033*ea4-(Rp*ReO;|Lsyqi{5i z!Lc|F$KwQ?h?8(KPQj@-4X5J_oQbn=HqODhI1lIJ0$hlTa4{~yrML{2;|g4ft8g{0 z!L_&!*W(7kM z!LxV{&*KHWh?np(Ucsw)4X@)3yotB)Hr~Ozcn|O61AK^&@G(BYr}zw?;|qL=ukba# z!MFGh-{S}Th@bE?e!;K!4Zq_L{E5HtH~zuD_zy!xCI2xrhQY8H4#Q&vjEIpiGDg9u z7!9Li42+4fFgC`)xEK%PV**Twi7+uH!K9cB0~9Dxp+SI818ZU}tc`WBF4n{P*Z>=1BW#RKuqigf=GX#TVk>NoZLlr2 z!}iz#J7Op7j9suRcEj%21AAgG?2Ub}FZRR!H~D z!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ z!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U? z!}s_BKjJ6+j9>68e#7th1ApQ#{EdI`FaE<&(a3)cjbSh>hQsg}0V850jEqq*Dn`TT z7z1NsER2nDFfPW!_?Q3_Vj@h8NiZoU!vFJs)Gh-IairFwb=D?ho3v**0%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$V;L-q<*+*1(!r3u|K?tc&%qJ~qIH*a#bA6KsmjusOECme>kgV;gLX?XW#| zz>e4nJ7X8@irug~_Q0Ol3wvW9?2G-dKMufwI0y&h5FCoba5#>@kvIxR;}{%^<8VAq zz==2sC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18 zz>T;GH{%xEira8I?!cY63wPrl+>85gKOVq?cnA;U5j={=@Hn2plXwbG;~6}Q=kPpU zz>9bZFXI)wir4Tu-oTr93vc5cyo>knK0d&Q_y`~46MTx#@HxJ~m-q@_;~RX7@9;f- zz>oL|KjRntir?@%{=lF33xDGu{EPoERCMwmLt_{Wi{UUlM!<*|2_s_^jEd1PI>x}5 z7z<-#9E^+cFg_;0gqR2uV-ie?$uK~H5*2DRXwjj^5KN9KFeRqK)R+d-VmeHZ889Pe z!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;HSOkk=F)WTHuq2kk(pUz|VmU026|f>! z!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76uqC#_*4PHyVmoY)9k3&I z!p_(QyJ9!&jyZzFARfZQcm$8)F+7eZ@FbqX(|88Y;yFBz7w{rp z!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO@Fl*&*Z2nC;yZkgAMhi7 z!q4~xzv4Iijz91x{=(n*2mj(f3>Aa?$IuuC!(uoLj}b5;M#9J#1*2j#jE*rdCdR_p z7zg8GJdBSCFd-(w#Fzw=VloU+phSfl4O(>QF$9xi3QUQqFg2#Zw3rUlV+PEKnJ_bE z!K|1Kvttg-iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9}2`q`Fur!vzvRDqwV+E{;m9R2a z!Kzpdt78qUiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4ur;>9w%88aV+ZVrov<@@ z!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{a5Rp=u{aLL;{=?DlW;Ol z!KpY6r{fHqiL-Dv&cV4j59i|oT!@QsF)qQSxD1!$3S5b+a5b*MwYUz~;|AP_n{YF3 z!L7Irx8n}niMwz&?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X@HC#mvv>~A;|08km+&%P z!K-);uj388iMQ}J-od+g5AWjxe29F!wSOQC8DJ+d;uq>9t@>l^YVkNAMRj?{n z!|GTAYho>|jdidt*2DVP02^W>Y>Z8?DK^9A*aBN(D{PHzur0R3_SgYCVkhj3U9c;5 z!|vDvdtxu_jeW2$_QU=-00-hA9E?M7C=SEnI08rFC>)Jra4e3)@i+k|;v}4mQ*bIy z!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$=DqM|ga4oLG^|%2y;wIdTTW~9G z!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2DLjp5@GPFg^LPO-;w8L{SMVxc z!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_ z!|(V5f8sCvjeqbj{=-nQ$bSrtVK6L)!|)gZBVr_sj8QNuM#JbB17l(=jE!+HF2=+7 zm;e)EB20`)FexU(00l}^sL`NBhaN*PIi|prm85)v!9& zz?xVKYhxX(i}kQRHo%712peM)Y>LgWIkv!-*a}-?8*Gd1uswFbj@Su1V;Ag--LO0M zz@FF(dt)E$i~X=a4#0sp2nXX39E!tmIF7)PI0{GO7#xe^a6C@Hi8u)-;}o2V({MV@ zz?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MX zz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;t zz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##B zz@PXFf8!tgi~lfGZ1NvNV;BsJ;V?W#z=#+LBV!bdiqSAS#=w{u3u9v(jEnIwJ|@6~ zm2nS(V@o>OpYlqC8omEmta2uj}5RPHp0f(1e;q9kCAPxW*aq8TJ8X{~up@TD&e#RJVmIuLJ+LSC z!rs^i`(i)rj{|TZ4#L4W1c%}<9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx( z!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN z!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj z!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61 z!r%A@|KdLk6^H!C&=>~8VmJ(s5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0Atu7a zm;{qzG7M0lM1>j+T6E|!1e0S5Oo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB~8 z!MvCc^J4)lh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$1+0jburgM`s#p!HV-2i{wXinU z!Ma!v>th3Kh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc2keNQurqeSuGkH`V-M_!y|6d- z!M@lJ`{Mu{h=Xu24#A-~42R*ZsI1b0-1e}PIa57H8sW=U%;|!dMvv4-f z!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYft+)-h;||=3yKpz| z!M(T-_u~OPh==en9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S(t9T8s;|;utx9~RJ z!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kFulNnW;}86azwkHy z!N2$qL&YWkF*Jt3uow=*V+4$dkuWkw!KfGwqhkz=iLo#?#=*E4594D3Oo)jvF($#J zm<$6HC{dwCgBBfn48i1>0#jltOpR$UEvCctm;p0lCd`akFe_%m?3e>{VlK>$c`z^L z!~9qP3t}NGj76|07Q^CL0!v~kERAKbESAIaSOF_yC9I59uqsx=>R1D7VlAwVb+9hh z!}{0&8)74Dj7_j9HpAxF0$XA$Y>jQOEw;n<*a16YC+v(}uq$@M?$`r+VlV8CeXuX~ z!~Qq`2jUa4Js2={N&t;w+qvb8s%s z!}+)X7vdsZj7xASF2m)x0$1WHT#ajREw01$xB)lfCftl$a4T-Z?YIMX;x62cdvGuA z!~J*w58@#_j7RV&9>e2!0#D*8JdJ1YES|&jcmXfsCA^GR@G4%z>v#ii;w`+5cknLW z!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC|_yIrSC;W_G@GE}9@Aw0M;xGJ-fABB< z!%*?ae+-RbFf4|{@E8FjVkC@=Q7|e-!{`_TV`40fjd3t8#>4oS025*&OpHk|DJH`J z1xi$?(V#_#9z!rWrofb#3R7bmOpEC-J!Zg+mVx%J$As3*ac zz=gO77vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%V zz=L=Q591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVG zz=!wv(J(s3z?c{dV`ChQi}5f%CcuQ42oqxxOp3`cK!Fk! zYBXrkp~nzRjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9% z!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L z!p7JHn_@F;jxDeyw!+rf2HRpgY>yqVBX+{h*af>{H|&l*uqXDy-q;8GVn6JU18^V? z!ofHMhvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW z!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s z!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A z!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)Jl-}ndr;y()I1G;wFyjAJyIqG}RlSX(o#`o7W_>nb zLpI_we3p&*9GkEyoAG%zXA8DuE4F4Ewq-lEX9sp84j-r^_xl%Mf)e!(yK6~E>;{FdMGd;Y*5`4fNUFZ`9i@pt~gKlvAf zOyWNy#!SG3OvJ=Y!lX>bCiU*L;;iG%nuU*TZB${~D>LphAY zIf5fOilaG(V>yoFIe`;7iIX{nQ#p;(`8sFt4bJ2&zRB5~!?~Qt`CPz-T*Sp(!lhis zi@>WETG!F=hfLWFjVJ5+-FbCT9w!WGbd+ z8m47B-o^CHz`L1|nV6Ybn3dU>ojI73xtN=In3wnPUgqO{yr215fDf=B3$ZYZ@Ie-3 zF&1YDKE#qN#nLRphxrH}Ls!@8`; zr&*s3*pQ9*44-9VKF20(%4U3?&Dnx2*@~^%hHcr7?b(4H*@>OmgC?8{4mScHV zU`1A9WmaKTR%3P6;NyIPPqHR!@hR439oA(%KF#`Uz=mwZXZS1|^EozQQ#RxCY|a*J z$yRL5Hf+mwY|jqt$WH9cF6_!~?9LwS$zJTuKJ3eW?9Txl$QSq`U*aIX%vU&=uW|@q z<4_LcaE{##2C@oCm)12$wMKEr3(n9s2Zo3a_7XLGh-OSWQbwqaYgV|#XBM|NUoc41d`V|Vso zPxfMO_F-T4V}B0dK)%2i`4R{5Wxm3}e3e7^8i#TihjRo+aui2%499XD$8!QFauO$V z3a4@!r}K5r;2WIDS$vbTIfrvOkMp^J3%Q7kxr9r(jLW%#E4hlRxrS@Gj_bLB8@Y*Z zaWl7YE8pfje3#p}ojbUbySSTs_#XFiANTVB5AqNX^9Yaf7?1M=Px2Js=V_kdS$@C| zd5-6Kffsp+mwAO(d5s_OV_xSC-sCNQ!cX}bKj#|%3>_e5`2gyS&F4uh7a=*KFY^f zmgQKU6XFY^@+ z=BpgS*Ep2JIGiImlA}19V>p)MIGz(Yk&`%?Q#h5=IGwL^2H)UJ&f=S#%{iRQd7RG$ zT*yUS%q3jPWn9h`T**~j%{5%hbzIL4+{jIQi<`NHTlqHM;k(?%?cBkg+{NA8!}qwC z`?#M6c#wy9m`8Y&$9SA4c#^01K2P%u&+-F)$a6f;3%tlnyv!@S%4__HAM-kI@Fs8Z z6Mo9i_&LAem;8!f^BaE4@Ay4`;E()?Kl2y<%HQ}q|KOkei$QkrpAlmwU_vHhVkTiy zCS!7@U`nQ9YNlaYrsG{q&kVeq8JUThnT1)IjoF!lIhl*OnTL6K5AS6@-pBiyp9S~; z3$hRkvj`t#Q5Iuymf%Ay$xVs*pjW-nr+yY?bx0j*pZ#snO)eG z-PoNy*pt23n|;`q{n(!aIFK*!MZUyAe3`FsFkj^mzQ&;(#^D^nksQU*9K*33$MKxN ziJZjAoWiM`#_4>WGx!E)au(m@Y|i0a&f|P8;6g6qVlLrQF5_~p;7YFIYOdj0uH$-c z;6`rZTinbo+{(B44&UWAZs!i}!9=^xD+{gVqz=J%*!#u*HJjUZZ!IM12_j#IU zc$OdVL!RS#Uf@Mu;$>dpRbJyq{Fv8ygEx7LpYT(D#?Sc$zvNf^n&0qSe#h_m1ApXC z{F%S-SN_J|`3L{xUkq}H|BM(j0TVJ26Eg{uG8vOI1yeE=Q!@?IG9B+?dS>9=%*ag4 z%q+~xY|PFa%*kBL%{e(1&g{aj?8ffw!Jh2J-t5D^?8p8bz=3>$FY+Z0 z;>&!6gZV0l@HGzQFb?Mkj^rqg<`|CUIF9E8PUIv`<`holG*0L1oWVCZle73HXLAnc zavtY%0T*%+7jp@hav7I%1y^zvS91;5avj%m12=LL-{NL&;a0xQcla*1aXWW#CwFl- z_wYULWzs z8@$O|{DhzKGk(r5_$9yM*ZhXx@;iRdANV7G;?Mkrzw$T!&Oi7k|6-6+{Aa|N37C+H zn3zeJl*yQ!DVUO}n3`#rmg#sG(=!9_W=3XWW@celW@C2dU{2;@ZsuWL-otyDkN5F@ z=4Syuz=ABq!YslES(L?CoF(`WOR^M8vkV{RBYc#Pu`J86JS(swE3q=GuqvyuI&1K8 zKEWqhlePF1YqJjPvL2sieKuf2HsUjUmW}xwo3JUH@p(393$|n{wq_f)WjnTK2XtLmw1_1c$L@q5kKa2-r!B%;wSu+pYd~k!7uq0zvegmmf!Jv z{=gsk6MyC}{FT4)cmBaY`4@v+;y)wCOu&Rp#KcU(q)f)-Ou>{)#nep0v`ojln4TGU zH#0I5GcyabG8?lq2XitPb2AU~@*dvHe7ukMGd~OP0TyH-7G@DX$f7L9;w-_3Sdyh! znq~MfAK{~XjAdDl$3qHvJs!* zvuw=g*n~~ljL)+au{Zm$FZ;1S2XG)? z;EQ~TgZMIE;b6YXA$*NPIgG4~#Kl~~rCi44T)~xG#noKHwOq&b+`x_8#J9MaTey{P^Bum+ZQRZs z+{s;jjFSzw;0N$-fxn4*uCf zMvR$&37LqAnS@E1jLDgTDVd6?nTBbZj(0IVGw^O^WF}^27G`BOW@irOWG?1r9_Hme zyqEcSAMa;=7T^Oc$U-d4B7BfVS&YS5f)BAIOR+S|@L@i}NBJ1bvK-5^0xPl-E3*o# zvKp(i1|R1We3CU;i%+pO>##2C@oCm)12$wMKEr3(n9s2Zo3a_7XLGh-OSWQbwqaYg zV|#XBM|NUoc41d`V|VsoPxfMO_F-T4V}B0dK)%2i`4R{5Wxm3}e3e7^8i#TihjRo+ zaui2%499XD$8!QFauO$V3a4@!r}K5r;2WIDS$vbTIfrvOkMp^J3%Q7kxr9r(jLW%# zE4hlRxrS@Gj_bLB8@Y*ZaWl7YE8pfje3#p}ojbUbySSTs_#XFiANTVB5AqNX^9Yaf z7?1M=Px2Js=V_kdS$@C|d5-6Kffsp+mwAO(d5s_OV_xSC-sCNQ!cX}bKj#|%3>_e z5`2gyS&F4uh7a=*KFY^fmgQKU6XFY^@+=BpgS*Ep2JIGiImlA}19V>p)MIGz(Yk&`%?Q#h5=IGwL^ z2H)UJ&f=S#%{iRQd7RG$T*yUS%q3jPWn9h`T**~j%{5%hbzIL4+{jIQi<`NHTlqHM z;k(?%?cBkg+{NA8!}qwC`?#M6c#wy9m`8Y&$9SA4c#^01K2P%u&+-F)$a6f;3%tln zyv!@S%4__HAM-kI@Fs8Z6Mo9i_&LAem;8!f^BaE4@Ay4`;E()?Kl2y<%HQ}q|KOke zi$PxTpAlmwU_vHhVkTiyCS!7@U`nQ9YNlaYrsG{q&kVeq8JUThnT1)IjoF!lIhl*O znTL6K5AS6@-pBiyp9S~;3$hRkvj`t#Q5Iuymf%Ay$xVs*pjW- znr+yY?bx0j*pZ#snO)eG-PoNy*pt23n|;`q{n(!aIFK*!MZUyAe3`FsFkj^mzQ&;( z#^D^nksQU*9K*33$MKxNiJZjAoWiM`#_4>WGx!E)au(m@Y|i0a&f|P8;6g6qVlLrQ zF5_~p;7YFIYOdj0uH$-c;6`rZTinbo+{(B44&UWAZs!i}!9=^xD+{gVqz=J%* z!#u*HJjUZZ!IM12_j#IUc$OdVL!RS#Uf@Mu;$>dpRbJyq{Fv8ygEx7LpYT(D#?Sc$ zzvNf^n&0qSe#h_m1ApXC{F%S-SN_J|`3L{xUkvUM{~0l60w!c4CT0>QWilpb3Z`T# zre+$ZWjfx)^vuA!nUR^8nOT^X*_fRfCKpgU*t<1#FzOB2lG`9;cFbqVI0m89LZ4}%`qIyaU9PHoXAO>%qg78 zX`Ig2IfHL-CTH13bt>Jj^3J%40mv6FkXNe4nRzhG+Q!Kjb-{ z=LKHmC0^zgUgb4@#E*HMH+Yk`_z6GdXZ)OB@JoKhulWtX<#+s^Kk!HX#Gm;Kf8}rd zoqzC8{>9*4@t+Z6CSXD)Vqzv?QYK?^reI2@Vrr&gTBhS&OwSCwn;DsjnVE%InT^?* zgE^UtxtWJ~c@OVpKHkUsnV$vt01L7Z3$q9xWKkAlahBjiEXh(V%`$wLkML1G#a4-X`2?S2P1fR5tj#*C%X)m8_1S<8*@(~ZSvKZ#Y{I5&#^>3b zE!dK+*qUwFmhIS{9oUhb*qL3}mEG8#J=l}I*qeRWm;KnE12~W`@I}7FL4297a4=ux z5WdEt9LC`s!I2!r(Hz6E9LMpTz=@p1$(+KeoW|*Voiq3bXL1(bkJTYksy`2&CCPyCs`@K^rE-}wjszu(iIFqyZCTDXF=W-tBa{(7}5f^g_mvR}Ga|Ks&6<2c&*K!@#a|1VW z6W`)yZsAtG&3E`Nw{bgna3^H1znnVZoOkc`!2mo4_nx=Q}doZmsh$|rhUuioffw2+NyQWp$l8x-naEX zg_gAJ+PPbg);)XfSkR?a`_AFtM_t^dRr4M_n)hGWvt9FUttTX2(5iW_=KB%{K^Vme z5+?e$U6%Ry_J1GqpATDDqf4vS{q7upQP&n7TDR==Z^i#U_QLjm7f{Q;k2`H{^SLeN zww&8)ZtJ;i=C=L6wd(f1y^5Cn@0xX|)W3b||FK<1;#>c;Y=7e8x36RvL_r)R2oeT~ zg2X|RAZd^+NFJmJQUJtb zv7l^FE+`*V2r34Zg33XaplVPps2sqk_@Fm|$!$ zE*Kw72qp%Tg2};@U}`Wem>#?y%n05HW(Ko@H-p*1oM3J+FPI-J2o?s5g2lm-U}>-{ zSRSkhRtBqr)xnxzZLls_A8ZIV2AhJng3ZB}U~BMp@J{e=1SgJB6LYE@9WOTi8A95%vswg}uW*Vc)P{*gqT)4h&xiUkqOg2Zb+(uY`lcSHmIU zYvIsvSU5Z!5snN;g`>kU;n;9oI6j;ZP7Ei7lfx`<5uOZBh3|)_!!zO8 z@PqKf@LYI4ybxXtFNK%GE8*4fTKG};adhhKzWhF^tWhu?(X zhTnzXhd+crhChWrhrfishQEcshkt~BhJW22{9zPDag-oR7$u4lM@gcjQL-p`lp;zQ zrHWEVX`-}Iy6CPbeUu@(JIWYkiZVx8qO4K2D0`G6${FQ~az}ZhywN?;y-~jCzUcla ze^elPASxIYiV8KXNldPjYtzEQuZe>5N( z7`+g^7`+q?ie8Rhi3UfnMnj_4qM^~SXm~Ut8X1j>Mn_|!vC+6_d^91N7)^>MM^mDy z(X?oK^m;TSdLx<{&5GWPW=C_PxzW66ezYK37%hqxM@yom(XwcHv?5v=t%_DhYofK$ zx@dj0A=(&iir$JgM_Zz;(c95G(Yw*MXnV9H+8OPNc1L@n_oBVgzG#1RAUYTwiVjCd zqNCBV=y-G@IvJgc-j7a4XQH#w2hoSox#)a!A-Wh{iY`Z2qN~xh=%eW4=z4S`x*6Sy zK8Ze!K8rq&zKFhzzKXt%zKOn#zKgz(eu#dIeu{pMeu;jKev5vO{)qmJ{<=N&hjA3g zae_EuoG4BlCyA5B$>QX3ia2GQDo!1ziPOgE;=AJXafbNrIAfeC&Kzfnv&Pxt>~W4b zXPhg}9p{Pj#`nbc#`)s=;``(Lae?@OxL{l;E*uw$AB>B}#p2>|iTI(oWLzpP9hZq8 zjvt91jUS84#^vJjafP^ITq&*`SBa~})#B=Ljrj5SiTKI5W?U{6hR<{8Bt9emQ<69vr_K4~bukhsML=;qi!gWIQS! z9gm5}#^d7g@q~C{JSm0-)8gsz>+y{Ejd*4}D}FPc9nXp9#`EI&@q&0^yeM8A zFNv4N%i`tnig;zbDqbD0iPy&K;`Q-{cw@XNek+y~FX8dpA|G%zLmeuIet7M6Pw@~?e#Za=uzwPob*D3pRmcO&! z`)93i=V|1fmqim91~X_=09F+DTzZf0aAW@Z*N*|OSz28xq>UXimSPXYq^f=xq%zGiEnW;w{R=p<~w|s+qj)O zxRblMn|t^k_i`Wi^8gR>5D)VRkMbCg^8`=w6yN7*p5a-3zz=zj=Xrq_d5M>Kg;#lv zAMs;e=MCQEEq=mJ`58aw7yOc6@oRp=Z}}a+=MVgmKk;Y&!e99tf9D_klYjB{j|Bg0 zAtT02z=TZ1#7x4ZOvdC)!IVtJ)J(&)Ovk&Jo*8&IGcpr1GYhja8?!S9b21lmGY|9f z9^T7*ypQ)YKMU{y7Gxn7W)VKfqAbSZEWw9ZlBHOhW%w{3;iG(vWm%5pS%DQLMGrO=WyRkcauqS)5H~X+J`>{Ujnnx$XYdWqG z!IfOa)m+21T*vj?z>VC*x44;GxRr179lpzL+|C``$z9ydJ$#RQxsUsKfCqVqhk1lY zd5p(-f+u;3@AEXz@GL*zhdjsgyugdR#LK+GtGvdK_%W~Z25<5fKjEkRjGyxhe#x)+ zHNWAv{EpxA2mZ*P_%nauul$X_^AG;Xzj*r=!JYWeh%pl|Armn%lQ1chF*#E(B~vjq z(=aX5@h+xk2Hwq#%*4#h!mP~3?99QO%*EWy!@RtQ_c9;vC?8{4mScHVU`1A9WmaKTR%3P6;NyIPPqHR!@hR439oA(% zKF#`Uz=mwZXZS1|^EozQQ#RxCY|a*J$yRL5Hf+mwY|jqt$WH9cF6_!~?9LwS$zJTu zKJ3eW?9Txl$QSq`U*aIX%vU&=uW|@q<4_LcaE{6aN`8W&$Q;A|_@MCS@`vX9}idDyC){re!+b#q`X;yP1)hn3-9amD!k` zIhd2Vn45W+m-p~q=Hq?5pZQsU53nE$u`rA9K^A2(7H0`Q#F8w<(k#P=`3N87V=T*Z zEYAw8$V#ltDy+(Ctj-#IoKNsc)?_U{#oDaHx~#{iS)UEqkd62ZpJih{$0lsbW_+H_ z*@7+Eimlm(ZP||P*?}F|iJjSnUD=J@*@HdVi@n*0ec6xwIe-KC0$=1y9K@IT3J3F5 z4&iGY%3&PN5gf@;9L+Ht%W)jf37p7DoXjbl%4wX=*Exf4a3*K*P0r>V&gDGL=K?O| zA};0hxSdBhe15Cm-K6*}s`t-Zcd`i=x9QZi`G2`hZ?8KUiud1%&j0)I>vEo!zhe@n zDSsym@&9wq;*_}+=2o0rX>R4Y74Bp*&TG}{U)6cl?F7fxz1nwfU9v>uEHJ?m zirje>1poQ<|4>aAc5dIVb*q0>*MFH{`tK_{<$v8+3&J6PpP8k9UyIvkq;t20eVTXb z-TGMCAu|piy?vK;QP(zYdbaMhV_vTTJLb3P+Vzm068ulKp;fPDy$1ZNHf;S5U8G>H z`t^!bX@2`^_2}~7beCMsFOIGq;_Udvu-uvF@_) zpV9ZP8t^|jE`J{MKXkh1b#DLvb5;M-6tb*l?@qni|GSi;f7vyccmBJSf%*3+>^X6M z*DkH+_v+jAp9yTr#4w&1CYTt-6QkRY3C%6Ndj6}%{I>(O_Wv4}|9fEmr*AR0lb!Eu z37LqAnS@Ds=jsfed_rJ~c0MZ4HDxe0?g??7{rpM!$Dfzqo+`1m{Vii$ zKrM+TQkpSdt3GVh=t*NH)=U^*ll$k__8zv@m|z(b16rqz+8$cxpb{(!7HwE)Owzh; z)Vf)!5)Ue7w60}!YK=eK;x`TmsQ#(R#>mLUi5068#^iZtofs@qi{~s_Y#f+cXdDF5 zgSASn+Lr5$%`JwWU-M(VDVA|aoAsvVtyha_rfC(f*5f96r}5Wnq=mX@WfqyBIhVAzbeG}f(XiusM% zwXMcs3ynEySv(<{BTYuoUA92^TGDE+sa7Ui!pWFuj3teEmQfo}OBE; z(u;cZ8fhF3a~D{~!hl*HPMXow>K0>>+KK4Nm^;T}wKQBGNkt4b{dCEQ$|E3hWFTZL zN&j9lwB{(wI2sEiQxTCej)8?Gu(-uo8dRv4XfZ;;&RU;ZbNxc&*ht6-r{C)_WRS5e zSk$U2QK%aUAflD6S})btFcG0ufJM`ThV=HSvoi}mQ}#(8(sR~6MjdA9q4ms))svWy z#Tv&tVlfg+ztXd}Z8SLK*haOYyr`(ihyPQw-17D7qfkTB`f1R-F`z+Z4a#acHT8&P ztWe8psz;5j9%VFHMl+H$vI@?OYfIF!Xj7tYMbb!wj26o<0%}F`auEiWG(=rE95ox8 zQg&lJ9QGrYb%W92M+7xctGt&cg$+12yh2yI9w zlzr28^;$d7I2G>Ph}h&pdYTi`=*C_FoyYGovBy^e^52FgK-5|u7vBa3Z%KGmJR-^VGYG$V#h*S}G|i-^QJ~z^2)r+M(`Tg6vxP#>d4!&4p=l92EA@huda+RIC6Cn0EHyp1)GJcx)k2}y zJVLLt(2ShW=9p!?K_&NQK(pi8RaA3t1uWxjx9AWA_+Sf)?oQJ(-jOh}fZ=xm-wXIX z3HyS@48IR}6W|XdTpTQ6_(Q;tm@(Q>wN+{*J*;FIex!B^*T))WN6e;FctvBp-q@;^ z@(yl%tac|7Q#_mSC)tpi(WL5YZZtl%jL*~(5nDdm_&mL!Y-qPHEaS_7TGkYa#+s7G zwvh2vdPCVz`|Iq(=^OG(g?u=T!d^;tej5lDtI7$rwY770;~)P7Q7oMBisJJX7eo0T zT5gu_oLRml7DfJ-ipJE^IJTjru{~sbA6Qa~q-baq$&_L5SVD=FjMXJz=FS)tjIAUE ze+U^rIvMs;VFZ7sHL&$ihW!#)LUk352;*0$x5oI5dku+%v5R|#xytxGM(L#OplA9Y zGBy50jPp0Ozi4JuG|l|CoH-GX8UOGsspAsHZaK?N%pzc1Q4GhHgh(YseM|@i0G%0x z51pi1To*}+VstH0f?u3B8sn)kqeUqO?wKb2Ubg`=%K!{pE6SO*f?3PLb&2{|QdDB( z-MECG$U$XZmwjcVM$1Tv4tMn*GOc!)S`lVlV)ddUI5(;t!q6hNJVECw(TQg3r}ekQ z9_dK;?n>u%t;T(=hH}xw8C^iat5Bip4-+)>fZjPlQRGMOM03opteziUkc{~yO5>rS72&A)_7)F5ScvdR+Juw`3 zBqT-g>5YvKy!#U`Pg+e(?Mylnc*GJl#MoqsQOp+C zS(`*ijKA>noqS-!3FFXOVmy8WI$5r>;e03) z69^57iD;nU7nYbrfVM3p4&aVwKM<1%kp?1gD11dx^0y3eU;&FV6ye35p=}R|gLuM! zwB0f7hmbg!N1VV@D9$B^07!4D7}Vt-D!Zu|$@HGa)agXsRS}r5o2WA|is{)Jorxj5 zd??9yBQgtsqj0)QMU5?H69=1l7?H@#Io$D@8O%xjo0)S9Sg<*|R?g%3Tq|pN#8%E% z($VhSN)88-Gjjn((#(Zy<|3lPOqpep4rB6z0f$&Jm=@F)X{3WI08acylB~a7Y}B2D+j$!UVLF#LIvq4iRGX#vw}l zGI0=x;r1?B6?MeN6V?+$I}^r$!{P>FgvHC5Ev~aBuvo*`5{-;{E!K(O)M?oQi`CNl zRM<|f5i3A*jc-EV63zH+TOV;kLn47js@g(8JE|JMVY?tkddCq-W~{)AxHf&x<(uTo z_uP4lMT&|%wNRMET1oHwVkNqkIF2RqZyCfYfa&-BeM`k^>h1DRw#6Fsu-5Ttq*5oK z*Pc=qFh7=xSdgNTHZnYrcTBd_*pHkH(SJ=Ah`a%3w=x6t?YxT+c^9l5aIB9(Lht2H14C# zx9j}`=e+oUl6m|?JjQSTQt35h6KF8*K@#zL{-K-xXBE-_&X8LO+x?H?G2Y@mY{uPdtadC7#D` zp(7Tp6)zAR5-*~WF?)%Cb~5i};IRJ{V)WYoD)Hxh_FDSJE+g#MiEG!aLvWaNWKL!q8d_s&~UwlgZ@I`ua*Rl3zL}hb7 zC%Bzyz5ot$za&POyN%i6IWsQs$_rtRVvGk*Kp1<<5j<#%+Xs(&zF?$*>*^-th%=U>DKcmB<6 zah+|1JO9Dh61y2?cg~P+^n$Kt;ZHKFz@H|rt?9;1C}6RK55Krn$2(XUFOqeMCQ-q!m*gXKlLc0e7Ls$oqD8jlINnnp8AL2qs3`T2-A&kl@;aeRsl<213 z)t5QxO{Hqe-WcjmnD$|jeMv+<8498lAWBFKqef~woVD#Tf)I8XM`^#zZcKgP{y=45 zs#ziz7&kLV0!LtKnC#s!MrG8ulo4~v5Nc${XwdPFM_g@#HI`s1tpBgB#<8Ar$LDw{ zWD{suSXdJQ>M~f9sGAKF6g_|iI6o)zjMWDclFQG7FyuH*%igw$gMmqJO<|eL$aj7h|rpO{u`Ls2Q7)*T>8q*T9vvgz` z(+`t$@}@b1Z{+YY+}92M9Ke~FkE4kR{vb`rAIG4z#8O6O zqDyy#h;HhO(8%oz*3NgWJ(h*SB-F08%ZP#(HZ&EExlyTO9Z_=T*0YWyVnB6z%Y(TM z=;xWc+!YCyuIH=oVHx9HWn>497}@?tIk)aIN>{Fs_BR2A{msM;`{Tp}`$;#lKY`W~ zEsT2YH;8WPw0PG3PhI;33nfXYUHel+2?-O8YyV2A<8egE*}qEaxEd%-!8OSmqSO7s z@!av=ADrNb;vVSV?hmX27GIGM&q=-|Ig#hvq3z^b5}QZtypw!IneGD(9oaL>K47Ly264tZKKYV$MBY!L=Hjsq(M)Xtwp1X&QGz^K; zSkSwN{f7`LJ&HPxz}0!5NyX{D%-n6tz|41Bl#29azAjy1Cf?~y}&xvq?7S2_oM zrN=%As=R)9O3p%md{I1o=3?> zqSSPM0W47aMdFm!zr)Zw`pvNEew0B=N)1&b@B_U=iMybt)BOQQ1y^K+3MkUR;PO22LZ2XB-n2C zd`J|$s!=12+A7oFV?y9W`9(>5Lb=(H_O=nYl)n4!xM{&DGw7w8>SaPbRocnWa=^(rE0y4wH|b43ww+2+%1#b;?Sel2NA| z)G765(Nzxp3o3fs3m&M^pVix^0&PmZhHpYcHx)(6%tikQfFeHD5q3#5p zgxxzk!9MLwu&;N58Ti)c&frPkVc8jmw==^CX9njnvtMT5*GiSWKTxD^HHjj9N3vjC zr`S{a*3cN=^ceQ0?`UE$b&3|!cTAS zFku4usFivj#oc^2 zA59o+cGB}08uB6kuvDyhs#uD#PCvN|wh%$|0;?}TgyF|yNhZ&cVM1s#DR(@XnZI3$ zD$6o=ZdaDu5$0!4M!B1>QJpI+Jz0+-YmnRfV}PVZ4eiJ;&zIK<D7G>AzXdHx zQ-(x}8fl@)-Fy$NBrF%R<7mh($_8$lSOqW_v(*^!Teu$c4dbA#0g@N9<7pO}G@JaF zQ}XYb(Y7lGWIFCtRQm*y;ka4ETu=cgx)aE_(Pk~R((`pW2?UhHS~TSRlZg+`C!5Ln c>(JsOA*0gw^46lYxCadC6VK-Q9dMLW>7t+nkfRIItCYwgWf6rKlDpjPE!XxXK}6TsK+lF4bye)W zcg5btPpqgYs3;b!DE6*=|CzU&Y?1?iX5X8(Gyj=a_jZ0>^vfqj_r~()tA){ssEtWEy_8 zaA@VQk(I-YqfBG2Pt6alLR3}SB(*Ty&|JMDYRn55N1MicpIY2lCxW0wjRn=gU^rgi z5UXzr21}z2_0^G5W1-*UFYqgP_ae1$WvC$zJCjvyW;_&+#=>fTFm_^#u{Z_XG30t! z^)}WUOEiM$SkpMpr;XLdY2!5lVkv-FmYmYLU*~{n9Pd*L8$#hoL(~WcjOED_I`>P2 zQo)9Oe#Kv~Mms?D)(?XltH?_7sy<-oK49I;G*C?N8GSw($nzCh@t%;iy-(-zsS|Vymv{}TOMNLE)24`3x!_f?@v2@V2 zW9*>g$$N^rS>YYZpakSs;$6V*1fK@XR{1jGJ<%O+8+fdC^KCpd0CAE}tJAcIhImi5 z<2AEmIR(1bUoR@XWXK5_@~PQkr*X&$4*B$mCgVH6nT3qAPs-={Z9{V?X0&oKonsp3x{K*N6w~?e14H40%%O0Q96{@V*M%AQ zuD210Tg!JS1FW5&EBOTKMbb(#(~2~5F`U1IoWGQuKcAepOyjal=P!rz>vNp1i3MY; zjVsvs4W@CW+xe^D{MGc@glSyU)_bpoU)N~`TA^uN@6(E_TTYCvE)BMn8aMdVQgz{w z86nIZqRF^%&b5#2-HVwd@2I(pjGJN$jGMI*trUap7Sp)Zr?qd@I%wq>OCG-mL+CcG zQ>)h5RFx*b5)Y%Ot&W?OGOuyF)n~IZE@9k(*WM}NxT`IA@Hna0CzEH(m*M>y2J#-oU}$`0T$(|9~5fF~~9tn~02Pg+Y~ zQK~D9r{M3?899E&`rs8MIM8@jg7=)u;Pa;OLXN=~rNK>XFk!re0RKk;zDxl=LIG|z zjaQriOL-!C6#+hz6`*!72ltw3yzUO}4Fvb51awPVfWCzQ-cAKDhUWQq>?+07H_w>` zwzf6(O~TX{Fy8fPL#x&&x>_H-rj#X&_mKK+rm>v@RonAQQ{PlKx49{%8SnesA^FA! z8X8f%2K=k=$`3XBzep?$hjvJZKWfY2j{(q50_YP0=uHCXQ`7j&257v!UVILO-pN8p z&8vyG7`qszFHGZ0H%wmvrmvIdw(EV+H>UAz_O!CwMr13({Vw@kyKdH=HAM6AFN5le~KV{qe*=6&Wl+A?kE8_c&;`^QA`mH&fkl z9!T=4POr-jGp{-n6Mcw3dJY0T2rW|_j9*v2MPKR%L_ZQ^{|;e+bf9^1pC=9l9f9^w zwyzkF@Sy{f1Mo1#K+;ePgVoJ7k*FAiUIr3}5gSP4twrijjgC86Eqsc5Yo-u(ukpoC*c^RG8;xRTwvjl5W~^4JFCSAbaW~?RGJ;e zNc7^EJH#k7WW9*By<4lWadmEj#?Cq#>3dYimB*go@@z;Xv8rw z4K%rB71K#NAZ8GS^LQi?Ty<7%127YGgg=V}E1a_fn@zr6SkX(&2E*CiiaBHixhq+t z#aq=`)^*q@CKKiRC(Y7|WK&U(A;M4MS#7OT=32Bb(u-xQ0PDpOSM zPf(7IAq7>Z_+*aM(%Sllh9Dmv#1fEaCjOu~3S!-skb7Y}ZO4)(c7E8EJgD0VI}S8c zEXA)+=R9?`^AxLZOeJd>DdK$56&VhYXh0lKB-%TPAX>?Piw6!D%h7^F2oF<)@yiQj zVRLP5G!hflAYhw>4AC}8)Szokd`0QjBSMnMR5^n_*ItiBEy&4crGN68lG6r>I`jam zhKE+6XJoqG)^97*ef2i7tlJ9SHJrrc9i`ngu_BY0o!JJkWo8@El94x&3U8)#JM?By zOwq#h_N{tW$Oh;fa=9Q04q4|ghnAg@5K$0O5HUQcAmSvWRq2d03SuStrZ|DAPC-yT ztRi_;Clq8o=-D+;LNySJ+Es8O$#WIN)nbh4T7l!5SX~CPDNZ70YqDFrLk5eJ(MCE= zW}JdwyF%L86(SWAq%l6ZtX;oJ;#8?|8p#I4>GY73;Thb_INymgi3SYv*Jv~su2HPX zJC%y$-gX@)iM8NKjL%|C+ORqppACvBTA7}U@j0Lq#{D{CZ&U!M_xWBr|uQR z`Wzt|mpo-$1QrS_yQq_yK0q3A-MmTZo7|Tp7uyVIde&lAbFh6f~coU z+jhvZwMqd>UaNYvGocP|6P3$s=`<=ZpY6!yIYOn56IZnD#0FcVkTWxsY$bIYp#KWxGmGd?T&?MX$yCxEZmuELEM#T;BLpj^t6F{ zK#Q2-UOZ@mx{ow4L1COHcvo>h5lry_)8!NeOn~Hrq@n6GuHvS6D8*O&Ugke6`Bd}Z zZ%pyMez`4qMDnQw!GAQx&-3TA{$o4|sXCQMJVF;|+G4eYt_bBE^*HEi`(RC^HWY7& z1y|HJ)rcpER=~G);z??EN%39F=~LjOmeY7Rsx2|a(qRss zHle?SN;Mj(Zf;7{h?hX13nuYDl9Z+PGB-1ebu-c6j6LnXLS210e)SRbQ-~klgc|Wx z;#G8b|EbgKaLFcK!&BOR9j(;rz{Nn1iz9}|hxOfL0-G?76L0g&qF0u7N_|2Gr-vgm5UOKm9f4OTLZtvQ* zfq`9H*9R`G=wEfHJ{a9ry^_@F!clBTD;0q4|NCeB72fz)SlKXU`Uha+rj@)IcDVQu zJ;$3JXq)0AQqT3~VP)!SOkI#kz%XU zHl<6V1XOGrOGz5Dav2F>Rwi`$mH~a-GBDLWCwCxuRVOgywh>j=p7M%MGC{r+7UdAp z?YwqmGo9GXSlyM@Ug^AcCN6FsScnh773{b*cB@iWK&b3*?b)VuAAbNOWZt^6Z0?c0 z8|X+|cV@dUCwf@h-&F?9u_uO};G3csN#&-pH<6J__vt++mj|+kYM-<$`FRjq%$ya| zDLfclU8b%tHB(80$NflCX6+C*U3n-`(skS)-R!m92b$CwATvEMQ)iIO^kJZ4{T)ow z$aEzMA=896WqJtpO)-?olxk;r9YzAG-UAOTpTkppR2nazBP5@|l&dEWh&`Odj|9~e zqwsqeMzlNg>BJunE)qYQ9x{$GGL9pNLVn2gU@WJ<$2d^6831cMHQd>qVD;awM9L-- zoeOA^v@xkLe@?Jje~gV+Nk4IFhNd^5t%0Ch=8$0v=A~YpqVW zvsh_1EA1oNIm8SIzZ_}o=26nkxx`6_Tg7e;ng^=34&fe64R^Tn9eIDjTDqm+P4%JH z?)Q};WeZ4{jW3kO7dgTbOI2d97*t?z3`qlnB_sq4{L}{q$Kq*<YjX_wtSq&c$HcRP(>${aMY zves-XV@jcEqJ_Fz9dFBGnuy@LBWYx9cLZq8b|*^hfQYf8yADl4rG zJCq6SPGDg!g;f&c6CGjc#%i7(%1#1RdV4a9CCp5*Q_#(xc@m&uN;#EOQ9h@U2FfQ$ zeU#7X)HlT$Om?pxXOe)blh>G1)};8(l(QC;c@S7h0=y%~oU+R)u(Lol#o4U2j{<8Y zWk1 z5?|FXvaa5#bf0<^dJZmEv&J>7u@79XC1yZe$1?2V_0q)~h=Mrmy1tReUD-{bA`)zO zZ)P#?Enchj1*KQAXLZ;9sl(~5kODThu~EL%yPccG8oqQ^S$%dY2Xv8tD3H2Qxf23( zJ7?v8stga^g`T{xx!YF6H`KgoGZu+^(4X6}1ARTE?!D;77Kr;uzN9K^4|PB3(g?`B z2z&r!e$`WWRhI|RgRc*ftjy8F5`c}iw!L3`L=K8aL6u^Uu~_2qOtB}>%^nm_f`&oy z6saPwPm>1nN|;k#pP@bud`xx^iswi`)d>s?isw^&XHdKV3JnShAvGxec9Q=hsHWJ& zTKh=;OT@$pk7YRdFU#0B69vdQhdQdy{pc!fj(8O;0DzJ48mr}wjMqU2qHi$UJv81V z3aqB*B>te^N!daZQy**QcBQK|aF~eHmS0Z1ebiQPr#8;U*UpqRkdzf!PLwS8mswl7Bq=E9HtWtS> zM}1R#&t!Lb?I8hGr>vm7en|11^7;{!1O&E7fuB+W&Na!;pn(6DKybqm5G+els_PyAK0>D_p)kk#d>gIiDFRjpuf*a-?Ms&(pJUhS@(UV%=G5xS>tvoMxAe@8mdmYLtXbu z2{?7#Tk`WH|G*UADeFFxPxtQ7KPbib`V0MD+&&&G`IK{~tZ}P>KRt{zh`xDcRLK2U zX_vM9Tcvy5A?VeY#IP(0=Z<4I=%~OEBt&=R;UcxF@mRWY%-w)-!RQLQIUEsVu~9Pvd4L(9?;AuBL77lV0V9s! zp-e2MzL^lmFx@re1YPhV7Zeso%&{rHGvbbu{5iyTHt(&vov5XtnqnC%?ITeEV&WSy zJ?QTkarz3fd}f5piIz@Oh`J0>DqB-U7{qjGNcwYN+Qf_y}1bgQ#57$=$Ss~V4(h2;6bQ2kQ`8_U}53#ghj`Bn6hD!E!$KNhzMvM92MuCHA^#R z+Jye05kbKngQI-{1EPcDJd}S#a0mY?&O5trh~PmJpPi*337?9H4R71Y$NQyldtspg zQBeV%>`(be1-Fchh>nShZ5d+^_n_X6aUR@8*xO}voG;KXt{fN`Hz2NdTozx@7wPv` zH#8`wQ`=x?AXKBU-P0G0?*)6nu5o^TepSEOm-psdDkd&)kb?$TCe8zvwf~i~|CNsm zw%>b@2#4f;vHk6Dc$5S@W1@qjLTx;~_{Mo8FQPWNgJVL&gA(8zeCI{HRa9`h zIFI(jT#>C>MF+>&M5fEeMu$cOM@N^p>5#-ZFe1(u^dN0I1VuXnKt@DHg$IO%b`Fjk zBftyeE z&-vDdbdam71K;ZxTEDYlr3K+@9XPfqgub`ookL^mI52;+>@(ks&a@%9<4GTE^C!ei z60`()PmmQlUf#X{vM6wV)oxIGm$4r z%k*lP6r$ML{VU_(5FoQ{B}1~?ekVySL;k^l9RGGe&VM_=C;5P2+d5*oee&e@$>Vi- zSfJyxqT_4>Vx#6E!dplA+iz?`O9aM-+78FIWF9m+EVQNLEgVM??eXg#^Sp zG+&?-t(DqQa zql-5a3BKLKCVA*Kws)~t=3RcA2l4mzCLOOC@8)9zqdi!>Bl02&1=#_Clwf?k{xqS{ zgVpi6t0ZR#*Qodh`$}$<=vXobBqO0wP*FKE1XB6K$D8P$W7gxUJ)x z(CCg%xezJM?7k=eyvH-x|`xmh@5&)>3BkMUZ zh(=~RaMk4TFC7T^gBoJoXAIaO1pP#K*$%j;5;~tv{Ces{^7wf0S;qqAq&YJ;D+tiB-Yp-7Y>ST8|S9% zxA9?E2<7JDf7sE}A5_R8k)v~2Lb8YC2+0}Z6O!GDN&c3swu!U}l>GX=ZAl`~Z#Dn# zR`EA9tL%`)>=5GLu6dA z2MY^sA8g0*jzmdG0cfIz+wrw+CR*APR*8rym_ONszw!r;=eE^1^dE#(jVY z4$1BA%>Ix;FIGAhVUqv8YH-`dE3F1msf$0S(0ZuS>#db)eq&}O%n)?WGq zyZgky`ZLjO_9hVh{{`;3rzR}%@96pO3GZHIPn|&VztP+cc2BS7c6J{AthrcQTXiCX z{-C}8%$WSnetErsCn>W?_?ic+8UNYQc0yOwL)D749dK||1v_p?${Sg)SGHY!^~i{r zkT_o^q?9ihQo385`>TzXFB2V7c2I1fZ$~E%bc42r{%~^~jw74@8wdJ2k)}`no~Ce1 zJFNT990HuoMSAq)%5TpZ+&a=Wa{*y?rd=-CuOy0L{w*VeoKWp2PvTgjcvXfWz^m~d z+COndXTvt5{}oEvGE*R=kYm$}*s=*HFJ>f)pj|kLjSjX8 zJz)Vcq3si@6@%>9IH8^qQqhjM6ABkzF%7BWLE3h*7w3We+uAao#Y5GrldD329fC$h z#8sWX@!K~${DiAjuWMUE+pqk>n>gk;tu1^`h?FMaauMB*71JyaKN{@7Z(jpfJJ4^$ zJLs$pr=<1NijN_a3Do~c6`Ccor1U>vNqA^STO|GmEcr>MlcKvy3BJ@0jy+6*%EZ`d zQJ`H*s%zJhY&#O}(0s6+(hoa&cw%4C&(CW#a@gyyR61ew|JGc@B+@AS?`Z_twX@$6 zC{YsKmF$TUg78E;=xe_km!*H`32j3D#5~9N#D}s; z`B@_K5_n*Th~Hm-g>?Um1a?;<6*m7Z6=FT8Z3TkEoJ81N$X=F5OzueI+dODZZ=Xcx zX;&6lTfLl3fq&u*y?O0-Iqg3*u7Bp!f&(vq(J3o-UR$o#r_I1o+Jv_#YExqruog{;tL`P;^a;9&)Pi_ zYa#zcVw`LXyw_p67`%bj{HkRLAiSHBP@7leEg+CkK z#3H_zz{!i|e<15`$JmATq++R4T2H8!CX~-ZUb+1okw|PvRyudrbsB>0npk1y;rG&H zyP(7!?_ar~gxzk^-emn@pZoum8~UZ;u>DJa+q(=b*YP_x0zZn=kK&W4)<)_0asRn@ zNs1YMYYzSkD?ilXnN!g0_B-YwwxrsofU+}Vf4ltHGBPaA-O29!Zw!8-)cv^K^rtdP za<}g`Z8eGipvO+=16qc7&rKi^L80OHW<|6+z1{u?ssGDfX0toGogC*oTa?>=G02_5 zndB$SVvkKI6F{K1G?|de|CSrSM)ub>`7iZaoydr}f5QmIPUz#!a!hACNBXy{NFv3L zZu8GUSW+dxyRn#!suGD@?x@qCbdu(tz(aX#du#e{!Vl zS5{rHtMXo}e&x^d=x%S@hySS*{!r|;y+&x1?NyvDIQs}uP+YQ__iuGDNy^<`h0ksa zL%hPfOE`GNZ&Unsbdo*w7TP^%Oh{C4a9n)B-79bIl8Lhf|L?2Y5CD0rk-^ET+g@q; z$u!wPkK>#E<^JH$CeAz0?^0Qtbl%Gr1Mg*H?!6pg{{M&&2Rq#Ol@Paa8l2E1kMr=W z&?Mfwx%1oD{c9mkDz|BXTYuUQ+PHT+A~CUEkXRf^wDl~`>B9!>u+La|SG z70o;Bnga*zO6Q~6@O<5MT^*SDuJ0i={?yk`*xA{bEx7S1AYxa|^VlaMY>+n|?{`>u+K@}knFsQ zdr|TR$QN_!C3ZRem3xKL;HbxN=iv|P5$aI=-$okt`OWXoJ0uUe{(t58t#ZU$b+2-` zSKEwnuW@c`o!dI+w%)!ur2M~6=#yfU@6Lq%t;9q2p8PM(JCVF0xx{nH8W787d>f0s=DtD4Y{ zf&H&DkH4xxbsd^GDPT?K;rCX+!#XLTeL~$o{A0lr9C{_HO8B3}!oB{t&cR*P(H5uB_j_&m zUH$AwPX72ik+VP6(Gq_<9QW?H|2E41%LV@+j(?{x>l}hO3bX3Zl!=W$wjaOM>V&67 z+@0)YJ6o=KK2MwdTKt#$m);FpJGD)iB+cPxUst~t{o+$ycWry>s>?UT@0nMq6QAok zj@|=DI7cr!*zF$9-{1b>9~;p&v}Kzx=ZLIdKc@>8{~r+Sa>92DjNcRXAc2wgVUxtR z7ON^`8Sq-Pj{y+gQz*#~s3e{RaQ|T&HX${q%0!0-6JgB{YmoINza`pYCzE&49C!Au-``t0GU;XERvT6JrfdT;tfbH{VsC}p%w|7uU zfC6ZM0a$izye%A3Lqtr3P=s40n!5Lfb>8HAR~|o$P8ow zvI5zF>_846C*T9*0&)X+fV@CHAU{w5C8UzAPfixB7jJsEzk~#0-}K!AQos3bO1U6oq*0j7oaQ9 z4d@Q^0D1zwfZjkKpfAu5=no741_FbC!N3r}0}KU*0mFe2z(`;eFdB#h#sFi1alm+B z0x%Jn1WX2|08@c!z;s{+FcX*s%m(HFbAfrld|&~v5Lg5(29^Lzfn~sQU<0D#dx3qxe&7Ib5I6)J295wnfn&gN z-~@0II0c*r&H!hDbHI7v0&o$y1Y8EL09S!)z;)mTa1*!%+y?FdcY%Arec%D`5O@SU z2A%*f*i<$0w{tKD1!>9f*PoU255p7=mJxKDZx}=YA_9$7EA}G2Qz>f!AxLgFbkL! z%m!u$bAUNPA21h~8_Wad1@nRV!2)1Gun<@nECLng5PZA2O$W8Ap}Ap48kD-A|VQ* zAqHY04&osJ5+MnaAq7$)4bmY4G9e3cK`Ee=P%0=jlm}-`Jntz0jMBU2r3K}fr>)KpyE&os3cShDh-u^%0lI!@=yh+ zB2)>g3{`=uLcUNns5;~a)qrY3wV>Ki9jGo;52_C}fEq%LpvF)Ws43J8Y7Y5BEua7> z5NZhpLBUWfs5KMIQX( zdO$s)UQlnS57Zax2la;rKm(yc&|qi?rVHRw8Y1G)*_f^I{1pu5mL=sxrSdI&v&9z#!{ zr_eL#IrIX03B7_|LvNtB&^zco^a1(^eS$tiU!bqhH{0}sFa*Oe0;4bn<1hh}Fa^^v z1G6v(^RNJmumsDn0;{kF>#zZvum!u|6mUv76`UGQ1E+=4!Rg@)a7H*2oEgpnXN9xD z+2I^;PS^*|1?PtIzV7L|B8V-R&VK>|c4uiwt2sjdM3%7%#;Al7oj)mL99pH{|C%7}*1?~!WgS*2$ z;GS?VxHsGf?hE&W`@;j^f$$)BFgyhIz(e6-@NjqpJQ5xSkA~ymG4NP;96TPL08fM` z!IR-B@KksjJRP0^&xB{ev*9`LTzDQlA6@`2gcre!;U(}=cp1DLUIDL!SHY{{HSk(^ z9lRdi0B?jh!JFYN@K$&mydB;F?}T^3yWu_XUU(n8A3gvdgb%@o;Un--_!xW~J^`PE zPr;|*Gw@mX9DE+W0AGYJ!I$AH@KyL4d>y_4--K_$x8XbRUHBe+AASHogdf3=;V1A@ z_!;~hegVIPU%{{8H}G5d9sC~t0Dpu(!JpwT@K^Yo?f5|iLSO_zPy|D8gg{7yLTH3R zScF4(L_kDDLS#fiR768`#6V2MLR?4+BqfpxNsXjI(jw`Q^hgFIBa#WpjATKwBH57a zNDd??;)CQuawB<=yhuJIKT-fGh!jE!BSny+NHL^1QUWQ7ltM})WstH+Iix&N0jY>o zLMkIwkgA9;QVpq&_#rirnn*3AHc|(vi_}BvBMp#-NF$^%(gbOWG((yr{zwZX00~4| zB0)$n(h6yfgdm}a8)<`tA>l{_5{a}$+96R$G!lcvBJGh5NJpd-(i!Q3bVa%$-H{$h zPox*p8|j1eMfxHAkpaj+WDqhK8G?9_p~x^~I5GkmiHt%Kvp8Fkk!ZcKwcuRkk`l?y0pt;c8XdW~#nh(v77C;N4h0wxi5ws{;3@wh9Kue;f(9&obv@BW)^c0 zqZ`nT=q7YCx&_^eZbP@DJJ6lzE_64#2i=SAL-(Tx(1Yk9^e}n^J&GPfkE18hljte* zGHTnj9i@rnOqaVva@ zjK>5_#3W3{6imf5OvenOB(rNB~Rsj$>o8Z0fA4oi<^z%pW)u*_H%EGw1`%Z}y1 za$-JME-W{e2g{4)!}4PVu!2}2tT0vtD~c7vien|Pl2|FMG*$*HiK9SS747 zRt2ky`C`?u>X;u^1FMPE!fIo6u)0`1tUlHNYlt<%8e>hcrdTtqIp&YGzyh#9tR)tN z1!JwS)>sG@in*~iSQr+LMPQLwTdW-xg+*gASS;2a>wtB{I$@o$E?8Hr8`d4`f%U|C zVZE_FSYNCk)*l;y4a5dvgRvo)2OEkF!-iucu#wm(Y%~^!jlsrZf z!KPxIV#~1Q*a~bVwhCK~t-;n}>#+6M z25cj?3EPZq!M0-CuD4!LDN0u?U>#yN%t!?qc__``82QA@&G+j6K1gV$ZPW z*bD3>_6mEAy}{mM@38mS2kayE3HywF!MGadcnQ2DUJ5Ubm%+>8oFcs0B_?uXaFYvQ%=+IStjE?y6>k2k;@;*IdecoV!S-VAS!`{OO}06Y+Hi3j1q zcq_a$9)gGBZoCa1hKJ)3cqHByZ-+`tdFWKjUBUulP3{AV2~lU;-gf0wZvOAV`8DXo4YFf+KiB zAVfkUWI`cSLL+p-AWXs{Tto^YC6S6qO{5{x66uKaLIDic+R zs)R35ji^re5jBXKL@lB=QHQ8Y)FbK>4Ty$BBcd_UglI}MBbpQbL<=H-2qan(K}0ao zifB!Q5TS&dXhVb%;Y0)xNwg)}5m7`m5ktfh?THRVN1_wandm}vCAtyai5^5xq8HJd z=tJ}+`Vsw!0mMLJ5HXk-LU@Rw#4utwF@hLLj3P!8al{y6EHRE4PfQ>t5|fC@#1vvG zF^!l`%phhGvxwQm9AYjpkC;y^AQlpfh{ePbVkxnVSWc`URuZd-)x;WNEwPSRPi!DI z5}Sz4#1>*Jv5nYH>>zd$yNKPy9%3)CkJwKfAPy3Th{MDY;wW*9I8K}(P7L`amxNSq`{lB7tQWJs3eNS+i( zk(5Z8R7jQ7NS!oDle9<|nSxA7rXo|5X~?u>Ix;<(fy_u|A~Ta&$gE^GGCP@r%t`u? zxyamP9x^YPkIYXNAPbU($iie1vM5=MEKZgnOOmC?(qtL3ELn~$PgWo+l9kBHWEHY1 z=}T54tCN0Y4YDR#i>yu7A?uR$$ogahvLV@sY)m#Go084Q=A=K_f(#%7$(Cdg8BDe! zTazJVDCs8KkYQvv89_#pZOL|I6d6s%kg;TYvIE(X>_m1ZyO3SUZe(||2icSCMfN89 zkbTL1WPfr1IglJg4km|?9&#u-j2upmAV-p;$kAjRIffidjw8pD6Ud3=Byuu2g`7%G zBd3!y$eH9UayB`KoJ-Cl=aUP_h2$b~F}Z|XN-iUplPk!TlP}1Z zBLNDeAPS}s3Z*a#rwEFqD2k>SilsP;rvyr* zBub_fN~JVPrwq!ZEXqZtpi)w)sMJ&%DlL_cN>62=GE$kS%v2UCE0vAPPUWC-Qa)5J zDmRsf%1h;=@>2za@@Fja&qN)@AuQzfX9R4J-7RfZ}{m7~g26{w0-C8{!2g{n&V zQq`#Hlpj@ts!7$NYEyNnx>P-?KGlF~NHwAwQ%$I*R5PkMnoLchrc%?W z>C_BrCN+ziP0gX^QuC_ids#rq1ICCsP)taY9qCY z+DvVswo==u?bHrxC$)>(P3@ufQv0a=)B)-sb%;7l9ifg=$Ef4f3F;(uiaJf5q0Un0 zsPohX>LPWCx=dZ6u2R>i>(mYECUuLtP2HjHQunC))C1}v^@w^*J)xdb!r3+g5H zih51Gq25yOsQ1(d>Lc}u`b>SHzEa;PfCg!ZhG~RGX^h5cf+lH-rfG&|X^!S;ffi|r zmT84nX^qxtgEnc4cF`&5lyoXOHJye|OQ)mL(;4WDbS648orTUyXQQ*zIq00U51ot7 zP3NKW()sB8bOE{`U5GAB7om&N#pvR63A!X*iY`r;q07?c=<;+0x*}bPu1r^0Weix)0r# z?nn2h2haoQLG)mH2<@SV(!=QC^ay$+J&GPp$I)ZxvGh24JUxM)NKc|C(^KfF^fY=p zJ%gS}&!T73bLhGBJbFI8fL=&1q8HOk=%w^BdO5v6r9P1|}nuiOI}lVX`vWnCwgrCMV;=LRJ|;g?fGNlnVhS@wn4(NErZ`iADan*#N;74cvP?OqJX3+G$W&q~GgX+Xj4xA- zsm}N@HJF-AEv7b8hpEfdW9l;vn1)OvrZLlmY05NXnlt`P3nqXGWLh#oOfb`mY0ZQ% zp^Tep!-O&6Oav3jv}M{cQA{)w!^AS}nGQ@xrW4bd>B4knx-s3E9!yWB7t@>R!}MkP zG5whV%s^%kGng5|c$lHgFlIP2f*Hw-Vn#D@%ot`YGmaV0OkgH5lbFfO6lN+jjhW8O zU}iG2nAyx6W-c?2na?a>7BY*N#mo|BDYJ}O&a7ZoGOL)?%o=7bvyNHMY+yDro0!ea z7G^86joHrZV0JRQnBB}CW-qgk+0Ptc4l;+B!^{!pD07TC&YWOQGN+i+%o*k^bB;OB zTwpFTmzc}U73L~)jk(U;U~V$EnA^-9<}P!Oxz9Xc9x{)Z$IKJvDf5hZ&b(k=GOw7| z%p2w{^NxAXd|*B@pP0|g7v?MTjR9DYg;e$@;Ll*xYO$ zHZPlx&CeEK3$lgS!fX+?C|isz&X!$3IO`fLNXA=`*;%r;@0vd!4$tUud=4PXP=mTV9k%(h}%vmtCK z>t@@qVQe@X!A7!e*>-Fc8_mYBv21&`1KW}9#CB%8uwB`1Y>_qCyM$fJE@PLoE7+CnDt0xyhF#09W7o4A*p2Kab~C$$ z-O6rbx3fFgo$M}lH@k=3%kE?Mvj^CN>>>6rdxSm89%GNQC)kthDfTpbhCR!kW6!e} z*o*8X_A+~gy~?8Iu`-FYUK4YJ=FW8stEA}<} zhJDMvW8bqM*pKWd_A~p1{mOo00S@FK4(1RJ2;a+$cyTox`XmyOHL<=}F1K3py?H`)2p7t^ zxi(xF7tTd+kz8A@9T&w#b1_^j*PiRZb>up6ow+VtSFRh^o$JB%N* z8^8_Z262PAA)JRB$_?X&b0fHs+$e4|7srj^#&YAh@!SM%A~%Vf%uV5@a?`l!+zf6e zH;bFi&Ee*9^SJrk0&XF%zfd$a^EN0lpw#h%d|+;fwOc_~LvCz9e6YFU^=cIDdja$)Dm+^Jn<8{5k$Se}TWq zU*a$GSNN;^HU2t(gTKk&;&1bJ_`Cc){yzVJf5<=LAM;Q6r~EViIsbxx$-m-X^KbaK z{5$?V|AGI=f8sy$U-+;5Hy#i`0TN&V5l{gWaDfm=ff8te5m_QG9r{E*x5^@W9guFsN zA-_;SC@2&X3JXPqqCzpDxKKhUDU=dQ3uT0|LOG$lP(i3DR1zu+RfMX7uTV{>F8B#G zgqlJvp|(&*s4LVH>I)5ohC(BuvCu?lDl`+C3;se2AwUQeS_(lzu+U0qErbZ6f?H@K zgbCq7gb*pT71{|=LbMPg#0u?&4njwvlh9e{B6JnH3EhPrLQkQW&|Byu^cDIE{e=O- zKw*$DSQsLBgrUMPVYo0t7%7YrMhkJm7-6h1P8cst5GD$fgvr7bVX81qm@do^W(u=} z*}@!Qt}suSFDwui3X6or!V+PruuNDktPoZTtAy3U8ey%lPFOE&5H<>%gw4VhVXLrB z*e>i4b_%6jy_iAFC}t8fi&@01Vm2|mm_y7d`iQy2++rRvub5BF zFBT9BiiO0&ViB>ZSWGM~mJmydrNq)=8L_NbPAo505G#t6#L8k7v8w1RRuij>eqs%= zrdUg?E!Gk1iuJ_$Vgs?E*hp+FHW8bO&BW%Szt}`qQ|u-77W;^O#eQOcaez2b z93&1Fhln0=s5nd}L|iH^6PJrC#FgSIakaQcTq~{<*NYp(jp8P8v$#dvDsB_E zi#x=f;x2KwxJTS8?i2Tm2gHNoA@Q(yL_8`U6OW50#FOGF@w9kGJS(0P&x;qti{d5m zvUo+jDqa(>i#NoZ;w|yEct^Y|-V^VO55$M!Bk{5LM0_ef6Q7GO#FyeL@wNCyd@H^a z--{o_kK!lsv-m~)Dt;3I36vlSmJkV*FbS6kiIgabmKceZIEj}8Nt7f>mJ~^qG)b2X z$&@U~C8dy3N~xsOQW`0(luk-7WsovTnWW587AdQgP0B9ika9{sQZ6aClt;=d<&*MD z1*C#fA*rxbL@Fv3lZs0vq>@r8skBr^Dl3(f%1afbic%%1vQ$N?D)~y)r0SBNR70vM z)skvUb)>pdJ*mFbKx!y8k{U}*q^43csk!7YwU7d&K&hn^Bn3;Yq}Ec16e_u;Hd2@r zE=5R@Qd_B=6eUGVF;cA5Ug{uqlsZYBr7lueshiYY>LK-%dP%*dK2l$)pVVI(APtlT zNrR;!l1CaU4U>jTBczeiC~345CykNDO5>#Q(gbOuG)bB)O_8Qb)1>Lr3~8n`OPVds zk>*PCr1{bUX`!@8S}ZM*mP*T{<7aB-IxHQLj!MU*7n#UdMrJWo=VT8=h6%5rSwXAExnQ6O7Eoi z(g*3I^hx?GeUZLO-y}c=Wk`l)L`G#y#$`e#WlE-HMrLJB=4C+^Wl5H0MOI}^)@4IB zWlMI+Ddd!LDmk^BMoufIlhex?CA1LZ;T zV0nn_k%!8|=v=XDlD(#gHN=K!W(pl-EbXB@3-IX3nPoy2?Wsovh z8KQWUp~^62xH3W+sfw$E-IIl%gPny zs&Y-auG~;=Dz}u|${ppda! zJ}RG-&&n6&tMW|&R8WOfSVdG+#Z+7+R8pl>T4hvLt7+7Pb&S=6j*HZ{ANL(QrBsJYbKY92MOnorHI7ElYS zh19}o5w)mVOf9aKP)n+%)Y57hwX9lBEw5HkE2@>$%4!w0s_Ls&Q>&|fY7MofT1&01 z)=}%K_0;-m1GS;rNNub(QJbpG)aI(c+CmLb1J#ylkQ%JEQd_GbYN+Z~+o)k`xEi5G zs%_PFYLptS#;CDsd$ohwQSGF5R=cQO)oyBcwTIeM?WOis`>1`@erkVpfI3heqz+bx zs2+8wI!qm|j!;LcqtwxAoH|AwtBzC0s}t0T>LhitIz^qTPE)6=Gt`;tEOoXzN1dzA zQ|GG-)P?FIb+NicU8*iqm#Zt(mFg;WwYo-KtFBYms~gmf>Lzuwx<%cpZd13bJJg-( zE_JuMN8PLLQ}?R})Pw3F^{{$GJ*pm4kELvBEdPTje zUQ@5DH`JTzE%ml~N4=}wQ}3$})Q9RL^|AUyeX2fFpQ|s_m+C9^wfaVVtG-j;s~^;l z>L>NH`bGV!ep3Mr)F2Jk5DnEZ4c7>b)F_SC7>(69jn@QC)Fe&T6iwAMP1g*~)GW=V zrO;ApskGEu8ZE7sPD`(4&@yV7w9Hx-EvuGI%dX|na%w(WE-kl~N6V|_)ADNtw1Qe8 zt*}-^E2YATcL#wIP(rRmU zw7Ob7t-jVkYp6BS8f#6ordl(tx#q96&;qnTt)&*E1#7Le)>?=bs=2i`T9_8DMQD*) zTdkcIrA2EoTCCPy>!5YiI%%D?E?QTuo7P?Hq4m^yX}z^RT3@Z7)?XW-4b%o{gS8== zM;odQ(}rs!w2|5)q(WYwCwCUOmZKgI$o2|{!=4$h_ z`Pu?)p|(g{tS!-&YRk0c+6rx@wn|&At$LUS25qCZN!zS#(Y9*awC&msZKt+N z+pX=<_GpAqCx{sbq&#mXt^XmEZ{CWYspk7EX ztQXOX>c#ZpdI`OxUP>>mm(k1W<@EA;1-+tPNw2I|(W~mddNsYe?x)w#YwET1+Ik(m zu3k^CuQ$*e>W%cqdK0~=-b`<<`|BLA;(Rz#?tGCxX=pFS=dS|_h-c|3Wch`I9J@sCCZ@rJ+SMR6y*9YhW^+EbzeTeSS zhw8)h;ra-Dq&`X?t;gwO^s)LleY`$FpQumLC+k!6srod1x;{gnsn619>vQzE`aFHU zzCd57FVYw5OZ27sGJUzeLSLz`(pT$i^tJjreZ9Ux->7fWH|tyUt@<{7yS_u;sqfNv z>wEOQ`aXTXen3B{AJPx&NA#omG5xrHLO-dW(ogGW^t1Xo{k(obzo=i*FY8zItNJzl zx_(2yso&CX>v#0K`aS)={y=}IKhhuTPxPnyGyS>#LVu~h(qHRu^tbvu{k{G{|EPb` zKkHxgulhF~FhBz`U;{Bw12b@gFi3+kXoE3WgEM$TFhoN#WJ57jLo;;4FigWTTt*5b zrIE@=ZKN^M8tIJmMg}9Jk;%wxWHGWD*^KN)4kM@GW8^Y&8+nYpMm{6IQNSo@6fz1M zMU0|GF{8Lq!YFB!GD;g|jIu^Kqr6eUsAyC&DjQXds)nyo&8Tkp88wWWMlGYZQOBrj z)HCWE4UC3HBcrj=#As?XGnyOzMhhdr2sBz6K}N9A%4ltb7@>ySXk&yK;YNfJX|y%k z8Bs>G5o5#}?TrpbN28O`+2~?)HM$wyjUGl%qnFX!=wtLX`WgL=0meXMkTKX8Vt9<9 z#xP^JF~S&Wj50yG#uj6%vCY_S>@ap3yNun& z9%HYu&)9DqFb*1rjKjteAgOxz?)(xgnOx-k0)3i*NnZitI zrZQ8TY0R`{Iy1eQ!OUo8GBcZ5%&cZMGrO6?%xU_Vxy;;V9y70*&&+QYFbkT6%)(|7 zv#43jEN+%COPZz3(q1$RqtDAmi4YQ_M%dBnIG3%Q3 z%=%^nv!U6@Y-~0$o0`qc=BB^d!VEA2&6Z}68Em#PTbm(fsOdJ_m||}N}yO>?gZf1A0huPEYW%f4vn0?KDW`A>lInW$r4mO9F9&@NU z%p7ixFh`oB%+Y3?ImR4ojx)!b6U>R`By+Mk#hhwRGpCy~%$epabGA9hoNLZA=bH=6 zh2|o2vAM)tYA!REn=8zf<|=cwxyD>;t~1x08_bR7CUdj7#oTIcGq;;N%$?>gbGNz2 z+-vSL_nQaIgXSUguzAEhY92F>nGq0OB z%$w#d^R{`%yldVw@0$>P0f>t4`uvNq= zY8A7JTP3WLRw=8rRmLi7m9xrQ6|9O@C9ASk#j0xgTGg!TmY-F_s%h1-YFl-zx>h}_ zzSY2LXf?7LTTQH{Rx_))Iv<6v&ts$1j8fp!* zhFc@7k=7_{v=wKKvBp~CtntDCNurZvl&ZOyUfTJx;=)&gsx zwa8j*EwPqb%dF+r3Tvgc%35u$vDRAato7CgYooQv+H7sHwp!b)?bZ%!r?t!4ZSAr4 zTKla1)&c9Fb;vqw9kGsD$E@Sl3G1YF$~tYGvCdlOtn=0d>!NkZx@=vsu3Fcu>(&kH zrgh7@ZQZf%TKBB`)&uLI^~ic`J+Yoz&#dRx3+tuz%6e_RvEEwmtoPOj>!bC_`fPo% zzFOZbzy-P>7wke@s0(x9F2Y5+C>QNwT&#<8@h-t7x+ItEQe3J_bLlR_Wx6bv%ay{F z(v`}U+LgwY)|Jkc-j%_X(Ur-S*_FkW)s@Ya-Ic?Y)8*sJ<;v~KEV~jP%8RLx!#zbS1G1-`6Of{w%(~TL%Ok96~;|6UIs7lyTZPW1Kb48Rv})#zo_jaoM$cw#&?o*B=L7sgBDmGRnmW4txq8Sjk`#z*6m@!9xd zd^Nrq-;E!}Pve*I+xTPrH6oZ1%}8csGm072jAlkRW0*0`AT!tuF+zzsncd7`<}`Dexy?LgUNfJW-z;DjGz*!9%_3$|vzS@jEMb;3OPQt3 zGGvGAS>3E*)--FGwaq$aU9+B9-)vwuG#i(V0JV+nVropW>>SD+1>16_B4B$z0E#mU$dVXZuU0^m;=p0 z=3sM(In*3x4mU@bBh69fXmgA?)*NS!Hz$}A%}M5DbBa0DoMuipXP7h1S>|kWjyczy zXU;blm}XYMx-m