From ee8ba611a8e3b0b081e7a74fe596ec8e0dbc16ab Mon Sep 17 00:00:00 2001 From: alistair Date: Sat, 16 Jun 2018 00:30:42 +0100 Subject: [PATCH 001/113] Fix Timestamp rounding --- pandas/_libs/tslibs/timestamps.pyx | 61 ++++++++++++------- .../tests/scalar/timestamp/test_unary_ops.py | 24 +++++++- 2 files changed, 62 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index ba5ebdab82ddc..b9ea2da2c18d5 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -72,31 +72,50 @@ def round_ns(values, rounder, freq): ------- int or :obj:`ndarray` """ - from pandas.tseries.frequencies import to_offset - unit = to_offset(freq).nanos - if unit < 1000: - # for nano rounding, work with the last 6 digits separately - # due to float precision - buff = 1000000 - r = (buff * (values // buff) + unit * - (rounder((values % buff) * (1 / float(unit)))).astype('i8')) - else: - if unit % 1000 != 0: - msg = 'Precision will be lost using frequency: {}' - warnings.warn(msg.format(freq)) - - # GH19206 - # to deal with round-off when unit is large - if unit >= 1e9: - divisor = 10 ** int(np.log10(unit / 1e7)) + def _round_non_int_multiple(value): + if unit < 1000: + # for nano rounding, work with the last 6 digits separately + # due to float precision + buff = 1000000 + r = (buff * (value // buff) + unit * + (rounder((value % buff) * (1 / float(unit)))).astype('i8')) + else: + if unit % 1000 != 0: + msg = 'Precision will be lost using frequency: {}' + warnings.warn(msg.format(freq)) + + # GH19206 + # to deal with round-off when unit is large + if unit >= 1e9: + divisor = 10 ** int(np.log10(unit / 1e7)) + else: + divisor = 10 + + r = (unit * rounder((value * (divisor / float(unit))) / divisor) + .astype('i8')) + + return r + + # GH21262 If the Timestamp is multiple of the freq str + # then we don't apply _round_non_int_multiple + + def _apply_round(value): + if value % unit == 0: + return value else: - divisor = 10 + return _round_non_int_multiple(value) - r = (unit * rounder((values * (divisor / float(unit))) / divisor) - .astype('i8')) + from pandas.tseries.frequencies import to_offset + unit = to_offset(freq).nanos - return r + if type(values) is int: + if values % unit == 0: + return values + else: + return _round_non_int_multiple(values) + else: + return np.fromiter((_apply_round(item) for item in values), np.int64) # This is PITA. Because we inherit from datetime, which has very specific # construction requirements, we need to do object instantiation in python diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index aecddab8477fc..f37642569167a 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -118,6 +118,27 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): expected = Timestamp(expected) assert result == expected + @pytest.mark.parametrize('test_input, freq, expected', [ + ('2018-01-01 00:02:06', '2s', '2018-01-01 00:02:06'), + ('2018-01-01 00:02:00', '2T', '2018-01-01 00:02:00'), + ('2018-01-01 00:04:00', '4T', '2018-01-01 00:04:00'), + ('2018-01-01 00:15:00', '15T', '2018-01-01 00:15:00'), + ('2018-01-01 00:20:00', '20T', '2018-01-01 00:20:00'), + ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'), + ]) + def test_round_minute_freq(self, test_input, freq, expected): + # Ensure timestamps that shouldnt round dont! + # GH#21262 + dt = Timestamp(test_input) + expected = Timestamp(expected) + + result_ceil = dt.ceil(freq) + assert result_ceil == expected + result_floor = dt.floor(freq) + assert result_floor == expected + result_round = dt.round(freq) + assert result_round == expected + def test_ceil(self): dt = Timestamp('20130101 09:10:11') result = dt.ceil('D') @@ -257,7 +278,6 @@ def test_timestamp(self): if PY3: # datetime.timestamp() converts in the local timezone with tm.set_timezone('UTC'): - # should agree with datetime.timestamp method dt = ts.to_pydatetime() - assert dt.timestamp() == ts.timestamp() + assert dt.timestamp() == ts.timestamp() \ No newline at end of file From 50986c6c6d4eb89cd367a991620f23e88ee0e14b Mon Sep 17 00:00:00 2001 From: alistair Date: Sat, 16 Jun 2018 01:18:34 +0100 Subject: [PATCH 002/113] Pep8 fixes --- pandas/tests/scalar/timestamp/test_unary_ops.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index f37642569167a..4fb4f8f36ab86 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -119,12 +119,12 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): assert result == expected @pytest.mark.parametrize('test_input, freq, expected', [ - ('2018-01-01 00:02:06', '2s', '2018-01-01 00:02:06'), - ('2018-01-01 00:02:00', '2T', '2018-01-01 00:02:00'), - ('2018-01-01 00:04:00', '4T', '2018-01-01 00:04:00'), + ('2018-01-01 00:02:06', '2s', '2018-01-01 00:02:06'), + ('2018-01-01 00:02:00', '2T', '2018-01-01 00:02:00'), + ('2018-01-01 00:04:00', '4T', '2018-01-01 00:04:00'), ('2018-01-01 00:15:00', '15T', '2018-01-01 00:15:00'), ('2018-01-01 00:20:00', '20T', '2018-01-01 00:20:00'), - ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'), + ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'), ]) def test_round_minute_freq(self, test_input, freq, expected): # Ensure timestamps that shouldnt round dont! @@ -280,4 +280,4 @@ def test_timestamp(self): with tm.set_timezone('UTC'): # should agree with datetime.timestamp method dt = ts.to_pydatetime() - assert dt.timestamp() == ts.timestamp() \ No newline at end of file + assert dt.timestamp() == ts.timestamp() From d1c6e6f372e35b4f2f47931a1485902912d41bd7 Mon Sep 17 00:00:00 2001 From: alistair Date: Sun, 17 Jun 2018 14:07:17 +0100 Subject: [PATCH 003/113] Pep8 fixes and add additional test cases --- pandas/_libs/tslibs/timestamps.pyx | 9 +++++---- pandas/tests/indexes/datetimes/test_scalar_compat.py | 4 ++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index b9ea2da2c18d5..0115b90fb2ff6 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -109,14 +109,15 @@ def round_ns(values, rounder, freq): unit = to_offset(freq).nanos if type(values) is int: - if values % unit == 0: - return values - else: - return _round_non_int_multiple(values) + if values % unit == 0: + return values + else: + return _round_non_int_multiple(values) else: return np.fromiter((_apply_round(item) for item in values), np.int64) + # This is PITA. Because we inherit from datetime, which has very specific # construction requirements, we need to do object instantiation in python # (see Timestamp class above). This will serve as a C extension type that diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 9180bb0af3af3..441676dd36ea7 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -143,6 +143,10 @@ def test_round(self, tz): ['1823-01-01 00:00:01.000000020']), (['1823-01-01 00:00:01'], 'floor', '1s', ['1823-01-01 00:00:01']), (['1823-01-01 00:00:01'], 'ceil', '1s', ['1823-01-01 00:00:01']), + (['2018-01-01 00:15:00'], 'ceil', '15T', ['2018-01-01 00:15:00']), + (['2018-01-01 00:15:00'], 'floor', '15T', ['2018-01-01 00:15:00']), + (['1823-01-01 03:00:00'], 'ceil', '3H', ['1823-01-01 03:00:00']), + (['1823-01-01 03:00:00'], 'floor', '3H', ['1823-01-01 03:00:00']), (('NaT', '1823-01-01 00:00:01'), 'floor', '1s', ('NaT', '1823-01-01 00:00:01')), (('NaT', '1823-01-01 00:00:01'), 'ceil', '1s', From 03a42b82ed04b9c6a088c1c9eecacb6b457cefcf Mon Sep 17 00:00:00 2001 From: alistair Date: Sun, 17 Jun 2018 15:09:02 +0100 Subject: [PATCH 004/113] Futher test cases --- .../tests/indexes/datetimes/test_scalar_compat.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 441676dd36ea7..801dcb91b124e 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -134,6 +134,21 @@ def test_round(self, tz): ts = '2016-10-17 12:00:00.001501031' DatetimeIndex([ts]).round('1010ns') + def test_no_rounding_occurs(self, tz): + # GH 21262 + rng = date_range(start='2016-01-01', periods=5, + freq='2Min', tz=tz) + + expected_rng = DatetimeIndex([ + Timestamp('2016-01-01 00:00:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:02:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:04:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:06:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:08:00', tz=tz, freq='2T'), + ]) + + tm.assert_index_equal(rng.round(freq='2T'), expected_rng) + @pytest.mark.parametrize('test_input, rounder, freq, expected', [ (['2117-01-01 00:00:45'], 'floor', '15s', ['2117-01-01 00:00:45']), (['2117-01-01 00:00:45'], 'ceil', '15s', ['2117-01-01 00:00:45']), From a3020741ae4e854f9f7d10c9af8c089ff52e8fe9 Mon Sep 17 00:00:00 2001 From: alistair Date: Thu, 21 Jun 2018 01:27:16 +0100 Subject: [PATCH 005/113] Refactor timestamp rounding --- pandas/_libs/tslibs/timestamps.pyx | 51 ++++++++++--------- pandas/core/indexes/datetimelike.py | 4 +- .../tests/scalar/timestamp/test_unary_ops.py | 14 +++-- 3 files changed, 34 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 0115b90fb2ff6..66e087568df51 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -58,21 +58,31 @@ cdef inline object create_timestamp_from_ts(int64_t value, return ts_base -def round_ns(values, rounder, freq): +cdef inline round_ns(values, rounder, freq): + """ Applies rounding function at given frequency Parameters ---------- - values : int, :obj:`ndarray` - rounder : function + values : np.array + rounder : function, eg. 'Ceil', 'Floor', 'round' freq : str, obj Returns ------- - int or :obj:`ndarray` + np.array """ def _round_non_int_multiple(value): + + from pandas.tseries.frequencies import to_offset + unit = to_offset(freq).nanos + + # GH21262 If the Timestamp is multiple of the freq str + # don't apply any rounding + if value % unit == 0: + return value + if unit < 1000: # for nano rounding, work with the last 6 digits separately # due to float precision @@ -96,26 +106,7 @@ def round_ns(values, rounder, freq): return r - # GH21262 If the Timestamp is multiple of the freq str - # then we don't apply _round_non_int_multiple - - def _apply_round(value): - if value % unit == 0: - return value - else: - return _round_non_int_multiple(value) - - from pandas.tseries.frequencies import to_offset - unit = to_offset(freq).nanos - - if type(values) is int: - if values % unit == 0: - return values - else: - return _round_non_int_multiple(values) - - else: - return np.fromiter((_apply_round(item) for item in values), np.int64) + return np.fromiter((_round_non_int_multiple(item) for item in values), np.int64) # This is PITA. Because we inherit from datetime, which has very specific @@ -663,13 +654,23 @@ class Timestamp(_Timestamp): return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq) + @staticmethod + def round_values(values, rounder, freq): + """ + DatetimeIndex/PeriodIndex also use Timestamp rounding + """ + return round_ns(values, rounder, freq) + def _round(self, freq, rounder): if self.tz is not None: value = self.tz_localize(None).value else: value = self.value - r = round_ns(value, rounder, freq) + value = np.array([value], dtype=np.int64) + + # Will only ever contain 1 element for timestamp + r = Timestamp.round_values(value, rounder, freq).item() result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index c7cb245263df8..45dceb53c54bb 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -15,7 +15,7 @@ from pandas._libs import lib, iNaT, NaT, Timedelta from pandas._libs.tslibs.period import Period from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds -from pandas._libs.tslibs.timestamps import round_ns +from pandas._libs.tslibs.timestamps import Timestamp from pandas.core.dtypes.common import ( _ensure_int64, @@ -183,7 +183,7 @@ class TimelikeOps(object): def _round(self, freq, rounder): # round the local times values = _ensure_datetimelike_to_i8(self) - result = round_ns(values, rounder, freq) + result = Timestamp.round_values(values, rounder, freq) result = self._maybe_mask_results(result, fill_value=NaT) attribs = self._get_attributes_dict() diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 4fb4f8f36ab86..dbe31ccb11114 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -126,18 +126,16 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): ('2018-01-01 00:20:00', '20T', '2018-01-01 00:20:00'), ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'), ]) - def test_round_minute_freq(self, test_input, freq, expected): + @pytest.mark.parametrize('rounder', ['ceil', 'floor', 'round']) + def test_round_minute_freq(self, test_input, freq, expected, rounder): # Ensure timestamps that shouldnt round dont! # GH#21262 + dt = Timestamp(test_input) expected = Timestamp(expected) - - result_ceil = dt.ceil(freq) - assert result_ceil == expected - result_floor = dt.floor(freq) - assert result_floor == expected - result_round = dt.round(freq) - assert result_round == expected + func = getattr(dt, rounder) + result = func(freq) + assert result == expected def test_ceil(self): dt = Timestamp('20130101 09:10:11') From 33335b4300c412c5421e5a63ef33f5e6ffdd4ba0 Mon Sep 17 00:00:00 2001 From: alistair Date: Thu, 21 Jun 2018 01:48:20 +0100 Subject: [PATCH 006/113] Parameterize test cases --- pandas/_libs/tslibs/timestamps.pyx | 9 +-------- pandas/core/indexes/datetimelike.py | 4 ++-- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 66e087568df51..92781c63aeaf7 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -58,7 +58,7 @@ cdef inline object create_timestamp_from_ts(int64_t value, return ts_base -cdef inline round_ns(values, rounder, freq): +def round_ns(values, rounder, freq): """ Applies rounding function at given frequency @@ -654,13 +654,6 @@ class Timestamp(_Timestamp): return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq) - @staticmethod - def round_values(values, rounder, freq): - """ - DatetimeIndex/PeriodIndex also use Timestamp rounding - """ - return round_ns(values, rounder, freq) - def _round(self, freq, rounder): if self.tz is not None: value = self.tz_localize(None).value diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 45dceb53c54bb..c7cb245263df8 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -15,7 +15,7 @@ from pandas._libs import lib, iNaT, NaT, Timedelta from pandas._libs.tslibs.period import Period from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds -from pandas._libs.tslibs.timestamps import Timestamp +from pandas._libs.tslibs.timestamps import round_ns from pandas.core.dtypes.common import ( _ensure_int64, @@ -183,7 +183,7 @@ class TimelikeOps(object): def _round(self, freq, rounder): # round the local times values = _ensure_datetimelike_to_i8(self) - result = Timestamp.round_values(values, rounder, freq) + result = round_ns(values, rounder, freq) result = self._maybe_mask_results(result, fill_value=NaT) attribs = self._get_attributes_dict() From 0363a7201a7344948f3f5f344d5110ec626e2198 Mon Sep 17 00:00:00 2001 From: alistair Date: Thu, 21 Jun 2018 01:55:04 +0100 Subject: [PATCH 007/113] Update function error --- pandas/_libs/tslibs/timestamps.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 92781c63aeaf7..de852e066685c 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -663,7 +663,7 @@ class Timestamp(_Timestamp): value = np.array([value], dtype=np.int64) # Will only ever contain 1 element for timestamp - r = Timestamp.round_values(value, rounder, freq).item() + r = round_ns(value, rounder, freq).item() result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) From 7353c2fa05c69a6c12f1506f745f947923ffa6c1 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sun, 24 Jun 2018 23:22:45 +0100 Subject: [PATCH 008/113] Perform manipulation with vectorization --- pandas/_libs/tslibs/timestamps.pyx | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index de852e066685c..f7d3367d4b0b3 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -65,23 +65,15 @@ def round_ns(values, rounder, freq): Parameters ---------- - values : np.array + values : :obj:`ndarray` rounder : function, eg. 'Ceil', 'Floor', 'round' freq : str, obj Returns ------- - np.array + :obj:`ndarray` """ - def _round_non_int_multiple(value): - - from pandas.tseries.frequencies import to_offset - unit = to_offset(freq).nanos - - # GH21262 If the Timestamp is multiple of the freq str - # don't apply any rounding - if value % unit == 0: - return value + def _round_non_int_multiple(value, unit): if unit < 1000: # for nano rounding, work with the last 6 digits separately @@ -106,7 +98,19 @@ def round_ns(values, rounder, freq): return r - return np.fromiter((_round_non_int_multiple(item) for item in values), np.int64) + from pandas.tseries.frequencies import to_offset + unit = to_offset(freq).nanos + + values = np.copy(values) + + # GH21262 If the Timestamp is multiple of the freq str + # don't apply any rounding + mask = values % unit == 0 + if mask.all(): + return values + values[~mask] = _round_non_int_multiple(values[~mask], unit) + + return values # This is PITA. Because we inherit from datetime, which has very specific @@ -663,7 +667,7 @@ class Timestamp(_Timestamp): value = np.array([value], dtype=np.int64) # Will only ever contain 1 element for timestamp - r = round_ns(value, rounder, freq).item() + r = round_ns(value, rounder, freq)[0] result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) From 82c7db1d8cb10d4146f322a371efd04e59d6f23d Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Mon, 25 Jun 2018 22:00:53 +0100 Subject: [PATCH 009/113] Lower case doc string --- pandas/_libs/tslibs/timestamps.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index f7d3367d4b0b3..d2ce8225443fa 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -66,7 +66,7 @@ def round_ns(values, rounder, freq): Parameters ---------- values : :obj:`ndarray` - rounder : function, eg. 'Ceil', 'Floor', 'round' + rounder : function, eg. 'ceil', 'floor', 'round' freq : str, obj Returns From 559bb50906841568b7fa6d74bc9cbb93d7cf170a Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Tue, 26 Jun 2018 21:50:14 +0100 Subject: [PATCH 010/113] Update copy function --- pandas/_libs/tslibs/timestamps.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index d2ce8225443fa..046d5d090876d 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -101,7 +101,7 @@ def round_ns(values, rounder, freq): from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos - values = np.copy(values) + values = values.copy() # GH21262 If the Timestamp is multiple of the freq str # don't apply any rounding From bfe0d169db1c49c21cee3a08a41544e173c0b8dd Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Tue, 26 Jun 2018 23:50:35 +0100 Subject: [PATCH 011/113] Refactor to use just one function --- pandas/_libs/tslibs/timestamps.pyx | 50 +++++++++++++----------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 046d5d090876d..7e8e2777c371c 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -73,44 +73,36 @@ def round_ns(values, rounder, freq): ------- :obj:`ndarray` """ - def _round_non_int_multiple(value, unit): - - if unit < 1000: - # for nano rounding, work with the last 6 digits separately - # due to float precision - buff = 1000000 - r = (buff * (value // buff) + unit * - (rounder((value % buff) * (1 / float(unit)))).astype('i8')) - else: - if unit % 1000 != 0: - msg = 'Precision will be lost using frequency: {}' - warnings.warn(msg.format(freq)) - - # GH19206 - # to deal with round-off when unit is large - if unit >= 1e9: - divisor = 10 ** int(np.log10(unit / 1e7)) - else: - divisor = 10 - - r = (unit * rounder((value * (divisor / float(unit))) / divisor) - .astype('i8')) - - return r from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos - values = values.copy() - # GH21262 If the Timestamp is multiple of the freq str # don't apply any rounding mask = values % unit == 0 if mask.all(): return values - values[~mask] = _round_non_int_multiple(values[~mask], unit) - - return values + r = values.copy() + + if unit < 1000: + # for nano rounding, work with the last 6 digits separately + # due to float precision + buff = 1000000 + r[~mask] = (buff * (values[~mask] // buff) + unit * + (rounder((values[~mask] % buff) * (1 / float(unit)))).astype('i8')) + else: + if unit % 1000 != 0: + msg = 'Precision will be lost using frequency: {}' + warnings.warn(msg.format(freq)) + # GH19206 + # to deal with round-off when unit is large + if unit >= 1e9: + divisor = 10 ** int(np.log10(unit / 1e7)) + else: + divisor = 10 + r[~mask] = (unit * rounder((values[~mask] * (divisor / float(unit))) / divisor) + .astype('i8')) + return r # This is PITA. Because we inherit from datetime, which has very specific From 4249fd7c33f19277b4b2786e38bbfac83aa61b6a Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 27 Jun 2018 22:24:22 +0100 Subject: [PATCH 012/113] Pep8 --- pandas/_libs/tslibs/timestamps.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 7e8e2777c371c..123ccebf83a56 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -88,8 +88,9 @@ def round_ns(values, rounder, freq): # for nano rounding, work with the last 6 digits separately # due to float precision buff = 1000000 - r[~mask] = (buff * (values[~mask] // buff) + unit * - (rounder((values[~mask] % buff) * (1 / float(unit)))).astype('i8')) + r[~mask] = (buff * (values[~mask] // buff) + + unit * (rounder((values[~mask] % buff) * + (1 / float(unit)))).astype('i8')) else: if unit % 1000 != 0: msg = 'Precision will be lost using frequency: {}' @@ -100,8 +101,9 @@ def round_ns(values, rounder, freq): divisor = 10 ** int(np.log10(unit / 1e7)) else: divisor = 10 - r[~mask] = (unit * rounder((values[~mask] * (divisor / float(unit))) / divisor) - .astype('i8')) + r[~mask] = (unit * rounder((values[~mask] * + (divisor / float(unit))) / divisor) + .astype('i8')) return r From 2abd8e2f94dd961bdc90d4156dea83c9a2501d1c Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Mon, 11 Jun 2018 12:28:54 +0100 Subject: [PATCH 013/113] DOC: MultiIndex Fixes (#21414) --- pandas/core/indexes/multi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 20805e33bb1d3..75b6be96feb78 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1407,7 +1407,7 @@ def _sort_levels_monotonic(self): This is an *internal* function. - create a new MultiIndex from the current to monotonically sorted + Create a new MultiIndex from the current to monotonically sorted items IN the levels. This does not actually make the entire MultiIndex monotonic, JUST the levels. @@ -1465,8 +1465,8 @@ def _sort_levels_monotonic(self): def remove_unused_levels(self): """ - create a new MultiIndex from the current that removing - unused levels, meaning that they are not expressed in the labels + Create a new MultiIndex from the current that removes + unused levels, meaning that they are not expressed in the labels. The resulting MultiIndex will have the same outward appearance, meaning the same .values and ordering. It will also From 3546c00d937bb4fbf32dc9da771c05b91004128d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Jun 2018 17:14:47 +0200 Subject: [PATCH 014/113] DOC: fix grammar of deprecation message (#21421) --- pandas/util/_decorators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 6b55554cdc941..7d5753d03f4fc 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -140,8 +140,8 @@ def wrapper(*args, **kwargs): if new_arg_name is None and old_arg_value is not None: msg = ( "the '{old_name}' keyword is deprecated and will be " - "removed in a future version " - "please takes steps to stop use of '{old_name}'" + "removed in a future version. " + "Please take steps to stop the use of '{old_name}'" ).format(old_name=old_arg_name) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) kwargs[old_arg_name] = old_arg_value From 342fd405396cdb898dde1eb66cc8c040264c3cfa Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 12 Jun 2018 01:05:45 +0100 Subject: [PATCH 015/113] MAINT: Deprecate encoding from stata reader/writer (#21400) Deprecate the encoding parameter from all Stata reading and writing methods and classes. The encoding depends only on the file format and cannot be changed by users. --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/frame.py | 9 +++++---- pandas/io/stata.py | 25 +++++++++++-------------- pandas/tests/io/test_stata.py | 15 +++++---------- 4 files changed, 22 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index de985d4db5fa3..68c1839221508 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -45,7 +45,7 @@ Other API Changes Deprecations ~~~~~~~~~~~~ -- +- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`). - - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ca572e2e56b6c..0985de3126c5a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -80,7 +80,8 @@ from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import (Appender, Substitution, - rewrite_axis_style_signature) + rewrite_axis_style_signature, + deprecate_kwarg) from pandas.util._validators import (validate_bool_kwarg, validate_axis_style_args) @@ -1764,6 +1765,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', startcol=startcol, freeze_panes=freeze_panes, engine=engine) + @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None, version=114, @@ -1869,9 +1871,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True, kwargs['convert_strl'] = convert_strl writer = statawriter(fname, self, convert_dates=convert_dates, - encoding=encoding, byteorder=byteorder, - time_stamp=time_stamp, data_label=data_label, - write_index=write_index, + byteorder=byteorder, time_stamp=time_stamp, + data_label=data_label, write_index=write_index, variable_labels=variable_labels, **kwargs) writer.write_file() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 8584e1f0e3f14..b2a5bec2a4837 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -33,11 +33,7 @@ from pandas.core.series import Series from pandas.io.common import (get_filepath_or_buffer, BaseIterator, _stringify_path) -from pandas.util._decorators import Appender -from pandas.util._decorators import deprecate_kwarg - -VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1', - 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1') +from pandas.util._decorators import Appender, deprecate_kwarg _version_error = ("Version of given Stata file is not 104, 105, 108, " "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " @@ -169,6 +165,7 @@ @Appender(_read_stata_doc) +@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index_col=None, @@ -952,6 +949,7 @@ def __init__(self): class StataReader(StataParser, BaseIterator): __doc__ = _stata_reader_doc + @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') def __init__(self, path_or_buf, convert_dates=True, convert_categoricals=True, index_col=None, @@ -970,7 +968,7 @@ def __init__(self, path_or_buf, convert_dates=True, self._preserve_dtypes = preserve_dtypes self._columns = columns self._order_categoricals = order_categoricals - self._encoding = encoding + self._encoding = None self._chunksize = chunksize # State variables for the file @@ -1962,17 +1960,14 @@ class StataWriter(StataParser): _max_string_length = 244 + @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None): super(StataWriter, self).__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index - if encoding is not None: - if encoding not in VALID_ENCODINGS: - raise ValueError('Unknown encoding. Only latin-1 and ascii ' - 'supported.') - self._encoding = encoding + self._encoding = 'latin-1' self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels @@ -2731,6 +2726,7 @@ class StataWriter117(StataWriter): _max_string_length = 2045 + @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None, convert_strl=None): @@ -2738,9 +2734,10 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._convert_strl = [] if convert_strl is None else convert_strl[:] super(StataWriter117, self).__init__(fname, data, convert_dates, - write_index, encoding, byteorder, - time_stamp, data_label, - variable_labels) + write_index, byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels) self._map = None self._strl_blob = None diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index e5585902a9dd6..bfb72be80400e 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -361,7 +361,8 @@ def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) - encoded = read_stata(self.dta_encoding, encoding="latin-1") + with tm.assert_produces_warning(FutureWarning): + encoded = read_stata(self.dta_encoding, encoding='latin-1') result = encoded.kreis1849[0] expected = raw.kreis1849[0] @@ -369,8 +370,9 @@ def test_encoding(self, version): assert isinstance(result, compat.string_types) with tm.ensure_clean() as path: - encoded.to_stata(path, encoding='latin-1', - write_index=False, version=version) + with tm.assert_produces_warning(FutureWarning): + encoded.to_stata(path, write_index=False, version=version, + encoding='latin-1') reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded) @@ -1349,13 +1351,6 @@ def test_out_of_range_float(self): assert 'ColumnTooBig' in cm.exception assert 'infinity' in cm.exception - def test_invalid_encoding(self): - # GH15723, validate encoding - original = self.read_csv(self.csv3) - with pytest.raises(ValueError): - with tm.ensure_clean() as path: - original.to_stata(path, encoding='utf-8') - def test_path_pathlib(self): df = tm.makeDataFrame() df.index.name = 'index' From a1111b38b419caab4253d97dd78a7e125148e911 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 11 Jun 2018 17:15:29 -0700 Subject: [PATCH 016/113] DOC: Add 0.23.2 whatsnew template (#21433) --- doc/source/whatsnew/v0.23.2.txt | 82 +++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 doc/source/whatsnew/v0.23.2.txt diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt new file mode 100644 index 0000000000000..ec2eddcfd4d41 --- /dev/null +++ b/doc/source/whatsnew/v0.23.2.txt @@ -0,0 +1,82 @@ +.. _whatsnew_0232: + +v0.23.2 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. contents:: What's new in v0.23.2 + :local: + :backlinks: none + +.. _whatsnew_0232.enhancements: + +New features +~~~~~~~~~~~~ + + +.. _whatsnew_0232.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0232.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- +- + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0232.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +- +- + +Conversion +^^^^^^^^^^ + +- +- + +Indexing +^^^^^^^^ + +- +- + +I/O +^^^ + +- +- + +Plotting +^^^^^^^^ + +- +- + +Reshaping +^^^^^^^^^ + +- +- + +Categorical +^^^^^^^^^^^ + +- From 4423dc9d5daa9e9b5f52f60cfe1c23a48ab6bbf3 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 11 Jun 2018 17:16:36 -0700 Subject: [PATCH 017/113] MAINT: More friendly error msg on Index overflow (#21377) * MAINT: More useful error msg on Index overflow Display a more friendly error message when there is an OverflowError during Index construction. Partially addresses gh-15832. * DOC: Clarify how Index.__new__ handles dtype Partially addresses gh-15823. --- pandas/core/indexes/base.py | 12 +++++++++++- pandas/tests/indexes/test_base.py | 7 +++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dff6b5421d5ab..bf1051332ee19 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -181,6 +181,9 @@ class Index(IndexOpsMixin, PandasObject): ---------- data : array-like (1-dimensional) dtype : NumPy dtype (default: object) + If dtype is None, we find the dtype that best fits the data. + If an actual dtype is provided, we coerce to that dtype if it's safe. + Otherwise, an error will be raised. copy : bool Make a copy of input ndarray name : object @@ -306,7 +309,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if is_integer_dtype(dtype): inferred = lib.infer_dtype(data) if inferred == 'integer': - data = np.array(data, copy=copy, dtype=dtype) + try: + data = np.array(data, copy=copy, dtype=dtype) + except OverflowError: + # gh-15823: a more user-friendly error message + raise OverflowError( + "the elements provided in the data cannot " + "all be casted to the dtype {dtype}" + .format(dtype=dtype)) elif inferred in ['floating', 'mixed-integer-float']: if isna(data).any(): raise ValueError('cannot convert float ' diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f9f16dc0ce8b7..c264f5f79e47e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -474,6 +474,13 @@ def test_constructor_nonhashable_name(self, indices): tm.assert_raises_regex(TypeError, message, indices.set_names, names=renamed) + def test_constructor_overflow_int64(self): + # see gh-15832 + msg = ("the elements provided in the data cannot " + "all be casted to the dtype int64") + with tm.assert_raises_regex(OverflowError, msg): + Index([np.iinfo(np.uint64).max - 1], dtype="int64") + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', From 0550ef78be45744075bd12e010114d5706262111 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Jun 2018 09:54:11 +0200 Subject: [PATCH 018/113] DOC: follow 0.23.1 template for 0.23.2 whatsnew (#21435) --- doc/source/whatsnew/v0.23.2.txt | 36 +++++++++++++++------------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index ec2eddcfd4d41..c636e73fbd6c2 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -10,16 +10,11 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none -.. _whatsnew_0232.enhancements: -New features -~~~~~~~~~~~~ +.. _whatsnew_0232.fixed_regressions: - -.. _whatsnew_0232.deprecations: - -Deprecations -~~~~~~~~~~~~ +Fixed Regressions +~~~~~~~~~~~~~~~~~ - - @@ -43,40 +38,41 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +**Groupby/Resample/Rolling** + - - -Conversion -^^^^^^^^^^ +**Conversion** + - - -Indexing -^^^^^^^^ +**Indexing** - - -I/O -^^^ +**I/O** - - -Plotting -^^^^^^^^ +**Plotting** - - -Reshaping -^^^^^^^^^ +**Reshaping** - - -Categorical -^^^^^^^^^^^ +**Categorical** + +- + +**Other** - From 57c82220f9a3ada5b20864e83a1874343de43bb2 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Tue, 12 Jun 2018 08:57:03 +0100 Subject: [PATCH 019/113] DOC: Loading sphinxcontrib.spelling to sphinx only if it's available (#21397) --- ci/environment-dev.yaml | 1 + ci/requirements_dev.txt | 1 + doc/source/conf.py | 13 +++++++++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml index f9f9208519d61..5733857b55dd4 100644 --- a/ci/environment-dev.yaml +++ b/ci/environment-dev.yaml @@ -13,3 +13,4 @@ dependencies: - pytz - setuptools>=24.2.0 - sphinx + - sphinxcontrib-spelling diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 3430e778a4573..83ee30b52071d 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -9,3 +9,4 @@ python-dateutil>=2.5.0 pytz setuptools>=24.2.0 sphinx +sphinxcontrib-spelling \ No newline at end of file diff --git a/doc/source/conf.py b/doc/source/conf.py index 97081bec863b7..909bd5a80b76e 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,9 +16,11 @@ import re import inspect import importlib -from sphinx.ext.autosummary import _import_by_name +import logging import warnings +from sphinx.ext.autosummary import _import_by_name +logger = logging.getLogger(__name__) try: raw_input # Python 2 @@ -73,9 +75,16 @@ 'sphinx.ext.ifconfig', 'sphinx.ext.linkcode', 'nbsphinx', - 'sphinxcontrib.spelling' ] +try: + import sphinxcontrib.spelling +except ImportError as err: + logger.warn(('sphinxcontrib.spelling failed to import with error "{}". ' + '`spellcheck` command is not available.'.format(err))) +else: + extensions.append('sphinxcontrib.spelling') + exclude_patterns = ['**.ipynb_checkpoints'] spelling_word_list_filename = ['spelling_wordlist.txt', 'names_wordlist.txt'] From 8a51d075343dafb073ea2eb2c054e5518686053e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Jun 2018 11:15:12 +0200 Subject: [PATCH 020/113] Fix flake8 in conf.py (#21438) --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 909bd5a80b76e..5534700f0734a 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -78,7 +78,7 @@ ] try: - import sphinxcontrib.spelling + import sphinxcontrib.spelling # noqa except ImportError as err: logger.warn(('sphinxcontrib.spelling failed to import with error "{}". ' '`spellcheck` command is not available.'.format(err))) From 7f2f340fb43963ec1444326239d98b3faa9b4b13 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Tue, 12 Jun 2018 12:28:37 +0100 Subject: [PATCH 021/113] Doc Fixes (#21415) --- pandas/core/indexing.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 2c40be17ce781..0e4f040253560 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -46,6 +46,15 @@ class _IndexSlice(object): """ Create an object to more easily perform multi-index slicing + See Also + -------- + MultiIndex.remove_unused_levels : New MultiIndex with no unused levels. + + Notes + ----- + See :ref:`Defined Levels ` + for further info on slicing a MultiIndex. + Examples -------- From 56477a032893c307189cbe68e374c1457d6f9f89 Mon Sep 17 00:00:00 2001 From: testvinder Date: Tue, 12 Jun 2018 13:31:10 +0200 Subject: [PATCH 022/113] Reapply all patches by @testvinder against master (#21413) --- pandas/core/reshape/pivot.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 9a2ad5d13d77a..3390451c60c0f 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -446,7 +446,18 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, - ... # but they still will be counted in the output + # and will not be shown in the output because + # dropna is True by default. Set 'dropna=False' + # to preserve categories with no data + ... # doctest: +SKIP + col_0 d e + row_0 + a 1 0 + b 0 1 + + >>> crosstab(foo, bar, dropna=False) # 'c' and 'f' are not represented + # in the data, but they still will be counted + # and shown in the output ... # doctest: +SKIP col_0 d e f row_0 From ea922c64a19a703f43c2c5a771db7f535bbdaab4 Mon Sep 17 00:00:00 2001 From: Antti Kaihola Date: Tue, 12 Jun 2018 14:37:01 +0300 Subject: [PATCH 023/113] Two tests didn't properly assert an exception was raised. Fixed. (#21409) --- .../tests/indexes/datetimes/test_indexing.py | 5 ++--- pandas/tests/io/parser/c_parser_only.py | 22 +++++++++---------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index dd192db4b0eb3..8cffa035721b0 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -583,7 +583,6 @@ def test_get_indexer(self): def test_reasonable_keyerror(self): # GH#1062 index = DatetimeIndex(['1/3/2000']) - try: + with pytest.raises(KeyError) as excinfo: index.get_loc('1/1/2000') - except KeyError as e: - assert '2000' in str(e) + assert '2000' in str(excinfo.value) diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index e0422249289b7..9dc7b070f889d 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -23,21 +23,19 @@ class CParserTests(object): - def test_buffer_overflow(self): + @pytest.mark.parametrize( + 'malf', + ['1\r1\r1\r 1\r 1\r', + '1\r1\r1\r 1\r 1\r11\r', + '1\r1\r1\r 1\r 1\r11\r1\r'], + ids=['words pointer', 'stream pointer', 'lines pointer']) + def test_buffer_overflow(self, malf): # see gh-9205: test certain malformed input files that cause # buffer overflows in tokenizer.c - - malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer - malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer - malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer - cperr = 'Buffer overflow caught - possible malformed input file.' - - for malf in (malfw, malfs, malfl): - try: - self.read_table(StringIO(malf)) - except Exception as err: - assert cperr in str(err) + with pytest.raises(pd.errors.ParserError) as excinfo: + self.read_table(StringIO(malf)) + assert cperr in str(excinfo.value) def test_buffer_rd_bytes(self): # see gh-12098: src->buffer in the C parser can be freed twice leading From a37c3d228c905be6cd7e41e7b57b55aab0753c0d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Jun 2018 11:42:55 -0500 Subject: [PATCH 024/113] DOC: 0.23.1 release (#21446) --- doc/source/release.rst | 49 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index fa03d614ed42c..7bbd4ba43e66f 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,10 +37,57 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: https://pypi.org/project/pandas * Documentation: http://pandas.pydata.org +pandas 0.23.1 +------------- + +**Release date**: June 12, 2018 + +This is a minor release from 0.23.0 and includes a number of bug fixes and +performance improvements. + +See the :ref:`full whatsnew ` for a list of all the changes. + +Thanks +~~~~~~ + +A total of 30 people contributed to this release. People with a "+" by their +names contributed a patch for the first time. + +* Adam J. Stewart +* Adam Kim + +* Aly Sivji +* Chalmer Lowe + +* Damini Satya + +* Dr. Irv +* Gabe Fernando + +* Giftlin Rajaiah +* Jeff Reback +* Jeremy Schendel + +* Joris Van den Bossche +* Kalyan Gokhale + +* Kevin Sheppard +* Matthew Roeschke +* Max Kanter + +* Ming Li +* Pyry Kovanen + +* Stefano Cianciulli +* Tom Augspurger +* Uddeshya Singh + +* Wenhuan +* William Ayd +* chris-b1 +* gfyoung +* h-vetinari +* nprad + +* ssikdar1 + +* tmnhat2001 +* topper-123 +* zertrin + + pandas 0.23.0 ------------- -**Release date**: May 15, 2017 +**Release date**: May 15, 2018 This is a major release from 0.22.0 and includes a number of API changes, new features, enhancements, and performance improvements along with a large number From 9f9b63644fac430e8e2b541d00aa100a541ac324 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 13 Jun 2018 02:55:53 -0500 Subject: [PATCH 025/113] DOC: Fixed warning in doc build (#21449) --- doc/source/ecosystem.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 56fea1ccfd9dc..f683fd6892ea5 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -39,7 +39,7 @@ Use pandas DataFrames in your `scikit-learn `__ ML pipeline. `Featuretools `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. From c5a11d6a6ca943ae5296b2857fad403b397f2b6a Mon Sep 17 00:00:00 2001 From: Joel Ostblom Date: Wed, 13 Jun 2018 03:57:29 -0400 Subject: [PATCH 026/113] DOC: add favicon to doc pages (#21440) --- doc/source/_static/favicon.ico | Bin 0 -> 3902 bytes doc/source/conf.py | 10 +++++----- 2 files changed, 5 insertions(+), 5 deletions(-) create mode 100644 doc/source/_static/favicon.ico diff --git a/doc/source/_static/favicon.ico b/doc/source/_static/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..d15c4803b62e6dd2f706a5ebe1861fe438f5d98f GIT binary patch literal 3902 zcmeH}Jxg3c6oyApR`3f%m!K?eBQf?Wh`|)vXg~{V3qO)NRjPnkB%oMmVIkVur4U3B z#Si=g5-b(;7x+=Fpl+U5MjU7FF1y@&!D455cJ4W6_PpoLoteAFRPafq4c}?g*=S7C z7}E(U_yY2)%{CZwVtICy@U```-9QrNV8OCTKMEyeDt@T)LKoaZ)?u0J;uDoHQhDJM zT!X8*q*xqHd7-Qs!{ollxuvE`j|$ZprWLqP?a?9Fg&oT_eK&-W)SAt=ZgoCPgS&rp z{Z+pS)AV}?+AGqW1XuG3dl&*Gd z|AG%N0W+5G^u6#AzFD7Qn(GuO?&kQ7-3Y4Ft@{ys01iF>8Fn4yt3AlS>E*b>ZiRWz zpTYhNd!GLk`&l#aA$d;5s)oN_jtlIvW_fPC)e>yJ!!`7WnjYZZqI0Gn_QBn^QSOzr z)sT+pgC;nDxHN;#k1F*1b11U=^j82{s-Ze2&2#d$#;n-K?he$_D5it9RZdY(XtVE&1|1)i*;M m=p8FsAoPD$6`Va9$mGWmd*#gyCLW9_f*Z!XlFi&C-tr%v>Bf=( literal 0 HcmV?d00001 diff --git a/doc/source/conf.py b/doc/source/conf.py index 5534700f0734a..29f947e1144ea 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -213,16 +213,16 @@ # of the sidebar. # html_logo = None -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -# html_favicon = None - # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +html_favicon = os.path.join(html_static_path[0], 'favicon.ico') + # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. # html_last_updated_fmt = '%b %d, %Y' From bcc6b8c0386e35800400f8e4f0b41df3932c42e0 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 13 Jun 2018 11:25:58 +0100 Subject: [PATCH 027/113] Fix tests fragile to PATH (#21453) --- pandas/tests/plotting/test_converter.py | 3 ++- pandas/tests/test_downstream.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 47cded19f5300..bb976a1e3e81c 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -1,4 +1,5 @@ import subprocess +import sys import pytest from datetime import datetime, date @@ -27,7 +28,7 @@ def test_register_by_default(self): "import pandas as pd; " "units = dict(matplotlib.units.registry); " "assert pd.Timestamp in units)'") - call = ['python', '-c', code] + call = [sys.executable, '-c', code] assert subprocess.check_call(call) == 0 def test_warns(self): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index afd7993fefc70..cf98cff97669a 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -3,6 +3,7 @@ Testing that we work in the downstream packages """ import subprocess +import sys import pytest import numpy as np # noqa @@ -57,7 +58,7 @@ def test_xarray(df): def test_oo_optimizable(): # GH 21071 - subprocess.check_call(["python", "-OO", "-c", "import pandas"]) + subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) @tm.network From 402905d869184c2ad23cd08869c10a2788ba2efe Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 13 Jun 2018 03:32:54 -0700 Subject: [PATCH 028/113] use ccalendar.get_days_in_month, deprecate tslib.monthrange (#21451) --- pandas/_libs/tslib.pyx | 16 +--------------- pandas/_libs/tslibs/resolution.pyx | 4 ++-- pandas/core/indexes/datetimes.py | 5 +++-- pandas/tests/tseries/offsets/test_offsets.py | 6 ------ pandas/tseries/offsets.py | 4 ++-- setup.py | 1 + 6 files changed, 9 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 0f58cfa761f21..4f73f196b0d9d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -25,9 +25,7 @@ from tslibs.np_datetime cimport (check_dts_bounds, _string_to_dts, dt64_to_dtstruct, dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64, - get_datetime64_value, - days_per_month_table, - dayofweek, is_leapyear) + get_datetime64_value) from tslibs.np_datetime import OutOfBoundsDatetime from tslibs.parsing import parse_datetime_string @@ -763,18 +761,6 @@ cdef inline bint _parse_today_now(str val, int64_t* iresult): # Some general helper functions -def monthrange(int64_t year, int64_t month): - cdef: - int64_t days - - if month < 1 or month > 12: - raise ValueError("bad month number 0; must be 1-12") - - days = days_per_month_table[is_leapyear(year)][month - 1] - - return (dayofweek(year, month, 1), days) - - cpdef normalize_date(object dt): """ Normalize datetime.datetime value to midnight. Returns datetime.date as a diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index d0a9501afe566..2f185f4142a09 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -25,6 +25,7 @@ from fields import build_field_sarray from conversion import tz_convert from conversion cimport tz_convert_utc_to_tzlocal from ccalendar import MONTH_ALIASES, int_to_weekday +from ccalendar cimport get_days_in_month from pandas._libs.properties import cache_readonly from pandas._libs.tslib import Timestamp @@ -487,7 +488,6 @@ class _FrequencyInferer(object): days = self.fields['D'] weekdays = self.index.dayofweek - from calendar import monthrange for y, m, d, wd in zip(years, months, days, weekdays): if calendar_start: @@ -496,7 +496,7 @@ class _FrequencyInferer(object): business_start &= d == 1 or (d <= 3 and wd == 0) if calendar_end or business_end: - _, daysinmonth = monthrange(y, m) + daysinmonth = get_days_in_month(y, m) cal = d == daysinmonth if calendar_end: calendar_end &= cal diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d44b13172f86d..66622814f172d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -55,6 +55,7 @@ from pandas._libs import (lib, index as libindex, tslib as libts, join as libjoin, Timestamp) from pandas._libs.tslibs import (timezones, conversion, fields, parsing, + ccalendar, resolution as libresolution) # -------- some conversion wrapper functions @@ -1451,14 +1452,14 @@ def _parsed_string_to_bounds(self, reso, parsed): Timestamp(datetime(parsed.year, 12, 31, 23, 59, 59, 999999), tz=self.tz)) elif reso == 'month': - d = libts.monthrange(parsed.year, parsed.month)[1] + d = ccalendar.get_days_in_month(parsed.year, parsed.month) return (Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz), Timestamp(datetime(parsed.year, parsed.month, d, 23, 59, 59, 999999), tz=self.tz)) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead - d = libts.monthrange(parsed.year, qe)[1] # at end of month + d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month return (Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz), Timestamp(datetime(parsed.year, qe, d, 23, 59, diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 5369b1a94a956..a1c5a825054ec 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -41,12 +41,6 @@ from .common import assert_offset_equal, assert_onOffset -def test_monthrange(): - import calendar - for y in range(2000, 2013): - for m in range(1, 13): - assert tslib.monthrange(y, m) == calendar.monthrange(y, m) - #### # Misc function tests #### diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index c294110d89ec5..a5a983bf94bb8 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1140,7 +1140,7 @@ def apply(self, other): # shift `other` to self.day_of_month, incrementing `n` if necessary n = liboffsets.roll_convention(other.day, self.n, self.day_of_month) - days_in_month = tslib.monthrange(other.year, other.month)[1] + days_in_month = ccalendar.get_days_in_month(other.year, other.month) # For SemiMonthBegin on other.day == 1 and # SemiMonthEnd on other.day == days_in_month, @@ -1217,7 +1217,7 @@ class SemiMonthEnd(SemiMonthOffset): def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False - _, days_in_month = tslib.monthrange(dt.year, dt.month) + days_in_month = ccalendar.get_days_in_month(dt.year, dt.month) return dt.day in (self.day_of_month, days_in_month) def _apply(self, n, other): diff --git a/setup.py b/setup.py index 90ec8e91a0700..d6890a08b09d0 100755 --- a/setup.py +++ b/setup.py @@ -603,6 +603,7 @@ def pxd(name): 'pyxfile': '_libs/tslibs/resolution', 'pxdfiles': ['_libs/src/util', '_libs/khash', + '_libs/tslibs/ccalendar', '_libs/tslibs/frequencies', '_libs/tslibs/timezones'], 'depends': tseries_depends, From 22d65e17a837e25430bd04873db0036134eb6938 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 13 Jun 2018 03:51:41 -0700 Subject: [PATCH 029/113] BUG: Construct Timestamp with tz correctly near DST border (#21407) --- doc/source/whatsnew/v0.23.2.txt | 4 ++++ pandas/_libs/tslibs/conversion.pyx | 22 ++++--------------- pandas/tests/frame/test_timezones.py | 10 +++++++++ .../indexes/datetimes/test_construction.py | 9 ++++++++ .../indexes/datetimes/test_date_range.py | 14 ++++++++++++ .../tests/scalar/timestamp/test_timestamp.py | 8 +++++++ 6 files changed, 49 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index c636e73fbd6c2..1de44ffeb4160 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -73,6 +73,10 @@ Bug Fixes - +**Timezones** +- Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) +- Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) + **Other** - diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index f4841e6abb7e8..3cbef82437544 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -347,25 +347,11 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, if tz is not None: tz = maybe_get_tz(tz) - # sort of a temporary hack if ts.tzinfo is not None: - if hasattr(tz, 'normalize') and hasattr(ts.tzinfo, '_utcoffset'): - ts = tz.normalize(ts) - obj.value = pydatetime_to_dt64(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - else: - # tzoffset - try: - tz = ts.astimezone(tz).tzinfo - except: - pass - obj.value = pydatetime_to_dt64(ts, &obj.dts) - ts_offset = get_utcoffset(ts.tzinfo, ts) - obj.value -= int(ts_offset.total_seconds() * 1e9) - tz_offset = get_utcoffset(tz, ts) - obj.value += int(tz_offset.total_seconds() * 1e9) - dt64_to_dtstruct(obj.value, &obj.dts) - obj.tzinfo = tz + # Convert the current timezone to the passed timezone + ts = ts.astimezone(tz) + obj.value = pydatetime_to_dt64(ts, &obj.dts) + obj.tzinfo = ts.tzinfo elif not is_utc(tz): ts = _localize_pydatetime(ts, tz) obj.value = pydatetime_to_dt64(ts, &obj.dts) diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index fa589a0aa4817..3956968173070 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -133,3 +133,13 @@ def test_frame_reset_index(self, tz): xp = df.index.tz rs = roundtripped.index.tz assert xp == rs + + @pytest.mark.parametrize('tz', [None, 'America/New_York']) + def test_boolean_compare_transpose_tzindex_with_dst(self, tz): + # GH 19970 + idx = date_range('20161101', '20161130', freq='4H', tz=tz) + df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))}, + index=idx) + result = df.T == df.T + expected = DataFrame(True, index=list('ab'), columns=idx) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index dae69a86910af..b138b79caac76 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -469,6 +469,15 @@ def test_constructor_with_non_normalized_pytz(self, tz): result = DatetimeIndex(['2010'], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz + def test_constructor_timestamp_near_dst(self): + # GH 20854 + ts = [Timestamp('2016-10-30 03:00:00+0300', tz='Europe/Helsinki'), + Timestamp('2016-10-30 03:00:00+0200', tz='Europe/Helsinki')] + result = DatetimeIndex(ts) + expected = DatetimeIndex([ts[0].to_pydatetime(), + ts[1].to_pydatetime()]) + tm.assert_index_equal(result, expected) + class TestTimeSeries(object): diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 193804b66395b..ec37bbbcb6c02 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -278,6 +278,20 @@ def test_wom_len(self, periods): res = date_range(start='20110101', periods=periods, freq='WOM-1MON') assert len(res) == periods + def test_construct_over_dst(self): + # GH 20854 + pre_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', + ambiguous=True) + pst_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', + ambiguous=False) + expect_data = [Timestamp('2010-11-07 00:00:00', tz='US/Pacific'), + pre_dst, + pst_dst] + expected = DatetimeIndex(expect_data) + result = date_range(start='2010-11-7', periods=3, + freq='H', tz='US/Pacific') + tm.assert_index_equal(result, expected) + class TestGenRangeGeneration(object): diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index ab87d98fca8eb..4689c7bea626f 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -528,6 +528,14 @@ def test_disallow_setting_tz(self, tz): with pytest.raises(AttributeError): ts.tz = tz + @pytest.mark.parametrize('offset', ['+0300', '+0200']) + def test_construct_timestamp_near_dst(self, offset): + # GH 20854 + expected = Timestamp('2016-10-30 03:00:00{}'.format(offset), + tz='Europe/Helsinki') + result = Timestamp(expected, tz='Europe/Helsinki') + assert result == expected + class TestTimestamp(object): From 29083462a24d10df1dd727112fc362a5cf074c15 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 13 Jun 2018 03:54:55 -0700 Subject: [PATCH 030/113] parametrize tests, unify repeated tests (#21405) --- pandas/tests/tseries/offsets/test_offsets.py | 522 +++++++++---------- 1 file changed, 243 insertions(+), 279 deletions(-) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index a1c5a825054ec..8bf0d9f915d04 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -83,6 +83,7 @@ def test_to_m8(): class Base(object): _offset = None + d = Timestamp(datetime(2008, 1, 2)) timezones = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Tokyo', 'dateutil/US/Pacific'] @@ -142,6 +143,56 @@ def test_apply_out_of_range(self, tz): # so ignore pass + def test_offsets_compare_equal(self): + # root cause of GH#456: __ne__ was not implemented + if self._offset is None: + return + offset1 = self._offset() + offset2 = self._offset() + assert not offset1 != offset2 + assert offset1 == offset2 + + def test_rsub(self): + if self._offset is None or not hasattr(self, "offset2"): + # i.e. skip for TestCommon and YQM subclasses that do not have + # offset2 attr + return + assert self.d - self.offset2 == (-self.offset2).apply(self.d) + + def test_radd(self): + if self._offset is None or not hasattr(self, "offset2"): + # i.e. skip for TestCommon and YQM subclasses that do not have + # offset2 attr + return + assert self.d + self.offset2 == self.offset2 + self.d + + def test_sub(self): + if self._offset is None or not hasattr(self, "offset2"): + # i.e. skip for TestCommon and YQM subclasses that do not have + # offset2 attr + return + off = self.offset2 + with pytest.raises(Exception): + off - self.d + + assert 2 * off - off == off + assert self.d - self.offset2 == self.d + self._offset(-2) + assert self.d - self.offset2 == self.d - (2 * off - off) + + def testMult1(self): + if self._offset is None or not hasattr(self, "offset1"): + # i.e. skip for TestCommon and YQM subclasses that do not have + # offset1 attr + return + assert self.d + 10 * self.offset1 == self.d + self._offset(10) + assert self.d + 5 * self.offset1 == self.d + self._offset(5) + + def testMult2(self): + if self._offset is None: + return + assert self.d + (-5 * self._offset(-10)) == self.d + self._offset(50) + assert self.d + (-3 * self._offset(-2)) == self.d + self._offset(6) + class TestCommon(Base): # exected value created by Base._get_offset @@ -509,6 +560,7 @@ def setup_method(self, method): self.d = datetime(2008, 1, 1) self.offset = BDay() + self.offset1 = self.offset self.offset2 = BDay(2) def test_different_normalize_equals(self): @@ -530,7 +582,7 @@ def test_with_offset(self): assert (self.d + offset) == datetime(2008, 1, 2, 2) - def testEQ(self): + def test_eq(self): assert self.offset2 == self.offset2 def test_mul(self): @@ -539,28 +591,9 @@ def test_mul(self): def test_hash(self): assert hash(self.offset2) == hash(self.offset2) - def testCall(self): + def test_call(self): assert self.offset2(self.d) == datetime(2008, 1, 3) - def testRAdd(self): - assert self.d + self.offset2 == self.offset2 + self.d - - def testSub(self): - off = self.offset2 - pytest.raises(Exception, off.__sub__, self.d) - assert 2 * off - off == off - - assert self.d - self.offset2 == self.d + BDay(-2) - - def testRSub(self): - assert self.d - self.offset2 == (-self.offset2).apply(self.d) - - def testMult1(self): - assert self.d + 10 * self.offset == self.d + BDay(10) - - def testMult2(self): - assert self.d + (-5 * BDay(-10)) == self.d + BDay(50) - def testRollback1(self): assert BDay(10).rollback(self.d) == self.d @@ -672,12 +705,6 @@ def test_apply_large_n(self): def test_apply_corner(self): pytest.raises(TypeError, BDay().apply, BMonthEnd()) - def test_offsets_compare_equal(self): - # root cause of #456 - offset1 = BDay() - offset2 = BDay() - assert not offset1 != offset2 - class TestBusinessHour(Base): _offset = BusinessHour @@ -729,7 +756,7 @@ def test_with_offset(self): assert self.d + BusinessHour() * 3 == expected assert self.d + BusinessHour(n=3) == expected - def testEQ(self): + def test_eq(self): for offset in [self.offset1, self.offset2, self.offset3, self.offset4]: assert offset == offset @@ -743,31 +770,22 @@ def test_hash(self): for offset in [self.offset1, self.offset2, self.offset3, self.offset4]: assert hash(offset) == hash(offset) - def testCall(self): + def test_call(self): assert self.offset1(self.d) == datetime(2014, 7, 1, 11) assert self.offset2(self.d) == datetime(2014, 7, 1, 13) assert self.offset3(self.d) == datetime(2014, 6, 30, 17) assert self.offset4(self.d) == datetime(2014, 6, 30, 14) - def testRAdd(self): - assert self.d + self.offset2 == self.offset2 + self.d - - def testSub(self): + def test_sub(self): + # we have to override test_sub here becasue self.offset2 is not + # defined as self._offset(2) off = self.offset2 - pytest.raises(Exception, off.__sub__, self.d) + with pytest.raises(Exception): + off - self.d assert 2 * off - off == off assert self.d - self.offset2 == self.d + self._offset(-3) - def testRSub(self): - assert self.d - self.offset2 == (-self.offset2).apply(self.d) - - def testMult1(self): - assert self.d + 5 * self.offset1 == self.d + self._offset(5) - - def testMult2(self): - assert self.d + (-3 * self._offset(-2)) == self.d + self._offset(6) - def testRollback1(self): assert self.offset1.rollback(self.d) == self.d assert self.offset2.rollback(self.d) == self.d @@ -1317,12 +1335,6 @@ def test_apply_nanoseconds(self): for base, expected in compat.iteritems(cases): assert_offset_equal(offset, base, expected) - def test_offsets_compare_equal(self): - # root cause of #456 - offset1 = self._offset() - offset2 = self._offset() - assert not offset1 != offset2 - def test_datetimeindex(self): idx1 = DatetimeIndex(start='2014-07-04 15:00', end='2014-07-08 10:00', freq='BH') @@ -1361,6 +1373,8 @@ def test_datetimeindex(self): class TestCustomBusinessHour(Base): _offset = CustomBusinessHour + holidays = ['2014-06-27', datetime(2014, 6, 30), + np.datetime64('2014-07-02')] def setup_method(self, method): # 2014 Calendar to check custom holidays @@ -1371,8 +1385,6 @@ def setup_method(self, method): self.d = datetime(2014, 7, 1, 10, 00) self.offset1 = CustomBusinessHour(weekmask='Tue Wed Thu Fri') - self.holidays = ['2014-06-27', datetime(2014, 6, 30), - np.datetime64('2014-07-02')] self.offset2 = CustomBusinessHour(holidays=self.holidays) def test_constructor_errors(self): @@ -1401,7 +1413,7 @@ def test_with_offset(self): assert self.d + CustomBusinessHour() * 3 == expected assert self.d + CustomBusinessHour(n=3) == expected - def testEQ(self): + def test_eq(self): for offset in [self.offset1, self.offset2]: assert offset == offset @@ -1418,33 +1430,19 @@ def testEQ(self): assert (CustomBusinessHour(holidays=['2014-06-27']) != CustomBusinessHour(holidays=['2014-06-28'])) + def test_sub(self): + # override the Base.test_sub implementation because self.offset2 is + # defined differently in this class than the test expects + pass + def test_hash(self): assert hash(self.offset1) == hash(self.offset1) assert hash(self.offset2) == hash(self.offset2) - def testCall(self): + def test_call(self): assert self.offset1(self.d) == datetime(2014, 7, 1, 11) assert self.offset2(self.d) == datetime(2014, 7, 1, 11) - def testRAdd(self): - assert self.d + self.offset2 == self.offset2 + self.d - - def testSub(self): - off = self.offset2 - pytest.raises(Exception, off.__sub__, self.d) - assert 2 * off - off == off - - assert self.d - self.offset2 == self.d - (2 * off - off) - - def testRSub(self): - assert self.d - self.offset2 == (-self.offset2).apply(self.d) - - def testMult1(self): - assert self.d + 5 * self.offset1 == self.d + self._offset(5) - - def testMult2(self): - assert self.d + (-3 * self._offset(-2)) == self.d + self._offset(6) - def testRollback1(self): assert self.offset1.rollback(self.d) == self.d assert self.offset2.rollback(self.d) == self.d @@ -1484,49 +1482,51 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2014, 7, 7, 9) - def test_normalize(self): - tests = [] - - tests.append((CustomBusinessHour(normalize=True, - holidays=self.holidays), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 3), - datetime(2014, 7, 1, 16): datetime(2014, 7, 3), - datetime(2014, 7, 1, 23): datetime(2014, 7, 3), - datetime(2014, 7, 1, 0): datetime(2014, 7, 1), - datetime(2014, 7, 4, 15): datetime(2014, 7, 4), - datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 7), - datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) - - tests.append((CustomBusinessHour(-1, normalize=True, - holidays=self.holidays), - {datetime(2014, 7, 1, 8): datetime(2014, 6, 26), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1), - datetime(2014, 7, 1, 10): datetime(2014, 6, 26), - datetime(2014, 7, 1, 0): datetime(2014, 6, 26), - datetime(2014, 7, 7, 10): datetime(2014, 7, 4), - datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 4), - datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) - - tests.append((CustomBusinessHour(1, normalize=True, start='17:00', - end='04:00', holidays=self.holidays), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 2, 2): datetime(2014, 7, 2), - datetime(2014, 7, 2, 3): datetime(2014, 7, 3), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5), - datetime(2014, 7, 5, 2): datetime(2014, 7, 5), - datetime(2014, 7, 7, 2): datetime(2014, 7, 7), - datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) - - for offset, cases in tests: - for dt, expected in compat.iteritems(cases): - assert offset.apply(dt) == expected + normalize_cases = [] + normalize_cases.append(( + CustomBusinessHour(normalize=True, holidays=holidays), + {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3), + datetime(2014, 7, 1, 23): datetime(2014, 7, 3), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) + + normalize_cases.append(( + CustomBusinessHour(-1, normalize=True, holidays=holidays), + {datetime(2014, 7, 1, 8): datetime(2014, 6, 26), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 26), + datetime(2014, 7, 1, 0): datetime(2014, 6, 26), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) + + normalize_cases.append(( + CustomBusinessHour(1, normalize=True, + start='17:00', end='04:00', + holidays=holidays), + {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 3), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) + + @pytest.mark.parametrize('norm_cases', normalize_cases) + def test_normalize(self, norm_cases): + offset, cases = norm_cases + for dt, expected in compat.iteritems(cases): + assert offset.apply(dt) == expected def test_onOffset(self): tests = [] @@ -1544,75 +1544,75 @@ def test_onOffset(self): for dt, expected in compat.iteritems(cases): assert offset.onOffset(dt) == expected - def test_apply(self): - tests = [] - - tests.append(( - CustomBusinessHour(holidays=self.holidays), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 9), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 3, 9, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 10), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, - 30)})) - - tests.append(( - CustomBusinessHour(4, holidays=self.holidays), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 13): datetime(2014, 7, 3, 9), - datetime(2014, 7, 1, 15): datetime(2014, 7, 3, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, - 30)})) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_apply_nanoseconds(self): - tests = [] - - tests.append((CustomBusinessHour(holidays=self.holidays), - {Timestamp('2014-07-01 15:00') + Nano(5): Timestamp( - '2014-07-01 16:00') + Nano(5), - Timestamp('2014-07-01 16:00') + Nano(5): Timestamp( - '2014-07-03 09:00') + Nano(5), - Timestamp('2014-07-01 16:00') - Nano(5): Timestamp( - '2014-07-01 17:00') - Nano(5)})) - - tests.append((CustomBusinessHour(-1, holidays=self.holidays), - {Timestamp('2014-07-01 15:00') + Nano(5): Timestamp( - '2014-07-01 14:00') + Nano(5), - Timestamp('2014-07-01 10:00') + Nano(5): Timestamp( - '2014-07-01 09:00') + Nano(5), - Timestamp('2014-07-01 10:00') - Nano(5): Timestamp( - '2014-06-26 17:00') - Nano(5), })) + apply_cases = [] + apply_cases.append(( + CustomBusinessHour(holidays=holidays), + {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 3, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) + + apply_cases.append(( + CustomBusinessHour(4, holidays=holidays), + {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 3, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 3, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)})) + + @pytest.mark.parametrize('apply_case', apply_cases) + def test_apply(self, apply_case): + offset, cases = apply_case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + nano_cases = [] + nano_cases.append( + (CustomBusinessHour(holidays=holidays), + {Timestamp('2014-07-01 15:00') + Nano(5): + Timestamp('2014-07-01 16:00') + Nano(5), + Timestamp('2014-07-01 16:00') + Nano(5): + Timestamp('2014-07-03 09:00') + Nano(5), + Timestamp('2014-07-01 16:00') - Nano(5): + Timestamp('2014-07-01 17:00') - Nano(5)})) + + nano_cases.append( + (CustomBusinessHour(-1, holidays=holidays), + {Timestamp('2014-07-01 15:00') + Nano(5): + Timestamp('2014-07-01 14:00') + Nano(5), + Timestamp('2014-07-01 10:00') + Nano(5): + Timestamp('2014-07-01 09:00') + Nano(5), + Timestamp('2014-07-01 10:00') - Nano(5): + Timestamp('2014-06-26 17:00') - Nano(5)})) + + @pytest.mark.parametrize('nano_case', nano_cases) + def test_apply_nanoseconds(self, nano_case): + offset, cases = nano_case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) class TestCustomBusinessDay(Base): @@ -1623,6 +1623,7 @@ def setup_method(self, method): self.nd = np_datetime64_compat('2008-01-01 00:00:00Z') self.offset = CDay() + self.offset1 = self.offset self.offset2 = CDay(2) def test_different_normalize_equals(self): @@ -1644,7 +1645,7 @@ def test_with_offset(self): assert (self.d + offset) == datetime(2008, 1, 2, 2) - def testEQ(self): + def test_eq(self): assert self.offset2 == self.offset2 def test_mul(self): @@ -1653,29 +1654,10 @@ def test_mul(self): def test_hash(self): assert hash(self.offset2) == hash(self.offset2) - def testCall(self): + def test_call(self): assert self.offset2(self.d) == datetime(2008, 1, 3) assert self.offset2(self.nd) == datetime(2008, 1, 3) - def testRAdd(self): - assert self.d + self.offset2 == self.offset2 + self.d - - def testSub(self): - off = self.offset2 - pytest.raises(Exception, off.__sub__, self.d) - assert 2 * off - off == off - - assert self.d - self.offset2 == self.d + CDay(-2) - - def testRSub(self): - assert self.d - self.offset2 == (-self.offset2).apply(self.d) - - def testMult1(self): - assert self.d + 10 * self.offset == self.d + CDay(10) - - def testMult2(self): - assert self.d + (-5 * CDay(-10)) == self.d + CDay(50) - def testRollback1(self): assert CDay(10).rollback(self.d) == self.d @@ -1783,12 +1765,6 @@ def test_apply_large_n(self): def test_apply_corner(self): pytest.raises(Exception, CDay().apply, BMonthEnd()) - def test_offsets_compare_equal(self): - # root cause of #456 - offset1 = CDay() - offset2 = CDay() - assert not offset1 != offset2 - def test_holidays(self): # Define a TradingDay offset holidays = ['2012-05-01', datetime(2013, 5, 1), @@ -1857,10 +1833,11 @@ class CustomBusinessMonthBase(object): def setup_method(self, method): self.d = datetime(2008, 1, 1) - self.offset = self._object() - self.offset2 = self._object(2) + self.offset = self._offset() + self.offset1 = self.offset + self.offset2 = self._offset(2) - def testEQ(self): + def test_eq(self): assert self.offset2 == self.offset2 def test_mul(self): @@ -1869,47 +1846,23 @@ def test_mul(self): def test_hash(self): assert hash(self.offset2) == hash(self.offset2) - def testRAdd(self): - assert self.d + self.offset2 == self.offset2 + self.d - - def testSub(self): - off = self.offset2 - pytest.raises(Exception, off.__sub__, self.d) - assert 2 * off - off == off - - assert self.d - self.offset2 == self.d + self._object(-2) - - def testRSub(self): - assert self.d - self.offset2 == (-self.offset2).apply(self.d) - - def testMult1(self): - assert self.d + 10 * self.offset == self.d + self._object(10) - - def testMult2(self): - assert self.d + (-5 * self._object(-10)) == self.d + self._object(50) - - def test_offsets_compare_equal(self): - offset1 = self._object() - offset2 = self._object() - assert not offset1 != offset2 - def test_roundtrip_pickle(self): def _check_roundtrip(obj): unpickled = tm.round_trip_pickle(obj) assert unpickled == obj - _check_roundtrip(self._object()) - _check_roundtrip(self._object(2)) - _check_roundtrip(self._object() * 2) + _check_roundtrip(self._offset()) + _check_roundtrip(self._offset(2)) + _check_roundtrip(self._offset() * 2) def test_copy(self): # GH 17452 - off = self._object(weekmask='Mon Wed Fri') + off = self._offset(weekmask='Mon Wed Fri') assert off == off.copy() class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base): - _object = CBMonthEnd + _offset = CBMonthEnd def test_different_normalize_equals(self): # equivalent in this special case @@ -2026,7 +1979,7 @@ def test_datetimeindex(self): class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base): - _object = CBMonthBegin + _offset = CBMonthBegin def test_different_normalize_equals(self): # equivalent in this special case @@ -2144,6 +2097,9 @@ def test_datetimeindex(self): class TestWeek(Base): _offset = Week + d = Timestamp(datetime(2008, 1, 2)) + offset1 = _offset() + offset2 = _offset(2) def test_repr(self): assert repr(Week(weekday=0)) == "" @@ -2151,9 +2107,11 @@ def test_repr(self): assert repr(Week(n=-2, weekday=0)) == "<-2 * Weeks: weekday=0>" def test_corner(self): - pytest.raises(ValueError, Week, weekday=7) - tm.assert_raises_regex( - ValueError, "Day must be", Week, weekday=-1) + with pytest.raises(ValueError): + Week(weekday=7) + + with pytest.raises(ValueError, match="Day must be"): + Week(weekday=-1) def test_isAnchored(self): assert Week(weekday=0).isAnchored() @@ -2198,38 +2156,37 @@ def test_offset(self, case): for base, expected in compat.iteritems(cases): assert_offset_equal(offset, base, expected) - def test_onOffset(self): - for weekday in range(7): - offset = Week(weekday=weekday) - - for day in range(1, 8): - date = datetime(2008, 1, day) + @pytest.mark.parametrize('weekday', range(7)) + def test_onOffset(self, weekday): + offset = Week(weekday=weekday) - if day % 7 == weekday: - expected = True - else: - expected = False - assert_onOffset(offset, date, expected) + for day in range(1, 8): + date = datetime(2008, 1, day) - def test_offsets_compare_equal(self): - # root cause of #456 - offset1 = Week() - offset2 = Week() - assert not offset1 != offset2 + if day % 7 == weekday: + expected = True + else: + expected = False + assert_onOffset(offset, date, expected) class TestWeekOfMonth(Base): _offset = WeekOfMonth + offset1 = _offset() + offset2 = _offset(2) def test_constructor(self): - tm.assert_raises_regex(ValueError, "^Week", WeekOfMonth, - n=1, week=4, weekday=0) - tm.assert_raises_regex(ValueError, "^Week", WeekOfMonth, - n=1, week=-1, weekday=0) - tm.assert_raises_regex(ValueError, "^Day", WeekOfMonth, - n=1, week=0, weekday=-1) - tm.assert_raises_regex(ValueError, "^Day", WeekOfMonth, - n=1, week=0, weekday=7) + with pytest.raises(ValueError, match="^Week"): + WeekOfMonth(n=1, week=4, weekday=0) + + with pytest.raises(ValueError, match="^Week"): + WeekOfMonth(n=1, week=-1, weekday=0) + + with pytest.raises(ValueError, match="^Day"): + WeekOfMonth(n=1, week=0, weekday=-1) + + with pytest.raises(ValueError, match="^Day"): + WeekOfMonth(n=1, week=0, weekday=-7) def test_repr(self): assert (repr(WeekOfMonth(weekday=1, week=2)) == @@ -2316,15 +2273,18 @@ def test_onOffset(self, case): class TestLastWeekOfMonth(Base): _offset = LastWeekOfMonth + offset1 = _offset() + offset2 = _offset(2) def test_constructor(self): - tm.assert_raises_regex(ValueError, "^N cannot be 0", - LastWeekOfMonth, n=0, weekday=1) + with pytest.raises(ValueError, match="^N cannot be 0"): + LastWeekOfMonth(n=0, weekday=1) + + with pytest.raises(ValueError, match="^Day"): + LastWeekOfMonth(n=1, weekday=-1) - tm.assert_raises_regex(ValueError, "^Day", LastWeekOfMonth, n=1, - weekday=-1) - tm.assert_raises_regex( - ValueError, "^Day", LastWeekOfMonth, n=1, weekday=7) + with pytest.raises(ValueError, match="^Day"): + LastWeekOfMonth(n=1, weekday=7) def test_offset(self): # Saturday @@ -2390,6 +2350,8 @@ def test_onOffset(self, case): class TestSemiMonthEnd(Base): _offset = SemiMonthEnd + offset1 = _offset() + offset2 = _offset(2) def test_offset_whole_year(self): dates = (datetime(2007, 12, 31), @@ -2560,6 +2522,8 @@ def test_vectorized_offset_addition(self, klass, assert_func): class TestSemiMonthBegin(Base): _offset = SemiMonthBegin + offset1 = _offset() + offset2 = _offset(2) def test_offset_whole_year(self): dates = (datetime(2007, 12, 15), @@ -2767,9 +2731,9 @@ def test_get_offset_name(self): def test_get_offset(): - with tm.assert_raises_regex(ValueError, _INVALID_FREQ_ERROR): + with pytest.raises(ValueError, match=_INVALID_FREQ_ERROR): get_offset('gibberish') - with tm.assert_raises_regex(ValueError, _INVALID_FREQ_ERROR): + with pytest.raises(ValueError, match=_INVALID_FREQ_ERROR): get_offset('QS-JAN-B') pairs = [ @@ -2787,7 +2751,7 @@ def test_get_offset(): def test_get_offset_legacy(): pairs = [('w@Sat', Week(weekday=5))] for name, expected in pairs: - with tm.assert_raises_regex(ValueError, _INVALID_FREQ_ERROR): + with pytest.raises(ValueError, match=_INVALID_FREQ_ERROR): get_offset(name) From 0e05de4d4a5b099ef9c6797e3b1376d521965c67 Mon Sep 17 00:00:00 2001 From: Steve Baker Date: Wed, 13 Jun 2018 05:12:38 -0700 Subject: [PATCH 031/113] DOC: isin() docstring change DataFrame to pd.DataFrame (#21403) --- pandas/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0985de3126c5a..02c86d2f4dcc8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7284,11 +7284,11 @@ def isin(self, values): When ``values`` is a Series or DataFrame: >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) - >>> other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']}) - >>> df.isin(other) + >>> df2 = pd.DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']}) + >>> df.isin(df2) A B 0 True False - 1 False False # Column A in `other` has a 3, but not at index 1. + 1 False False # Column A in `df2` has a 3, but not at index 1. 2 True True """ if isinstance(values, dict): From 159756ebeba7ee3a03855d8d80adef199dc17d80 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Wed, 13 Jun 2018 15:24:01 +0200 Subject: [PATCH 032/113] BUG: fix get_indexer_non_unique with CategoricalIndex key (#21457) closes #21448 --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/core/indexes/base.py | 3 +++ pandas/core/indexes/category.py | 7 ++++++- pandas/tests/categorical/test_indexing.py | 20 +++++++++++++++++++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 1de44ffeb4160..3e4326dea2ecc 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -51,7 +51,7 @@ Bug Fixes **Indexing** -- +- Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - **I/O** diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bf1051332ee19..d9e4ef7db1158 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,6 +31,7 @@ is_dtype_equal, is_dtype_union_equal, is_object_dtype, + is_categorical, is_categorical_dtype, is_interval_dtype, is_period_dtype, @@ -3300,6 +3301,8 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = _ensure_index(target) + if is_categorical(target): + target = target.astype(target.dtype.categories.dtype) pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 150eca32e229d..587090fa72def 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -598,7 +598,12 @@ def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): - target = target.categories + # Indexing on codes is more efficient if categories are the same: + if target.categories is self.categories: + target = target.codes + indexer, missing = self._engine.get_indexer_non_unique(target) + return _ensure_platform_int(indexer), missing + target = target.values codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) diff --git a/pandas/tests/categorical/test_indexing.py b/pandas/tests/categorical/test_indexing.py index 9c27b1101e5ca..cf7b5cfa55882 100644 --- a/pandas/tests/categorical/test_indexing.py +++ b/pandas/tests/categorical/test_indexing.py @@ -5,7 +5,7 @@ import numpy as np import pandas.util.testing as tm -from pandas import Categorical, Index, PeriodIndex +from pandas import Categorical, Index, CategoricalIndex, PeriodIndex from pandas.tests.categorical.common import TestCategorical @@ -103,3 +103,21 @@ def f(): s.categories = [1, 2] pytest.raises(ValueError, f) + + # Combinations of sorted/unique: + @pytest.mark.parametrize("idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], + [1, 3, 3, 4], [1, 2, 2, 4]]) + # Combinations of missing/unique + @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) + @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) + def test_get_indexer_non_unique(self, idx_values, key_values, key_class): + # GH 21448 + key = key_class(key_values, categories=range(1, 5)) + # Test for flat index and CategoricalIndex with same/different cats: + for dtype in None, 'category', key.dtype: + idx = Index(idx_values, dtype=dtype) + expected, exp_miss = idx.get_indexer_non_unique(key_values) + result, res_miss = idx.get_indexer_non_unique(key) + + tm.assert_numpy_array_equal(expected, result) + tm.assert_numpy_array_equal(exp_miss, res_miss) From 70a3d6da3606a55d9921ef26b3af321f3f693575 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 13 Jun 2018 12:59:20 -0700 Subject: [PATCH 033/113] BUG: Fix DateOffset eq to depend on normalize attr (#21404) Partially addresses gh-18854 --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/tests/tseries/offsets/test_offsets.py | 50 +++++++++----------- pandas/tseries/offsets.py | 2 +- 3 files changed, 24 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 68c1839221508..6d5e40d37c8df 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -91,7 +91,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- +- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) - - diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 8bf0d9f915d04..6fd525f02f55c 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -564,11 +564,10 @@ def setup_method(self, method): self.offset2 = BDay(2) def test_different_normalize_equals(self): - # equivalent in this special case - offset = BDay() - offset2 = BDay() - offset2.normalize = True - assert offset == offset2 + # GH#21404 changed __eq__ to return False when `normalize` doesnt match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 def test_repr(self): assert repr(self.offset) == '' @@ -734,11 +733,10 @@ def test_constructor_errors(self): BusinessHour(start='14:00:05') def test_different_normalize_equals(self): - # equivalent in this special case + # GH#21404 changed __eq__ to return False when `normalize` doesnt match offset = self._offset() - offset2 = self._offset() - offset2.normalize = True - assert offset == offset2 + offset2 = self._offset(normalize=True) + assert offset != offset2 def test_repr(self): assert repr(self.offset1) == '' @@ -1397,11 +1395,10 @@ def test_constructor_errors(self): CustomBusinessHour(start='14:00:05') def test_different_normalize_equals(self): - # equivalent in this special case + # GH#21404 changed __eq__ to return False when `normalize` doesnt match offset = self._offset() - offset2 = self._offset() - offset2.normalize = True - assert offset == offset2 + offset2 = self._offset(normalize=True) + assert offset != offset2 def test_repr(self): assert repr(self.offset1) == '' @@ -1627,11 +1624,10 @@ def setup_method(self, method): self.offset2 = CDay(2) def test_different_normalize_equals(self): - # equivalent in this special case - offset = CDay() - offset2 = CDay() - offset2.normalize = True - assert offset == offset2 + # GH#21404 changed __eq__ to return False when `normalize` doesnt match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 def test_repr(self): assert repr(self.offset) == '' @@ -1865,11 +1861,10 @@ class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base): _offset = CBMonthEnd def test_different_normalize_equals(self): - # equivalent in this special case - offset = CBMonthEnd() - offset2 = CBMonthEnd() - offset2.normalize = True - assert offset == offset2 + # GH#21404 changed __eq__ to return False when `normalize` doesnt match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 def test_repr(self): assert repr(self.offset) == '' @@ -1982,11 +1977,10 @@ class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base): _offset = CBMonthBegin def test_different_normalize_equals(self): - # equivalent in this special case - offset = CBMonthBegin() - offset2 = CBMonthBegin() - offset2.normalize = True - assert offset == offset2 + # GH#21404 changed __eq__ to return False when `normalize` doesnt match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 def test_repr(self): assert repr(self.offset) == '' diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a5a983bf94bb8..99f97d8fc7bc0 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -290,7 +290,7 @@ def _params(self): all_paras = self.__dict__.copy() if 'holidays' in all_paras and not all_paras['holidays']: all_paras.pop('holidays') - exclude = ['kwds', 'name', 'normalize', 'calendar'] + exclude = ['kwds', 'name', 'calendar'] attrs = [(k, v) for k, v in all_paras.items() if (k not in exclude) and (k[0] != '_')] attrs = sorted(set(attrs)) From 7d664d5b977feee64d93a23a37ee6b593af2f06b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 14 Jun 2018 03:05:28 -0700 Subject: [PATCH 034/113] API/BUG: DatetimeIndex correctly localizes integer data (#21216) --- doc/source/whatsnew/v0.24.0.txt | 4 +- pandas/core/indexes/base.py | 4 + pandas/core/indexes/datetimes.py | 82 ++++++++----------- pandas/tests/indexes/datetimes/test_astype.py | 10 +++ .../indexes/datetimes/test_construction.py | 60 +++++++++----- pandas/tests/indexes/test_base.py | 27 +++--- 6 files changed, 105 insertions(+), 82 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 6d5e40d37c8df..c29197725a2b6 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -36,7 +36,7 @@ Datetimelike API Changes Other API Changes ^^^^^^^^^^^^^^^^^ -- +- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) - - @@ -92,7 +92,7 @@ Datetimelike ^^^^^^^^^^^^ - Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) -- +- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) - Timedelta diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d9e4ef7db1158..36345a32a3bf7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1175,6 +1175,10 @@ def astype(self, dtype, copy=True): return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) try: + if is_datetime64tz_dtype(dtype): + from pandas.core.indexes.datetimes import DatetimeIndex + return DatetimeIndex(self.values, name=self.name, dtype=dtype, + copy=copy) return Index(self.values.astype(dtype, copy=copy), name=self.name, dtype=dtype) except (TypeError, ValueError): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 66622814f172d..e944df7aa83c6 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -395,57 +395,43 @@ def __new__(cls, data=None, # data must be Index or np.ndarray here if not (is_datetime64_dtype(data) or is_datetimetz(data) or - is_integer_dtype(data)): + is_integer_dtype(data) or lib.infer_dtype(data) == 'integer'): data = tools.to_datetime(data, dayfirst=dayfirst, yearfirst=yearfirst) - if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): - - if isinstance(data, DatetimeIndex): - if tz is None: - tz = data.tz - elif data.tz is None: - data = data.tz_localize(tz, ambiguous=ambiguous) - else: - # the tz's must match - if str(tz) != str(data.tz): - msg = ('data is already tz-aware {0}, unable to ' - 'set specified tz: {1}') - raise TypeError(msg.format(data.tz, tz)) + if isinstance(data, DatetimeIndex): + if tz is None: + tz = data.tz + elif data.tz is None: + data = data.tz_localize(tz, ambiguous=ambiguous) + else: + # the tz's must match + if str(tz) != str(data.tz): + msg = ('data is already tz-aware {0}, unable to ' + 'set specified tz: {1}') + raise TypeError(msg.format(data.tz, tz)) - subarr = data.values + subarr = data.values - if freq is None: - freq = data.freq - verify_integrity = False - else: - if data.dtype != _NS_DTYPE: - subarr = conversion.ensure_datetime64ns(data) - else: - subarr = data + if freq is None: + freq = data.freq + verify_integrity = False + elif issubclass(data.dtype.type, np.datetime64): + if data.dtype != _NS_DTYPE: + data = conversion.ensure_datetime64ns(data) + if tz is not None: + # Convert tz-naive to UTC + tz = timezones.maybe_get_tz(tz) + data = conversion.tz_localize_to_utc(data.view('i8'), tz, + ambiguous=ambiguous) + subarr = data.view(_NS_DTYPE) else: # must be integer dtype otherwise - if isinstance(data, Int64Index): - raise TypeError('cannot convert Int64Index->DatetimeIndex') + # assume this data are epoch timestamps if data.dtype != _INT64_DTYPE: - data = data.astype(np.int64) + data = data.astype(np.int64, copy=False) subarr = data.view(_NS_DTYPE) - if isinstance(subarr, DatetimeIndex): - if tz is None: - tz = subarr.tz - else: - if tz is not None: - tz = timezones.maybe_get_tz(tz) - - if (not isinstance(data, DatetimeIndex) or - getattr(data, 'tz', None) is None): - # Convert tz-naive to UTC - ints = subarr.view('i8') - subarr = conversion.tz_localize_to_utc(ints, tz, - ambiguous=ambiguous) - subarr = subarr.view(_NS_DTYPE) - subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) if dtype is not None: if not is_dtype_equal(subarr.dtype, dtype): @@ -807,8 +793,9 @@ def _mpl_repr(self): @cache_readonly def _is_dates_only(self): + """Return a boolean if we are only dates (and don't have a timezone)""" from pandas.io.formats.format import _is_dates_only - return _is_dates_only(self.values) + return _is_dates_only(self.values) and self.tz is None @property def _formatter_func(self): @@ -1244,7 +1231,7 @@ def join(self, other, how='left', level=None, return_indexers=False, See Index.join """ if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type not in ('floating', 'mixed-integer', + other.inferred_type not in ('floating', 'integer', 'mixed-integer', 'mixed-integer-float', 'mixed')): try: other = DatetimeIndex(other) @@ -2100,8 +2087,9 @@ def normalize(self): dtype='datetime64[ns, Asia/Calcutta]', freq=None) """ new_values = conversion.date_normalize(self.asi8, self.tz) - return DatetimeIndex(new_values, freq='infer', name=self.name, - tz=self.tz) + return DatetimeIndex(new_values, + freq='infer', + name=self.name).tz_localize(self.tz) @Substitution(klass='DatetimeIndex') @Appender(_shared_docs['searchsorted']) @@ -2182,8 +2170,6 @@ def insert(self, loc, item): try: new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)) - if self.tz is not None: - new_dates = conversion.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) except (AttributeError, TypeError): @@ -2221,8 +2207,6 @@ def delete(self, loc): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq - if self.tz is not None: - new_dates = conversion.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) def tz_convert(self, tz): diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 8acdd301f241a..64b8f48f6a4e1 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -225,6 +225,16 @@ def _check_rng(rng): _check_rng(rng_eastern) _check_rng(rng_utc) + @pytest.mark.parametrize('tz, dtype', [ + ['US/Pacific', 'datetime64[ns, US/Pacific]'], + [None, 'datetime64[ns]']]) + def test_integer_index_astype_datetime(self, tz, dtype): + # GH 20997, 20964 + val = [pd.Timestamp('2018-01-01', tz=tz).value] + result = pd.Index(val).astype(dtype) + expected = pd.DatetimeIndex(['2018-01-01'], tz=tz) + tm.assert_index_equal(result, expected) + class TestToPeriod(object): diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index b138b79caac76..f7682a965c038 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -1,8 +1,10 @@ -import pytest +from datetime import timedelta +from operator import attrgetter +from functools import partial +import pytest import pytz import numpy as np -from datetime import timedelta import pandas as pd from pandas import offsets @@ -26,25 +28,28 @@ def test_construction_caching(self): freq='ns')}) assert df.dttz.dtype.tz.zone == 'US/Eastern' - def test_construction_with_alt(self): - - i = pd.date_range('20130101', periods=5, freq='H', tz='US/Eastern') - i2 = DatetimeIndex(i, dtype=i.dtype) - tm.assert_index_equal(i, i2) - assert i.tz.zone == 'US/Eastern' - - i2 = DatetimeIndex(i.tz_localize(None).asi8, tz=i.dtype.tz) - tm.assert_index_equal(i, i2) - assert i.tz.zone == 'US/Eastern' - - i2 = DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype) - tm.assert_index_equal(i, i2) - assert i.tz.zone == 'US/Eastern' - - i2 = DatetimeIndex( - i.tz_localize(None).asi8, dtype=i.dtype, tz=i.dtype.tz) - tm.assert_index_equal(i, i2) - assert i.tz.zone == 'US/Eastern' + @pytest.mark.parametrize('kwargs', [ + {'tz': 'dtype.tz'}, + {'dtype': 'dtype'}, + {'dtype': 'dtype', 'tz': 'dtype.tz'}]) + def test_construction_with_alt(self, kwargs, tz_aware_fixture): + tz = tz_aware_fixture + i = pd.date_range('20130101', periods=5, freq='H', tz=tz) + kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} + result = DatetimeIndex(i, **kwargs) + tm.assert_index_equal(i, result) + + @pytest.mark.parametrize('kwargs', [ + {'tz': 'dtype.tz'}, + {'dtype': 'dtype'}, + {'dtype': 'dtype', 'tz': 'dtype.tz'}]) + def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): + tz = tz_aware_fixture + i = pd.date_range('20130101', periods=5, freq='H', tz=tz) + kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} + result = DatetimeIndex(i.tz_localize(None).asi8, **kwargs) + expected = i.tz_localize(None).tz_localize('UTC').tz_convert(tz) + tm.assert_index_equal(result, expected) # localize into the provided tz i2 = DatetimeIndex(i.tz_localize(None).asi8, tz='UTC') @@ -478,6 +483,19 @@ def test_constructor_timestamp_near_dst(self): ts[1].to_pydatetime()]) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('klass', [Index, DatetimeIndex]) + @pytest.mark.parametrize('box', [ + np.array, partial(np.array, dtype=object), list]) + @pytest.mark.parametrize('tz, dtype', [ + ['US/Pacific', 'datetime64[ns, US/Pacific]'], + [None, 'datetime64[ns]']]) + def test_constructor_with_int_tz(self, klass, box, tz, dtype): + # GH 20997, 20964 + ts = Timestamp('2018-01-01', tz=tz) + result = klass(box([ts.value]), dtype=dtype) + expected = klass([ts]) + assert result == expected + class TestTimeSeries(object): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c264f5f79e47e..b8bd218ec25ab 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -402,26 +402,33 @@ def test_constructor_dtypes_to_timedelta(self, cast_index, vals): index = Index(vals) assert isinstance(index, TimedeltaIndex) - @pytest.mark.parametrize("values", [ - # pass values without timezone, as DatetimeIndex localizes it - pd.date_range('2011-01-01', periods=5).values, - pd.date_range('2011-01-01', periods=5).asi8]) + @pytest.mark.parametrize("attr, utc", [ + ['values', False], + ['asi8', True]]) @pytest.mark.parametrize("klass", [pd.Index, pd.DatetimeIndex]) - def test_constructor_dtypes_datetime(self, tz_naive_fixture, values, + def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc, klass): - index = pd.date_range('2011-01-01', periods=5, tz=tz_naive_fixture) + # Test constructing with a datetimetz dtype + # .values produces numpy datetimes, so these are considered naive + # .asi8 produces integers, so these are considered epoch timestamps + index = pd.date_range('2011-01-01', periods=5) + arg = getattr(index, attr) + if utc: + index = index.tz_localize('UTC').tz_convert(tz_naive_fixture) + else: + index = index.tz_localize(tz_naive_fixture) dtype = index.dtype - result = klass(values, tz=tz_naive_fixture) + result = klass(arg, tz=tz_naive_fixture) tm.assert_index_equal(result, index) - result = klass(values, dtype=dtype) + result = klass(arg, dtype=dtype) tm.assert_index_equal(result, index) - result = klass(list(values), tz=tz_naive_fixture) + result = klass(list(arg), tz=tz_naive_fixture) tm.assert_index_equal(result, index) - result = klass(list(values), dtype=dtype) + result = klass(list(arg), dtype=dtype) tm.assert_index_equal(result, index) @pytest.mark.parametrize("attr", ['values', 'asi8']) From 7463576ca94f6ec9f74b1e28ee96600972ed12d3 Mon Sep 17 00:00:00 2001 From: Wenhuan Date: Thu, 14 Jun 2018 18:08:57 +0800 Subject: [PATCH 035/113] PERF: improve performance of groupby rank (#21237) (#21285) --- asv_bench/benchmarks/groupby.py | 21 +++++- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/_libs/groupby_helper.pxi.in | 103 ++++++++++++++--------------- 3 files changed, 71 insertions(+), 54 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 7777322071957..0725bbeb6c36d 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -5,7 +5,7 @@ import numpy as np from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, - TimeGrouper, Categorical) + TimeGrouper, Categorical, Timestamp) import pandas.util.testing as tm from .pandas_vb_common import setup # noqa @@ -385,6 +385,25 @@ def time_dtype_as_field(self, dtype, method, application): self.as_field_method() +class RankWithTies(object): + # GH 21237 + goal_time = 0.2 + param_names = ['dtype', 'tie_method'] + params = [['float64', 'float32', 'int64', 'datetime64'], + ['first', 'average', 'dense', 'min', 'max']] + + def setup(self, dtype, tie_method): + N = 10**4 + if dtype == 'datetime64': + data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) + else: + data = np.array([1] * N, dtype=dtype) + self.df = DataFrame({'values': data, 'key': ['foo'] * N}) + + def time_rank_ties(self, dtype, tie_method): + self.df.groupby('key').rank(method=tie_method) + + class Float32(object): # GH 13335 goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c29197725a2b6..f079f151808cc 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -65,6 +65,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - .. _whatsnew_0240.docs: diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b3e9b7c9e69ee..0062a6c8d31ab 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -429,7 +429,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, is_datetimelike : bool, default False unused in this method but provided for call compatibility with other Cython transformations - ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default + 'average' * average: average rank of group * min: lowest rank in group * max: highest rank in group @@ -514,26 +515,22 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, dups += 1 sum_ranks += i - grp_start + 1 - # if keep_na, check for missing values and assign back - # to the result where appropriate - - if keep_na and mask[_as[i]]: - grp_na_count += 1 - out[_as[i], 0] = nan - else: - # this implementation is inefficient because it will - # continue overwriting previously encountered dups - # i.e. if 5 duplicated values are encountered it will - # write to the result as follows (assumes avg tiebreaker): - # 1 - # .5 .5 - # .33 .33 .33 - # .25 .25 .25 .25 - # .2 .2 .2 .2 .2 - # - # could potentially be optimized to only write to the - # result once the last duplicate value is encountered - if tiebreak == TIEBREAK_AVERAGE: + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the the starting index of the current group (grp_start) + # and the current index + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]]) or + (labels[_as[i]] != labels[_as[i+1]])): + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and mask[_as[i]]: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = nan + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: @@ -552,38 +549,38 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, for j in range(i - dups + 1, i + 1): out[_as[j], 0] = grp_vals_seen - # look forward to the next value (using the sorting in _as) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is coming - # up. the conditional also needs to handle nan equality and the - # end of iteration - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]])): - dups = sum_ranks = 0 - val_start = i - grp_vals_seen += 1 - grp_tie_count +=1 - - # Similar to the previous conditional, check now if we are moving - # to a new group. If so, keep track of the index where the new - # group occurs, so the tiebreaker calculations can decrement that - # from their position. fill in the size of each group encountered - # (used by pct calculations later). also be sure to reset any of - # the items helping to calculate dups - if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count - else: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (grp_tie_count - - (grp_na_count > 0)) - dups = sum_ranks = 0 - grp_na_count = 0 - grp_tie_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]])): + dups = sum_ranks = 0 + grp_vals_seen += 1 + grp_tie_count += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. fill in the size of each + # group encountered (used by pct calculations later). also be + # sure to reset any of the items helping to calculate dups + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (i - grp_start + 1 - + grp_na_count) + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (grp_tie_count - + (grp_na_count > 0)) + dups = sum_ranks = 0 + grp_na_count = 0 + grp_tie_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 if pct: for i in range(N): From f5043003e87d0aa2af5ea5f490cd9246005bd263 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Jun 2018 03:12:01 -0700 Subject: [PATCH 036/113] PERF: typing and cdefs for tslibs.resolution (#21452) --- pandas/_libs/tslibs/resolution.pyx | 71 ++++++++++++++++++------------ 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 2f185f4142a09..210b201cd08ea 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -5,7 +5,7 @@ from cython cimport Py_ssize_t import numpy as np cimport numpy as cnp -from numpy cimport ndarray, int64_t +from numpy cimport ndarray, int64_t, int32_t cnp.import_array() from util cimport is_string_object, get_nat @@ -45,12 +45,12 @@ cdef int RESO_MIN = 4 cdef int RESO_HR = 5 cdef int RESO_DAY = 6 -_ONE_MICRO = 1000L -_ONE_MILLI = _ONE_MICRO * 1000 -_ONE_SECOND = _ONE_MILLI * 1000 -_ONE_MINUTE = 60 * _ONE_SECOND -_ONE_HOUR = 60 * _ONE_MINUTE -_ONE_DAY = 24 * _ONE_HOUR +_ONE_MICRO = 1000L +_ONE_MILLI = (_ONE_MICRO * 1000) +_ONE_SECOND = (_ONE_MILLI * 1000) +_ONE_MINUTE = (60 * _ONE_SECOND) +_ONE_HOUR = (60 * _ONE_MINUTE) +_ONE_DAY = (24 * _ONE_HOUR) # ---------------------------------------------------------------------- @@ -350,7 +350,7 @@ class Resolution(object): # TODO: this is non performant logic here (and duplicative) and this # simply should call unique_1d directly # plus no reason to depend on khash directly -cdef unique_deltas(ndarray[int64_t] arr): +cdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr): cdef: Py_ssize_t i, n = len(arr) int64_t val @@ -374,21 +374,27 @@ cdef unique_deltas(ndarray[int64_t] arr): return result -def _is_multiple(us, mult): +cdef inline bint _is_multiple(int64_t us, int64_t mult): return us % mult == 0 -def _maybe_add_count(base, count): +cdef inline str _maybe_add_count(str base, int64_t count): if count != 1: - return '{count}{base}'.format(count=int(count), base=base) + return '{count}{base}'.format(count=count, base=base) else: return base -class _FrequencyInferer(object): +cdef class _FrequencyInferer(object): """ Not sure if I can avoid the state machine here """ + cdef public: + object index + object values + bint warn + bint is_monotonic + dict _cache def __init__(self, index, warn=True): self.index = index @@ -476,16 +482,23 @@ class _FrequencyInferer(object): def rep_stamp(self): return Timestamp(self.values[0]) - def month_position_check(self): + cdef month_position_check(self): # TODO: cythonize this, very slow - calendar_end = True - business_end = True - calendar_start = True - business_start = True - - years = self.fields['Y'] - months = self.fields['M'] - days = self.fields['D'] + cdef: + int32_t daysinmonth, y, m, d + bint calendar_end = True + bint business_end = True + bint calendar_start = True + bint business_start = True + bint cal + int32_t[:] years + int32_t[:] months + int32_t[:] days + + fields = self.fields + years = fields['Y'] + months = fields['M'] + days = fields['D'] weekdays = self.index.dayofweek for y, m, d, wd in zip(years, months, days, weekdays): @@ -525,7 +538,7 @@ class _FrequencyInferer(object): def ydiffs(self): return unique_deltas(self.fields['Y'].astype('i8')) - def _infer_daily_rule(self): + cdef _infer_daily_rule(self): annual_rule = self._get_annual_rule() if annual_rule: nyears = self.ydiffs[0] @@ -562,7 +575,7 @@ class _FrequencyInferer(object): if wom_rule: return wom_rule - def _get_annual_rule(self): + cdef _get_annual_rule(self): if len(self.ydiffs) > 1: return None @@ -573,7 +586,7 @@ class _FrequencyInferer(object): return {'cs': 'AS', 'bs': 'BAS', 'ce': 'A', 'be': 'BA'}.get(pos_check) - def _get_quarterly_rule(self): + cdef _get_quarterly_rule(self): if len(self.mdiffs) > 1: return None @@ -584,14 +597,14 @@ class _FrequencyInferer(object): return {'cs': 'QS', 'bs': 'BQS', 'ce': 'Q', 'be': 'BQ'}.get(pos_check) - def _get_monthly_rule(self): + cdef _get_monthly_rule(self): if len(self.mdiffs) > 1: return None pos_check = self.month_position_check() return {'cs': 'MS', 'bs': 'BMS', 'ce': 'M', 'be': 'BM'}.get(pos_check) - def _is_business_daily(self): + cdef bint _is_business_daily(self): # quick check: cannot be business daily if self.day_deltas != [1, 3]: return False @@ -604,7 +617,7 @@ class _FrequencyInferer(object): return np.all(((weekdays == 0) & (shifts == 3)) | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))) - def _get_wom_rule(self): + cdef _get_wom_rule(self): # wdiffs = unique(np.diff(self.index.week)) # We also need -47, -49, -48 to catch index spanning year boundary # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): @@ -627,9 +640,9 @@ class _FrequencyInferer(object): return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) -class _TimedeltaFrequencyInferer(_FrequencyInferer): +cdef class _TimedeltaFrequencyInferer(_FrequencyInferer): - def _infer_daily_rule(self): + cdef _infer_daily_rule(self): if self.is_unique: days = self.deltas[0] / _ONE_DAY if days % 7 == 0: From 07381bb71fc14c63431e4a19cd5fc019692ad510 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Jun 2018 03:18:23 -0700 Subject: [PATCH 037/113] disallow normalize=True with Tick classes (#21427) --- doc/source/whatsnew/v0.24.0.txt | 35 ++++++++++++++++++++ pandas/tests/tseries/offsets/test_offsets.py | 22 ++++++++++-- pandas/tseries/offsets.py | 4 ++- 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index f079f151808cc..cae05446c00e6 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -24,6 +24,41 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0240.api.datetimelike.normalize + +Tick DateOffset Normalize Restrictions +-------------------------------------- + +Creating a ``Tick`` object (:class:``Day``, :class:``Hour``, :class:``Minute``, +:class:``Second``, :class:``Milli``, :class:``Micro``, :class:``Nano``) with +`normalize=True` is no longer supported. This prevents unexpected behavior +where addition could fail to be monotone or associative. (:issue:`21427`) + +.. ipython:: python + + ts = pd.Timestamp('2018-06-11 18:01:14') + ts + tic = pd.offsets.Hour(n=2, normalize=True) + tic + +Previous Behavior: + +.. code-block:: ipython + + In [4]: ts + tic + Out [4]: Timestamp('2018-06-11 00:00:00') + + In [5]: ts + tic + tic + tic == ts + (tic + tic + tic) + Out [5]: False + +Current Behavior: + +.. ipython:: python + + tic = pd.offsets.Hour(n=2) + ts + tic + tic + tic == ts + (tic + tic + tic) + + .. _whatsnew_0240.api.datetimelike: Datetimelike API Changes diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 6fd525f02f55c..5dd2a199405bf 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -28,7 +28,7 @@ YearEnd, Day, QuarterEnd, BusinessMonthEnd, FY5253, Nano, Easter, FY5253Quarter, - LastWeekOfMonth) + LastWeekOfMonth, Tick) from pandas.core.tools.datetimes import format, ole2datetime import pandas.tseries.offsets as offsets from pandas.io.pickle import read_pickle @@ -270,6 +270,11 @@ def test_offset_freqstr(self, offset_types): def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=False): + + if normalize and issubclass(offset, Tick): + # normalize=True disallowed for Tick subclasses GH#21427 + return + offset_s = self._get_offset(offset, normalize=normalize) func = getattr(offset_s, funcname) @@ -458,6 +463,9 @@ def test_onOffset(self, offset_types): assert offset_s.onOffset(dt) # when normalize=True, onOffset checks time is 00:00:00 + if issubclass(offset_types, Tick): + # normalize=True disallowed for Tick subclasses GH#21427 + return offset_n = self._get_offset(offset_types, normalize=True) assert not offset_n.onOffset(dt) @@ -485,7 +493,9 @@ def test_add(self, offset_types, tz): assert isinstance(result, Timestamp) assert result == expected_localize - # normalize=True + # normalize=True, disallowed for Tick subclasses GH#21427 + if issubclass(offset_types, Tick): + return offset_s = self._get_offset(offset_types, normalize=True) expected = Timestamp(expected.date()) @@ -3092,6 +3102,14 @@ def test_require_integers(offset_types): cls(n=1.5) +def test_tick_normalize_raises(tick_classes): + # check that trying to create a Tick object with normalize=True raises + # GH#21427 + cls = tick_classes + with pytest.raises(ValueError): + cls(n=3, normalize=True) + + def test_weeks_onoffset(): # GH#18510 Week with weekday = None, normalize = False should always # be onOffset diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 99f97d8fc7bc0..2f4989f26b394 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -2217,8 +2217,10 @@ class Tick(SingleConstructorOffset): _attributes = frozenset(['n', 'normalize']) def __init__(self, n=1, normalize=False): - # TODO: do Tick classes with normalize=True make sense? self.n = self._validate_n(n) + if normalize: + raise ValueError("Tick offset with `normalize=True` are not " + "allowed.") # GH#21427 self.normalize = normalize __gt__ = _tick_comp(operator.gt) From e2fb27adb691a0191b9bc459604e17cc1c367480 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Thu, 14 Jun 2018 15:53:14 +0530 Subject: [PATCH 038/113] CLN: Comparison methods for MultiIndex should have consistent behaviour for all nlevels (GH21149) (#21195) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/indexes/base.py | 3 ++- pandas/tests/indexes/test_multi.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 3e4326dea2ecc..0d3f9cb8dd3b6 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -52,6 +52,7 @@ Bug Fixes **Indexing** - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) +- Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) - **I/O** diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 36345a32a3bf7..4b32e5d4f5654 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -91,7 +91,8 @@ def cmp_method(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - if is_object_dtype(self) and self.nlevels == 1: + from .multi import MultiIndex + if is_object_dtype(self) and not isinstance(self, MultiIndex): # don't pass MultiIndex with np.errstate(all='ignore'): result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 0ab3447909d9b..ab53002ee1587 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -3307,3 +3307,20 @@ def test_duplicate_multiindex_labels(self): with pytest.raises(ValueError): ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], inplace=True) + + def test_multiindex_compare(self): + # GH 21149 + # Ensure comparison operations for MultiIndex with nlevels == 1 + # behave consistently with those for MultiIndex with nlevels > 1 + + midx = pd.MultiIndex.from_product([[0, 1]]) + + # Equality self-test: MultiIndex object vs self + expected = pd.Series([True, True]) + result = pd.Series(midx == midx) + tm.assert_series_equal(result, expected) + + # Greater than comparison: MultiIndex object vs self + expected = pd.Series([False, False]) + result = pd.Series(midx > midx) + tm.assert_series_equal(result, expected) From f57e0eb854f1a4d2c2de61cf65331a6dac507adf Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 14 Jun 2018 11:38:23 +0100 Subject: [PATCH 039/113] PERF: Add __contains__ to CategoricalIndex (#21369) --- asv_bench/benchmarks/categoricals.py | 13 +++++++++++++ doc/source/whatsnew/v0.23.2.txt | 4 +++- pandas/core/indexes/category.py | 28 ++++++++++++++++++++-------- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 5464e7cba22c3..48f42621d183d 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -193,3 +193,16 @@ def time_categorical_series_is_monotonic_increasing(self): def time_categorical_series_is_monotonic_decreasing(self): self.s.is_monotonic_decreasing + + +class Contains(object): + + goal_time = 0.2 + + def setup(self): + N = 10**5 + self.ci = tm.makeCategoricalIndex(N) + self.cat = self.ci.categories[0] + + def time_contains(self): + self.cat in self.ci diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 0d3f9cb8dd3b6..79a4c3da2ffa4 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -24,7 +24,9 @@ Fixed Regressions Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Improved performance of membership checks in :class:`CategoricalIndex` + (i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains` + is likewise much faster (:issue:`21369`) - Documentation Changes diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 587090fa72def..7f2860a963423 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -325,19 +325,31 @@ def _reverse_indexer(self): def __contains__(self, key): hash(key) - if self.categories._defer_to_indexing: - return key in self.categories + if isna(key): # if key is a NaN, check if any NaN is in self. + return self.isna().any() + + # is key in self.categories? Then get its location. + # If not (i.e. KeyError), it logically can't be in self either + try: + loc = self.categories.get_loc(key) + except KeyError: + return False - return key in self.values + # loc is the location of key in self.categories, but also the value + # for key in self.codes and in self._engine. key may be in categories, + # but still not in self, check this. Example: + # 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False + if is_scalar(loc): + return loc in self._engine + else: + # if self.categories is IntervalIndex, loc is an array + # check if any scalar of the array is in self._engine + return any(loc_ in self._engine for loc_ in loc) @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): hash(key) - - if self.categories._defer_to_indexing: - return self.categories.contains(key) - - return key in self.values + return key in self def __array__(self, dtype=None): """ the array interface, return my values """ From 0921273dde9d489d30bce199ee7ed8f1078cedd3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 15 Jun 2018 05:50:05 -0700 Subject: [PATCH 040/113] CLN: Index imports and 0.23.1 whatsnew (#21490) --- doc/source/whatsnew/v0.23.1.txt | 8 ++++---- pandas/core/indexes/base.py | 14 ++++++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index db25bcf8113f5..af4eeffd87d01 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -97,8 +97,8 @@ Bug Fixes **Data-type specific** -- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) -- Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) +- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue:`21078`) +- Bug in :class:`Timedelta` where passing a float with a unit would prematurely round the float precision (:issue:`14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) **Sparse** @@ -110,12 +110,12 @@ Bug Fixes - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) -- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) +- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, :issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) **Plotting** -- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue: `20968`) +- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue:`20968`) **I/O** diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4b32e5d4f5654..6a56278b0da49 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -283,7 +283,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if (is_datetime64_any_dtype(data) or (dtype is not None and is_datetime64_any_dtype(dtype)) or 'tz' in kwargs): - from pandas.core.indexes.datetimes import DatetimeIndex + from pandas import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) if dtype is not None and is_dtype_equal(_o_dtype, dtype): @@ -293,7 +293,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif (is_timedelta64_dtype(data) or (dtype is not None and is_timedelta64_dtype(dtype))): - from pandas.core.indexes.timedeltas import TimedeltaIndex + from pandas import TimedeltaIndex result = TimedeltaIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: return Index(result.to_pytimedelta(), dtype=_o_dtype) @@ -404,8 +404,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if (lib.is_datetime_with_singletz_array(subarr) or 'tz' in kwargs): # only when subarr has the same tz - from pandas.core.indexes.datetimes import ( - DatetimeIndex) + from pandas import DatetimeIndex try: return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) @@ -413,8 +412,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, pass elif inferred.startswith('timedelta'): - from pandas.core.indexes.timedeltas import ( - TimedeltaIndex) + from pandas import TimedeltaIndex return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == 'period': @@ -1177,7 +1175,7 @@ def astype(self, dtype, copy=True): copy=copy) try: if is_datetime64tz_dtype(dtype): - from pandas.core.indexes.datetimes import DatetimeIndex + from pandas import DatetimeIndex return DatetimeIndex(self.values, name=self.name, dtype=dtype, copy=copy) return Index(self.values.astype(dtype, copy=copy), name=self.name, @@ -3333,7 +3331,7 @@ def get_indexer_for(self, target, **kwargs): def _maybe_promote(self, other): # A hack, but it works - from pandas.core.indexes.datetimes import DatetimeIndex + from pandas import DatetimeIndex if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): return DatetimeIndex(self), other elif self.inferred_type == 'boolean': From d2de5070f9d8d0722a90cb328b98afc40d7c5018 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 15 Jun 2018 13:51:18 +0100 Subject: [PATCH 041/113] improve speed of nans in CategoricalIndex (#21493) --- pandas/core/indexes/category.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 7f2860a963423..0093d4940751e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -326,7 +326,7 @@ def __contains__(self, key): hash(key) if isna(key): # if key is a NaN, check if any NaN is in self. - return self.isna().any() + return self.hasnans # is key in self.categories? Then get its location. # If not (i.e. KeyError), it logically can't be in self either From aa5e1f135dec74102fc168b8fb5a71c88badb418 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 15 Jun 2018 10:19:04 -0700 Subject: [PATCH 042/113] perf improvements in tslibs.period (#21447) --- pandas/_libs/src/util.pxd | 15 +++++++++++++++ pandas/_libs/tslib.pyx | 2 +- pandas/_libs/tslibs/period.pyx | 20 ++++++++++++-------- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/src/util.pxd b/pandas/_libs/src/util.pxd index d8249ec130f4d..2c1876fad95d2 100644 --- a/pandas/_libs/src/util.pxd +++ b/pandas/_libs/src/util.pxd @@ -161,3 +161,18 @@ cdef inline bint _checknull(object val): cdef inline bint is_period_object(object val): return getattr(val, '_typ', '_typ') == 'period' + + +cdef inline bint is_offset_object(object val): + """ + Check if an object is a DateOffset object. + + Parameters + ---------- + val : object + + Returns + ------- + is_date_offset : bool + """ + return getattr(val, '_typ', None) == "dateoffset" diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 4f73f196b0d9d..6588b5476e2b9 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -125,7 +125,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, elif box == "datetime": func_create = create_datetime_from_ts else: - raise ValueError("box must be one of 'datetime', 'date', 'time' or" + + raise ValueError("box must be one of 'datetime', 'date', 'time' or" " 'timestamp'") if tz is not None: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 008747c0a9e78..cc2fb6e0617cb 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -19,7 +19,8 @@ from pandas.compat import PY2 cimport cython -from cpython.datetime cimport PyDateTime_Check, PyDateTime_IMPORT +from cpython.datetime cimport (PyDateTime_Check, PyDelta_Check, + PyDateTime_IMPORT) # import datetime C API PyDateTime_IMPORT @@ -1058,18 +1059,21 @@ cdef class _Period(object): return hash((self.ordinal, self.freqstr)) def _add_delta(self, other): - if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): + cdef: + int64_t nanos, offset_nanos + + if (PyDelta_Check(other) or util.is_timedelta64_object(other) or + isinstance(other, offsets.Tick)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = delta_to_nanoseconds(other) offset_nanos = delta_to_nanoseconds(offset) - if nanos % offset_nanos == 0: ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) msg = 'Input cannot be converted to Period(freq={0})' raise IncompatibleFrequency(msg.format(self.freqstr)) - elif isinstance(other, offsets.DateOffset): + elif util.is_offset_object(other): freqstr = other.rule_code base = get_base_alias(freqstr) if base == self.freq.rule_code: @@ -1082,8 +1086,8 @@ cdef class _Period(object): def __add__(self, other): if is_period_object(self): - if isinstance(other, (timedelta, np.timedelta64, - offsets.DateOffset)): + if (PyDelta_Check(other) or util.is_timedelta64_object(other) or + util.is_offset_object(other)): return self._add_delta(other) elif other is NaT: return NaT @@ -1109,8 +1113,8 @@ cdef class _Period(object): def __sub__(self, other): if is_period_object(self): - if isinstance(other, (timedelta, np.timedelta64, - offsets.DateOffset)): + if (PyDelta_Check(other) or util.is_timedelta64_object(other) or + util.is_offset_object(other)): neg_other = -other return self + neg_other elif util.is_integer_object(other): From 676ae5950de3bee2aa5916e4db8618a87476cd2f Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 15 Jun 2018 11:21:36 -0600 Subject: [PATCH 043/113] BUG: Fix Series.nlargest for integer boundary values (#21432) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/conftest.py | 71 ++++++++++++++++++++++++ pandas/core/algorithms.py | 5 +- pandas/tests/frame/test_analytics.py | 78 +++++++++++++-------------- pandas/tests/series/test_analytics.py | 35 ++++++++++++ 5 files changed, 147 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 79a4c3da2ffa4..b8d865195cddd 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -82,4 +82,5 @@ Bug Fixes **Other** +- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - diff --git a/pandas/conftest.py b/pandas/conftest.py index d5f399c7cd63d..9d806a91f37f7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -129,6 +129,14 @@ def join_type(request): return request.param +@pytest.fixture(params=['nlargest', 'nsmallest']) +def nselect_method(request): + """ + Fixture for trying all nselect methods + """ + return request.param + + @pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')]) def nulls_fixture(request): """ @@ -170,3 +178,66 @@ def string_dtype(request): * 'U' """ return request.param + + +@pytest.fixture(params=["float32", "float64"]) +def float_dtype(request): + """ + Parameterized fixture for float dtypes. + + * float32 + * float64 + """ + + return request.param + + +UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] +SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"] +ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES + + +@pytest.fixture(params=SIGNED_INT_DTYPES) +def sint_dtype(request): + """ + Parameterized fixture for signed integer dtypes. + + * int8 + * int16 + * int32 + * int64 + """ + + return request.param + + +@pytest.fixture(params=UNSIGNED_INT_DTYPES) +def uint_dtype(request): + """ + Parameterized fixture for unsigned integer dtypes. + + * uint8 + * uint16 + * uint32 + * uint64 + """ + + return request.param + + +@pytest.fixture(params=ALL_INT_DTYPES) +def any_int_dtype(request): + """ + Parameterized fixture for any integer dtypes. + + * int8 + * uint8 + * int16 + * uint16 + * int32 + * uint32 + * int64 + * uint64 + """ + + return request.param diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b33c10da7813e..9e34b8eb55ccb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1133,9 +1133,12 @@ def compute(self, method): return dropped[slc].sort_values(ascending=ascending).head(n) # fast method - arr, _, _ = _ensure_data(dropped.values) + arr, pandas_dtype, _ = _ensure_data(dropped.values) if method == 'nlargest': arr = -arr + if is_integer_dtype(pandas_dtype): + # GH 21426: ensure reverse ordering at boundaries + arr -= 1 if self.keep == 'last': arr = arr[::-1] diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b8f1acc2aa679..6dc24ed856017 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -12,7 +12,7 @@ from numpy.random import randn import numpy as np -from pandas.compat import lrange, product, PY35 +from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, _np_version_under1p12, _np_version_under1p15, @@ -2260,54 +2260,49 @@ class TestNLargestNSmallest(object): # ---------------------------------------------------------------------- # Top / bottom - @pytest.mark.parametrize( - 'method, n, order', - product(['nsmallest', 'nlargest'], range(1, 11), - [['a'], - ['c'], - ['a', 'b'], - ['a', 'c'], - ['b', 'a'], - ['b', 'c'], - ['a', 'b', 'c'], - ['c', 'a', 'b'], - ['c', 'b', 'a'], - ['b', 'c', 'a'], - ['b', 'a', 'c'], - - # dups! - ['b', 'c', 'c'], - - ])) - def test_n(self, df_strings, method, n, order): + @pytest.mark.parametrize('order', [ + ['a'], + ['c'], + ['a', 'b'], + ['a', 'c'], + ['b', 'a'], + ['b', 'c'], + ['a', 'b', 'c'], + ['c', 'a', 'b'], + ['c', 'b', 'a'], + ['b', 'c', 'a'], + ['b', 'a', 'c'], + + # dups! + ['b', 'c', 'c']]) + @pytest.mark.parametrize('n', range(1, 11)) + def test_n(self, df_strings, nselect_method, n, order): # GH10393 df = df_strings if 'b' in order: error_msg = self.dtype_error_msg_template.format( - column='b', method=method, dtype='object') + column='b', method=nselect_method, dtype='object') with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(n, order) + getattr(df, nselect_method)(n, order) else: - ascending = method == 'nsmallest' - result = getattr(df, method)(n, order) + ascending = nselect_method == 'nsmallest' + result = getattr(df, nselect_method)(n, order) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'method, columns', - product(['nsmallest', 'nlargest'], - product(['group'], ['category_string', 'string']) - )) - def test_n_error(self, df_main_dtypes, method, columns): + @pytest.mark.parametrize('columns', [ + ('group', 'category_string'), ('group', 'string')]) + def test_n_error(self, df_main_dtypes, nselect_method, columns): df = df_main_dtypes + col = columns[1] error_msg = self.dtype_error_msg_template.format( - column=columns[1], method=method, dtype=df[columns[1]].dtype) + column=col, method=nselect_method, dtype=df[col].dtype) # escape some characters that may be in the repr error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)") .replace("[", "\\[").replace("]", "\\]")) with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(2, columns) + getattr(df, nselect_method)(2, columns) def test_n_all_dtypes(self, df_main_dtypes): df = df_main_dtypes @@ -2328,15 +2323,14 @@ def test_n_identical_values(self): expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'n, order', - product([1, 2, 3, 4, 5], - [['a', 'b', 'c'], - ['c', 'b', 'a'], - ['a'], - ['b'], - ['a', 'b'], - ['c', 'b']])) + @pytest.mark.parametrize('order', [ + ['a', 'b', 'c'], + ['c', 'b', 'a'], + ['a'], + ['b'], + ['a', 'b'], + ['c', 'b']]) + @pytest.mark.parametrize('n', range(1, 6)) def test_n_duplicate_index(self, df_duplicates, n, order): # GH 13412 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index aba472f2ce8f9..b9c7b837b8b81 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1944,6 +1944,15 @@ def test_mode_sortwarning(self): tm.assert_series_equal(result, expected) +def assert_check_nselect_boundary(vals, dtype, method): + # helper function for 'test_boundary_{dtype}' tests + s = Series(vals, dtype=dtype) + result = getattr(s, method)(3) + expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] + expected = s.loc[expected_idxr] + tm.assert_series_equal(result, expected) + + class TestNLargestNSmallest(object): @pytest.mark.parametrize( @@ -2028,6 +2037,32 @@ def test_n(self, n): expected = s.sort_values().head(n) assert_series_equal(result, expected) + def test_boundary_integer(self, nselect_method, any_int_dtype): + # GH 21426 + dtype_info = np.iinfo(any_int_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val, min_val + 1, max_val - 1, max_val] + assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) + + def test_boundary_float(self, nselect_method, float_dtype): + # GH 21426 + dtype_info = np.finfo(float_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + min_2nd, max_2nd = np.nextafter( + [min_val, max_val], 0, dtype=float_dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + assert_check_nselect_boundary(vals, float_dtype, nselect_method) + + @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]']) + def test_boundary_datetimelike(self, nselect_method, dtype): + # GH 21426 + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + dtype_info = np.iinfo('int64') + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + assert_check_nselect_boundary(vals, dtype, nselect_method) + class TestCategoricalSeriesAnalytics(object): From 8ca172d52368914fffa85d4df04fdd06b2db2c6a Mon Sep 17 00:00:00 2001 From: Uddeshya Singh Date: Fri, 15 Jun 2018 22:57:19 +0530 Subject: [PATCH 044/113] Removing SimpleMock test from pandas.util.testing (#21482) --- pandas/util/testing.py | 53 ------------------------------------------ 1 file changed, 53 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 233eba6490937..d26a2116fb3ce 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2263,59 +2263,6 @@ def wrapper(*args, **kwargs): with_connectivity_check = network -class SimpleMock(object): - - """ - Poor man's mocking object - - Note: only works for new-style classes, assumes __getattribute__ exists. - - >>> a = type("Duck",(),{}) - >>> a.attr1,a.attr2 ="fizz","buzz" - >>> b = SimpleMock(a,"attr1","bar") - >>> b.attr1 == "bar" and b.attr2 == "buzz" - True - >>> a.attr1 == "fizz" and a.attr2 == "buzz" - True - """ - - def __init__(self, obj, *args, **kwds): - assert(len(args) % 2 == 0) - attrs = kwds.get("attrs", {}) - for k, v in zip(args[::2], args[1::2]): - # dict comprehensions break 2.6 - attrs[k] = v - self.attrs = attrs - self.obj = obj - - def __getattribute__(self, name): - attrs = object.__getattribute__(self, "attrs") - obj = object.__getattribute__(self, "obj") - return attrs.get(name, type(obj).__getattribute__(obj, name)) - - -@contextmanager -def stdin_encoding(encoding=None): - """ - Context manager for running bits of code while emulating an arbitrary - stdin encoding. - - >>> import sys - >>> _encoding = sys.stdin.encoding - >>> with stdin_encoding('AES'): sys.stdin.encoding - 'AES' - >>> sys.stdin.encoding==_encoding - True - - """ - import sys - - _stdin = sys.stdin - sys.stdin = SimpleMock(sys.stdin, "encoding", encoding) - yield - sys.stdin = _stdin - - def assert_raises_regex(_exception, _regexp, _callable=None, *args, **kwargs): r""" From 28780c27eff197cf0897bf643e9f26c81e8d117e Mon Sep 17 00:00:00 2001 From: Uddeshya Singh Date: Fri, 15 Jun 2018 23:03:34 +0530 Subject: [PATCH 045/113] TST: adding test cases for verifying correct values shown by pivot_table() #21378 (#21393) --- pandas/tests/reshape/test_pivot.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 3ec60d50f2792..ca95dde1a20c9 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -161,6 +161,24 @@ def test_pivot_with_non_observable_dropna(self, dropna): tm.assert_frame_equal(result, expected) + # gh-21378 + df = pd.DataFrame( + {'A': pd.Categorical(['left', 'low', 'high', 'low', 'high'], + categories=['low', 'high', 'left'], + ordered=True), + 'B': range(5)}) + + result = df.pivot_table(index='A', values='B', dropna=dropna) + expected = pd.DataFrame( + {'B': [2, 3, 0]}, + index=pd.Index( + pd.Categorical.from_codes([0, 1, 2], + categories=['low', 'high', 'left'], + ordered=True), + name='A')) + + tm.assert_frame_equal(result, expected) + def test_pass_array(self): result = self.data.pivot_table( 'D', index=self.data.A, columns=self.data.C) From a231fb28616b04401ae36efe38ed7ee2759c9c23 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 18 Jun 2018 23:42:59 +0200 Subject: [PATCH 046/113] PERF: remove useless overrides (#21523) closes #21522 --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/indexes/multi.py | 12 ------------ 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index b8d865195cddd..f7e170cca039e 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -27,6 +27,7 @@ Performance Improvements - Improved performance of membership checks in :class:`CategoricalIndex` (i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains` is likewise much faster (:issue:`21369`) +- Improved performance of :meth:`MultiIndex.is_unique` (:issue:`21522`) - Documentation Changes diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 75b6be96feb78..ab23a80acdaae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -852,14 +852,6 @@ def _has_complex_internals(self): # to disable groupby tricks return True - @cache_readonly - def is_monotonic(self): - """ - return if the index is monotonic increasing (only equal or - increasing) values. - """ - return self.is_monotonic_increasing - @cache_readonly def is_monotonic_increasing(self): """ @@ -887,10 +879,6 @@ def is_monotonic_decreasing(self): # monotonic decreasing if and only if reverse is monotonic increasing return self[::-1].is_monotonic_increasing - @cache_readonly - def is_unique(self): - return not self.duplicated().any() - @cache_readonly def _have_mixed_levels(self): """ return a boolean list indicated if we have mixed levels """ From 5f44af0b3949b8be00a73b6cba2da9d8cc730e9d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 18 Jun 2018 15:34:28 -0700 Subject: [PATCH 047/113] TST: Add unit tests for older timezone issues (#21491) --- doc/source/whatsnew/v0.23.2.txt | 5 ++++ .../indexes/datetimes/test_arithmetic.py | 24 ++++++++++++++++++- .../tests/scalar/timestamp/test_timestamp.py | 6 +++++ .../tests/scalar/timestamp/test_unary_ops.py | 7 ++++++ .../tests/series/indexing/test_alter_index.py | 9 +++++++ pandas/tests/test_resample.py | 11 +++++++++ 6 files changed, 61 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index f7e170cca039e..70a5dd5817c3c 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -80,6 +80,11 @@ Bug Fixes **Timezones** - Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) - Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) +- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) +- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError``(:issue:`8910`) +- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) +- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) +- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) **Other** diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index eff2872a1cff3..0649083a440df 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -4,7 +4,7 @@ import operator import pytest - +import pytz import numpy as np import pandas as pd @@ -476,6 +476,28 @@ def test_dti_shift_localized(self, tzstr): result = dr_tz.shift(1, '10T') assert result.tz == dr_tz.tz + def test_dti_shift_across_dst(self): + # GH 8616 + idx = date_range('2013-11-03', tz='America/Chicago', + periods=7, freq='H') + s = Series(index=idx[:-1]) + result = s.shift(freq='H') + expected = Series(index=idx[1:]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('shift, result_time', [ + [0, '2014-11-14 00:00:00'], + [-1, '2014-11-13 23:00:00'], + [1, '2014-11-14 01:00:00']]) + def test_dti_shift_near_midnight(self, shift, result_time): + # GH 8616 + dt = datetime(2014, 11, 14, 0) + dt_est = pytz.timezone('EST').localize(dt) + s = Series(data=[1], index=[dt_est]) + result = s.shift(shift, freq='H') + expected = Series(1, index=DatetimeIndex([result_time], tz='EST')) + tm.assert_series_equal(result, expected) + # ------------------------------------------------------------- # Binary operations DatetimeIndex and timedelta-like diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 4689c7bea626f..8dc9903b7356d 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -420,6 +420,12 @@ def test_constructor_nanosecond(self, result): expected = expected + Timedelta(nanoseconds=1) assert result == expected + @pytest.mark.parametrize('z', ['Z0', 'Z00']) + def test_constructor_invalid_Z0_isostring(self, z): + # GH 8910 + with pytest.raises(ValueError): + Timestamp('2014-11-02 01:00{}'.format(z)) + @pytest.mark.parametrize('arg', ['year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond', 'nanosecond']) def test_invalid_date_kwarg_with_string_input(self, arg): diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index dbe31ccb11114..b02fef707a6fe 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -257,6 +257,13 @@ def test_replace_across_dst(self, tz, normalize): ts2b = normalize(ts2) assert ts2 == ts2b + def test_replace_dst_border(self): + # Gh 7825 + t = Timestamp('2013-11-3', tz='America/Chicago') + result = t.replace(hour=3) + expected = Timestamp('2013-11-3 03:00:00', tz='America/Chicago') + assert result == expected + # -------------------------------------------------------------- @td.skip_if_windows diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 999ed5f26daee..bcd5a64402c33 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -453,6 +453,15 @@ def test_reindex_fill_value(): assert_series_equal(result, expected) +def test_reindex_datetimeindexes_tz_naive_and_aware(): + # GH 8306 + idx = date_range('20131101', tz='America/Chicago', periods=7) + newidx = date_range('20131103', periods=10, freq='H') + s = Series(range(7), index=idx) + with pytest.raises(TypeError): + s.reindex(newidx, method='ffill') + + def test_rename(): # GH 17407 s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex')) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index c1257cce9a9a4..6f0ad0535c6b4 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2084,6 +2084,17 @@ def test_resample_dst_anchor(self): freq='D', tz='Europe/Paris')), 'D Frequency') + def test_downsample_across_dst(self): + # GH 8531 + tz = pytz.timezone('Europe/Berlin') + dt = datetime(2014, 10, 26) + dates = date_range(tz.localize(dt), periods=4, freq='2H') + result = Series(5, index=dates).resample('H').mean() + expected = Series([5., np.nan] * 3 + [5.], + index=date_range(tz.localize(dt), periods=7, + freq='H')) + tm.assert_series_equal(result, expected) + def test_resample_with_nat(self): # GH 13020 index = DatetimeIndex([pd.NaT, From 1427b690f2f3de336e6cc5e9cdcdd006d468322d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 18 Jun 2018 17:39:39 -0500 Subject: [PATCH 048/113] BUG: Timedelta.__bool__ (#21485) Closes #21484 --- doc/source/whatsnew/v0.23.2.txt | 9 ++++++--- pandas/_libs/tslibs/timedeltas.pyx | 3 +++ pandas/tests/scalar/timedelta/test_timedelta.py | 14 ++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 70a5dd5817c3c..48efc02480e67 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -46,10 +46,13 @@ Bug Fixes - - -**Conversion** +**Timedelta** +- Bug in :class:`Timedelta` where non-zero timedeltas shorter than 1 microsecond were considered False (:issue:`21484`) -- +**Conversion** + +- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - **Indexing** @@ -78,6 +81,7 @@ Bug Fixes - **Timezones** + - Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) - Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) - Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) @@ -88,5 +92,4 @@ Bug Fixes **Other** -- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 87dc371195b5b..f68dc421a1ee9 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -899,6 +899,9 @@ cdef class _Timedelta(timedelta): def __str__(self): return self._repr_base(format='long') + def __bool__(self): + return self.value != 0 + def isoformat(self): """ Format Timedelta as ISO 8601 Duration like diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 205fdf49d3e91..6472bd4245622 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -588,3 +588,17 @@ def test_components(self): result = s.dt.components assert not result.iloc[0].isna().all() assert result.iloc[1].isna().all() + + +@pytest.mark.parametrize('value, expected', [ + (Timedelta('10S'), True), + (Timedelta('-10S'), True), + (Timedelta(10, unit='ns'), True), + (Timedelta(0, unit='ns'), False), + (Timedelta(-10, unit='ns'), True), + (Timedelta(None), True), + (pd.NaT, True), +]) +def test_truthiness(value, expected): + # https://github.com/pandas-dev/pandas/issues/21484 + assert bool(value) is expected From 2741967a8917769d77c0486cea9b57436f9e17bb Mon Sep 17 00:00:00 2001 From: David Krych Date: Mon, 18 Jun 2018 18:43:27 -0400 Subject: [PATCH 049/113] BUG: Fix Index construction when given empty generator (#21470). (#21481) --- doc/source/whatsnew/v0.23.2.txt | 3 ++- pandas/core/arrays/categorical.py | 5 ++--- pandas/core/indexes/base.py | 10 ++++++---- pandas/tests/indexes/test_base.py | 19 +++++++++++-------- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 48efc02480e67..94669c5b02410 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -52,8 +52,9 @@ Bug Fixes **Conversion** +- Bug in constructing :class:`Index` with an iterator or generator (:issue:`21470`) - Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) -- + **Indexing** diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d466198b648ef..e22b0d626a218 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -3,7 +3,6 @@ import numpy as np from warnings import warn import textwrap -import types from pandas import compat from pandas.compat import u, lzip @@ -28,7 +27,7 @@ is_categorical, is_categorical_dtype, is_list_like, is_sequence, - is_scalar, + is_scalar, is_iterator, is_dict_like) from pandas.core.algorithms import factorize, take_1d, unique1d, take @@ -2483,7 +2482,7 @@ def _convert_to_list_like(list_like): if isinstance(list_like, list): return list_like if (is_sequence(list_like) or isinstance(list_like, tuple) or - isinstance(list_like, types.GeneratorType)): + is_iterator(list_like)): return list(list_like) elif is_scalar(list_like): return [list_like] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a56278b0da49..27cc368a696e3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -428,12 +428,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif data is None or is_scalar(data): cls._scalar_data_error(data) else: - if tupleize_cols and is_list_like(data) and data: + if tupleize_cols and is_list_like(data): + # GH21470: convert iterable to list before determining if empty if is_iterator(data): data = list(data) - # we must be all tuples, otherwise don't construct - # 10697 - if all(isinstance(e, tuple) for e in data): + + if data and all(isinstance(e, tuple) for e in data): + # we must be all tuples, otherwise don't construct + # 10697 from .multi import MultiIndex return MultiIndex.from_tuples( data, names=name or kwargs.get('names')) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index b8bd218ec25ab..1d8a958c3413f 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -445,21 +445,24 @@ def test_constructor_dtypes_timedelta(self, attr, klass): result = klass(list(values), dtype=dtype) tm.assert_index_equal(result, index) - def test_constructor_empty_gen(self): - skip_index_keys = ["repeats", "periodIndex", "rangeIndex", - "tuples"] - for key, index in self.generate_index_types(skip_index_keys): - empty = index.__class__([]) - assert isinstance(empty, index.__class__) - assert not len(empty) + @pytest.mark.parametrize("value", [[], iter([]), (x for x in [])]) + @pytest.mark.parametrize("klass", + [Index, Float64Index, Int64Index, UInt64Index, + CategoricalIndex, DatetimeIndex, TimedeltaIndex]) + def test_constructor_empty(self, value, klass): + empty = klass(value) + assert isinstance(empty, klass) + assert not len(empty) @pytest.mark.parametrize("empty,klass", [ (PeriodIndex([], freq='B'), PeriodIndex), + (PeriodIndex(iter([]), freq='B'), PeriodIndex), + (PeriodIndex((x for x in []), freq='B'), PeriodIndex), (RangeIndex(step=1), pd.RangeIndex), (MultiIndex(levels=[[1, 2], ['blue', 'red']], labels=[[], []]), MultiIndex) ]) - def test_constructor_empty(self, empty, klass): + def test_constructor_empty_special(self, empty, klass): assert isinstance(empty, klass) assert not len(empty) From 34b77d12b198f2dd41f5f2699cc5a7d266d8924a Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Mon, 18 Jun 2018 23:45:25 +0100 Subject: [PATCH 050/113] BUG/REG: file-handle object handled incorrectly in to_csv (#21478) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/core/frame.py | 3 +- pandas/core/series.py | 3 +- pandas/io/common.py | 4 +++ pandas/io/formats/csvs.py | 59 ++++++++++++++++++++----------- pandas/tests/frame/test_to_csv.py | 16 +++++---- pandas/tests/series/test_io.py | 18 +++++----- pandas/tests/test_common.py | 34 +++++++++++++----- 8 files changed, 91 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 94669c5b02410..67c7ce150132a 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - .. _whatsnew_0232.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 02c86d2f4dcc8..a5dfbcc2a3142 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1690,7 +1690,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. compression : string, optional A string representing the compression to use in the output file. - Allowed values are 'gzip', 'bz2', 'zip', 'xz'. + Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only + used when the first argument is a filename. line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file diff --git a/pandas/core/series.py b/pandas/core/series.py index 0450f28087f66..23c4bbe082f28 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3790,7 +3790,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', non-ascii, for python versions prior to 3 compression : string, optional A string representing the compression to use in the output file. - Allowed values are 'gzip', 'bz2', 'zip', 'xz'. + Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only + used when the first argument is a filename. date_format: string, default None Format string for datetime objects. decimal: string, default '.' diff --git a/pandas/io/common.py b/pandas/io/common.py index a492b7c0b8e8e..ac9077f2db50e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -445,6 +445,10 @@ def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) + @property + def closed(self): + return self.fp is None + class MMapWrapper(BaseIterator): """ diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 7f660e2644fa4..60518f596e9af 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,11 +5,13 @@ from __future__ import print_function +import warnings + import csv as csvlib +from zipfile import ZipFile import numpy as np from pandas.core.dtypes.missing import notna -from pandas.core.dtypes.inference import is_file_like from pandas.core.index import Index, MultiIndex from pandas import compat from pandas.compat import (StringIO, range, zip) @@ -128,19 +130,31 @@ def save(self): else: encoding = self.encoding - # PR 21300 uses string buffer to receive csv writing and dump into - # file-like output with compression as option. GH 21241, 21118 - f = StringIO() - if not is_file_like(self.path_or_buf): - # path_or_buf is path - path_or_buf = self.path_or_buf - elif hasattr(self.path_or_buf, 'name'): - # path_or_buf is file handle - path_or_buf = self.path_or_buf.name - else: - # path_or_buf is file-like IO objects. + # GH 21227 internal compression is not used when file-like passed. + if self.compression and hasattr(self.path_or_buf, 'write'): + msg = ("compression has no effect when passing file-like " + "object as input.") + warnings.warn(msg, RuntimeWarning, stacklevel=2) + + # when zip compression is called. + is_zip = isinstance(self.path_or_buf, ZipFile) or ( + not hasattr(self.path_or_buf, 'write') + and self.compression == 'zip') + + if is_zip: + # zipfile doesn't support writing string to archive. uses string + # buffer to receive csv writing and dump into zip compression + # file handle. GH 21241, 21118 + f = StringIO() + close = False + elif hasattr(self.path_or_buf, 'write'): f = self.path_or_buf - path_or_buf = None + close = False + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + close = True try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -157,13 +171,18 @@ def save(self): self._save() finally: - # GH 17778 handles zip compression for byte strings separately. - buf = f.getvalue() - if path_or_buf: - f, handles = _get_handle(path_or_buf, self.mode, - encoding=encoding, - compression=self.compression) - f.write(buf) + if is_zip: + # GH 17778 handles zip compression separately. + buf = f.getvalue() + if hasattr(self.path_or_buf, 'write'): + self.path_or_buf.write(buf) + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + f.write(buf) + close = True + if close: f.close() for _fh in handles: _fh.close() diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 60dc336a85388..3ad25ae73109e 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -9,6 +9,7 @@ import numpy as np from pandas.compat import (lmap, range, lrange, StringIO, u) +from pandas.io.common import _get_handle import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, @@ -935,18 +936,19 @@ def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: df.to_csv(filename, compression=compression, encoding=encoding) - # test the round trip - to_csv -> read_csv result = read_csv(filename, compression=compression, index_col=0, encoding=encoding) + assert_frame_equal(df, result) - with open(filename, 'w') as fh: - df.to_csv(fh, compression=compression, encoding=encoding) - - result_fh = read_csv(filename, compression=compression, - index_col=0, encoding=encoding) + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + df.to_csv(f, encoding=encoding) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) assert_frame_equal(df, result) - assert_frame_equal(df, result_fh) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f98962685ad9a..814d794d45c18 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -11,6 +11,7 @@ from pandas import Series, DataFrame from pandas.compat import StringIO, u +from pandas.io.common import _get_handle from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -151,20 +152,19 @@ def test_to_csv_compression(self, s, encoding, compression): s.to_csv(filename, compression=compression, encoding=encoding, header=True) - # test the round trip - to_csv -> read_csv result = pd.read_csv(filename, compression=compression, encoding=encoding, index_col=0, squeeze=True) + assert_series_equal(s, result) - with open(filename, 'w') as fh: - s.to_csv(fh, compression=compression, encoding=encoding, - header=True) - - result_fh = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, - squeeze=True) + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + s.to_csv(f, encoding=encoding, header=True) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) assert_series_equal(s, result) - assert_series_equal(s, result_fh) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 7034e9ac2e0c8..ef5f13bfa504a 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -11,6 +11,7 @@ from pandas.compat import range, lmap import pandas.core.common as com from pandas.core import ops +from pandas.io.common import _get_handle import pandas.util.testing as tm @@ -246,19 +247,34 @@ def test_compression_size(obj, method, compression_only): [12.32112, 123123.2, 321321.2]], columns=['X', 'Y', 'Z']), Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_csv']) +@pytest.mark.parametrize('method', ['to_csv', 'to_json']) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as filename: - with open(filename, 'w') as fh: - getattr(obj, method)(fh, compression=compression_only) - assert not fh.closed - assert fh.closed + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed compressed = os.path.getsize(filename) with tm.ensure_clean() as filename: - with open(filename, 'w') as fh: - getattr(obj, method)(fh, compression=None) - assert not fh.closed - assert fh.closed + f, _handles = _get_handle(filename, 'w', compression=None) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed uncompressed = os.path.getsize(filename) assert uncompressed > compressed + + +# GH 21227 +def test_compression_warning(compression_only): + df = DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']) + with tm.ensure_clean() as filename: + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with tm.assert_produces_warning(RuntimeWarning, + check_stacklevel=False): + with f: + df.to_csv(f, compression=compression_only) From e788e47d6ff8fd3403a8d5411f8fad61509a75da Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 18 Jun 2018 18:03:14 -0700 Subject: [PATCH 051/113] Append Mode for ExcelWriter with openpyxl (#21251) --- doc/source/whatsnew/v0.24.0.txt | 2 ++ pandas/io/excel.py | 51 +++++++++++++++++++++++---------- pandas/tests/io/test_excel.py | 39 +++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index cae05446c00e6..c23ed006ff637 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -8,6 +8,8 @@ v0.24.0 New features ~~~~~~~~~~~~ +- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) + .. _whatsnew_0240.enhancements.other: Other Enhancements diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 5608c29637447..e86d33742b266 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -804,6 +804,10 @@ class ExcelWriter(object): datetime_format : string, default None Format string for datetime objects written into Excel files (e.g. 'YYYY-MM-DD HH:MM:SS') + mode : {'w' or 'a'}, default 'w' + File mode to use (write or append). + + .. versionadded:: 0.24.0 Notes ----- @@ -897,7 +901,8 @@ def save(self): pass def __init__(self, path, engine=None, - date_format=None, datetime_format=None, **engine_kwargs): + date_format=None, datetime_format=None, mode='w', + **engine_kwargs): # validate that this engine can handle the extension if isinstance(path, string_types): ext = os.path.splitext(path)[-1] @@ -919,6 +924,8 @@ def __init__(self, path, engine=None, else: self.datetime_format = datetime_format + self.mode = mode + def __fspath__(self): return _stringify_path(self.path) @@ -993,23 +1000,27 @@ class _OpenpyxlWriter(ExcelWriter): engine = 'openpyxl' supported_extensions = ('.xlsx', '.xlsm') - def __init__(self, path, engine=None, **engine_kwargs): + def __init__(self, path, engine=None, mode='w', **engine_kwargs): # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook - super(_OpenpyxlWriter, self).__init__(path, **engine_kwargs) + super(_OpenpyxlWriter, self).__init__(path, mode=mode, **engine_kwargs) - # Create workbook object with default optimized_write=True. - self.book = Workbook() + if self.mode == 'a': # Load from existing workbook + from openpyxl import load_workbook + book = load_workbook(self.path) + self.book = book + else: + # Create workbook object with default optimized_write=True. + self.book = Workbook() - # Openpyxl 1.6.1 adds a dummy sheet. We remove it. - if self.book.worksheets: - try: - self.book.remove(self.book.worksheets[0]) - except AttributeError: + if self.book.worksheets: + try: + self.book.remove(self.book.worksheets[0]) + except AttributeError: - # compat - self.book.remove_sheet(self.book.worksheets[0]) + # compat - for openpyxl <= 2.4 + self.book.remove_sheet(self.book.worksheets[0]) def save(self): """ @@ -1443,11 +1454,16 @@ class _XlwtWriter(ExcelWriter): engine = 'xlwt' supported_extensions = ('.xls',) - def __init__(self, path, engine=None, encoding=None, **engine_kwargs): + def __init__(self, path, engine=None, encoding=None, mode='w', + **engine_kwargs): # Use the xlwt module as the Excel writer. import xlwt engine_kwargs['engine'] = engine - super(_XlwtWriter, self).__init__(path, **engine_kwargs) + + if mode == 'a': + raise ValueError('Append mode is not supported with xlwt!') + + super(_XlwtWriter, self).__init__(path, mode=mode, **engine_kwargs) if encoding is None: encoding = 'ascii' @@ -1713,13 +1729,18 @@ class _XlsxWriter(ExcelWriter): supported_extensions = ('.xlsx',) def __init__(self, path, engine=None, - date_format=None, datetime_format=None, **engine_kwargs): + date_format=None, datetime_format=None, mode='w', + **engine_kwargs): # Use the xlsxwriter module as the Excel writer. import xlsxwriter + if mode == 'a': + raise ValueError('Append mode is not supported with xlsxwriter!') + super(_XlsxWriter, self).__init__(path, engine=engine, date_format=date_format, datetime_format=datetime_format, + mode=mode, **engine_kwargs) self.book = xlsxwriter.Workbook(path, **engine_kwargs) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 05423474f330a..2a225e6fe6a45 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -2006,6 +2006,31 @@ def test_write_cells_merge_styled(self, merge_cells, ext, engine): assert xcell_b1.font == openpyxl_sty_merged assert xcell_a2.font == openpyxl_sty_merged + @pytest.mark.parametrize("mode,expected", [ + ('w', ['baz']), ('a', ['foo', 'bar', 'baz'])]) + def test_write_append_mode(self, merge_cells, ext, engine, mode, expected): + import openpyxl + df = DataFrame([1], columns=['baz']) + + with ensure_clean(ext) as f: + wb = openpyxl.Workbook() + wb.worksheets[0].title = 'foo' + wb.worksheets[0]['A1'].value = 'foo' + wb.create_sheet('bar') + wb.worksheets[1]['A1'].value = 'bar' + wb.save(f) + + writer = ExcelWriter(f, engine=engine, mode=mode) + df.to_excel(writer, sheet_name='baz', index=False) + writer.save() + + wb2 = openpyxl.load_workbook(f) + result = [sheet.title for sheet in wb2.worksheets] + assert result == expected + + for index, cell_value in enumerate(expected): + assert wb2.worksheets[index]['A1'].value == cell_value + @td.skip_if_no('xlwt') @pytest.mark.parametrize("merge_cells,ext,engine", [ @@ -2060,6 +2085,13 @@ def test_to_excel_styleconverter(self, merge_cells, ext, engine): assert xlwt.Alignment.HORZ_CENTER == xls_style.alignment.horz assert xlwt.Alignment.VERT_TOP == xls_style.alignment.vert + def test_write_append_mode_raises(self, merge_cells, ext, engine): + msg = "Append mode is not supported with xlwt!" + + with ensure_clean(ext) as f: + with tm.assert_raises_regex(ValueError, msg): + ExcelWriter(f, engine=engine, mode='a') + @td.skip_if_no('xlsxwriter') @pytest.mark.parametrize("merge_cells,ext,engine", [ @@ -2111,6 +2143,13 @@ def test_column_format(self, merge_cells, ext, engine): assert read_num_format == num_format + def test_write_append_mode_raises(self, merge_cells, ext, engine): + msg = "Append mode is not supported with xlsxwriter!" + + with ensure_clean(ext) as f: + with tm.assert_raises_regex(ValueError, msg): + ExcelWriter(f, engine=engine, mode='a') + class TestExcelWriterEngineTests(object): From fb165558e00db6b667b06aeef9ebc53a6589e67f Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 19 Jun 2018 09:20:12 +0100 Subject: [PATCH 052/113] DOC: Improve code example for Index.get_indexer (#21511) --- pandas/core/indexes/base.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 27cc368a696e3..490fd872125ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3148,17 +3148,22 @@ def droplevel(self, level=0): .. versionadded:: 0.21.0 (list-like tolerance) - Examples - -------- - >>> indexer = index.get_indexer(new_index) - >>> new_values = cur_values.take(indexer) - Returns ------- indexer : ndarray of int Integers from 0 to n - 1 indicating that the index at these positions matches the corresponding target values. Missing values in the target are marked by -1. + + Examples + -------- + >>> index = pd.Index(['c', 'a', 'b']) + >>> index.get_indexer(['a', 'b', 'x']) + array([ 1, 2, -1]) + + Notice that the return value is an array of locations in ``index`` + and ``x`` is marked by -1, as it is not in ``index``. + """ @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) From 84533b12517a33fa400d8a92cac914907c446f7a Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Tue, 19 Jun 2018 03:30:47 -0500 Subject: [PATCH 053/113] DOC: remove grammar duplication in groupby docs (#21534) --- doc/source/groupby.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 1c4c3f93726a9..47d53c82b86f3 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -680,8 +680,7 @@ match the shape of the input array. data_range = lambda x: x.max() - x.min() ts.groupby(key).transform(data_range) -Alternatively the built-in methods can be could be used to produce the same -outputs +Alternatively, the built-in methods could be used to produce the same outputs. .. ipython:: python From 8cbcafd5031d446de1ab3b1fdd56e133dfa3d8cf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 19 Jun 2018 04:12:52 -0700 Subject: [PATCH 054/113] remove daytime attr, move getstate and setstate to base class (#21533) --- pandas/_libs/tslibs/offsets.pyx | 39 ++++++++++++++++++++++++++++++++ pandas/tseries/offsets.py | 40 --------------------------------- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 8caf9ea0e0389..3ca9bb307da9c 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -379,6 +379,45 @@ class _BaseOffset(object): 'got {n}'.format(n=n)) return nint + def __setstate__(self, state): + """Reconstruct an instance from a pickled state""" + if 'offset' in state: + # Older (<0.22.0) versions have offset attribute instead of _offset + if '_offset' in state: # pragma: no cover + raise AssertionError('Unexpected key `_offset`') + state['_offset'] = state.pop('offset') + state['kwds']['offset'] = state['_offset'] + + if '_offset' in state and not isinstance(state['_offset'], timedelta): + # relativedelta, we need to populate using its kwds + offset = state['_offset'] + odict = offset.__dict__ + kwds = {key: odict[key] for key in odict if odict[key]} + state.update(kwds) + + self.__dict__ = state + if 'weekmask' in state and 'holidays' in state: + calendar, holidays = _get_calendar(weekmask=self.weekmask, + holidays=self.holidays, + calendar=None) + self.calendar = calendar + self.holidays = holidays + + def __getstate__(self): + """Return a pickleable state""" + state = self.__dict__.copy() + + # we don't want to actually pickle the calendar object + # as its a np.busyday; we recreate on deserilization + if 'calendar' in state: + del state['calendar'] + try: + state['kwds'].pop('calendar') + except KeyError: + pass + + return state + class BaseOffset(_BaseOffset): # Here we add __rfoo__ methods that don't play well with cdef classes diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 2f4989f26b394..ffa2c0a5e3211 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -423,30 +423,6 @@ def _offset_str(self): def nanos(self): raise ValueError("{name} is a non-fixed frequency".format(name=self)) - def __setstate__(self, state): - """Reconstruct an instance from a pickled state""" - if 'offset' in state: - # Older (<0.22.0) versions have offset attribute instead of _offset - if '_offset' in state: # pragma: no cover - raise AssertionError('Unexpected key `_offset`') - state['_offset'] = state.pop('offset') - state['kwds']['offset'] = state['_offset'] - - if '_offset' in state and not isinstance(state['_offset'], timedelta): - # relativedelta, we need to populate using its kwds - offset = state['_offset'] - odict = offset.__dict__ - kwds = {key: odict[key] for key in odict if odict[key]} - state.update(kwds) - - self.__dict__ = state - if 'weekmask' in state and 'holidays' in state: - calendar, holidays = _get_calendar(weekmask=self.weekmask, - holidays=self.holidays, - calendar=None) - self.calendar = calendar - self.holidays = holidays - class SingleConstructorOffset(DateOffset): @classmethod @@ -494,21 +470,6 @@ def _repr_attrs(self): out += ': ' + ', '.join(attrs) return out - def __getstate__(self): - """Return a pickleable state""" - state = self.__dict__.copy() - - # we don't want to actually pickle the calendar object - # as its a np.busyday; we recreate on deserilization - if 'calendar' in state: - del state['calendar'] - try: - state['kwds'].pop('calendar') - except KeyError: - pass - - return state - class BusinessDay(BusinessMixin, SingleConstructorOffset): """ @@ -690,7 +651,6 @@ def _get_business_hours_by_sec(self): until = datetime(2014, 4, 1, self.end.hour, self.end.minute) return (until - dtstart).total_seconds() else: - self.daytime = False dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute) until = datetime(2014, 4, 2, self.end.hour, self.end.minute) return (until - dtstart).total_seconds() From c53e001beffb56eca21b3245330e2a1068bf7007 Mon Sep 17 00:00:00 2001 From: Jacopo Rota Date: Tue, 19 Jun 2018 13:26:48 +0200 Subject: [PATCH 055/113] BUG: Handle read_csv corner case (#21176) Closes gh-21141 --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/io/parsers.py | 12 +++++++++++- pandas/tests/io/parser/common.py | 15 +++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 67c7ce150132a..0f2c9c4756987 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -64,6 +64,7 @@ Bug Fixes **I/O** +- Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) - - diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2c8f98732c92f..65df2bffb4abf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3209,12 +3209,22 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): col = columns[k] if is_integer(k) else k dtype[col] = v - if index_col is None or index_col is False: + # Even though we have no data, the "index" of the empty DataFrame + # could for example still be an empty MultiIndex. Thus, we need to + # check whether we have any index columns specified, via either: + # + # 1) index_col (column indices) + # 2) index_names (column names) + # + # Both must be non-null to ensure a successful construction. Otherwise, + # we have to create a generic emtpy Index. + if (index_col is None or index_col is False) or index_names is None: index = Index([]) else: data = [Series([], dtype=dtype[name]) for name in index_names] index = _ensure_index_from_sequences(data, names=index_names) index_col.sort() + for i, n in enumerate(index_col): columns.pop(n - i) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 2b7ff1f5a9879..b39122e5e7906 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -238,6 +238,21 @@ def test_csv_mixed_type(self): out = self.read_csv(StringIO(data)) tm.assert_frame_equal(out, expected) + def test_read_csv_low_memory_no_rows_with_index(self): + if self.engine == "c" and not self.low_memory: + pytest.skip("This is a low-memory specific test") + + # see gh-21141 + data = """A,B,C +1,1,1,2 +2,2,3,4 +3,3,4,5 +""" + out = self.read_csv(StringIO(data), low_memory=True, + index_col=0, nrows=0) + expected = DataFrame(columns=["A", "B", "C"]) + tm.assert_frame_equal(out, expected) + def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0, From 34d74cec1a3c9ed0a119769832dea3206ed41a85 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 19 Jun 2018 22:36:55 +0200 Subject: [PATCH 056/113] De-duplicate code for indexing with list-likes of keys (#21503) --- pandas/core/frame.py | 3 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexing.py | 214 +++++++++++++++++++----------------- 3 files changed, 116 insertions(+), 103 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a5dfbcc2a3142..74bb2abc27c4b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2724,7 +2724,8 @@ def _getitem_array(self, key): indexer = key.nonzero()[0] return self._take(indexer, axis=0) else: - indexer = self.loc._convert_to_indexer(key, axis=1) + indexer = self.loc._convert_to_indexer(key, axis=1, + raise_missing=True) return self._take(indexer, axis=1) def _getitem_multilevel(self, key): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 490fd872125ff..577b715ca9998 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3634,7 +3634,7 @@ def _reindex_non_unique(self, target): else: # need to retake to have the same size as the indexer - indexer[~check] = 0 + indexer[~check] = -1 # reset the new indexer to account for the new size new_indexer = np.arange(len(self.take(indexer))) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0e4f040253560..d5e81105dd323 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -688,7 +688,8 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): if isinstance(indexer, tuple): # flatten np.ndarray indexers - ravel = lambda i: i.ravel() if isinstance(i, np.ndarray) else i + def ravel(i): + return i.ravel() if isinstance(i, np.ndarray) else i indexer = tuple(map(ravel, indexer)) aligners = [not com.is_null_slice(idx) for idx in indexer] @@ -925,33 +926,10 @@ def _multi_take(self, tup): """ create the reindex map for our objects, raise the _exception if we can't create the indexer """ - try: - o = self.obj - d = {} - for key, axis in zip(tup, o._AXIS_ORDERS): - ax = o._get_axis(axis) - # Have the index compute an indexer or return None - # if it cannot handle: - indexer, keyarr = ax._convert_listlike_indexer(key, - kind=self.name) - # We only act on all found values: - if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer(key, indexer, axis) - d[axis] = (ax[indexer], indexer) - continue - - # If we are trying to get actual keys from empty Series, we - # patiently wait for a KeyError later on - otherwise, convert - if len(ax) or not len(key): - key = self._convert_for_reindex(key, axis) - indexer = ax.get_indexer_for(key) - keyarr = ax.reindex(keyarr)[0] - self._validate_read_indexer(keyarr, indexer, - o._get_axis_number(axis)) - d[axis] = (keyarr, indexer) - return o._reindex_with_indexers(d, copy=True, allow_dups=True) - except (KeyError, IndexingError) as detail: - raise self._exception(detail) + o = self.obj + d = {axis: self._get_listlike_indexer(key, axis) + for (key, axis) in zip(tup, o._AXIS_ORDERS)} + return o._reindex_with_indexers(d, copy=True, allow_dups=True) def _convert_for_reindex(self, key, axis=None): return key @@ -1124,7 +1102,88 @@ def _getitem_axis(self, key, axis=None): return self._get_label(key, axis=axis) + def _get_listlike_indexer(self, key, axis, raise_missing=False): + """ + Transform a list-like of keys into a new index and an indexer. + + Parameters + ---------- + key : list-like + Target labels + axis: int + Dimension on which the indexing is being made + raise_missing: bool + Whether to raise a KeyError if some labels are not found. Will be + removed in the future, and then this method will always behave as + if raise_missing=True. + + Raises + ------ + KeyError + If at least one key was requested but none was found, and + raise_missing=True. + + Returns + ------- + keyarr: Index + New index (coinciding with 'key' if the axis is unique) + values : array-like + An indexer for the return object; -1 denotes keys not found + """ + o = self.obj + ax = o._get_axis(axis) + + # Have the index compute an indexer or return None + # if it cannot handle: + indexer, keyarr = ax._convert_listlike_indexer(key, + kind=self.name) + # We only act on all found values: + if indexer is not None and (indexer != -1).all(): + self._validate_read_indexer(key, indexer, axis, + raise_missing=raise_missing) + return ax[indexer], indexer + + if ax.is_unique: + # If we are trying to get actual keys from empty Series, we + # patiently wait for a KeyError later on - otherwise, convert + if len(ax) or not len(key): + key = self._convert_for_reindex(key, axis) + indexer = ax.get_indexer_for(key) + keyarr = ax.reindex(keyarr)[0] + else: + keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) + + self._validate_read_indexer(keyarr, indexer, + o._get_axis_number(axis), + raise_missing=raise_missing) + return keyarr, indexer + def _getitem_iterable(self, key, axis=None): + """ + Index current object with an an iterable key (which can be a boolean + indexer, or a collection of keys). + + Parameters + ---------- + key : iterable + Target labels, or boolean indexer + axis: int, default None + Dimension on which the indexing is being made + + Raises + ------ + KeyError + If no key was found. Will change in the future to raise if not all + keys were found. + IndexingError + If the boolean indexer is unalignable with the object being + indexed. + + Returns + ------- + scalar, DataFrame, or Series: indexed value(s), + """ + if axis is None: axis = self.axis or 0 @@ -1133,54 +1192,18 @@ def _getitem_iterable(self, key, axis=None): labels = self.obj._get_axis(axis) if com.is_bool_indexer(key): + # A boolean indexer key = check_bool_indexer(labels, key) inds, = key.nonzero() return self.obj._take(inds, axis=axis) else: - # Have the index compute an indexer or return None - # if it cannot handle; we only act on all found values - indexer, keyarr = labels._convert_listlike_indexer( - key, kind=self.name) - if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer(key, indexer, axis) - return self.obj.take(indexer, axis=axis) - - ax = self.obj._get_axis(axis) - # existing labels are unique and indexer are unique - if labels.is_unique and Index(keyarr).is_unique: - indexer = ax.get_indexer_for(key) - self._validate_read_indexer(key, indexer, axis) - - d = {axis: [ax.reindex(keyarr)[0], indexer]} - return self.obj._reindex_with_indexers(d, copy=True, - allow_dups=True) - - # existing labels are non-unique - else: - - # reindex with the specified axis - if axis + 1 > self.obj.ndim: - raise AssertionError("invalid indexing error with " - "non-unique index") - - new_target, indexer, new_indexer = labels._reindex_non_unique( - keyarr) - - if new_indexer is not None: - result = self.obj._take(indexer[indexer != -1], axis=axis) - - self._validate_read_indexer(key, new_indexer, axis) - result = result._reindex_with_indexers( - {axis: [new_target, new_indexer]}, - copy=True, allow_dups=True) + # A collection of keys + keyarr, indexer = self._get_listlike_indexer(key, axis, + raise_missing=False) + return self.obj._reindex_with_indexers({axis: [keyarr, indexer]}, + copy=True, allow_dups=True) - else: - self._validate_read_indexer(key, indexer, axis) - result = self.obj._take(indexer, axis=axis) - - return result - - def _validate_read_indexer(self, key, indexer, axis): + def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): """ Check that indexer can be used to return a result (e.g. at least one element was found, unless the list of keys was actually empty). @@ -1193,11 +1216,16 @@ def _validate_read_indexer(self, key, indexer, axis): Indices corresponding to the key (with -1 indicating not found) axis: int Dimension on which the indexing is being made + raise_missing: bool + Whether to raise a KeyError if some labels are not found. Will be + removed in the future, and then this method will always behave as + if raise_missing=True. Raises ------ KeyError - If at least one key was requested none was found. + If at least one key was requested but none was found, and + raise_missing=True. """ ax = self.obj._get_axis(axis) @@ -1214,6 +1242,12 @@ def _validate_read_indexer(self, key, indexer, axis): u"None of [{key}] are in the [{axis}]".format( key=key, axis=self.obj._get_axis_name(axis))) + # We (temporarily) allow for some missing keys with .loc, except in + # some cases (e.g. setting) in which "raise_missing" will be False + if not(self.name == 'loc' and not raise_missing): + not_found = list(set(key) - set(ax)) + raise KeyError("{} not in index".format(not_found)) + # we skip the warning on Categorical/Interval # as this check is actually done (check for # non-missing values), but a bit later in the @@ -1229,9 +1263,10 @@ def _validate_read_indexer(self, key, indexer, axis): if not (ax.is_categorical() or ax.is_interval()): warnings.warn(_missing_key_warning, - FutureWarning, stacklevel=5) + FutureWarning, stacklevel=6) - def _convert_to_indexer(self, obj, axis=None, is_setter=False): + def _convert_to_indexer(self, obj, axis=None, is_setter=False, + raise_missing=False): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray @@ -1310,33 +1345,10 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False): inds, = obj.nonzero() return inds else: - - # Have the index compute an indexer or return None - # if it cannot handle - indexer, objarr = labels._convert_listlike_indexer( - obj, kind=self.name) - if indexer is not None: - return indexer - - # unique index - if labels.is_unique: - indexer = check = labels.get_indexer(objarr) - - # non-unique (dups) - else: - (indexer, - missing) = labels.get_indexer_non_unique(objarr) - # 'indexer' has dupes, create 'check' using 'missing' - check = np.zeros(len(objarr), dtype=np.intp) - check[missing] = -1 - - mask = check == -1 - if mask.any(): - raise KeyError('{mask} not in index' - .format(mask=objarr[mask])) - - return com._values_from_object(indexer) - + # When setting, missing keys are not allowed, even with .loc: + kwargs = {'raise_missing': True if is_setter else + raise_missing} + return self._get_listlike_indexer(obj, axis, **kwargs)[1] else: try: return labels.get_loc(obj) From a10216dac1903dbca00c65387401bafeb4500480 Mon Sep 17 00:00:00 2001 From: Wil Tan Date: Wed, 20 Jun 2018 19:50:21 +1000 Subject: [PATCH 057/113] Update "See Also" section of pandas/core/generic.py (#21550) --- pandas/core/generic.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 32f64b1d3e05c..555108a5d9349 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5175,8 +5175,7 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, -------- pandas.to_datetime : Convert argument to datetime. pandas.to_timedelta : Convert argument to timedelta. - pandas.to_numeric : Return a fixed frequency timedelta index, - with day as the default. + pandas.to_numeric : Convert argument to numeric type. Returns ------- @@ -5210,7 +5209,7 @@ def infer_objects(self): -------- pandas.to_datetime : Convert argument to datetime. pandas.to_timedelta : Convert argument to timedelta. - pandas.to_numeric : Convert argument to numeric typeR + pandas.to_numeric : Convert argument to numeric type. Returns ------- From 4e6a11f62bb170700ab42af032b2522398f0ebe7 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 20 Jun 2018 11:14:25 +0100 Subject: [PATCH 058/113] Fixing documentation lists indentation (#21519) --- doc/source/api.rst | 6 +- doc/source/basics.rst | 43 ++- doc/source/categorical.rst | 10 +- doc/source/comparison_with_r.rst | 10 +- doc/source/computation.rst | 46 +-- doc/source/contributing.rst | 70 ++--- doc/source/contributing_docstring.rst | 76 ++--- doc/source/developer.rst | 14 +- doc/source/dsintro.rst | 26 +- doc/source/ecosystem.rst | 16 +- doc/source/enhancingperf.rst | 42 +-- doc/source/extending.rst | 6 +- doc/source/gotchas.rst | 4 +- doc/source/groupby.rst | 54 ++-- doc/source/indexing.rst | 82 +++--- doc/source/install.rst | 14 +- doc/source/internals.rst | 38 +-- doc/source/io.rst | 404 +++++++++++++------------- doc/source/merging.rst | 72 ++--- doc/source/options.rst | 8 +- doc/source/overview.rst | 26 +- doc/source/reshaping.rst | 48 +-- doc/source/sparse.rst | 6 +- doc/source/timeseries.rst | 40 +-- doc/source/tutorials.rst | 164 +++++------ doc/source/visualization.rst | 6 +- 26 files changed, 665 insertions(+), 666 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 4faec93490fde..f2c00d5d12031 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1200,9 +1200,9 @@ Attributes and underlying data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Axes** - * **items**: axis 0; each item corresponds to a DataFrame contained inside - * **major_axis**: axis 1; the index (rows) of each of the DataFrames - * **minor_axis**: axis 2; the columns of each of the DataFrames +* **items**: axis 0; each item corresponds to a DataFrame contained inside +* **major_axis**: axis 1; the index (rows) of each of the DataFrames +* **minor_axis**: axis 2; the columns of each of the DataFrames .. autosummary:: :toctree: generated/ diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 74f1d80c6fd3d..c460b19640f46 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -50,9 +50,8 @@ Attributes and the raw ndarray(s) pandas objects have a number of attributes enabling you to access the metadata - * **shape**: gives the axis dimensions of the object, consistent with ndarray - * Axis labels - +* **shape**: gives the axis dimensions of the object, consistent with ndarray +* Axis labels * **Series**: *index* (only axis) * **DataFrame**: *index* (rows) and *columns* * **Panel**: *items*, *major_axis*, and *minor_axis* @@ -131,9 +130,9 @@ Flexible binary operations With binary operations between pandas data structures, there are two key points of interest: - * Broadcasting behavior between higher- (e.g. DataFrame) and - lower-dimensional (e.g. Series) objects. - * Missing data in computations. +* Broadcasting behavior between higher- (e.g. DataFrame) and + lower-dimensional (e.g. Series) objects. +* Missing data in computations. We will demonstrate how to manage these issues independently, though they can be handled simultaneously. @@ -462,10 +461,10 @@ produce an object of the same size. Generally speaking, these methods take an **axis** argument, just like *ndarray.{sum, std, ...}*, but the axis can be specified by name or integer: - - **Series**: no axis argument needed - - **DataFrame**: "index" (axis=0, default), "columns" (axis=1) - - **Panel**: "items" (axis=0), "major" (axis=1, default), "minor" - (axis=2) +* **Series**: no axis argument needed +* **DataFrame**: "index" (axis=0, default), "columns" (axis=1) +* **Panel**: "items" (axis=0), "major" (axis=1, default), "minor" + (axis=2) For example: @@ -1187,11 +1186,11 @@ It is used to implement nearly all other features relying on label-alignment functionality. To *reindex* means to conform the data to match a given set of labels along a particular axis. This accomplishes several things: - * Reorders the existing data to match a new set of labels - * Inserts missing value (NA) markers in label locations where no data for - that label existed - * If specified, **fill** data for missing labels using logic (highly relevant - to working with time series data) +* Reorders the existing data to match a new set of labels +* Inserts missing value (NA) markers in label locations where no data for + that label existed +* If specified, **fill** data for missing labels using logic (highly relevant + to working with time series data) Here is a simple example: @@ -1911,10 +1910,10 @@ the axis indexes, since they are immutable) and returns a new object. Note that **it is seldom necessary to copy objects**. For example, there are only a handful of ways to alter a DataFrame *in-place*: - * Inserting, deleting, or modifying a column. - * Assigning to the ``index`` or ``columns`` attributes. - * For homogeneous data, directly modifying the values via the ``values`` - attribute or advanced indexing. +* Inserting, deleting, or modifying a column. +* Assigning to the ``index`` or ``columns`` attributes. +* For homogeneous data, directly modifying the values via the ``values`` + attribute or advanced indexing. To be clear, no pandas method has the side effect of modifying your data; almost every method returns a new object, leaving the original object @@ -2112,14 +2111,14 @@ Because the data was transposed the original inference stored all columns as obj The following functions are available for one dimensional object arrays or scalars to perform hard conversion of objects to a specified type: -- :meth:`~pandas.to_numeric` (conversion to numeric dtypes) +* :meth:`~pandas.to_numeric` (conversion to numeric dtypes) .. ipython:: python m = ['1.1', 2, 3] pd.to_numeric(m) -- :meth:`~pandas.to_datetime` (conversion to datetime objects) +* :meth:`~pandas.to_datetime` (conversion to datetime objects) .. ipython:: python @@ -2127,7 +2126,7 @@ hard conversion of objects to a specified type: m = ['2016-07-09', datetime.datetime(2016, 3, 2)] pd.to_datetime(m) -- :meth:`~pandas.to_timedelta` (conversion to timedelta objects) +* :meth:`~pandas.to_timedelta` (conversion to timedelta objects) .. ipython:: python diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index c6827f67a390b..acab9de905540 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -542,11 +542,11 @@ Comparisons Comparing categorical data with other objects is possible in three cases: - * Comparing equality (``==`` and ``!=``) to a list-like object (list, Series, array, - ...) of the same length as the categorical data. - * All comparisons (``==``, ``!=``, ``>``, ``>=``, ``<``, and ``<=``) of categorical data to - another categorical Series, when ``ordered==True`` and the `categories` are the same. - * All comparisons of a categorical data to a scalar. +* Comparing equality (``==`` and ``!=``) to a list-like object (list, Series, array, + ...) of the same length as the categorical data. +* All comparisons (``==``, ``!=``, ``>``, ``>=``, ``<``, and ``<=``) of categorical data to + another categorical Series, when ``ordered==True`` and the `categories` are the same. +* All comparisons of a categorical data to a scalar. All other comparisons, especially "non-equality" comparisons of two categoricals with different categories or a categorical with any list-like object, will raise a ``TypeError``. diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index a7586f623a160..eecacde8ad14e 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -18,11 +18,11 @@ was started to provide a more detailed look at the `R language party libraries as they relate to ``pandas``. In comparisons with R and CRAN libraries, we care about the following things: - - **Functionality / flexibility**: what can/cannot be done with each tool - - **Performance**: how fast are operations. Hard numbers/benchmarks are - preferable - - **Ease-of-use**: Is one tool easier/harder to use (you may have to be - the judge of this, given side-by-side code comparisons) +* **Functionality / flexibility**: what can/cannot be done with each tool +* **Performance**: how fast are operations. Hard numbers/benchmarks are + preferable +* **Ease-of-use**: Is one tool easier/harder to use (you may have to be + the judge of this, given side-by-side code comparisons) This page is also here to offer a bit of a translation guide for users of these R packages. diff --git a/doc/source/computation.rst b/doc/source/computation.rst index ff06c369e1897..5e7b8be5f8af0 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -344,20 +344,20 @@ The weights used in the window are specified by the ``win_type`` keyword. The list of recognized types are the `scipy.signal window functions `__: -- ``boxcar`` -- ``triang`` -- ``blackman`` -- ``hamming`` -- ``bartlett`` -- ``parzen`` -- ``bohman`` -- ``blackmanharris`` -- ``nuttall`` -- ``barthann`` -- ``kaiser`` (needs beta) -- ``gaussian`` (needs std) -- ``general_gaussian`` (needs power, width) -- ``slepian`` (needs width). +* ``boxcar`` +* ``triang`` +* ``blackman`` +* ``hamming`` +* ``bartlett`` +* ``parzen`` +* ``bohman`` +* ``blackmanharris`` +* ``nuttall`` +* ``barthann`` +* ``kaiser`` (needs beta) +* ``gaussian`` (needs std) +* ``general_gaussian`` (needs power, width) +* ``slepian`` (needs width). .. ipython:: python @@ -537,10 +537,10 @@ Binary Window Functions two ``Series`` or any combination of ``DataFrame/Series`` or ``DataFrame/DataFrame``. Here is the behavior in each case: -- two ``Series``: compute the statistic for the pairing. -- ``DataFrame/Series``: compute the statistics for each column of the DataFrame +* two ``Series``: compute the statistic for the pairing. +* ``DataFrame/Series``: compute the statistics for each column of the DataFrame with the passed Series, thus returning a DataFrame. -- ``DataFrame/DataFrame``: by default compute the statistic for matching column +* ``DataFrame/DataFrame``: by default compute the statistic for matching column names, returning a DataFrame. If the keyword argument ``pairwise=True`` is passed then computes the statistic for each pair of columns, returning a ``MultiIndexed DataFrame`` whose ``index`` are the dates in question (see :ref:`the next section @@ -741,10 +741,10 @@ Aside from not having a ``window`` parameter, these functions have the same interfaces as their ``.rolling`` counterparts. Like above, the parameters they all accept are: -- ``min_periods``: threshold of non-null data points to require. Defaults to +* ``min_periods``: threshold of non-null data points to require. Defaults to minimum needed to compute statistic. No ``NaNs`` will be output once ``min_periods`` non-null data points have been seen. -- ``center``: boolean, whether to set the labels at the center (default is False). +* ``center``: boolean, whether to set the labels at the center (default is False). .. _stats.moments.expanding.note: .. note:: @@ -903,12 +903,12 @@ of an EW moment: One must specify precisely one of **span**, **center of mass**, **half-life** and **alpha** to the EW functions: -- **Span** corresponds to what is commonly called an "N-day EW moving average". -- **Center of mass** has a more physical interpretation and can be thought of +* **Span** corresponds to what is commonly called an "N-day EW moving average". +* **Center of mass** has a more physical interpretation and can be thought of in terms of span: :math:`c = (s - 1) / 2`. -- **Half-life** is the period of time for the exponential weight to reduce to +* **Half-life** is the period of time for the exponential weight to reduce to one half. -- **Alpha** specifies the smoothing factor directly. +* **Alpha** specifies the smoothing factor directly. Here is an example for a univariate time series: diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 6ae93ba46fa5c..ff06d024740bf 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -138,11 +138,11 @@ steps; you only need to install the compiler. For Windows developers, the following links may be helpful. -- https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ -- https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit -- https://cowboyprogrammer.org/building-python-wheels-for-windows/ -- https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ -- https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy +* https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ +* https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit +* https://cowboyprogrammer.org/building-python-wheels-for-windows/ +* https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ +* https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy Let us know if you have any difficulties by opening an issue or reaching out on `Gitter`_. @@ -155,11 +155,11 @@ Creating a Python Environment Now that you have a C compiler, create an isolated pandas development environment: -- Install either `Anaconda `_ or `miniconda +* Install either `Anaconda `_ or `miniconda `_ -- Make sure your conda is up to date (``conda update conda``) -- Make sure that you have :ref:`cloned the repository ` -- ``cd`` to the *pandas* source directory +* Make sure your conda is up to date (``conda update conda``) +* Make sure that you have :ref:`cloned the repository ` +* ``cd`` to the *pandas* source directory We'll now kick off a three-step process: @@ -286,7 +286,7 @@ complex changes to the documentation as well. Some other important things to know about the docs: -- The *pandas* documentation consists of two parts: the docstrings in the code +* The *pandas* documentation consists of two parts: the docstrings in the code itself and the docs in this folder ``pandas/doc/``. The docstrings provide a clear explanation of the usage of the individual @@ -294,7 +294,7 @@ Some other important things to know about the docs: overviews per topic together with some other information (what's new, installation, etc). -- The docstrings follow a pandas convention, based on the **Numpy Docstring +* The docstrings follow a pandas convention, based on the **Numpy Docstring Standard**. Follow the :ref:`pandas docstring guide ` for detailed instructions on how to write a correct docstring. @@ -303,7 +303,7 @@ Some other important things to know about the docs: contributing_docstring.rst -- The tutorials make heavy use of the `ipython directive +* The tutorials make heavy use of the `ipython directive `_ sphinx extension. This directive lets you put code in the documentation which will be run during the doc build. For example:: @@ -324,7 +324,7 @@ Some other important things to know about the docs: doc build. This approach means that code examples will always be up to date, but it does make the doc building a bit more complex. -- Our API documentation in ``doc/source/api.rst`` houses the auto-generated +* Our API documentation in ``doc/source/api.rst`` houses the auto-generated documentation from the docstrings. For classes, there are a few subtleties around controlling which methods and attributes have pages auto-generated. @@ -488,8 +488,8 @@ standard. Google provides an open source style checker called ``cpplint``, but w use a fork of it that can be found `here `__. Here are *some* of the more common ``cpplint`` issues: - - we restrict line-length to 80 characters to promote readability - - every header file must include a header guard to avoid name collisions if re-included +* we restrict line-length to 80 characters to promote readability +* every header file must include a header guard to avoid name collisions if re-included :ref:`Continuous Integration ` will run the `cpplint `_ tool @@ -536,8 +536,8 @@ Python (PEP8) There are several tools to ensure you abide by this standard. Here are *some* of the more common ``PEP8`` issues: - - we restrict line-length to 79 characters to promote readability - - passing arguments should have spaces after commas, e.g. ``foo(arg1, arg2, kw1='bar')`` +* we restrict line-length to 79 characters to promote readability +* passing arguments should have spaces after commas, e.g. ``foo(arg1, arg2, kw1='bar')`` :ref:`Continuous Integration ` will run the `flake8 `_ tool @@ -715,14 +715,14 @@ Using ``pytest`` Here is an example of a self-contained set of tests that illustrate multiple features that we like to use. -- functional style: tests are like ``test_*`` and *only* take arguments that are either fixtures or parameters -- ``pytest.mark`` can be used to set metadata on test functions, e.g. ``skip`` or ``xfail``. -- using ``parametrize``: allow testing of multiple cases -- to set a mark on a parameter, ``pytest.param(..., marks=...)`` syntax should be used -- ``fixture``, code for object construction, on a per-test basis -- using bare ``assert`` for scalars and truth-testing -- ``tm.assert_series_equal`` (and its counter part ``tm.assert_frame_equal``), for pandas object comparisons. -- the typical pattern of constructing an ``expected`` and comparing versus the ``result`` +* functional style: tests are like ``test_*`` and *only* take arguments that are either fixtures or parameters +* ``pytest.mark`` can be used to set metadata on test functions, e.g. ``skip`` or ``xfail``. +* using ``parametrize``: allow testing of multiple cases +* to set a mark on a parameter, ``pytest.param(..., marks=...)`` syntax should be used +* ``fixture``, code for object construction, on a per-test basis +* using bare ``assert`` for scalars and truth-testing +* ``tm.assert_series_equal`` (and its counter part ``tm.assert_frame_equal``), for pandas object comparisons. +* the typical pattern of constructing an ``expected`` and comparing versus the ``result`` We would name this file ``test_cool_feature.py`` and put in an appropriate place in the ``pandas/tests/`` structure. @@ -969,21 +969,21 @@ Finally, commit your changes to your local repository with an explanatory messag uses a convention for commit message prefixes and layout. Here are some common prefixes along with general guidelines for when to use them: - * ENH: Enhancement, new functionality - * BUG: Bug fix - * DOC: Additions/updates to documentation - * TST: Additions/updates to tests - * BLD: Updates to the build process/scripts - * PERF: Performance improvement - * CLN: Code cleanup +* ENH: Enhancement, new functionality +* BUG: Bug fix +* DOC: Additions/updates to documentation +* TST: Additions/updates to tests +* BLD: Updates to the build process/scripts +* PERF: Performance improvement +* CLN: Code cleanup The following defines how a commit message should be structured. Please reference the relevant GitHub issues in your commit message using GH1234 or #1234. Either style is fine, but the former is generally preferred: - * a subject line with `< 80` chars. - * One blank line. - * Optionally, a commit message body. +* a subject line with `< 80` chars. +* One blank line. +* Optionally, a commit message body. Now you can commit your changes in your local repository:: diff --git a/doc/source/contributing_docstring.rst b/doc/source/contributing_docstring.rst index 4dec2a23facca..afb554aeffbc3 100644 --- a/doc/source/contributing_docstring.rst +++ b/doc/source/contributing_docstring.rst @@ -68,7 +68,7 @@ As PEP-257 is quite open, and some other standards exist on top of it. In the case of pandas, the numpy docstring convention is followed. The conventions is explained in this document: -- `numpydoc docstring guide `_ +* `numpydoc docstring guide `_ (which is based in the original `Guide to NumPy/SciPy documentation `_) @@ -78,9 +78,9 @@ The standard uses reStructuredText (reST). reStructuredText is a markup language that allows encoding styles in plain text files. Documentation about reStructuredText can be found in: -- `Sphinx reStructuredText primer `_ -- `Quick reStructuredText reference `_ -- `Full reStructuredText specification `_ +* `Sphinx reStructuredText primer `_ +* `Quick reStructuredText reference `_ +* `Full reStructuredText specification `_ Pandas has some helpers for sharing docstrings between related classes, see :ref:`docstring.sharing`. @@ -107,12 +107,12 @@ In rare occasions reST styles like bold text or italics will be used in docstrings, but is it common to have inline code, which is presented between backticks. It is considered inline code: -- The name of a parameter -- Python code, a module, function, built-in, type, literal... (e.g. ``os``, +* The name of a parameter +* Python code, a module, function, built-in, type, literal... (e.g. ``os``, ``list``, ``numpy.abs``, ``datetime.date``, ``True``) -- A pandas class (in the form ``:class:`pandas.Series```) -- A pandas method (in the form ``:meth:`pandas.Series.sum```) -- A pandas function (in the form ``:func:`pandas.to_datetime```) +* A pandas class (in the form ``:class:`pandas.Series```) +* A pandas method (in the form ``:meth:`pandas.Series.sum```) +* A pandas function (in the form ``:func:`pandas.to_datetime```) .. note:: To display only the last component of the linked class, method or @@ -352,71 +352,71 @@ When specifying the parameter types, Python built-in data types can be used directly (the Python type is preferred to the more verbose string, integer, boolean, etc): -- int -- float -- str -- bool +* int +* float +* str +* bool For complex types, define the subtypes. For `dict` and `tuple`, as more than one type is present, we use the brackets to help read the type (curly brackets for `dict` and normal brackets for `tuple`): -- list of int -- dict of {str : int} -- tuple of (str, int, int) -- tuple of (str,) -- set of str +* list of int +* dict of {str : int} +* tuple of (str, int, int) +* tuple of (str,) +* set of str In case where there are just a set of values allowed, list them in curly brackets and separated by commas (followed by a space). If the values are ordinal and they have an order, list them in this order. Otherwise, list the default value first, if there is one: -- {0, 10, 25} -- {'simple', 'advanced'} -- {'low', 'medium', 'high'} -- {'cat', 'dog', 'bird'} +* {0, 10, 25} +* {'simple', 'advanced'} +* {'low', 'medium', 'high'} +* {'cat', 'dog', 'bird'} If the type is defined in a Python module, the module must be specified: -- datetime.date -- datetime.datetime -- decimal.Decimal +* datetime.date +* datetime.datetime +* decimal.Decimal If the type is in a package, the module must be also specified: -- numpy.ndarray -- scipy.sparse.coo_matrix +* numpy.ndarray +* scipy.sparse.coo_matrix If the type is a pandas type, also specify pandas except for Series and DataFrame: -- Series -- DataFrame -- pandas.Index -- pandas.Categorical -- pandas.SparseArray +* Series +* DataFrame +* pandas.Index +* pandas.Categorical +* pandas.SparseArray If the exact type is not relevant, but must be compatible with a numpy array, array-like can be specified. If Any type that can be iterated is accepted, iterable can be used: -- array-like -- iterable +* array-like +* iterable If more than one type is accepted, separate them by commas, except the last two types, that need to be separated by the word 'or': -- int or float -- float, decimal.Decimal or None -- str or list of str +* int or float +* float, decimal.Decimal or None +* str or list of str If ``None`` is one of the accepted values, it always needs to be the last in the list. For axis, the convention is to use something like: -- axis : {0 or 'index', 1 or 'columns', None}, default None +* axis : {0 or 'index', 1 or 'columns', None}, default None .. _docstring.returns: diff --git a/doc/source/developer.rst b/doc/source/developer.rst index b8bb2b2fcbe2f..f76af394abc48 100644 --- a/doc/source/developer.rst +++ b/doc/source/developer.rst @@ -81,20 +81,20 @@ The ``metadata`` field is ``None`` except for: omitted it is assumed to be nanoseconds. * ``categorical``: ``{'num_categories': K, 'ordered': is_ordered, 'type': $TYPE}`` - * Here ``'type'`` is optional, and can be a nested pandas type specification - here (but not categorical) + * Here ``'type'`` is optional, and can be a nested pandas type specification + here (but not categorical) * ``unicode``: ``{'encoding': encoding}`` - * The encoding is optional, and if not present is UTF-8 + * The encoding is optional, and if not present is UTF-8 * ``object``: ``{'encoding': encoding}``. Objects can be serialized and stored in ``BYTE_ARRAY`` Parquet columns. The encoding can be one of: - * ``'pickle'`` - * ``'msgpack'`` - * ``'bson'`` - * ``'json'`` + * ``'pickle'`` + * ``'msgpack'`` + * ``'bson'`` + * ``'json'`` * ``timedelta``: ``{'unit': 'ns'}``. The ``'unit'`` is optional, and if omitted it is assumed to be nanoseconds. This metadata is optional altogether diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 4d8e7979060f4..efa52a6f7cfe2 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -51,9 +51,9 @@ labels are collectively referred to as the **index**. The basic method to create Here, ``data`` can be many different things: - - a Python dict - - an ndarray - - a scalar value (like 5) +* a Python dict +* an ndarray +* a scalar value (like 5) The passed **index** is a list of axis labels. Thus, this separates into a few cases depending on what **data is**: @@ -246,12 +246,12 @@ potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object. Like Series, DataFrame accepts many different kinds of input: - - Dict of 1D ndarrays, lists, dicts, or Series - - 2-D numpy.ndarray - - `Structured or record - `__ ndarray - - A ``Series`` - - Another ``DataFrame`` +* Dict of 1D ndarrays, lists, dicts, or Series +* 2-D numpy.ndarray +* `Structured or record + `__ ndarray +* A ``Series`` +* Another ``DataFrame`` Along with the data, you can optionally pass **index** (row labels) and **columns** (column labels) arguments. If you pass an index and / or columns, @@ -842,10 +842,10 @@ econometric analysis of panel data. However, for the strict purposes of slicing and dicing a collection of DataFrame objects, you may find the axis names slightly arbitrary: - - **items**: axis 0, each item corresponds to a DataFrame contained inside - - **major_axis**: axis 1, it is the **index** (rows) of each of the - DataFrames - - **minor_axis**: axis 2, it is the **columns** of each of the DataFrames +* **items**: axis 0, each item corresponds to a DataFrame contained inside +* **major_axis**: axis 1, it is the **index** (rows) of each of the + DataFrames +* **minor_axis**: axis 2, it is the **columns** of each of the DataFrames Construction of Panels works about like you would expect: diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index f683fd6892ea5..4e15f9069de67 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -159,14 +159,14 @@ See more in the `pandas-datareader docs `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 979d025111df1..8f8a9fe3e50e0 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -461,15 +461,15 @@ Supported Syntax These operations are supported by :func:`pandas.eval`: -- Arithmetic operations except for the left shift (``<<``) and right shift +* Arithmetic operations except for the left shift (``<<``) and right shift (``>>``) operators, e.g., ``df + 2 * pi / s ** 4 % 42 - the_golden_ratio`` -- Comparison operations, including chained comparisons, e.g., ``2 < df < df2`` -- Boolean operations, e.g., ``df < df2 and df3 < df4 or not df_bool`` -- ``list`` and ``tuple`` literals, e.g., ``[1, 2]`` or ``(1, 2)`` -- Attribute access, e.g., ``df.a`` -- Subscript expressions, e.g., ``df[0]`` -- Simple variable evaluation, e.g., ``pd.eval('df')`` (this is not very useful) -- Math functions: `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, +* Comparison operations, including chained comparisons, e.g., ``2 < df < df2`` +* Boolean operations, e.g., ``df < df2 and df3 < df4 or not df_bool`` +* ``list`` and ``tuple`` literals, e.g., ``[1, 2]`` or ``(1, 2)`` +* Attribute access, e.g., ``df.a`` +* Subscript expressions, e.g., ``df[0]`` +* Simple variable evaluation, e.g., ``pd.eval('df')`` (this is not very useful) +* Math functions: `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, `sqrt`, `sinh`, `cosh`, `tanh`, `arcsin`, `arccos`, `arctan`, `arccosh`, `arcsinh`, `arctanh`, `abs` and `arctan2`. @@ -477,22 +477,22 @@ This Python syntax is **not** allowed: * Expressions - - Function calls other than math functions. - - ``is``/``is not`` operations - - ``if`` expressions - - ``lambda`` expressions - - ``list``/``set``/``dict`` comprehensions - - Literal ``dict`` and ``set`` expressions - - ``yield`` expressions - - Generator expressions - - Boolean expressions consisting of only scalar values + * Function calls other than math functions. + * ``is``/``is not`` operations + * ``if`` expressions + * ``lambda`` expressions + * ``list``/``set``/``dict`` comprehensions + * Literal ``dict`` and ``set`` expressions + * ``yield`` expressions + * Generator expressions + * Boolean expressions consisting of only scalar values * Statements - - Neither `simple `__ - nor `compound `__ - statements are allowed. This includes things like ``for``, ``while``, and - ``if``. + * Neither `simple `__ + nor `compound `__ + statements are allowed. This includes things like ``for``, ``while``, and + ``if``. diff --git a/doc/source/extending.rst b/doc/source/extending.rst index 431c69bc0b6b5..8018d35770924 100644 --- a/doc/source/extending.rst +++ b/doc/source/extending.rst @@ -167,9 +167,9 @@ you can retain subclasses through ``pandas`` data manipulations. There are 3 constructor properties to be defined: -- ``_constructor``: Used when a manipulation result has the same dimensions as the original. -- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. -- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. +* ``_constructor``: Used when a manipulation result has the same dimensions as the original. +* ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. +* ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. Following table shows how ``pandas`` data structures define constructor properties by default. diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index b7042ef390018..79e312ca12833 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -193,9 +193,9 @@ Choice of ``NA`` representation For lack of ``NA`` (missing) support from the ground up in NumPy and Python in general, we were given the difficult choice between either: -- A *masked array* solution: an array of data and an array of boolean values +* A *masked array* solution: an array of data and an array of boolean values indicating whether a value is there or is missing. -- Using a special sentinel value, bit pattern, or set of sentinel values to +* Using a special sentinel value, bit pattern, or set of sentinel values to denote ``NA`` across the dtypes. For many reasons we chose the latter. After years of production use it has diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 47d53c82b86f3..45e449d081fb0 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -22,36 +22,36 @@ Group By: split-apply-combine By "group by" we are referring to a process involving one or more of the following steps: - - **Splitting** the data into groups based on some criteria. - - **Applying** a function to each group independently. - - **Combining** the results into a data structure. +* **Splitting** the data into groups based on some criteria. +* **Applying** a function to each group independently. +* **Combining** the results into a data structure. Out of these, the split step is the most straightforward. In fact, in many situations we may wish to split the data set into groups and do something with those groups. In the apply step, we might wish to one of the following: - - **Aggregation**: compute a summary statistic (or statistics) for each - group. Some examples: +* **Aggregation**: compute a summary statistic (or statistics) for each + group. Some examples: - - Compute group sums or means. - - Compute group sizes / counts. + * Compute group sums or means. + * Compute group sizes / counts. - - **Transformation**: perform some group-specific computations and return a - like-indexed object. Some examples: +* **Transformation**: perform some group-specific computations and return a + like-indexed object. Some examples: - - Standardize data (zscore) within a group. - - Filling NAs within groups with a value derived from each group. + * Standardize data (zscore) within a group. + * Filling NAs within groups with a value derived from each group. - - **Filtration**: discard some groups, according to a group-wise computation - that evaluates True or False. Some examples: +* **Filtration**: discard some groups, according to a group-wise computation + that evaluates True or False. Some examples: - - Discard data that belongs to groups with only a few members. - - Filter out data based on the group sum or mean. + * Discard data that belongs to groups with only a few members. + * Filter out data based on the group sum or mean. - - Some combination of the above: GroupBy will examine the results of the apply - step and try to return a sensibly combined result if it doesn't fit into - either of the above two categories. +* Some combination of the above: GroupBy will examine the results of the apply + step and try to return a sensibly combined result if it doesn't fit into + either of the above two categories. Since the set of object instance methods on pandas data structures are generally rich and expressive, we often simply want to invoke, say, a DataFrame function @@ -88,15 +88,15 @@ object (more on what the GroupBy object is later), you may do the following: The mapping can be specified many different ways: - - A Python function, to be called on each of the axis labels. - - A list or NumPy array of the same length as the selected axis. - - A dict or ``Series``, providing a ``label -> group name`` mapping. - - For ``DataFrame`` objects, a string indicating a column to be used to group. - Of course ``df.groupby('A')`` is just syntactic sugar for - ``df.groupby(df['A'])``, but it makes life simpler. - - For ``DataFrame`` objects, a string indicating an index level to be used to - group. - - A list of any of the above things. +* A Python function, to be called on each of the axis labels. +* A list or NumPy array of the same length as the selected axis. +* A dict or ``Series``, providing a ``label -> group name`` mapping. +* For ``DataFrame`` objects, a string indicating a column to be used to group. + Of course ``df.groupby('A')`` is just syntactic sugar for + ``df.groupby(df['A'])``, but it makes life simpler. +* For ``DataFrame`` objects, a string indicating an index level to be used to + group. +* A list of any of the above things. Collectively we refer to the grouping objects as the **keys**. For example, consider the following ``DataFrame``: diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 2b9fcf874ef22..1c63acce6e3fa 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -17,10 +17,10 @@ Indexing and Selecting Data The axis labeling information in pandas objects serves many purposes: - - Identifies data (i.e. provides *metadata*) using known indicators, - important for analysis, visualization, and interactive console display. - - Enables automatic and explicit data alignment. - - Allows intuitive getting and setting of subsets of the data set. +* Identifies data (i.e. provides *metadata*) using known indicators, + important for analysis, visualization, and interactive console display. +* Enables automatic and explicit data alignment. +* Allows intuitive getting and setting of subsets of the data set. In this section, we will focus on the final point: namely, how to slice, dice, and generally get and set subsets of pandas objects. The primary focus will be @@ -62,37 +62,37 @@ Object selection has had a number of user-requested additions in order to support more explicit location based indexing. Pandas now supports three types of multi-axis indexing. -- ``.loc`` is primarily label based, but may also be used with a boolean array. ``.loc`` will raise ``KeyError`` when the items are not found. Allowed inputs are: +* ``.loc`` is primarily label based, but may also be used with a boolean array. ``.loc`` will raise ``KeyError`` when the items are not found. Allowed inputs are: - - A single label, e.g. ``5`` or ``'a'`` (Note that ``5`` is interpreted as a - *label* of the index. This use is **not** an integer position along the - index.). - - A list or array of labels ``['a', 'b', 'c']``. - - A slice object with labels ``'a':'f'`` (Note that contrary to usual python - slices, **both** the start and the stop are included, when present in the - index! See :ref:`Slicing with labels - `.). - - A boolean array - - A ``callable`` function with one argument (the calling Series, DataFrame or Panel) and - that returns valid output for indexing (one of the above). + * A single label, e.g. ``5`` or ``'a'`` (Note that ``5`` is interpreted as a + *label* of the index. This use is **not** an integer position along the + index.). + * A list or array of labels ``['a', 'b', 'c']``. + * A slice object with labels ``'a':'f'`` (Note that contrary to usual python + slices, **both** the start and the stop are included, when present in the + index! See :ref:`Slicing with labels + `.). + * A boolean array + * A ``callable`` function with one argument (the calling Series, DataFrame or Panel) and + that returns valid output for indexing (one of the above). .. versionadded:: 0.18.1 See more at :ref:`Selection by Label `. -- ``.iloc`` is primarily integer position based (from ``0`` to +* ``.iloc`` is primarily integer position based (from ``0`` to ``length-1`` of the axis), but may also be used with a boolean array. ``.iloc`` will raise ``IndexError`` if a requested indexer is out-of-bounds, except *slice* indexers which allow out-of-bounds indexing. (this conforms with Python/NumPy *slice* semantics). Allowed inputs are: - - An integer e.g. ``5``. - - A list or array of integers ``[4, 3, 0]``. - - A slice object with ints ``1:7``. - - A boolean array. - - A ``callable`` function with one argument (the calling Series, DataFrame or Panel) and - that returns valid output for indexing (one of the above). + * An integer e.g. ``5``. + * A list or array of integers ``[4, 3, 0]``. + * A slice object with ints ``1:7``. + * A boolean array. + * A ``callable`` function with one argument (the calling Series, DataFrame or Panel) and + that returns valid output for indexing (one of the above). .. versionadded:: 0.18.1 @@ -100,7 +100,7 @@ of multi-axis indexing. :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical `. -- ``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer. See more at :ref:`Selection By Callable `. +* ``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer. See more at :ref:`Selection By Callable `. Getting values from an object with multi-axes selection uses the following notation (using ``.loc`` as an example, but the following applies to ``.iloc`` as @@ -343,14 +343,14 @@ Integers are valid labels, but they refer to the label **and not the position**. The ``.loc`` attribute is the primary access method. The following are valid inputs: -- A single label, e.g. ``5`` or ``'a'`` (Note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index.). -- A list or array of labels ``['a', 'b', 'c']``. -- A slice object with labels ``'a':'f'`` (Note that contrary to usual python +* A single label, e.g. ``5`` or ``'a'`` (Note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index.). +* A list or array of labels ``['a', 'b', 'c']``. +* A slice object with labels ``'a':'f'`` (Note that contrary to usual python slices, **both** the start and the stop are included, when present in the index! See :ref:`Slicing with labels `.). -- A boolean array. -- A ``callable``, see :ref:`Selection By Callable `. +* A boolean array. +* A ``callable``, see :ref:`Selection By Callable `. .. ipython:: python @@ -445,11 +445,11 @@ Pandas provides a suite of methods in order to get **purely integer based indexi The ``.iloc`` attribute is the primary access method. The following are valid inputs: -- An integer e.g. ``5``. -- A list or array of integers ``[4, 3, 0]``. -- A slice object with ints ``1:7``. -- A boolean array. -- A ``callable``, see :ref:`Selection By Callable `. +* An integer e.g. ``5``. +* A list or array of integers ``[4, 3, 0]``. +* A slice object with ints ``1:7``. +* A boolean array. +* A ``callable``, see :ref:`Selection By Callable `. .. ipython:: python @@ -599,8 +599,8 @@ bit of user confusion over the years. The recommended methods of indexing are: -- ``.loc`` if you want to *label* index. -- ``.iloc`` if you want to *positionally* index. +* ``.loc`` if you want to *label* index. +* ``.iloc`` if you want to *positionally* index. .. ipython:: python @@ -1455,15 +1455,15 @@ If you want to identify and remove duplicate rows in a DataFrame, there are two methods that will help: ``duplicated`` and ``drop_duplicates``. Each takes as an argument the columns to use to identify duplicated rows. -- ``duplicated`` returns a boolean vector whose length is the number of rows, and which indicates whether a row is duplicated. -- ``drop_duplicates`` removes duplicate rows. +* ``duplicated`` returns a boolean vector whose length is the number of rows, and which indicates whether a row is duplicated. +* ``drop_duplicates`` removes duplicate rows. By default, the first observed row of a duplicate set is considered unique, but each method has a ``keep`` parameter to specify targets to be kept. -- ``keep='first'`` (default): mark / drop duplicates except for the first occurrence. -- ``keep='last'``: mark / drop duplicates except for the last occurrence. -- ``keep=False``: mark / drop all duplicates. +* ``keep='first'`` (default): mark / drop duplicates except for the first occurrence. +* ``keep='last'``: mark / drop duplicates except for the last occurrence. +* ``keep=False``: mark / drop all duplicates. .. ipython:: python diff --git a/doc/source/install.rst b/doc/source/install.rst index e655136904920..87d1b63914635 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -261,17 +261,17 @@ Optional Dependencies * `Apache Parquet `__, either `pyarrow `__ (>= 0.4.1) or `fastparquet `__ (>= 0.0.6) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are: - * `psycopg2 `__: for PostgreSQL - * `pymysql `__: for MySQL. - * `SQLite `__: for SQLite, this is included in Python's standard library by default. + * `psycopg2 `__: for PostgreSQL + * `pymysql `__: for MySQL. + * `SQLite `__: for SQLite, this is included in Python's standard library by default. * `matplotlib `__: for plotting, Version 1.4.3 or higher. * For Excel I/O: - * `xlrd/xlwt `__: Excel reading (xlrd) and writing (xlwt) - * `openpyxl `__: openpyxl version 2.4.0 - for writing .xlsx files (xlrd >= 0.9.0) - * `XlsxWriter `__: Alternative Excel writer + * `xlrd/xlwt `__: Excel reading (xlrd) and writing (xlwt) + * `openpyxl `__: openpyxl version 2.4.0 + for writing .xlsx files (xlrd >= 0.9.0) + * `XlsxWriter `__: Alternative Excel writer * `Jinja2 `__: Template engine for conditional HTML formatting. * `s3fs `__: necessary for Amazon S3 access (s3fs >= 0.0.7). diff --git a/doc/source/internals.rst b/doc/source/internals.rst index caf5790fb24c6..fce99fc633440 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -24,23 +24,23 @@ Indexing In pandas there are a few objects implemented which can serve as valid containers for the axis labels: -- ``Index``: the generic "ordered set" object, an ndarray of object dtype +* ``Index``: the generic "ordered set" object, an ndarray of object dtype assuming nothing about its contents. The labels must be hashable (and likely immutable) and unique. Populates a dict of label to location in Cython to do ``O(1)`` lookups. -- ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer +* ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer data, such as time stamps -- ``Float64Index``: a version of ``Index`` highly optimized for 64-bit float data -- ``MultiIndex``: the standard hierarchical index object -- ``DatetimeIndex``: An Index object with ``Timestamp`` boxed elements (impl are the int64 values) -- ``TimedeltaIndex``: An Index object with ``Timedelta`` boxed elements (impl are the in64 values) -- ``PeriodIndex``: An Index object with Period elements +* ``Float64Index``: a version of ``Index`` highly optimized for 64-bit float data +* ``MultiIndex``: the standard hierarchical index object +* ``DatetimeIndex``: An Index object with ``Timestamp`` boxed elements (impl are the int64 values) +* ``TimedeltaIndex``: An Index object with ``Timedelta`` boxed elements (impl are the in64 values) +* ``PeriodIndex``: An Index object with Period elements There are functions that make the creation of a regular index easy: -- ``date_range``: fixed frequency date range generated from a time rule or +* ``date_range``: fixed frequency date range generated from a time rule or DateOffset. An ndarray of Python datetime objects -- ``period_range``: fixed frequency date range generated from a time rule or +* ``period_range``: fixed frequency date range generated from a time rule or DateOffset. An ndarray of ``Period`` objects, representing timespans The motivation for having an ``Index`` class in the first place was to enable @@ -52,22 +52,22 @@ From an internal implementation point of view, the relevant methods that an ``Index`` must define are one or more of the following (depending on how incompatible the new object internals are with the ``Index`` functions): -- ``get_loc``: returns an "indexer" (an integer, or in some cases a +* ``get_loc``: returns an "indexer" (an integer, or in some cases a slice object) for a label -- ``slice_locs``: returns the "range" to slice between two labels -- ``get_indexer``: Computes the indexing vector for reindexing / data +* ``slice_locs``: returns the "range" to slice between two labels +* ``get_indexer``: Computes the indexing vector for reindexing / data alignment purposes. See the source / docstrings for more on this -- ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data +* ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data alignment purposes when the index is non-unique. See the source / docstrings for more on this -- ``reindex``: Does any pre-conversion of the input index then calls +* ``reindex``: Does any pre-conversion of the input index then calls ``get_indexer`` -- ``union``, ``intersection``: computes the union or intersection of two +* ``union``, ``intersection``: computes the union or intersection of two Index objects -- ``insert``: Inserts a new label into an Index, yielding a new object -- ``delete``: Delete a label, yielding a new object -- ``drop``: Deletes a set of labels -- ``take``: Analogous to ndarray.take +* ``insert``: Inserts a new label into an Index, yielding a new object +* ``delete``: Delete a label, yielding a new object +* ``drop``: Deletes a set of labels +* ``take``: Analogous to ndarray.take MultiIndex ~~~~~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index 658b9ff15783d..ae6c4f12f04f7 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -252,12 +252,12 @@ Datetime Handling +++++++++++++++++ parse_dates : boolean or list of ints or names or list of lists or dict, default ``False``. - - If ``True`` -> try parsing the index. - - If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date + * If ``True`` -> try parsing the index. + * If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - - If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date + * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date column. - - If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. + * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. A fast-path exists for iso8601-formatted dates. infer_datetime_format : boolean, default ``False`` If ``True`` and parse_dates is enabled for a column, attempt to infer the @@ -961,12 +961,12 @@ negative consequences if enabled. Here are some examples of datetime strings that can be guessed (All representing December 30th, 2011 at 00:00:00): -- "20111230" -- "2011/12/30" -- "20111230 00:00:00" -- "12/30/2011 00:00:00" -- "30/Dec/2011 00:00:00" -- "30/December/2011 00:00:00" +* "20111230" +* "2011/12/30" +* "20111230 00:00:00" +* "12/30/2011 00:00:00" +* "30/Dec/2011 00:00:00" +* "30/December/2011 00:00:00" Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With ``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With @@ -1303,16 +1303,16 @@ with data files that have known and fixed column widths. The function parameters to ``read_fwf`` are largely the same as `read_csv` with two extra parameters, and a different usage of the ``delimiter`` parameter: - - ``colspecs``: A list of pairs (tuples) giving the extents of the - fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). - String value 'infer' can be used to instruct the parser to try detecting - the column specifications from the first 100 rows of the data. Default - behavior, if not specified, is to infer. - - ``widths``: A list of field widths which can be used instead of 'colspecs' - if the intervals are contiguous. - - ``delimiter``: Characters to consider as filler characters in the fixed-width file. - Can be used to specify the filler character of the fields - if it is not spaces (e.g., '~'). +* ``colspecs``: A list of pairs (tuples) giving the extents of the + fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). + String value 'infer' can be used to instruct the parser to try detecting + the column specifications from the first 100 rows of the data. Default + behavior, if not specified, is to infer. +* ``widths``: A list of field widths which can be used instead of 'colspecs' + if the intervals are contiguous. +* ``delimiter``: Characters to consider as filler characters in the fixed-width file. + Can be used to specify the filler character of the fields + if it is not spaces (e.g., '~'). .. ipython:: python :suppress: @@ -1566,9 +1566,9 @@ possible pandas uses the C parser (specified as ``engine='c'``), but may fall back to Python if C-unsupported options are specified. Currently, C-unsupported options include: -- ``sep`` other than a single character (e.g. regex separators) -- ``skipfooter`` -- ``sep=None`` with ``delim_whitespace=False`` +* ``sep`` other than a single character (e.g. regex separators) +* ``skipfooter`` +* ``sep=None`` with ``delim_whitespace=False`` Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. @@ -1602,29 +1602,29 @@ The ``Series`` and ``DataFrame`` objects have an instance method ``to_csv`` whic allows storing the contents of the object as a comma-separated-values file. The function takes a number of arguments. Only the first is required. - - ``path_or_buf``: A string path to the file to write or a StringIO - - ``sep`` : Field delimiter for the output file (default ",") - - ``na_rep``: A string representation of a missing value (default '') - - ``float_format``: Format string for floating point numbers - - ``cols``: Columns to write (default None) - - ``header``: Whether to write out the column names (default True) - - ``index``: whether to write row (index) names (default True) - - ``index_label``: Column label(s) for index column(s) if desired. If None - (default), and `header` and `index` are True, then the index names are - used. (A sequence should be given if the ``DataFrame`` uses MultiIndex). - - ``mode`` : Python write mode, default 'w' - - ``encoding``: a string representing the encoding to use if the contents are - non-ASCII, for Python versions prior to 3 - - ``line_terminator``: Character sequence denoting line end (default '\\n') - - ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL). Note that if you have set a `float_format` then floats are converted to strings and csv.QUOTE_NONNUMERIC will treat them as non-numeric - - ``quotechar``: Character used to quote fields (default '"') - - ``doublequote``: Control quoting of ``quotechar`` in fields (default True) - - ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when - appropriate (default None) - - ``chunksize``: Number of rows to write at a time - - ``tupleize_cols``: If False (default), write as a list of tuples, otherwise - write in an expanded line format suitable for ``read_csv`` - - ``date_format``: Format string for datetime objects +* ``path_or_buf``: A string path to the file to write or a StringIO +* ``sep`` : Field delimiter for the output file (default ",") +* ``na_rep``: A string representation of a missing value (default '') +* ``float_format``: Format string for floating point numbers +* ``cols``: Columns to write (default None) +* ``header``: Whether to write out the column names (default True) +* ``index``: whether to write row (index) names (default True) +* ``index_label``: Column label(s) for index column(s) if desired. If None + (default), and `header` and `index` are True, then the index names are + used. (A sequence should be given if the ``DataFrame`` uses MultiIndex). +* ``mode`` : Python write mode, default 'w' +* ``encoding``: a string representing the encoding to use if the contents are + non-ASCII, for Python versions prior to 3 +* ``line_terminator``: Character sequence denoting line end (default '\\n') +* ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL). Note that if you have set a `float_format` then floats are converted to strings and csv.QUOTE_NONNUMERIC will treat them as non-numeric +* ``quotechar``: Character used to quote fields (default '"') +* ``doublequote``: Control quoting of ``quotechar`` in fields (default True) +* ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when + appropriate (default None) +* ``chunksize``: Number of rows to write at a time +* ``tupleize_cols``: If False (default), write as a list of tuples, otherwise + write in an expanded line format suitable for ``read_csv`` +* ``date_format``: Format string for datetime objects Writing a formatted string ++++++++++++++++++++++++++ @@ -1634,22 +1634,22 @@ Writing a formatted string The ``DataFrame`` object has an instance method ``to_string`` which allows control over the string representation of the object. All arguments are optional: - - ``buf`` default None, for example a StringIO object - - ``columns`` default None, which columns to write - - ``col_space`` default None, minimum width of each column. - - ``na_rep`` default ``NaN``, representation of NA value - - ``formatters`` default None, a dictionary (by column) of functions each of - which takes a single argument and returns a formatted string - - ``float_format`` default None, a function which takes a single (float) - argument and returns a formatted string; to be applied to floats in the - ``DataFrame``. - - ``sparsify`` default True, set to False for a ``DataFrame`` with a hierarchical - index to print every MultiIndex key at each row. - - ``index_names`` default True, will print the names of the indices - - ``index`` default True, will print the index (ie, row labels) - - ``header`` default True, will print the column labels - - ``justify`` default ``left``, will print column headers left- or - right-justified +* ``buf`` default None, for example a StringIO object +* ``columns`` default None, which columns to write +* ``col_space`` default None, minimum width of each column. +* ``na_rep`` default ``NaN``, representation of NA value +* ``formatters`` default None, a dictionary (by column) of functions each of + which takes a single argument and returns a formatted string +* ``float_format`` default None, a function which takes a single (float) + argument and returns a formatted string; to be applied to floats in the + ``DataFrame``. +* ``sparsify`` default True, set to False for a ``DataFrame`` with a hierarchical + index to print every MultiIndex key at each row. +* ``index_names`` default True, will print the names of the indices +* ``index`` default True, will print the index (ie, row labels) +* ``header`` default True, will print the column labels +* ``justify`` default ``left``, will print column headers left- or + right-justified The ``Series`` object also has a ``to_string`` method, but with only the ``buf``, ``na_rep``, ``float_format`` arguments. There is also a ``length`` argument @@ -1670,17 +1670,17 @@ Writing JSON A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` with optional parameters: -- ``path_or_buf`` : the pathname or buffer to write the output +* ``path_or_buf`` : the pathname or buffer to write the output This can be ``None`` in which case a JSON string is returned -- ``orient`` : +* ``orient`` : ``Series``: - - default is ``index`` - - allowed values are {``split``, ``records``, ``index``} + * default is ``index`` + * allowed values are {``split``, ``records``, ``index``} ``DataFrame``: - - default is ``columns`` - - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``} + * default is ``columns`` + * allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``} The format of the JSON string @@ -1694,12 +1694,12 @@ with optional parameters: ``columns``; dict like {column -> {index -> value}} ``values``; just the values array -- ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. -- ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. -- ``force_ascii`` : force encoded string to be ASCII, default True. -- ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. -- ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object. -- ``lines`` : If ``records`` orient, then will write each record per line as json. +* ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. +* ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. +* ``force_ascii`` : force encoded string to be ASCII, default True. +* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. +* ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object. +* ``lines`` : If ``records`` orient, then will write each record per line as json. Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters. @@ -1818,19 +1818,19 @@ Fallback Behavior If the JSON serializer cannot handle the container contents directly it will fall back in the following manner: -- if the dtype is unsupported (e.g. ``np.complex``) then the ``default_handler``, if provided, will be called +* if the dtype is unsupported (e.g. ``np.complex``) then the ``default_handler``, if provided, will be called for each value, otherwise an exception is raised. -- if an object is unsupported it will attempt the following: +* if an object is unsupported it will attempt the following: - * check if the object has defined a ``toDict`` method and call it. - A ``toDict`` method should return a ``dict`` which will then be JSON serialized. + * check if the object has defined a ``toDict`` method and call it. + A ``toDict`` method should return a ``dict`` which will then be JSON serialized. - * invoke the ``default_handler`` if one was provided. + * invoke the ``default_handler`` if one was provided. - * convert the object to a ``dict`` by traversing its contents. However this will often fail - with an ``OverflowError`` or give unexpected results. + * convert the object to a ``dict`` by traversing its contents. However this will often fail + with an ``OverflowError`` or give unexpected results. In general the best approach for unsupported objects or dtypes is to provide a ``default_handler``. For example: @@ -1856,20 +1856,20 @@ Reading a JSON string to pandas object can take a number of parameters. The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` -- ``filepath_or_buffer`` : a **VALID** JSON string or file handle / StringIO. The string could be +* ``filepath_or_buffer`` : a **VALID** JSON string or file handle / StringIO. The string could be a URL. Valid URL schemes include http, ftp, S3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json -- ``typ`` : type of object to recover (series or frame), default 'frame' -- ``orient`` : +* ``typ`` : type of object to recover (series or frame), default 'frame' +* ``orient`` : Series : - - default is ``index`` - - allowed values are {``split``, ``records``, ``index``} + * default is ``index`` + * allowed values are {``split``, ``records``, ``index``} DataFrame - - default is ``columns`` - - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``} + * default is ``columns`` + * allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``} The format of the JSON string @@ -1885,20 +1885,20 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` ``table``; adhering to the JSON `Table Schema`_ -- ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if ``False``, then don't infer dtypes at all, default is True, apply only to the data. -- ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is ``True`` -- ``convert_dates`` : a list of columns to parse for dates; If ``True``, then try to parse date-like columns, default is ``True``. -- ``keep_default_dates`` : boolean, default ``True``. If parsing dates, then parse the default date-like columns. -- ``numpy`` : direct decoding to NumPy arrays. default is ``False``; +* ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if ``False``, then don't infer dtypes at all, default is True, apply only to the data. +* ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is ``True`` +* ``convert_dates`` : a list of columns to parse for dates; If ``True``, then try to parse date-like columns, default is ``True``. +* ``keep_default_dates`` : boolean, default ``True``. If parsing dates, then parse the default date-like columns. +* ``numpy`` : direct decoding to NumPy arrays. default is ``False``; Supports numeric data only, although labels may be non-numeric. Also note that the JSON ordering **MUST** be the same for each term if ``numpy=True``. -- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality. -- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default +* ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality. +* ``date_unit`` : string, the timestamp unit to detect if converting dates. Default None. By default the timestamp precision will be detected, if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to seconds, milliseconds, microseconds or nanoseconds respectively. -- ``lines`` : reads file as one json object per line. -- ``encoding`` : The encoding to use to decode py3 bytes. -- ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration. +* ``lines`` : reads file as one json object per line. +* ``encoding`` : The encoding to use to decode py3 bytes. +* ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. @@ -2175,10 +2175,10 @@ object str A few notes on the generated table schema: -- The ``schema`` object contains a ``pandas_version`` field. This contains +* The ``schema`` object contains a ``pandas_version`` field. This contains the version of pandas' dialect of the schema, and will be incremented with each revision. -- All dates are converted to UTC when serializing. Even timezone naive values, +* All dates are converted to UTC when serializing. Even timezone naive values, which are treated as UTC with an offset of 0. .. ipython:: python @@ -2187,7 +2187,7 @@ A few notes on the generated table schema: s = pd.Series(pd.date_range('2016', periods=4)) build_table_schema(s) -- datetimes with a timezone (before serializing), include an additional field +* datetimes with a timezone (before serializing), include an additional field ``tz`` with the time zone name (e.g. ``'US/Central'``). .. ipython:: python @@ -2196,7 +2196,7 @@ A few notes on the generated table schema: tz='US/Central')) build_table_schema(s_tz) -- Periods are converted to timestamps before serialization, and so have the +* Periods are converted to timestamps before serialization, and so have the same behavior of being converted to UTC. In addition, periods will contain and additional field ``freq`` with the period's frequency, e.g. ``'A-DEC'``. @@ -2206,7 +2206,7 @@ A few notes on the generated table schema: periods=4)) build_table_schema(s_per) -- Categoricals use the ``any`` type and an ``enum`` constraint listing +* Categoricals use the ``any`` type and an ``enum`` constraint listing the set of possible values. Additionally, an ``ordered`` field is included: .. ipython:: python @@ -2214,7 +2214,7 @@ A few notes on the generated table schema: s_cat = pd.Series(pd.Categorical(['a', 'b', 'a'])) build_table_schema(s_cat) -- A ``primaryKey`` field, containing an array of labels, is included +* A ``primaryKey`` field, containing an array of labels, is included *if the index is unique*: .. ipython:: python @@ -2222,7 +2222,7 @@ A few notes on the generated table schema: s_dupe = pd.Series([1, 2], index=[1, 1]) build_table_schema(s_dupe) -- The ``primaryKey`` behavior is the same with MultiIndexes, but in this +* The ``primaryKey`` behavior is the same with MultiIndexes, but in this case the ``primaryKey`` is an array: .. ipython:: python @@ -2231,15 +2231,15 @@ A few notes on the generated table schema: (0, 1)])) build_table_schema(s_multi) -- The default naming roughly follows these rules: +* The default naming roughly follows these rules: - + For series, the ``object.name`` is used. If that's none, then the - name is ``values`` - + For ``DataFrames``, the stringified version of the column name is used - + For ``Index`` (not ``MultiIndex``), ``index.name`` is used, with a - fallback to ``index`` if that is None. - + For ``MultiIndex``, ``mi.names`` is used. If any level has no name, - then ``level_`` is used. + * For series, the ``object.name`` is used. If that's none, then the + name is ``values`` + * For ``DataFrames``, the stringified version of the column name is used + * For ``Index`` (not ``MultiIndex``), ``index.name`` is used, with a + fallback to ``index`` if that is None. + * For ``MultiIndex``, ``mi.names`` is used. If any level has no name, + then ``level_`` is used. .. versionadded:: 0.23.0 @@ -2601,55 +2601,55 @@ parse HTML tables in the top-level pandas io function ``read_html``. **Issues with** |lxml|_ - * Benefits +* Benefits - * |lxml|_ is very fast. + * |lxml|_ is very fast. - * |lxml|_ requires Cython to install correctly. + * |lxml|_ requires Cython to install correctly. - * Drawbacks +* Drawbacks - * |lxml|_ does *not* make any guarantees about the results of its parse - *unless* it is given |svm|_. + * |lxml|_ does *not* make any guarantees about the results of its parse + *unless* it is given |svm|_. - * In light of the above, we have chosen to allow you, the user, to use the - |lxml|_ backend, but **this backend will use** |html5lib|_ if |lxml|_ - fails to parse + * In light of the above, we have chosen to allow you, the user, to use the + |lxml|_ backend, but **this backend will use** |html5lib|_ if |lxml|_ + fails to parse - * It is therefore *highly recommended* that you install both - |BeautifulSoup4|_ and |html5lib|_, so that you will still get a valid - result (provided everything else is valid) even if |lxml|_ fails. + * It is therefore *highly recommended* that you install both + |BeautifulSoup4|_ and |html5lib|_, so that you will still get a valid + result (provided everything else is valid) even if |lxml|_ fails. **Issues with** |BeautifulSoup4|_ **using** |lxml|_ **as a backend** - * The above issues hold here as well since |BeautifulSoup4|_ is essentially - just a wrapper around a parser backend. +* The above issues hold here as well since |BeautifulSoup4|_ is essentially + just a wrapper around a parser backend. **Issues with** |BeautifulSoup4|_ **using** |html5lib|_ **as a backend** - * Benefits +* Benefits - * |html5lib|_ is far more lenient than |lxml|_ and consequently deals - with *real-life markup* in a much saner way rather than just, e.g., - dropping an element without notifying you. + * |html5lib|_ is far more lenient than |lxml|_ and consequently deals + with *real-life markup* in a much saner way rather than just, e.g., + dropping an element without notifying you. - * |html5lib|_ *generates valid HTML5 markup from invalid markup - automatically*. This is extremely important for parsing HTML tables, - since it guarantees a valid document. However, that does NOT mean that - it is "correct", since the process of fixing markup does not have a - single definition. + * |html5lib|_ *generates valid HTML5 markup from invalid markup + automatically*. This is extremely important for parsing HTML tables, + since it guarantees a valid document. However, that does NOT mean that + it is "correct", since the process of fixing markup does not have a + single definition. - * |html5lib|_ is pure Python and requires no additional build steps beyond - its own installation. + * |html5lib|_ is pure Python and requires no additional build steps beyond + its own installation. - * Drawbacks +* Drawbacks - * The biggest drawback to using |html5lib|_ is that it is slow as - molasses. However consider the fact that many tables on the web are not - big enough for the parsing algorithm runtime to matter. It is more - likely that the bottleneck will be in the process of reading the raw - text from the URL over the web, i.e., IO (input-output). For very large - tables, this might not be true. + * The biggest drawback to using |html5lib|_ is that it is slow as + molasses. However consider the fact that many tables on the web are not + big enough for the parsing algorithm runtime to matter. It is more + likely that the bottleneck will be in the process of reading the raw + text from the URL over the web, i.e., IO (input-output). For very large + tables, this might not be true. .. |svm| replace:: **strictly valid markup** @@ -2753,13 +2753,13 @@ Specifying Sheets .. note :: An ExcelFile's attribute ``sheet_names`` provides access to a list of sheets. -- The arguments ``sheet_name`` allows specifying the sheet or sheets to read. -- The default value for ``sheet_name`` is 0, indicating to read the first sheet -- Pass a string to refer to the name of a particular sheet in the workbook. -- Pass an integer to refer to the index of a sheet. Indices follow Python +* The arguments ``sheet_name`` allows specifying the sheet or sheets to read. +* The default value for ``sheet_name`` is 0, indicating to read the first sheet +* Pass a string to refer to the name of a particular sheet in the workbook. +* Pass an integer to refer to the index of a sheet. Indices follow Python convention, beginning at 0. -- Pass a list of either strings or integers, to return a dictionary of specified sheets. -- Pass a ``None`` to return a dictionary of all available sheets. +* Pass a list of either strings or integers, to return a dictionary of specified sheets. +* Pass a ``None`` to return a dictionary of all available sheets. .. code-block:: python @@ -3030,9 +3030,9 @@ files if `Xlsxwriter`_ is not available. To specify which writer you want to use, you can pass an engine keyword argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: -- ``openpyxl``: version 2.4 or higher is required -- ``xlsxwriter`` -- ``xlwt`` +* ``openpyxl``: version 2.4 or higher is required +* ``xlsxwriter`` +* ``xlwt`` .. code-block:: python @@ -3055,8 +3055,8 @@ Style and Formatting The look and feel of Excel worksheets created from pandas can be modified using the following parameters on the ``DataFrame``'s ``to_excel`` method. -- ``float_format`` : Format string for floating point numbers (default ``None``). -- ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``). +* ``float_format`` : Format string for floating point numbers (default ``None``). +* ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``). @@ -3654,10 +3654,10 @@ data. A query is specified using the ``Term`` class under the hood, as a boolean expression. -- ``index`` and ``columns`` are supported indexers of a ``DataFrames``. -- ``major_axis``, ``minor_axis``, and ``items`` are supported indexers of +* ``index`` and ``columns`` are supported indexers of a ``DataFrames``. +* ``major_axis``, ``minor_axis``, and ``items`` are supported indexers of the Panel. -- if ``data_columns`` are specified, these can be used as additional indexers. +* if ``data_columns`` are specified, these can be used as additional indexers. Valid comparison operators are: @@ -3665,9 +3665,9 @@ Valid comparison operators are: Valid boolean expressions are combined with: -- ``|`` : or -- ``&`` : and -- ``(`` and ``)`` : for grouping +* ``|`` : or +* ``&`` : and +* ``(`` and ``)`` : for grouping These rules are similar to how boolean expressions are used in pandas for indexing. @@ -3680,16 +3680,16 @@ These rules are similar to how boolean expressions are used in pandas for indexi The following are valid expressions: -- ``'index >= date'`` -- ``"columns = ['A', 'D']"`` -- ``"columns in ['A', 'D']"`` -- ``'columns = A'`` -- ``'columns == A'`` -- ``"~(columns = ['A', 'B'])"`` -- ``'index > df.index[3] & string = "bar"'`` -- ``'(index > df.index[3] & index <= df.index[6]) | string = "bar"'`` -- ``"ts >= Timestamp('2012-02-01')"`` -- ``"major_axis>=20130101"`` +* ``'index >= date'`` +* ``"columns = ['A', 'D']"`` +* ``"columns in ['A', 'D']"`` +* ``'columns = A'`` +* ``'columns == A'`` +* ``"~(columns = ['A', 'B'])"`` +* ``'index > df.index[3] & string = "bar"'`` +* ``'(index > df.index[3] & index <= df.index[6]) | string = "bar"'`` +* ``"ts >= Timestamp('2012-02-01')"`` +* ``"major_axis>=20130101"`` The ``indexers`` are on the left-hand side of the sub-expression: @@ -3697,11 +3697,11 @@ The ``indexers`` are on the left-hand side of the sub-expression: The right-hand side of the sub-expression (after a comparison operator) can be: -- functions that will be evaluated, e.g. ``Timestamp('2012-02-01')`` -- strings, e.g. ``"bar"`` -- date-like, e.g. ``20130101``, or ``"20130101"`` -- lists, e.g. ``"['A', 'B']"`` -- variables that are defined in the local names space, e.g. ``date`` +* functions that will be evaluated, e.g. ``Timestamp('2012-02-01')`` +* strings, e.g. ``"bar"`` +* date-like, e.g. ``20130101``, or ``"20130101"`` +* lists, e.g. ``"['A', 'B']"`` +* variables that are defined in the local names space, e.g. ``date`` .. note:: @@ -4080,15 +4080,15 @@ simple use case. You store panel-type data, with dates in the ``major_axis`` and ids in the ``minor_axis``. The data is then interleaved like this: -- date_1 - - id_1 - - id_2 - - . - - id_n -- date_2 - - id_1 - - . - - id_n +* date_1 + * id_1 + * id_2 + * . + * id_n +* date_2 + * id_1 + * . + * id_n It should be clear that a delete operation on the ``major_axis`` will be fairly quick, as one chunk is removed, then the following data moved. On @@ -4216,12 +4216,12 @@ Caveats need to serialize these operations in a single thread in a single process. You will corrupt your data otherwise. See the (:issue:`2397`) for more information. -- If you use locks to manage write access between multiple processes, you +* If you use locks to manage write access between multiple processes, you may want to use :py:func:`~os.fsync` before releasing write locks. For convenience you can use ``store.flush(fsync=True)`` to do this for you. -- Once a ``table`` is created its items (Panel) / columns (DataFrame) +* Once a ``table`` is created its items (Panel) / columns (DataFrame) are fixed; only exactly the same columns can be appended -- Be aware that timezones (e.g., ``pytz.timezone('US/Eastern')``) +* Be aware that timezones (e.g., ``pytz.timezone('US/Eastern')``) are not necessarily equal across timezone versions. So if data is localized to a specific timezone in the HDFStore using one version of a timezone library and that data is updated with another version, the data @@ -4438,21 +4438,21 @@ Now you can import the ``DataFrame`` into R: Performance ''''''''''' -- ``tables`` format come with a writing performance penalty as compared to +* ``tables`` format come with a writing performance penalty as compared to ``fixed`` stores. The benefit is the ability to append/delete and query (potentially very large amounts of data). Write times are generally longer as compared with regular stores. Query times can be quite fast, especially on an indexed axis. -- You can pass ``chunksize=`` to ``append``, specifying the +* You can pass ``chunksize=`` to ``append``, specifying the write chunksize (default is 50000). This will significantly lower your memory usage on writing. -- You can pass ``expectedrows=`` to the first ``append``, +* You can pass ``expectedrows=`` to the first ``append``, to set the TOTAL number of expected rows that ``PyTables`` will expected. This will optimize read/write performance. -- Duplicate rows can be written to tables, but are filtered out in +* Duplicate rows can be written to tables, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) -- A ``PerformanceWarning`` will be raised if you are attempting to +* A ``PerformanceWarning`` will be raised if you are attempting to store types that will be pickled by PyTables (rather than stored as endemic types). See `Here `__ @@ -4482,14 +4482,14 @@ dtypes, including extension dtypes such as categorical and datetime with tz. Several caveats. -- This is a newer library, and the format, though stable, is not guaranteed to be backward compatible +* This is a newer library, and the format, though stable, is not guaranteed to be backward compatible to the earlier versions. -- The format will NOT write an ``Index``, or ``MultiIndex`` for the +* The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an error if a non-default one is provided. You can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to ignore it. -- Duplicate column names and non-string columns names are not supported -- Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message +* Duplicate column names and non-string columns names are not supported +* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. See the `Full Documentation `__. @@ -4550,10 +4550,10 @@ dtypes, including extension dtypes such as datetime with tz. Several caveats. -- Duplicate column names and non-string columns names are not supported. -- Index level names, if specified, must be strings. -- Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. -- Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message +* Duplicate column names and non-string columns names are not supported. +* Index level names, if specified, must be strings. +* Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. +* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 45944ba56d4e7..b2cb388e3cd03 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -81,33 +81,33 @@ some configurable handling of "what to do with the other axes": keys=None, levels=None, names=None, verify_integrity=False, copy=True) -- ``objs`` : a sequence or mapping of Series, DataFrame, or Panel objects. If a +* ``objs`` : a sequence or mapping of Series, DataFrame, or Panel objects. If a dict is passed, the sorted keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. -- ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along. -- ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on +* ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along. +* ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on other axis(es). Outer for union and inner for intersection. -- ``ignore_index`` : boolean, default False. If True, do not use the index +* ``ignore_index`` : boolean, default False. If True, do not use the index values on the concatenation axis. The resulting axis will be labeled 0, ..., n - 1. This is useful if you are concatenating objects where the concatenation axis does not have meaningful indexing information. Note the index values on the other axes are still respected in the join. -- ``join_axes`` : list of Index objects. Specific indexes to use for the other +* ``join_axes`` : list of Index objects. Specific indexes to use for the other n - 1 axes instead of performing inner/outer set logic. -- ``keys`` : sequence, default None. Construct hierarchical index using the +* ``keys`` : sequence, default None. Construct hierarchical index using the passed keys as the outermost level. If multiple levels passed, should contain tuples. -- ``levels`` : list of sequences, default None. Specific levels (unique values) +* ``levels`` : list of sequences, default None. Specific levels (unique values) to use for constructing a MultiIndex. Otherwise they will be inferred from the keys. -- ``names`` : list, default None. Names for the levels in the resulting +* ``names`` : list, default None. Names for the levels in the resulting hierarchical index. -- ``verify_integrity`` : boolean, default False. Check whether the new +* ``verify_integrity`` : boolean, default False. Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. -- ``copy`` : boolean, default True. If False, do not copy data unnecessarily. +* ``copy`` : boolean, default True. If False, do not copy data unnecessarily. Without a little bit of context many of these arguments don't make much sense. Let's revisit the above example. Suppose we wanted to associate specific keys @@ -156,10 +156,10 @@ When gluing together multiple DataFrames, you have a choice of how to handle the other axes (other than the one being concatenated). This can be done in the following three ways: -- Take the union of them all, ``join='outer'``. This is the default +* Take the union of them all, ``join='outer'``. This is the default option as it results in zero information loss. -- Take the intersection, ``join='inner'``. -- Use a specific index, as passed to the ``join_axes`` argument. +* Take the intersection, ``join='inner'``. +* Use a specific index, as passed to the ``join_axes`` argument. Here is an example of each of these methods. First, the default ``join='outer'`` behavior: @@ -531,52 +531,52 @@ all standard database join operations between ``DataFrame`` objects: suffixes=('_x', '_y'), copy=True, indicator=False, validate=None) -- ``left``: A DataFrame object. -- ``right``: Another DataFrame object. -- ``on``: Column or index level names to join on. Must be found in both the left +* ``left``: A DataFrame object. +* ``right``: Another DataFrame object. +* ``on``: Column or index level names to join on. Must be found in both the left and right DataFrame objects. If not passed and ``left_index`` and ``right_index`` are ``False``, the intersection of the columns in the DataFrames will be inferred to be the join keys. -- ``left_on``: Columns or index levels from the left DataFrame to use as +* ``left_on``: Columns or index levels from the left DataFrame to use as keys. Can either be column names, index level names, or arrays with length equal to the length of the DataFrame. -- ``right_on``: Columns or index levels from the right DataFrame to use as +* ``right_on``: Columns or index levels from the right DataFrame to use as keys. Can either be column names, index level names, or arrays with length equal to the length of the DataFrame. -- ``left_index``: If ``True``, use the index (row labels) from the left +* ``left_index``: If ``True``, use the index (row labels) from the left DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex (hierarchical), the number of levels must match the number of join keys from the right DataFrame. -- ``right_index``: Same usage as ``left_index`` for the right DataFrame -- ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults +* ``right_index``: Same usage as ``left_index`` for the right DataFrame +* ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults to ``inner``. See below for more detailed description of each method. -- ``sort``: Sort the result DataFrame by the join keys in lexicographical +* ``sort``: Sort the result DataFrame by the join keys in lexicographical order. Defaults to ``True``, setting to ``False`` will improve performance substantially in many cases. -- ``suffixes``: A tuple of string suffixes to apply to overlapping +* ``suffixes``: A tuple of string suffixes to apply to overlapping columns. Defaults to ``('_x', '_y')``. -- ``copy``: Always copy data (default ``True``) from the passed DataFrame +* ``copy``: Always copy data (default ``True``) from the passed DataFrame objects, even when reindexing is not necessary. Cannot be avoided in many cases but may improve performance / memory usage. The cases where copying can be avoided are somewhat pathological but this option is provided nonetheless. -- ``indicator``: Add a column to the output DataFrame called ``_merge`` +* ``indicator``: Add a column to the output DataFrame called ``_merge`` with information on the source of each row. ``_merge`` is Categorical-type and takes on a value of ``left_only`` for observations whose merge key only appears in ``'left'`` DataFrame, ``right_only`` for observations whose merge key only appears in ``'right'`` DataFrame, and ``both`` if the observation's merge key is found in both. -- ``validate`` : string, default None. +* ``validate`` : string, default None. If specified, checks if merge is of specified type. - * "one_to_one" or "1:1": checks if merge keys are unique in both - left and right datasets. - * "one_to_many" or "1:m": checks if merge keys are unique in left - dataset. - * "many_to_one" or "m:1": checks if merge keys are unique in right - dataset. - * "many_to_many" or "m:m": allowed, but does not result in checks. + * "one_to_one" or "1:1": checks if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": checks if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": checks if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. .. versionadded:: 0.21.0 @@ -605,11 +605,11 @@ terminology used to describe join operations between two SQL-table like structures (``DataFrame`` objects). There are several cases to consider which are very important to understand: -- **one-to-one** joins: for example when joining two ``DataFrame`` objects on +* **one-to-one** joins: for example when joining two ``DataFrame`` objects on their indexes (which must contain unique values). -- **many-to-one** joins: for example when joining an index (unique) to one or +* **many-to-one** joins: for example when joining an index (unique) to one or more columns in a different ``DataFrame``. -- **many-to-many** joins: joining columns on columns. +* **many-to-many** joins: joining columns on columns. .. note:: diff --git a/doc/source/options.rst b/doc/source/options.rst index 697cc0682e39a..cbe0264f442bc 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -31,10 +31,10 @@ You can get/set options directly as attributes of the top-level ``options`` attr The API is composed of 5 relevant functions, available directly from the ``pandas`` namespace: -- :func:`~pandas.get_option` / :func:`~pandas.set_option` - get/set the value of a single option. -- :func:`~pandas.reset_option` - reset one or more options to their default value. -- :func:`~pandas.describe_option` - print the descriptions of one or more options. -- :func:`~pandas.option_context` - execute a codeblock with a set of options +* :func:`~pandas.get_option` / :func:`~pandas.set_option` - get/set the value of a single option. +* :func:`~pandas.reset_option` - reset one or more options to their default value. +* :func:`~pandas.describe_option` - print the descriptions of one or more options. +* :func:`~pandas.option_context` - execute a codeblock with a set of options that revert to prior settings after execution. **Note:** Developers can check out `pandas/core/config.py `_ for more information. diff --git a/doc/source/overview.rst b/doc/source/overview.rst index f86b1c67e6843..6ba9501ba0b5e 100644 --- a/doc/source/overview.rst +++ b/doc/source/overview.rst @@ -12,19 +12,19 @@ programming language. :mod:`pandas` consists of the following elements: - * A set of labeled array data structures, the primary of which are - Series and DataFrame. - * Index objects enabling both simple axis indexing and multi-level / - hierarchical axis indexing. - * An integrated group by engine for aggregating and transforming data sets. - * Date range generation (date_range) and custom date offsets enabling the - implementation of customized frequencies. - * Input/Output tools: loading tabular data from flat files (CSV, delimited, - Excel 2003), and saving and loading pandas objects from the fast and - efficient PyTables/HDF5 format. - * Memory-efficient "sparse" versions of the standard data structures for storing - data that is mostly missing or mostly constant (some fixed value). - * Moving window statistics (rolling mean, rolling standard deviation, etc.). +* A set of labeled array data structures, the primary of which are + Series and DataFrame. +* Index objects enabling both simple axis indexing and multi-level / + hierarchical axis indexing. +* An integrated group by engine for aggregating and transforming data sets. +* Date range generation (date_range) and custom date offsets enabling the + implementation of customized frequencies. +* Input/Output tools: loading tabular data from flat files (CSV, delimited, + Excel 2003), and saving and loading pandas objects from the fast and + efficient PyTables/HDF5 format. +* Memory-efficient "sparse" versions of the standard data structures for storing + data that is mostly missing or mostly constant (some fixed value). +* Moving window statistics (rolling mean, rolling standard deviation, etc.). Data Structures --------------- diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 250a1808e496e..88b7114cf4101 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -106,12 +106,12 @@ Closely related to the :meth:`~DataFrame.pivot` method are the related ``MultiIndex`` objects (see the section on :ref:`hierarchical indexing `). Here are essentially what these methods do: - - ``stack``: "pivot" a level of the (possibly hierarchical) column labels, - returning a ``DataFrame`` with an index with a new inner-most level of row - labels. - - ``unstack``: (inverse operation of ``stack``) "pivot" a level of the - (possibly hierarchical) row index to the column axis, producing a reshaped - ``DataFrame`` with a new inner-most level of column labels. +* ``stack``: "pivot" a level of the (possibly hierarchical) column labels, + returning a ``DataFrame`` with an index with a new inner-most level of row + labels. +* ``unstack``: (inverse operation of ``stack``) "pivot" a level of the + (possibly hierarchical) row index to the column axis, producing a reshaped + ``DataFrame`` with a new inner-most level of column labels. .. image:: _static/reshaping_unstack.png @@ -132,8 +132,8 @@ from the hierarchical indexing section: The ``stack`` function "compresses" a level in the ``DataFrame``'s columns to produce either: - - A ``Series``, in the case of a simple column Index. - - A ``DataFrame``, in the case of a ``MultiIndex`` in the columns. +* A ``Series``, in the case of a simple column Index. +* A ``DataFrame``, in the case of a ``MultiIndex`` in the columns. If the columns have a ``MultiIndex``, you can choose which level to stack. The stacked level becomes the new lowest level in a ``MultiIndex`` on the columns: @@ -351,13 +351,13 @@ strategies. It takes a number of arguments: -- ``data``: a DataFrame object. -- ``values``: a column or a list of columns to aggregate. -- ``index``: a column, Grouper, array which has the same length as data, or list of them. +* ``data``: a DataFrame object. +* ``values``: a column or a list of columns to aggregate. +* ``index``: a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. -- ``columns``: a column, Grouper, array which has the same length as data, or list of them. +* ``columns``: a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. -- ``aggfunc``: function to use for aggregation, defaulting to ``numpy.mean``. +* ``aggfunc``: function to use for aggregation, defaulting to ``numpy.mean``. Consider a data set like this: @@ -431,17 +431,17 @@ unless an array of values and an aggregation function are passed. It takes a number of arguments -- ``index``: array-like, values to group by in the rows. -- ``columns``: array-like, values to group by in the columns. -- ``values``: array-like, optional, array of values to aggregate according to +* ``index``: array-like, values to group by in the rows. +* ``columns``: array-like, values to group by in the columns. +* ``values``: array-like, optional, array of values to aggregate according to the factors. -- ``aggfunc``: function, optional, If no values array is passed, computes a +* ``aggfunc``: function, optional, If no values array is passed, computes a frequency table. -- ``rownames``: sequence, default ``None``, must match number of row arrays passed. -- ``colnames``: sequence, default ``None``, if passed, must match number of column +* ``rownames``: sequence, default ``None``, must match number of row arrays passed. +* ``colnames``: sequence, default ``None``, if passed, must match number of column arrays passed. -- ``margins``: boolean, default ``False``, Add row/column margins (subtotals) -- ``normalize``: boolean, {'all', 'index', 'columns'}, or {0,1}, default ``False``. +* ``margins``: boolean, default ``False``, Add row/column margins (subtotals) +* ``normalize``: boolean, {'all', 'index', 'columns'}, or {0,1}, default ``False``. Normalize by dividing all values by the sum of values. @@ -615,10 +615,10 @@ As with the ``Series`` version, you can pass values for the ``prefix`` and ``prefix_sep``. By default the column name is used as the prefix, and '_' as the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways: -- string: Use the same value for ``prefix`` or ``prefix_sep`` for each column +* string: Use the same value for ``prefix`` or ``prefix_sep`` for each column to be encoded. -- list: Must be the same length as the number of columns being encoded. -- dict: Mapping column name to prefix. +* list: Must be the same length as the number of columns being encoded. +* dict: Mapping column name to prefix. .. ipython:: python diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 260d8aa32ef52..2bb99dd1822b6 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -104,9 +104,9 @@ Sparse data should have the same dtype as its dense representation. Currently, ``float64``, ``int64`` and ``bool`` dtypes are supported. Depending on the original dtype, ``fill_value`` default changes: -- ``float64``: ``np.nan`` -- ``int64``: ``0`` -- ``bool``: ``False`` +* ``float64``: ``np.nan`` +* ``int64``: ``0`` +* ``bool``: ``False`` .. ipython:: python diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index ded54d2d355f1..ba58d65b00714 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -28,11 +28,11 @@ a tremendous amount of new functionality for manipulating time series data. In working with time series data, we will frequently seek to: - - generate sequences of fixed-frequency dates and time spans - - conform or convert time series to a particular frequency - - compute "relative" dates based on various non-standard time increments - (e.g. 5 business days before the last business day of the year), or "roll" - dates forward or backward +* generate sequences of fixed-frequency dates and time spans +* conform or convert time series to a particular frequency +* compute "relative" dates based on various non-standard time increments + (e.g. 5 business days before the last business day of the year), or "roll" + dates forward or backward pandas provides a relatively compact and self-contained set of tools for performing the above tasks. @@ -226,8 +226,8 @@ You can pass only the columns that you need to assemble. ``pd.to_datetime`` looks for standard designations of the datetime component in the column names, including: -- required: ``year``, ``month``, ``day`` -- optional: ``hour``, ``minute``, ``second``, ``millisecond``, ``microsecond``, ``nanosecond`` +* required: ``year``, ``month``, ``day`` +* optional: ``hour``, ``minute``, ``second``, ``millisecond``, ``microsecond``, ``nanosecond`` Invalid Data ~~~~~~~~~~~~ @@ -463,14 +463,14 @@ Indexing One of the main uses for ``DatetimeIndex`` is as an index for pandas objects. The ``DatetimeIndex`` class contains many time series related optimizations: - - A large range of dates for various offsets are pre-computed and cached - under the hood in order to make generating subsequent date ranges very fast - (just have to grab a slice). - - Fast shifting using the ``shift`` and ``tshift`` method on pandas objects. - - Unioning of overlapping ``DatetimeIndex`` objects with the same frequency is - very fast (important for fast data alignment). - - Quick access to date fields via properties such as ``year``, ``month``, etc. - - Regularization functions like ``snap`` and very fast ``asof`` logic. +* A large range of dates for various offsets are pre-computed and cached + under the hood in order to make generating subsequent date ranges very fast + (just have to grab a slice). +* Fast shifting using the ``shift`` and ``tshift`` method on pandas objects. +* Unioning of overlapping ``DatetimeIndex`` objects with the same frequency is + very fast (important for fast data alignment). +* Quick access to date fields via properties such as ``year``, ``month``, etc. +* Regularization functions like ``snap`` and very fast ``asof`` logic. ``DatetimeIndex`` objects have all the basic functionality of regular ``Index`` objects, and a smorgasbord of advanced time series specific methods for easy @@ -797,11 +797,11 @@ We could have done the same thing with ``DateOffset``: The key features of a ``DateOffset`` object are: -- It can be added / subtracted to/from a datetime object to obtain a +* It can be added / subtracted to/from a datetime object to obtain a shifted date. -- It can be multiplied by an integer (positive or negative) so that the +* It can be multiplied by an integer (positive or negative) so that the increment will be applied multiple times. -- It has :meth:`~pandas.DateOffset.rollforward` and +* It has :meth:`~pandas.DateOffset.rollforward` and :meth:`~pandas.DateOffset.rollback` methods for moving a date forward or backward to the next or previous "offset date". @@ -2064,9 +2064,9 @@ To supply the time zone, you can use the ``tz`` keyword to ``date_range`` and other functions. Dateutil time zone strings are distinguished from ``pytz`` time zones by starting with ``dateutil/``. -- In ``pytz`` you can find a list of common (and less common) time zones using +* In ``pytz`` you can find a list of common (and less common) time zones using ``from pytz import common_timezones, all_timezones``. -- ``dateutil`` uses the OS timezones so there isn't a fixed list available. For +* ``dateutil`` uses the OS timezones so there isn't a fixed list available. For common zones, the names are the same as ``pytz``. .. ipython:: python diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 895fe595de205..381031fa128e6 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -28,33 +28,33 @@ repository `_. To run the examples in th clone the GitHub repository and get IPython Notebook running. See `How to use this cookbook `_. -- `A quick tour of the IPython Notebook: `_ +* `A quick tour of the IPython Notebook: `_ Shows off IPython's awesome tab completion and magic functions. -- `Chapter 1: `_ +* `Chapter 1: `_ Reading your data into pandas is pretty much the easiest thing. Even when the encoding is wrong! -- `Chapter 2: `_ +* `Chapter 2: `_ It's not totally obvious how to select data from a pandas dataframe. Here we explain the basics (how to take slices and get columns) -- `Chapter 3: `_ +* `Chapter 3: `_ Here we get into serious slicing and dicing and learn how to filter dataframes in complicated ways, really fast. -- `Chapter 4: `_ +* `Chapter 4: `_ Groupby/aggregate is seriously my favorite thing about pandas and I use it all the time. You should probably read this. -- `Chapter 5: `_ +* `Chapter 5: `_ Here you get to find out if it's cold in Montreal in the winter (spoiler: yes). Web scraping with pandas is fun! Here we combine dataframes. -- `Chapter 6: `_ +* `Chapter 6: `_ Strings with pandas are great. It has all these vectorized string operations and they're the best. We will turn a bunch of strings containing "Snow" into vectors of numbers in a trice. -- `Chapter 7: `_ +* `Chapter 7: `_ Cleaning up messy data is never a joy, but with pandas it's easier. -- `Chapter 8: `_ +* `Chapter 8: `_ Parsing Unix timestamps is confusing at first but it turns out to be really easy. -- `Chapter 9: `_ +* `Chapter 9: `_ Reading data from SQL databases. @@ -63,54 +63,54 @@ Lessons for new pandas users For more resources, please visit the main `repository `__. -- `01 - Lesson: `_ - - Importing libraries - - Creating data sets - - Creating data frames - - Reading from CSV - - Exporting to CSV - - Finding maximums - - Plotting data +* `01 - Lesson: `_ + * Importing libraries + * Creating data sets + * Creating data frames + * Reading from CSV + * Exporting to CSV + * Finding maximums + * Plotting data -- `02 - Lesson: `_ - - Reading from TXT - - Exporting to TXT - - Selecting top/bottom records - - Descriptive statistics - - Grouping/sorting data +* `02 - Lesson: `_ + * Reading from TXT + * Exporting to TXT + * Selecting top/bottom records + * Descriptive statistics + * Grouping/sorting data -- `03 - Lesson: `_ - - Creating functions - - Reading from EXCEL - - Exporting to EXCEL - - Outliers - - Lambda functions - - Slice and dice data +* `03 - Lesson: `_ + * Creating functions + * Reading from EXCEL + * Exporting to EXCEL + * Outliers + * Lambda functions + * Slice and dice data -- `04 - Lesson: `_ - - Adding/deleting columns - - Index operations +* `04 - Lesson: `_ + * Adding/deleting columns + * Index operations -- `05 - Lesson: `_ - - Stack/Unstack/Transpose functions +* `05 - Lesson: `_ + * Stack/Unstack/Transpose functions -- `06 - Lesson: `_ - - GroupBy function +* `06 - Lesson: `_ + * GroupBy function -- `07 - Lesson: `_ - - Ways to calculate outliers +* `07 - Lesson: `_ + * Ways to calculate outliers -- `08 - Lesson: `_ - - Read from Microsoft SQL databases +* `08 - Lesson: `_ + * Read from Microsoft SQL databases -- `09 - Lesson: `_ - - Export to CSV/EXCEL/TXT +* `09 - Lesson: `_ + * Export to CSV/EXCEL/TXT -- `10 - Lesson: `_ - - Converting between different kinds of formats +* `10 - Lesson: `_ + * Converting between different kinds of formats -- `11 - Lesson: `_ - - Combining data from various sources +* `11 - Lesson: `_ + * Combining data from various sources Practical data analysis with Python @@ -119,13 +119,13 @@ Practical data analysis with Python This `guide `_ is a comprehensive introduction to the data analysis process using the Python data ecosystem and an interesting open dataset. There are four sections covering selected topics as follows: -- `Munging Data `_ +* `Munging Data `_ -- `Aggregating Data `_ +* `Aggregating Data `_ -- `Visualizing Data `_ +* `Visualizing Data `_ -- `Time Series `_ +* `Time Series `_ .. _tutorial-exercises-new-users: @@ -134,25 +134,25 @@ Exercises for new users Practice your skills with real data sets and exercises. For more resources, please visit the main `repository `__. -- `01 - Getting & Knowing Your Data `_ +* `01 - Getting & Knowing Your Data `_ -- `02 - Filtering & Sorting `_ +* `02 - Filtering & Sorting `_ -- `03 - Grouping `_ +* `03 - Grouping `_ -- `04 - Apply `_ +* `04 - Apply `_ -- `05 - Merge `_ +* `05 - Merge `_ -- `06 - Stats `_ +* `06 - Stats `_ -- `07 - Visualization `_ +* `07 - Visualization `_ -- `08 - Creating Series and DataFrames `_ +* `08 - Creating Series and DataFrames `_ -- `09 - Time Series `_ +* `09 - Time Series `_ -- `10 - Deleting `_ +* `10 - Deleting `_ .. _tutorial-modern: @@ -164,29 +164,29 @@ Tutorial series written in 2016 by The source may be found in the GitHub repository `TomAugspurger/effective-pandas `_. -- `Modern Pandas `_ -- `Method Chaining `_ -- `Indexes `_ -- `Performance `_ -- `Tidy Data `_ -- `Visualization `_ -- `Timeseries `_ +* `Modern Pandas `_ +* `Method Chaining `_ +* `Indexes `_ +* `Performance `_ +* `Tidy Data `_ +* `Visualization `_ +* `Timeseries `_ Excel charts with pandas, vincent and xlsxwriter ------------------------------------------------ -- `Using Pandas and XlsxWriter to create Excel charts `_ +* `Using Pandas and XlsxWriter to create Excel charts `_ Video Tutorials --------------- -- `Pandas From The Ground Up `_ +* `Pandas From The Ground Up `_ (2015) (2:24) `GitHub repo `__ -- `Introduction Into Pandas `_ +* `Introduction Into Pandas `_ (2016) (1:28) `GitHub repo `__ -- `Pandas: .head() to .tail() `_ +* `Pandas: .head() to .tail() `_ (2016) (1:26) `GitHub repo `__ @@ -194,12 +194,12 @@ Video Tutorials Various Tutorials ----------------- -- `Wes McKinney's (pandas BDFL) blog `_ -- `Statistical analysis made easy in Python with SciPy and pandas DataFrames, by Randal Olson `_ -- `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 `_ -- `Financial analysis in Python, by Thomas Wiecki `_ -- `Intro to pandas data structures, by Greg Reda `_ -- `Pandas and Python: Top 10, by Manish Amde `_ -- `Pandas Tutorial, by Mikhail Semeniuk `_ -- `Pandas DataFrames Tutorial, by Karlijn Willems `_ -- `A concise tutorial with real life examples `_ +* `Wes McKinney's (pandas BDFL) blog `_ +* `Statistical analysis made easy in Python with SciPy and pandas DataFrames, by Randal Olson `_ +* `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 `_ +* `Financial analysis in Python, by Thomas Wiecki `_ +* `Intro to pandas data structures, by Greg Reda `_ +* `Pandas and Python: Top 10, by Manish Amde `_ +* `Pandas Tutorial, by Mikhail Semeniuk `_ +* `Pandas DataFrames Tutorial, by Karlijn Willems `_ +* `A concise tutorial with real life examples `_ diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 17197b805e86a..569a6fb7b7a0d 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -1381,9 +1381,9 @@ Plotting with error bars is supported in :meth:`DataFrame.plot` and :meth:`Serie Horizontal and vertical error bars can be supplied to the ``xerr`` and ``yerr`` keyword arguments to :meth:`~DataFrame.plot()`. The error values can be specified using a variety of formats: -- As a :class:`DataFrame` or ``dict`` of errors with column names matching the ``columns`` attribute of the plotting :class:`DataFrame` or matching the ``name`` attribute of the :class:`Series`. -- As a ``str`` indicating which of the columns of plotting :class:`DataFrame` contain the error values. -- As raw values (``list``, ``tuple``, or ``np.ndarray``). Must be the same length as the plotting :class:`DataFrame`/:class:`Series`. +* As a :class:`DataFrame` or ``dict`` of errors with column names matching the ``columns`` attribute of the plotting :class:`DataFrame` or matching the ``name`` attribute of the :class:`Series`. +* As a ``str`` indicating which of the columns of plotting :class:`DataFrame` contain the error values. +* As raw values (``list``, ``tuple``, or ``np.ndarray``). Must be the same length as the plotting :class:`DataFrame`/:class:`Series`. Asymmetrical error bars are also supported, however raw error values must be provided in this case. For a ``M`` length :class:`Series`, a ``Mx2`` array should be provided indicating lower and upper (or left and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors should be in a ``Mx2xN`` array. From ca9ce8d70465c02ee1593f95971d1bf433b058b3 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Wed, 20 Jun 2018 11:29:49 +0100 Subject: [PATCH 059/113] PERF: add method Categorical.__contains__ (#21508) --- asv_bench/benchmarks/categoricals.py | 10 ++-- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/core/arrays/categorical.py | 59 ++++++++++++++++++++++ pandas/core/indexes/category.py | 29 ++--------- pandas/tests/categorical/test_operators.py | 17 +++++++ 5 files changed, 88 insertions(+), 29 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 48f42621d183d..73e3933122628 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -202,7 +202,11 @@ class Contains(object): def setup(self): N = 10**5 self.ci = tm.makeCategoricalIndex(N) - self.cat = self.ci.categories[0] + self.c = self.ci.values + self.key = self.ci.categories[0] - def time_contains(self): - self.cat in self.ci + def time_categorical_index_contains(self): + self.key in self.ci + + def time_categorical_contains(self): + self.key in self.c diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 0f2c9c4756987..5454dc9eca360 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -26,7 +26,7 @@ Performance Improvements - Improved performance of membership checks in :class:`CategoricalIndex` (i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains` - is likewise much faster (:issue:`21369`) + is likewise much faster (:issue:`21369`, :issue:`21508`) - Improved performance of :meth:`MultiIndex.is_unique` (:issue:`21522`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e22b0d626a218..7b3cce0f2585d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -157,6 +157,57 @@ def _maybe_to_categorical(array): return array +def contains(cat, key, container): + """ + Helper for membership check for ``key`` in ``cat``. + + This is a helper method for :method:`__contains__` + and :class:`CategoricalIndex.__contains__`. + + Returns True if ``key`` is in ``cat.categories`` and the + location of ``key`` in ``categories`` is in ``container``. + + Parameters + ---------- + cat : :class:`Categorical`or :class:`categoricalIndex` + key : a hashable object + The key to check membership for. + container : Container (e.g. list-like or mapping) + The container to check for membership in. + + Returns + ------- + is_in : bool + True if ``key`` is in ``self.categories`` and location of + ``key`` in ``categories`` is in ``container``, else False. + + Notes + ----- + This method does not check for NaN values. Do that separately + before calling this method. + """ + hash(key) + + # get location of key in categories. + # If a KeyError, the key isn't in categories, so logically + # can't be in container either. + try: + loc = cat.categories.get_loc(key) + except KeyError: + return False + + # loc is the location of key in categories, but also the *value* + # for key in container. So, `key` may be in categories, + # but still not in `container`. Example ('b' in categories, + # but not in values): + # 'b' in Categorical(['a'], categories=['a', 'b']) # False + if is_scalar(loc): + return loc in container + else: + # if categories is an IntervalIndex, loc is an array. + return any(loc_ in container for loc_ in loc) + + _codes_doc = """The category codes of this categorical. Level codes are an array if integer which are the positions of the real @@ -1846,6 +1897,14 @@ def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(self.get_values().tolist()) + def __contains__(self, key): + """Returns True if `key` is in this Categorical.""" + # if key is a NaN, check if any NaN is in self. + if isna(key): + return self.isna().any() + + return contains(self, key, container=self._codes) + def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0093d4940751e..fc669074758da 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -24,6 +24,7 @@ import pandas.core.common as com import pandas.core.missing as missing import pandas.core.indexes.base as ibase +from pandas.core.arrays.categorical import Categorical, contains _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) @@ -125,7 +126,6 @@ def _create_from_codes(self, codes, categories=None, ordered=None, CategoricalIndex """ - from pandas.core.arrays import Categorical if categories is None: categories = self.categories if ordered is None: @@ -162,7 +162,6 @@ def _create_categorical(self, data, categories=None, ordered=None, if not isinstance(data, ABCCategorical): if ordered is None and dtype is None: ordered = False - from pandas.core.arrays import Categorical data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: @@ -323,32 +322,14 @@ def _reverse_indexer(self): @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): - hash(key) - - if isna(key): # if key is a NaN, check if any NaN is in self. + # if key is a NaN, check if any NaN is in self. + if isna(key): return self.hasnans - # is key in self.categories? Then get its location. - # If not (i.e. KeyError), it logically can't be in self either - try: - loc = self.categories.get_loc(key) - except KeyError: - return False - - # loc is the location of key in self.categories, but also the value - # for key in self.codes and in self._engine. key may be in categories, - # but still not in self, check this. Example: - # 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False - if is_scalar(loc): - return loc in self._engine - else: - # if self.categories is IntervalIndex, loc is an array - # check if any scalar of the array is in self._engine - return any(loc_ in self._engine for loc_ in loc) + return contains(self, key, container=self._engine) @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): - hash(key) return key in self def __array__(self, dtype=None): @@ -479,7 +460,6 @@ def where(self, cond, other=None): other = self._na_value values = np.where(cond, self.values, other) - from pandas.core.arrays import Categorical cat = Categorical(values, categories=self.categories, ordered=self.ordered) @@ -862,7 +842,6 @@ def _delegate_method(self, name, *args, **kwargs): def _add_accessors(cls): """ add in Categorical accessor methods """ - from pandas.core.arrays import Categorical CategoricalIndex._add_delegate_accessors( delegate=Categorical, accessors=["rename_categories", "reorder_categories", diff --git a/pandas/tests/categorical/test_operators.py b/pandas/tests/categorical/test_operators.py index fa8bb817616e4..a26de32d7446c 100644 --- a/pandas/tests/categorical/test_operators.py +++ b/pandas/tests/categorical/test_operators.py @@ -291,3 +291,20 @@ def test_numeric_like_ops(self): # invalid ufunc pytest.raises(TypeError, lambda: np.log(s)) + + def test_contains(self): + # GH21508 + c = pd.Categorical(list('aabbca'), categories=list('cab')) + + assert 'b' in c + assert 'z' not in c + assert np.nan not in c + with pytest.raises(TypeError): + assert [1] in c + + # assert codes NOT in index + assert 0 not in c + assert 1 not in c + + c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) + assert np.nan in c From 6289c76085bab7f2d713b7c6acc21dd329f6dfbe Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Wed, 20 Jun 2018 16:03:07 +0530 Subject: [PATCH 060/113] REGR: Fixes first_valid_index when DataFrame or Series has duplicate row index (GH21441) (#21497) --- doc/source/whatsnew/v0.23.2.txt | 3 ++- pandas/core/generic.py | 23 +++++++++++------------ pandas/tests/frame/test_timeseries.py | 15 ++++++++++++++- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 5454dc9eca360..5b3e607956f7a 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -17,7 +17,8 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) -- +- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- .. _whatsnew_0232.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 555108a5d9349..1780e359164e2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8968,18 +8968,17 @@ def _find_valid_index(self, how): is_valid = is_valid.any(1) # reduce axis 1 if how == 'first': - # First valid value case - i = is_valid.idxmax() - if not is_valid[i]: - return None - return i - - elif how == 'last': - # Last valid value case - i = is_valid.values[::-1].argmax() - if not is_valid.iat[len(self) - i - 1]: - return None - return self.index[len(self) - i - 1] + idxpos = is_valid.values[::].argmax() + + if how == 'last': + idxpos = len(self) - 1 - is_valid.values[::-1].argmax() + + chk_notna = is_valid.iat[idxpos] + idx = self.index[idxpos] + + if not chk_notna: + return None + return idx @Appender(_shared_docs['valid_index'] % {'position': 'first', 'klass': 'NDFrame'}) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 90fbc6e628369..fb9bd74d9876d 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -506,7 +506,15 @@ def test_asfreq_fillvalue(self): actual_series = ts.asfreq(freq='1S', fill_value=9.0) assert_series_equal(expected_series, actual_series) - def test_first_last_valid(self): + @pytest.mark.parametrize("data,idx,expected_first,expected_last", [ + ({'A': [1, 2, 3]}, [1, 1, 2], 1, 2), + ({'A': [1, 2, 3]}, [1, 2, 2], 1, 2), + ({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'), + ({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)]) + def test_first_last_valid(self, data, idx, + expected_first, expected_last): N = len(self.frame.index) mat = randn(N) mat[:5] = nan @@ -539,6 +547,11 @@ def test_first_last_valid(self): assert frame.first_valid_index().freq == frame.index.freq assert frame.last_valid_index().freq == frame.index.freq + # GH 21441 + df = DataFrame(data, index=idx) + assert expected_first == df.first_valid_index() + assert expected_last == df.last_valid_index() + def test_first_subset(self): ts = tm.makeTimeDataFrame(freq='12h') result = ts.first('10d') From b19219df541a9b6a3cbcf4608751218f79ee0e89 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 20 Jun 2018 03:35:09 -0700 Subject: [PATCH 061/113] API/BUG: Raise when int-dtype coercions fail (#21456) Closes gh-15832. --- doc/source/whatsnew/v0.24.0.txt | 28 ++++++++- pandas/core/dtypes/cast.py | 72 ++++++++++++++++++++++++ pandas/core/indexes/base.py | 17 +++--- pandas/core/series.py | 8 ++- pandas/tests/generic/test_generic.py | 10 ++-- pandas/tests/indexes/test_base.py | 9 ++- pandas/tests/indexes/test_numeric.py | 20 +++++++ pandas/tests/io/test_pytables.py | 2 +- pandas/tests/series/test_constructors.py | 26 +++++++-- 9 files changed, 170 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c23ed006ff637..15c5cc97b8426 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -26,7 +26,7 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_0240.api.datetimelike.normalize +.. _whatsnew_0240.api.datetimelike.normalize: Tick DateOffset Normalize Restrictions -------------------------------------- @@ -73,6 +73,32 @@ Datetimelike API Changes Other API Changes ^^^^^^^^^^^^^^^^^ +.. _whatsnew_0240.api.other.incompatibilities: + +Series and Index Data-Dtype Incompatibilities +--------------------------------------------- + +``Series`` and ``Index`` constructors now raise when the +data is incompatible with a passed ``dtype=`` (:issue:`15832`) + +Previous Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + 0 18446744073709551615 + dtype: uint64 + +Current Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + ... + OverflowError: Trying to coerce negative values to unsigned integers + - :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) - - diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ebc7a13234a98..65328dfc7347e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -20,6 +20,7 @@ is_dtype_equal, is_float_dtype, is_complex_dtype, is_integer_dtype, + is_unsigned_integer_dtype, is_datetime_or_timedelta_dtype, is_bool_dtype, is_scalar, is_string_dtype, _string_dtypes, @@ -1269,3 +1270,74 @@ def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): subarr = subarr2 return subarr + + +def maybe_cast_to_integer_array(arr, dtype, copy=False): + """ + Takes any dtype and returns the casted version, raising for when data is + incompatible with integer/unsigned integer dtypes. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + arr : array-like + The array to cast. + dtype : str, np.dtype + The integer dtype to cast the array to. + copy: boolean, default False + Whether to make a copy of the array before returning. + + Returns + ------- + int_arr : ndarray + An array of integer or unsigned integer dtype + + Raises + ------ + OverflowError : the dtype is incompatible with the data + ValueError : loss of precision has occurred during casting + + Examples + -------- + If you try to coerce negative values to unsigned integers, it raises: + + >>> Series([-1], dtype="uint64") + Traceback (most recent call last): + ... + OverflowError: Trying to coerce negative values to unsigned integers + + Also, if you try to coerce float values to integers, it raises: + + >>> Series([1, 2, 3.5], dtype="int64") + Traceback (most recent call last): + ... + ValueError: Trying to coerce float values to integers + """ + + try: + if not hasattr(arr, "astype"): + casted = np.array(arr, dtype=dtype, copy=copy) + else: + casted = arr.astype(dtype, copy=copy) + except OverflowError: + raise OverflowError("The elements provided in the data cannot all be " + "casted to the dtype {dtype}".format(dtype=dtype)) + + if np.array_equal(arr, casted): + return casted + + # We do this casting to allow for proper + # data and dtype checking. + # + # We didn't do this earlier because NumPy + # doesn't handle `uint64` correctly. + arr = np.asarray(arr) + + if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): + raise OverflowError("Trying to coerce negative values " + "to unsigned integers") + + if is_integer_dtype(dtype) and (is_float_dtype(arr) or + is_object_dtype(arr)): + raise ValueError("Trying to coerce float values to integers") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 577b715ca9998..ac33ffad762cd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -21,6 +21,7 @@ ABCPeriodIndex, ABCTimedeltaIndex, ABCDateOffset) from pandas.core.dtypes.missing import isna, array_equivalent +from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( _ensure_int64, _ensure_object, @@ -311,19 +312,16 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if is_integer_dtype(dtype): inferred = lib.infer_dtype(data) if inferred == 'integer': - try: - data = np.array(data, copy=copy, dtype=dtype) - except OverflowError: - # gh-15823: a more user-friendly error message - raise OverflowError( - "the elements provided in the data cannot " - "all be casted to the dtype {dtype}" - .format(dtype=dtype)) + data = maybe_cast_to_integer_array(data, dtype, + copy=copy) elif inferred in ['floating', 'mixed-integer-float']: if isna(data).any(): raise ValueError('cannot convert float ' 'NaN to integer') + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + # If we are actually all equal to integers, # then coerce to integer. try: @@ -352,7 +350,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, except (TypeError, ValueError) as e: msg = str(e) - if 'cannot convert float' in msg: + if ("cannot convert float" in msg or + "Trying to coerce float values to integer" in msg): raise # maybe coerce to a sub-class diff --git a/pandas/core/series.py b/pandas/core/series.py index 23c4bbe082f28..2f762dff4aeab 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -41,7 +41,8 @@ maybe_cast_to_datetime, maybe_castable, construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, - construct_1d_object_array_from_listlike) + construct_1d_object_array_from_listlike, + maybe_cast_to_integer_array) from pandas.core.dtypes.missing import ( isna, notna, @@ -4068,6 +4069,11 @@ def _try_cast(arr, take_fast_path): return arr try: + # gh-15832: Check if we are requesting a numeric dype and + # that we can convert the data to the requested dtype. + if is_float_dtype(dtype) or is_integer_dtype(dtype): + subarr = maybe_cast_to_integer_array(arr, dtype) + subarr = maybe_cast_to_datetime(arr, dtype) # Take care in creating object arrays (but iterators are not # supported): diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 311c71f734945..533bff0384ad9 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -199,11 +199,11 @@ def test_downcast(self): self._compare(result, expected) def test_constructor_compound_dtypes(self): - # GH 5191 - # compound dtypes should raise not-implementederror + # see gh-5191 + # Compound dtypes should raise NotImplementedError. def f(dtype): - return self._construct(shape=3, dtype=dtype) + return self._construct(shape=3, value=1, dtype=dtype) pytest.raises(NotImplementedError, f, [("A", "datetime64[h]"), ("B", "str"), @@ -534,14 +534,14 @@ def test_truncate_out_of_bounds(self): # small shape = [int(2e3)] + ([1] * (self._ndim - 1)) - small = self._construct(shape, dtype='int8') + small = self._construct(shape, dtype='int8', value=1) self._compare(small.truncate(), small) self._compare(small.truncate(before=0, after=3e3), small) self._compare(small.truncate(before=-1, after=2e3), small) # big shape = [int(2e6)] + ([1] * (self._ndim - 1)) - big = self._construct(shape, dtype='int8') + big = self._construct(shape, dtype='int8', value=1) self._compare(big.truncate(), big) self._compare(big.truncate(before=0, after=3e6), big) self._compare(big.truncate(before=-1, after=2e6), big) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1d8a958c3413f..daba56e0c1e29 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -486,11 +486,18 @@ def test_constructor_nonhashable_name(self, indices): def test_constructor_overflow_int64(self): # see gh-15832 - msg = ("the elements provided in the data cannot " + msg = ("The elements provided in the data cannot " "all be casted to the dtype int64") with tm.assert_raises_regex(OverflowError, msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") + @pytest.mark.xfail(reason="see gh-21311: Index " + "doesn't enforce dtype argument") + def test_constructor_cast(self): + msg = "could not convert string to float" + with tm.assert_raises_regex(ValueError, msg): + Index(["a", "b", "c"], dtype=float) + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 49322d9b7abd6..166af4c89877d 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -451,6 +451,18 @@ def test_astype(self): i = Float64Index([0, 1.1, np.NAN]) pytest.raises(ValueError, lambda: i.astype(dtype)) + def test_type_coercion_fail(self, any_int_dtype): + # see gh-15832 + msg = "Trying to coerce float values to integers" + with tm.assert_raises_regex(ValueError, msg): + Index([1, 2, 3.5], dtype=any_int_dtype) + + def test_type_coercion_valid(self, float_dtype): + # There is no Float32Index, so we always + # generate Float64Index. + i = Index([1, 2, 3.5], dtype=float_dtype) + tm.assert_index_equal(i, Index([1, 2, 3.5])) + def test_equals_numeric(self): i = Float64Index([1.0, 2.0]) @@ -862,6 +874,14 @@ def test_constructor_corner(self): with tm.assert_raises_regex(TypeError, 'casting'): Int64Index(arr_with_floats) + def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): + + # see gh-15832 + msg = "Trying to coerce negative values to unsigned integers" + + with tm.assert_raises_regex(OverflowError, msg): + Index([-1], dtype=uint_dtype) + def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index d590cfd6b6c64..f96e7eeb40ea2 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -2047,7 +2047,7 @@ def test_table_values_dtypes_roundtrip(self): assert df1.dtypes[0] == 'float32' # check with mixed dtypes - df1 = DataFrame(dict((c, Series(np.random.randn(5), dtype=c)) + df1 = DataFrame(dict((c, Series(np.random.randint(5), dtype=c)) for c in ['float32', 'float64', 'int32', 'int64', 'int16', 'int8'])) df1['string'] = 'foo' diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 906d2aacd5586..27cfec0dbf20d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -542,12 +542,30 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) def test_constructor_cast(self): - pytest.raises(ValueError, Series, ['a', 'b', 'c'], dtype=float) + msg = "could not convert string to float" + with tm.assert_raises_regex(ValueError, msg): + Series(["a", "b", "c"], dtype=float) + + def test_constructor_unsigned_dtype_overflow(self, uint_dtype): + # see gh-15832 + msg = 'Trying to coerce negative values to unsigned integers' + with tm.assert_raises_regex(OverflowError, msg): + Series([-1], dtype=uint_dtype) + + def test_constructor_coerce_float_fail(self, any_int_dtype): + # see gh-15832 + msg = "Trying to coerce float values to integers" + with tm.assert_raises_regex(ValueError, msg): + Series([1, 2, 3.5], dtype=any_int_dtype) + + def test_constructor_coerce_float_valid(self, float_dtype): + s = Series([1, 2, 3.5], dtype=float_dtype) + expected = Series([1, 2, 3.5]).astype(float_dtype) + assert_series_equal(s, expected) - def test_constructor_dtype_nocast(self): - # 1572 + def test_constructor_dtype_no_cast(self): + # see gh-1572 s = Series([1, 2, 3]) - s2 = Series(s, dtype=np.int64) s2[1] = 5 From 3814d0c3f77868809e2b343310a1534ea4b87b1d Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 20 Jun 2018 18:09:15 -0600 Subject: [PATCH 062/113] DOC: Add documentation for freq='infer' option of DatetimeIndex and TimedeltaIndex constructors (#21566) --- doc/source/timedeltas.rst | 7 +++++++ doc/source/timeseries.rst | 13 +++++++++++++ pandas/core/indexes/datetimes.py | 5 ++++- pandas/core/indexes/timedeltas.py | 5 ++++- 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index 745810704f665..e602e45784f4a 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -363,6 +363,13 @@ or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent miss pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(2,'D'), datetime.timedelta(days=2,seconds=2)]) +The string 'infer' can be passed in order to set the frequency of the index as the +inferred frequency upon creation: + +.. ipython:: python + + pd.TimedeltaIndex(['0 days', '10 days', '20 days'], freq='infer') + Generating Ranges of Time Deltas ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index ba58d65b00714..11157264304b0 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -185,6 +185,19 @@ options like ``dayfirst`` or ``format``, so use ``to_datetime`` if these are req pd.Timestamp('2010/11/12') +You can also use the ``DatetimeIndex`` constructor directly: + +.. ipython:: python + + pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05']) + +The string 'infer' can be passed in order to set the frequency of the index as the +inferred frequency upon creation: + +.. ipython:: python + + pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05'], freq='infer') + Providing a Format Argument ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e944df7aa83c6..9515d41080f87 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -187,7 +187,10 @@ class DatetimeIndex(DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, copy : bool Make a copy of input ndarray freq : string or pandas offset object, optional - One of pandas date offset strings or corresponding objects + One of pandas date offset strings or corresponding objects. The string + 'infer' can be passed in order to set the frequency of the index as the + inferred frequency upon creation + start : starting value, datetime-like, optional If data is None, start is used as the start point in generating regular timestamp data. diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 9707d19953418..e90e1264638b0 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -107,7 +107,10 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, TimelikeOps, Int64Index): Optional timedelta-like data to construct index with unit: unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional which is an integer/float number - freq: a frequency for the index, optional + freq : string or pandas offset object, optional + One of pandas date offset strings or corresponding objects. The string + 'infer' can be passed in order to set the frequency of the index as the + inferred frequency upon creation copy : bool Make a copy of input ndarray start : starting value, timedelta-like, optional From ea205c0c32c8e857a1e60fe2cd093627a9153ff0 Mon Sep 17 00:00:00 2001 From: Michael Odintsov Date: Thu, 21 Jun 2018 05:54:23 +0300 Subject: [PATCH 063/113] BUG: Fix group index calculation to prevent hitting maximum recursion depth (#21541) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/sorting.py | 29 ++++++++++++++++------------ pandas/tests/frame/test_analytics.py | 17 ++++++++++++++++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 5b3e607956f7a..9271f58947f95 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -61,6 +61,7 @@ Bug Fixes - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) +- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). - **I/O** diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e550976d1deeb..212f44e55c489 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -52,7 +52,21 @@ def _int64_cut_off(shape): return i return len(shape) - def loop(labels, shape): + def maybe_lift(lab, size): + # promote nan values (assigned -1 label in lab array) + # so that all output values are non-negative + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(_ensure_int64, labels) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + labels = list(labels) + shape = list(shape) + + # Iteratively process all the labels in chunks sized so less + # than _INT64_MAX unique int ids will be required for each chunk + while True: # how many levels can be done without overflow: nlev = _int64_cut_off(shape) @@ -74,7 +88,7 @@ def loop(labels, shape): out[mask] = -1 if nlev == len(shape): # all levels done! - return out + break # compress what has been done so far in order to avoid overflow # to retain lexical ranks, obs_ids should be sorted @@ -83,16 +97,7 @@ def loop(labels, shape): labels = [comp_ids] + labels[nlev:] shape = [len(obs_ids)] + shape[nlev:] - return loop(labels, shape) - - def maybe_lift(lab, size): # pormote nan values - return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - - labels = map(_ensure_int64, labels) - if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) - - return loop(list(labels), list(shape)) + return out def get_compressed_ids(labels, sizes): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6dc24ed856017..12ebdbe0fd3c7 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1527,6 +1527,23 @@ def test_duplicated_with_misspelled_column_name(self, subset): with pytest.raises(KeyError): df.drop_duplicates(subset) + @pytest.mark.slow + def test_duplicated_do_not_fail_on_wide_dataframes(self): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} + df = pd.DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool pd.Series as a result + # and don't fail during calculation. + # Actual values doesn't matter here, though usually + # it's all False in this case + assert isinstance(result, pd.Series) + assert result.dtype == np.bool + def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([ From fd8d6bcd82bfbaccc8b22ec0ecf7e049ff35fa93 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Thu, 21 Jun 2018 09:13:01 +0100 Subject: [PATCH 064/113] BUG: Fix passing empty label to df drop (#21515) Closes #21494 --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/generic.py | 21 ++-- pandas/core/indexes/base.py | 4 +- pandas/core/indexes/multi.py | 1 - .../tests/frame/test_axis_select_reindex.py | 15 +++ .../tests/series/indexing/test_alter_index.py | 106 ++++++++++++------ 6 files changed, 98 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 9271f58947f95..cae0d1a754d89 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -61,6 +61,7 @@ Bug Fixes - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) +- Bug in :meth:`DataFrame.drop` behaviour is not consistent for unique and non-unique indexes (:issue:`21494`) - Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). - diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1780e359164e2..9902da4094404 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3129,7 +3129,7 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): """ axis = self._get_axis_number(axis) axis_name = self._get_axis_name(axis) - axis, axis_ = self._get_axis(axis), axis + axis = self._get_axis(axis) if axis.is_unique: if level is not None: @@ -3138,24 +3138,25 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): new_axis = axis.drop(labels, level=level, errors=errors) else: new_axis = axis.drop(labels, errors=errors) - dropped = self.reindex(**{axis_name: new_axis}) - try: - dropped.axes[axis_].set_names(axis.names, inplace=True) - except AttributeError: - pass - result = dropped + result = self.reindex(**{axis_name: new_axis}) + # Case for non-unique axis else: labels = _ensure_object(com._index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError('axis must be a MultiIndex') indexer = ~axis.get_level_values(level).isin(labels) + + # GH 18561 MultiIndex.drop should raise if label is absent + if errors == 'raise' and indexer.all(): + raise KeyError('{} not found in axis'.format(labels)) else: indexer = ~axis.isin(labels) - - if errors == 'raise' and indexer.all(): - raise KeyError('{} not found in axis'.format(labels)) + # Check if label doesn't exist along axis + labels_missing = (axis.get_indexer_for(labels) == -1).any() + if errors == 'raise' and labels_missing: + raise KeyError('{} not found in axis'.format(labels)) slicer = [slice(None)] * self.ndim slicer[self._get_axis_number(axis_name)] = indexer diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ac33ffad762cd..4f140a6e77b2f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4341,7 +4341,7 @@ def drop(self, labels, errors='raise'): Raises ------ KeyError - If none of the labels are found in the selected axis + If not all of the labels are found in the selected axis """ arr_dtype = 'object' if self.dtype == 'object' else None labels = com._index_labels_to_array(labels, dtype=arr_dtype) @@ -4350,7 +4350,7 @@ def drop(self, labels, errors='raise'): if mask.any(): if errors != 'ignore': raise KeyError( - 'labels %s not contained in axis' % labels[mask]) + '{} not found in axis'.format(labels[mask])) indexer = indexer[~mask] return self.delete(indexer) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ab23a80acdaae..61b50f139dd10 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1707,7 +1707,6 @@ def drop(self, labels, level=None, errors='raise'): if errors != 'ignore': raise ValueError('labels %s not contained in axis' % labels[mask]) - indexer = indexer[~mask] except Exception: pass diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 28e82f7585850..0e0d6598f5101 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -1151,3 +1151,18 @@ def test_raise_on_drop_duplicate_index(self, actual): expected_no_err = actual.T.drop('c', axis=1, level=level, errors='ignore') assert_frame_equal(expected_no_err.T, actual) + + @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize('drop_labels', [[], [1], [2]]) + def test_drop_empty_list(self, index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + frame = pd.DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) + + @pytest.mark.parametrize('index', [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize('drop_labels', [[1, 4], [4, 5]]) + def test_drop_non_empty_list(self, index, drop_labels): + # GH 21494 + with tm.assert_raises_regex(KeyError, 'not found in axis'): + pd.DataFrame(index=index).drop(drop_labels) diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index bcd5a64402c33..561d6a9b42508 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -472,54 +472,86 @@ def test_rename(): assert result.name == expected.name -def test_drop(): - # unique - s = Series([1, 2], index=['one', 'two']) - expected = Series([1], index=['one']) - result = s.drop(['two']) - assert_series_equal(result, expected) - result = s.drop('two', axis='rows') - assert_series_equal(result, expected) - - # non-unique - # GH 5248 - s = Series([1, 1, 2], index=['one', 'two', 'one']) - expected = Series([1, 2], index=['one', 'one']) - result = s.drop(['two'], axis=0) - assert_series_equal(result, expected) - result = s.drop('two') - assert_series_equal(result, expected) - - expected = Series([1], index=['two']) - result = s.drop(['one']) - assert_series_equal(result, expected) - result = s.drop('one') - assert_series_equal(result, expected) +@pytest.mark.parametrize( + 'data, index, drop_labels,' + ' axis, expected_data, expected_index', + [ + # Unique Index + ([1, 2], ['one', 'two'], ['two'], + 0, [1], ['one']), + ([1, 2], ['one', 'two'], ['two'], + 'rows', [1], ['one']), + ([1, 1, 2], ['one', 'two', 'one'], ['two'], + 0, [1, 2], ['one', 'one']), + + # GH 5248 Non-Unique Index + ([1, 1, 2], ['one', 'two', 'one'], 'two', + 0, [1, 2], ['one', 'one']), + ([1, 1, 2], ['one', 'two', 'one'], ['one'], + 0, [1], ['two']), + ([1, 1, 2], ['one', 'two', 'one'], 'one', + 0, [1], ['two'])]) +def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels, + expected_data, expected_index): + + s = Series(data=data, index=index) + result = s.drop(drop_labels, axis=axis) + expected = Series(data=expected_data, index=expected_index) + tm.assert_series_equal(result, expected) - # single string/tuple-like - s = Series(range(3), index=list('abc')) - pytest.raises(KeyError, s.drop, 'bc') - pytest.raises(KeyError, s.drop, ('a',)) +@pytest.mark.parametrize( + 'data, index, drop_labels,' + ' axis, error_type, error_desc', + [ + # single string/tuple-like + (range(3), list('abc'), 'bc', + 0, KeyError, 'not found in axis'), + + # bad axis + (range(3), list('abc'), ('a',), + 0, KeyError, 'not found in axis'), + (range(3), list('abc'), 'one', + 'columns', ValueError, 'No axis named columns')]) +def test_drop_exception_raised(data, index, drop_labels, + axis, error_type, error_desc): + + with tm.assert_raises_regex(error_type, error_desc): + Series(data, index=index).drop(drop_labels, axis=axis) + + +def test_drop_with_ignore_errors(): # errors='ignore' s = Series(range(3), index=list('abc')) result = s.drop('bc', errors='ignore') - assert_series_equal(result, s) + tm.assert_series_equal(result, s) result = s.drop(['a', 'd'], errors='ignore') expected = s.iloc[1:] - assert_series_equal(result, expected) - - # bad axis - pytest.raises(ValueError, s.drop, 'one', axis='columns') + tm.assert_series_equal(result, expected) # GH 8522 s = Series([2, 3], index=[True, False]) assert s.index.is_object() result = s.drop(True) expected = Series([3], index=[False]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + - # GH 16877 - s = Series([2, 3], index=[0, 1]) - with tm.assert_raises_regex(KeyError, 'not contained in axis'): - s.drop([False, True]) +@pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize('drop_labels', [[], [1], [3]]) +def test_drop_empty_list(index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + series = pd.Series(index=index).drop(drop_labels) + tm.assert_series_equal(series, pd.Series(index=expected_index)) + + +@pytest.mark.parametrize('data, index, drop_labels', [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]) +]) +def test_drop_non_empty_list(data, index, drop_labels): + # GH 21494 and GH 16877 + with tm.assert_raises_regex(KeyError, 'not found in axis'): + pd.Series(data=data, index=index).drop(drop_labels) From f3a89f319d34c4155d71ee180de61e8a7800d8d1 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 21 Jun 2018 11:42:41 +0200 Subject: [PATCH 065/113] ERR: Raise a simpler backtrace for missing key (#21558) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/indexing.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 15c5cc97b8426..a9c49b7476fa6 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -197,7 +197,7 @@ Strings Indexing ^^^^^^^^ -- +- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) - - diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d5e81105dd323..38b6aaa2230fb 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1807,8 +1807,6 @@ def error(): try: key = self._convert_scalar_indexer(key, axis) - if not ax.contains(key): - error() except TypeError as e: # python 3 type errors should be raised @@ -1818,6 +1816,9 @@ def error(): except: error() + if not ax.contains(key): + error() + def _is_scalar_access(self, key): # this is a shortcut accessor to both .loc and .iloc # that provide the equivalent access of .at and .iat From eb47287453dc33bc745485504c999d4bc39beb64 Mon Sep 17 00:00:00 2001 From: Andrew Spott Date: Thu, 21 Jun 2018 03:47:12 -0600 Subject: [PATCH 066/113] Fixed HDFSTore.groups() performance. (#21543) --- doc/source/whatsnew/v0.23.2.txt | 5 ++++- pandas/io/pytables.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index cae0d1a754d89..54ddea9a25254 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -18,7 +18,7 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) -- +- .. _whatsnew_0232.performance: @@ -28,6 +28,9 @@ Performance Improvements - Improved performance of membership checks in :class:`CategoricalIndex` (i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains` is likewise much faster (:issue:`21369`, :issue:`21508`) +- Improved performance of :meth:`HDFStore.groups` (and dependent functions like + :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) + (:issue:`21372`) - Improved performance of :meth:`MultiIndex.is_unique` (:issue:`21522`) - diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index aa39e341792c7..aad387e0cdd58 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1098,7 +1098,7 @@ def groups(self): _tables() self._check_if_open() return [ - g for g in self._handle.walk_nodes() + g for g in self._handle.walk_groups() if (not isinstance(g, _table_mod.link.Link) and (getattr(g._v_attrs, 'pandas_type', None) or getattr(g, 'table', None) or From c815e0b626db9b6056ce85ccd5065506ac402039 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 21 Jun 2018 11:10:31 +0100 Subject: [PATCH 067/113] DOC: Fixing spaces around backticks, and linting (#21570) --- ci/lint.sh | 8 ++++++++ doc/source/merging.rst | 8 ++++---- doc/source/release.rst | 4 ++-- doc/source/reshaping.rst | 2 +- doc/source/timeseries.rst | 4 ++-- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 2cbf6f7ae52a9..9bcee55e1344c 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -174,6 +174,14 @@ if [ "$LINT" ]; then fi echo "Check for old-style classes DONE" + echo "Check for backticks incorrectly rendering because of missing spaces" + grep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for backticks incorrectly rendering because of missing spaces DONE" + else echo "NOT Linting" fi diff --git a/doc/source/merging.rst b/doc/source/merging.rst index b2cb388e3cd03..2eb5962ead986 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -279,9 +279,9 @@ need to be: Ignoring indexes on the concatenation axis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For ``DataFrame``s which don't have a meaningful index, you may wish to append -them and ignore the fact that they may have overlapping indexes. To do this, use -the ``ignore_index`` argument: +For ``DataFrame`` objects which don't have a meaningful index, you may wish +to append them and ignore the fact that they may have overlapping indexes. To +do this, use the ``ignore_index`` argument: .. ipython:: python @@ -314,7 +314,7 @@ This is also a valid argument to :meth:`DataFrame.append`: Concatenating with mixed ndims ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can concatenate a mix of ``Series`` and ``DataFrame``s. The +You can concatenate a mix of ``Series`` and ``DataFrame`` objects. The ``Series`` will be transformed to ``DataFrame`` with the column name as the name of the ``Series``. diff --git a/doc/source/release.rst b/doc/source/release.rst index 7bbd4ba43e66f..16fe896d9f58f 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -2641,7 +2641,7 @@ Improvements to existing features option it is no longer possible to round trip Excel files with merged MultiIndex and Hierarchical Rows. Set the ``merge_cells`` to ``False`` to restore the previous behaviour. (:issue:`5254`) -- The FRED DataReader now accepts multiple series (:issue`3413`) +- The FRED DataReader now accepts multiple series (:issue:`3413`) - StataWriter adjusts variable names to Stata's limitations (:issue:`5709`) API Changes @@ -2837,7 +2837,7 @@ API Changes copy through chained assignment is detected, settable via option ``mode.chained_assignment`` - test the list of ``NA`` values in the csv parser. add ``N/A``, ``#NA`` as independent default na values (:issue:`5521`) -- The refactoring involving``Series`` deriving from ``NDFrame`` breaks ``rpy2<=2.3.8``. an Issue +- The refactoring involving ``Series`` deriving from ``NDFrame`` breaks ``rpy2<=2.3.8``. an Issue has been opened against rpy2 and a workaround is detailed in :issue:`5698`. Thanks @JanSchulz. - ``Series.argmin`` and ``Series.argmax`` are now aliased to ``Series.idxmin`` and ``Series.idxmax``. These return the *index* of the min or max element respectively. Prior to 0.13.0 these would return diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 88b7114cf4101..7d9925d800441 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -654,7 +654,7 @@ When a column contains only one level, it will be omitted in the result. pd.get_dummies(df, drop_first=True) By default new columns will have ``np.uint8`` dtype. -To choose another dtype, use the``dtype`` argument: +To choose another dtype, use the ``dtype`` argument: .. ipython:: python diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 11157264304b0..9e01296d9c9c7 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -2169,8 +2169,8 @@ still considered to be equal even if they are in different time zones: rng_berlin[5] rng_eastern[5] == rng_berlin[5] -Like ``Series``, ``DataFrame``, and ``DatetimeIndex``, ``Timestamp``s can be converted to other -time zones using ``tz_convert``: +Like ``Series``, ``DataFrame``, and ``DatetimeIndex``; ``Timestamp`` objects +can be converted to other time zones using ``tz_convert``: .. ipython:: python From 91c9ec3b6741f88d29769fa3cd1c352db09a0119 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Jun 2018 03:18:53 -0700 Subject: [PATCH 068/113] fix hashing string-casting error (#21187) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/_libs/hashing.pyx | 7 ++----- pandas/tests/series/test_repr.py | 30 ++++++++++++++++++++++++++++++ pandas/util/testing.py | 22 ++++++++++++++++++++++ 4 files changed, 55 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 54ddea9a25254..c781f45715bd4 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -86,6 +86,7 @@ Bug Fixes **Categorical** +- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) - **Timezones** diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index c6f182ac5003f..4489847518a1d 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -8,8 +8,7 @@ import numpy as np from numpy cimport ndarray, uint8_t, uint32_t, uint64_t from util cimport _checknull -from cpython cimport (PyString_Check, - PyBytes_Check, +from cpython cimport (PyBytes_Check, PyUnicode_Check) from libc.stdlib cimport malloc, free @@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): cdef list datas = [] for i in range(n): val = arr[i] - if PyString_Check(val): - data = val.encode(encoding) - elif PyBytes_Check(val): + if PyBytes_Check(val): data = val elif PyUnicode_Check(val): data = val.encode(encoding) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 97236f028b1c4..730c2b7865f1f 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -11,6 +11,7 @@ from pandas import (Index, Series, DataFrame, date_range, option_context, Categorical, period_range, timedelta_range) from pandas.core.index import MultiIndex +from pandas.core.base import StringMixin from pandas.compat import lrange, range, u from pandas import compat @@ -202,6 +203,35 @@ def test_latex_repr(self): class TestCategoricalRepr(object): + def test_categorical_repr_unicode(self): + # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii', + # and we are working in PY2, then rendering a Categorical could raise + # UnicodeDecodeError by trying to decode when it shouldn't + + class County(StringMixin): + name = u'San Sebastián' + state = u'PR' + + def __unicode__(self): + return self.name + u', ' + self.state + + cat = pd.Categorical([County() for n in range(61)]) + idx = pd.Index(cat) + ser = idx.to_series() + + if compat.PY3: + # no reloading of sys, just check that the default (utf8) works + # as expected + repr(ser) + str(ser) + + else: + # set sys.defaultencoding to ascii, then change it back after + # the test + with tm.set_defaultencoding('ascii'): + repr(ser) + str(ser) + def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) exp = u("0 1\n1 2\n2 3\n3 4\n" + diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d26a2116fb3ce..b9e53dfc80020 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -553,6 +553,28 @@ def _valid_locales(locales, normalize): # Stdout / stderr decorators +@contextmanager +def set_defaultencoding(encoding): + """ + Set default encoding (as given by sys.getdefaultencoding()) to the given + encoding; restore on exit. + + Parameters + ---------- + encoding : str + """ + if not PY2: + raise ValueError("set_defaultencoding context is only available " + "in Python 2.") + orig = sys.getdefaultencoding() + reload(sys) # noqa:F821 + sys.setdefaultencoding(encoding) + try: + yield + finally: + sys.setdefaultencoding(orig) + + def capture_stdout(f): """ Decorator to capture stdout in a buffer so that it can be checked From c019f6dc1c639e0c0630cb855252d93c6e23fe6d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Jun 2018 03:24:20 -0700 Subject: [PATCH 069/113] make DateOffset immutable (#21341) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/_libs/tslibs/offsets.pyx | 16 ++- pandas/tests/tseries/offsets/test_offsets.py | 8 ++ pandas/tseries/offsets.py | 118 +++++++++---------- 4 files changed, 75 insertions(+), 69 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index a9c49b7476fa6..fd34424dedc52 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -67,6 +67,7 @@ Datetimelike API Changes ^^^^^^^^^^^^^^^^^^^^^^^^ - For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) +- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) .. _whatsnew_0240.api.other: @@ -176,7 +177,6 @@ Timezones Offsets ^^^^^^^ -- - - diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 3ca9bb307da9c..a9ef9166e4d33 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -304,6 +304,15 @@ class _BaseOffset(object): _day_opt = None _attributes = frozenset(['n', 'normalize']) + def __init__(self, n=1, normalize=False): + n = self._validate_n(n) + object.__setattr__(self, "n", n) + object.__setattr__(self, "normalize", normalize) + object.__setattr__(self, "_cache", {}) + + def __setattr__(self, name, value): + raise AttributeError("DateOffset objects are immutable.") + @property def kwds(self): # for backwards-compatibility @@ -395,13 +404,14 @@ class _BaseOffset(object): kwds = {key: odict[key] for key in odict if odict[key]} state.update(kwds) - self.__dict__ = state + self.__dict__.update(state) + if 'weekmask' in state and 'holidays' in state: calendar, holidays = _get_calendar(weekmask=self.weekmask, holidays=self.holidays, calendar=None) - self.calendar = calendar - self.holidays = holidays + object.__setattr__(self, "calendar", calendar) + object.__setattr__(self, "holidays", holidays) def __getstate__(self): """Return a pickleable state""" diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 5dd2a199405bf..66cb9baeb9357 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -234,6 +234,14 @@ class TestCommon(Base): 'Nano': Timestamp(np_datetime64_compat( '2011-01-01T09:00:00.000000001Z'))} + def test_immutable(self, offset_types): + # GH#21341 check that __setattr__ raises + offset = self._get_offset(offset_types) + with pytest.raises(AttributeError): + offset.normalize = True + with pytest.raises(AttributeError): + offset.n = 91 + def test_return_type(self, offset_types): offset = self._get_offset(offset_types) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index ffa2c0a5e3211..da8fdb4d79e34 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -23,7 +23,6 @@ ApplyTypeError, as_datetime, _is_normalized, _get_calendar, _to_dt64, - _determine_offset, apply_index_wraps, roll_yearday, shift_month, @@ -192,11 +191,14 @@ def __add__(date): normalize = False def __init__(self, n=1, normalize=False, **kwds): - self.n = self._validate_n(n) - self.normalize = normalize + BaseOffset.__init__(self, n, normalize) - self._offset, self._use_relativedelta = _determine_offset(kwds) - self.__dict__.update(kwds) + off, use_rd = liboffsets._determine_offset(kwds) + object.__setattr__(self, "_offset", off) + object.__setattr__(self, "_use_relativedelta", use_rd) + for key in kwds: + val = kwds[key] + object.__setattr__(self, key, val) @apply_wraps def apply(self, other): @@ -446,9 +448,9 @@ def __init__(self, weekmask, holidays, calendar): # following two attributes. See DateOffset._params() # holidays, weekmask - self.weekmask = weekmask - self.holidays = holidays - self.calendar = calendar + object.__setattr__(self, "weekmask", weekmask) + object.__setattr__(self, "holidays", holidays) + object.__setattr__(self, "calendar", calendar) class BusinessMixin(object): @@ -480,9 +482,8 @@ class BusinessDay(BusinessMixin, SingleConstructorOffset): _attributes = frozenset(['n', 'normalize', 'offset']) def __init__(self, n=1, normalize=False, offset=timedelta(0)): - self.n = self._validate_n(n) - self.normalize = normalize - self._offset = offset + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "_offset", offset) def _offset_str(self): def get_str(td): @@ -578,9 +579,11 @@ class BusinessHourMixin(BusinessMixin): def __init__(self, start='09:00', end='17:00', offset=timedelta(0)): # must be validated here to equality check - self.start = liboffsets._validate_business_time(start) - self.end = liboffsets._validate_business_time(end) - self._offset = offset + start = liboffsets._validate_business_time(start) + object.__setattr__(self, "start", start) + end = liboffsets._validate_business_time(end) + object.__setattr__(self, "end", end) + object.__setattr__(self, "_offset", offset) @cache_readonly def next_bday(self): @@ -807,8 +810,7 @@ class BusinessHour(BusinessHourMixin, SingleConstructorOffset): def __init__(self, n=1, normalize=False, start='09:00', end='17:00', offset=timedelta(0)): - self.n = self._validate_n(n) - self.normalize = normalize + BaseOffset.__init__(self, n, normalize) super(BusinessHour, self).__init__(start=start, end=end, offset=offset) @@ -837,9 +839,8 @@ class CustomBusinessDay(_CustomMixin, BusinessDay): def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, offset=timedelta(0)): - self.n = self._validate_n(n) - self.normalize = normalize - self._offset = offset + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "_offset", offset) _CustomMixin.__init__(self, weekmask, holidays, calendar) @@ -898,9 +899,8 @@ class CustomBusinessHour(_CustomMixin, BusinessHourMixin, def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, start='09:00', end='17:00', offset=timedelta(0)): - self.n = self._validate_n(n) - self.normalize = normalize - self._offset = offset + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "_offset", offset) _CustomMixin.__init__(self, weekmask, holidays, calendar) BusinessHourMixin.__init__(self, start=start, end=end, offset=offset) @@ -914,9 +914,7 @@ class MonthOffset(SingleConstructorOffset): _adjust_dst = True _attributes = frozenset(['n', 'normalize']) - def __init__(self, n=1, normalize=False): - self.n = self._validate_n(n) - self.normalize = normalize + __init__ = BaseOffset.__init__ @property def name(self): @@ -995,9 +993,8 @@ class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, offset=timedelta(0)): - self.n = self._validate_n(n) - self.normalize = normalize - self._offset = offset + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "_offset", offset) _CustomMixin.__init__(self, weekmask, holidays, calendar) @@ -1074,18 +1071,18 @@ class SemiMonthOffset(DateOffset): _attributes = frozenset(['n', 'normalize', 'day_of_month']) def __init__(self, n=1, normalize=False, day_of_month=None): + BaseOffset.__init__(self, n, normalize) + if day_of_month is None: - self.day_of_month = self._default_day_of_month + object.__setattr__(self, "day_of_month", + self._default_day_of_month) else: - self.day_of_month = int(day_of_month) + object.__setattr__(self, "day_of_month", int(day_of_month)) if not self._min_day_of_month <= self.day_of_month <= 27: msg = 'day_of_month must be {min}<=day_of_month<=27, got {day}' raise ValueError(msg.format(min=self._min_day_of_month, day=self.day_of_month)) - self.n = self._validate_n(n) - self.normalize = normalize - @classmethod def _from_name(cls, suffix=None): return cls(day_of_month=suffix) @@ -1291,9 +1288,8 @@ class Week(DateOffset): _attributes = frozenset(['n', 'normalize', 'weekday']) def __init__(self, n=1, normalize=False, weekday=None): - self.n = self._validate_n(n) - self.normalize = normalize - self.weekday = weekday + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "weekday", weekday) if self.weekday is not None: if self.weekday < 0 or self.weekday > 6: @@ -1421,10 +1417,9 @@ class WeekOfMonth(_WeekOfMonthMixin, DateOffset): _attributes = frozenset(['n', 'normalize', 'week', 'weekday']) def __init__(self, n=1, normalize=False, week=0, weekday=0): - self.n = self._validate_n(n) - self.normalize = normalize - self.weekday = weekday - self.week = week + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "weekday", weekday) + object.__setattr__(self, "week", week) if self.weekday < 0 or self.weekday > 6: raise ValueError('Day must be 0<=day<=6, got {day}' @@ -1493,9 +1488,8 @@ class LastWeekOfMonth(_WeekOfMonthMixin, DateOffset): _attributes = frozenset(['n', 'normalize', 'weekday']) def __init__(self, n=1, normalize=False, weekday=0): - self.n = self._validate_n(n) - self.normalize = normalize - self.weekday = weekday + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "weekday", weekday) if self.n == 0: raise ValueError('N cannot be 0') @@ -1553,11 +1547,11 @@ class QuarterOffset(DateOffset): # startingMonth vs month attr names are resolved def __init__(self, n=1, normalize=False, startingMonth=None): - self.n = self._validate_n(n) - self.normalize = normalize + BaseOffset.__init__(self, n, normalize) + if startingMonth is None: startingMonth = self._default_startingMonth - self.startingMonth = startingMonth + object.__setattr__(self, "startingMonth", startingMonth) def isAnchored(self): return (self.n == 1 and self.startingMonth is not None) @@ -1679,11 +1673,10 @@ def onOffset(self, dt): return dt.month == self.month and dt.day == self._get_offset_day(dt) def __init__(self, n=1, normalize=False, month=None): - self.n = self._validate_n(n) - self.normalize = normalize + BaseOffset.__init__(self, n, normalize) month = month if month is not None else self._default_month - self.month = month + object.__setattr__(self, "month", month) if self.month < 1 or self.month > 12: raise ValueError('Month must go from 1 to 12') @@ -1776,12 +1769,11 @@ class FY5253(DateOffset): def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, variation="nearest"): - self.n = self._validate_n(n) - self.normalize = normalize - self.startingMonth = startingMonth - self.weekday = weekday + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "startingMonth", startingMonth) + object.__setattr__(self, "weekday", weekday) - self.variation = variation + object.__setattr__(self, "variation", variation) if self.n == 0: raise ValueError('N cannot be 0') @@ -1976,13 +1968,12 @@ class FY5253Quarter(DateOffset): def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, qtr_with_extra_week=1, variation="nearest"): - self.n = self._validate_n(n) - self.normalize = normalize + BaseOffset.__init__(self, n, normalize) - self.weekday = weekday - self.startingMonth = startingMonth - self.qtr_with_extra_week = qtr_with_extra_week - self.variation = variation + object.__setattr__(self, "startingMonth", startingMonth) + object.__setattr__(self, "weekday", weekday) + object.__setattr__(self, "qtr_with_extra_week", qtr_with_extra_week) + object.__setattr__(self, "variation", variation) if self.n == 0: raise ValueError('N cannot be 0') @@ -2129,9 +2120,7 @@ class Easter(DateOffset): _adjust_dst = True _attributes = frozenset(['n', 'normalize']) - def __init__(self, n=1, normalize=False): - self.n = self._validate_n(n) - self.normalize = normalize + __init__ = BaseOffset.__init__ @apply_wraps def apply(self, other): @@ -2177,11 +2166,10 @@ class Tick(SingleConstructorOffset): _attributes = frozenset(['n', 'normalize']) def __init__(self, n=1, normalize=False): - self.n = self._validate_n(n) + BaseOffset.__init__(self, n, normalize) if normalize: raise ValueError("Tick offset with `normalize=True` are not " "allowed.") # GH#21427 - self.normalize = normalize __gt__ = _tick_comp(operator.gt) __ge__ = _tick_comp(operator.ge) From c03ed3857715d5f43750f3c142c5dd000da59d27 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 21 Jun 2018 15:05:43 +0200 Subject: [PATCH 070/113] REF: multi_take is now able to tackle all list-like (non-bool) cases (#21569) --- pandas/core/indexing.py | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 38b6aaa2230fb..1f9fe5f947d0c 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -902,30 +902,45 @@ def _getitem_tuple(self, tup): return retval def _multi_take_opportunity(self, tup): - from pandas.core.generic import NDFrame + """ + Check whether there is the possibility to use ``_multi_take``. + Currently the limit is that all axes being indexed must be indexed with + list-likes. - # ugly hack for GH #836 - if not isinstance(self.obj, NDFrame): - return False + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis + Returns + ------- + boolean: Whether the current indexing can be passed through _multi_take + """ if not all(is_list_like_indexer(x) for x in tup): return False # just too complicated - for indexer, ax in zip(tup, self.obj._data.axes): - if isinstance(ax, MultiIndex): - return False - elif com.is_bool_indexer(indexer): - return False - elif not ax.is_unique: - return False + if any(com.is_bool_indexer(x) for x in tup): + return False return True def _multi_take(self, tup): - """ create the reindex map for our objects, raise the _exception if we - can't create the indexer """ + Create the indexers for the passed tuple of keys, and execute the take + operation. This allows the take operation to be executed all at once - + rather than once for each dimension - improving efficiency. + + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis + + Returns + ------- + values: same type as the object being indexed + """ + # GH 836 o = self.obj d = {axis: self._get_listlike_indexer(key, axis) for (key, axis) in zip(tup, o._AXIS_ORDERS)} From 4668dba95cf905cd449e5cadcdbaad75e3afe220 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 22 Jun 2018 02:58:40 -0700 Subject: [PATCH 071/113] TST: Use int fixtures in test_construction.py (#21588) Partially addresses gh-21500. --- pandas/tests/indexes/datetimes/test_construction.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index f7682a965c038..ae98510951845 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -524,14 +524,13 @@ def test_dti_constructor_years_only(self, tz_naive_fixture): (rng3, expected3), (rng4, expected4)]: tm.assert_index_equal(rng, expected) - @pytest.mark.parametrize('dtype', [np.int64, np.int32, np.int16, np.int8]) - def test_dti_constructor_small_int(self, dtype): - # GH 13721 + def test_dti_constructor_small_int(self, any_int_dtype): + # see gh-13721 exp = DatetimeIndex(['1970-01-01 00:00:00.00000000', '1970-01-01 00:00:00.00000001', '1970-01-01 00:00:00.00000002']) - arr = np.array([0, 10, 20], dtype=dtype) + arr = np.array([0, 10, 20], dtype=any_int_dtype) tm.assert_index_equal(DatetimeIndex(arr), exp) def test_ctor_str_intraday(self): From 07e161dca7dbe211efb5df50ff4ba5a0edfbc576 Mon Sep 17 00:00:00 2001 From: Uddeshya Singh Date: Fri, 22 Jun 2018 15:35:45 +0530 Subject: [PATCH 072/113] DOC: Adding clarification on return dtype of to_numeric (#21585) --- pandas/core/tools/numeric.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index c584e29f682dd..ebe135dfb184c 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -16,6 +16,10 @@ def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. + The default return dtype is `float64` or `int64` + depending on the data supplied. Use the `downcast` parameter + to obtain other dtypes. + Parameters ---------- arg : list, tuple, 1-d array, or Series From 15b040c220909c2106c1cbdecbc39ec2955fe19c Mon Sep 17 00:00:00 2001 From: Uddeshya Singh Date: Fri, 22 Jun 2018 15:37:07 +0530 Subject: [PATCH 073/113] Update v0.24.0.txt (#21586) --- doc/source/whatsnew/v0.24.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index fd34424dedc52..4bfae7de01b8f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -31,8 +31,8 @@ Backwards incompatible API changes Tick DateOffset Normalize Restrictions -------------------------------------- -Creating a ``Tick`` object (:class:``Day``, :class:``Hour``, :class:``Minute``, -:class:``Second``, :class:``Milli``, :class:``Micro``, :class:``Nano``) with +Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, +:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with `normalize=True` is no longer supported. This prevents unexpected behavior where addition could fail to be monotone or associative. (:issue:`21427`) From b21efccbce9f66663932faec1c80e0d4bd4d20f5 Mon Sep 17 00:00:00 2001 From: Mitch Negus <21086604+mitchnegus@users.noreply.github.com> Date: Fri, 22 Jun 2018 03:11:20 -0700 Subject: [PATCH 074/113] clarifying regex pipe behavior (#21589) --- pandas/core/strings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 9632df46d3bbf..08239ae4dae20 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -335,11 +335,11 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): 4 False dtype: bool - Returning 'house' and 'parrot' within same string. + Returning 'house' or 'dog' when either expression occurs in a string. - >>> s1.str.contains('house|parrot', regex=True) + >>> s1.str.contains('house|dog', regex=True) 0 False - 1 False + 1 True 2 True 3 False 4 NaN From a7d5bb73643855fc2c7990b8ec13961b9d5d1d72 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 22 Jun 2018 03:21:13 -0700 Subject: [PATCH 075/113] DOC: Note assert_almost_equal impl. detail (#21580) Note the hard-coded switch between absolute and relative tolerance during checking. Closes gh-21528. --- pandas/util/testing.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index b9e53dfc80020..675dd94d49750 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -224,9 +224,15 @@ def assert_almost_equal(left, right, check_exact=False, check_dtype: bool, default True check dtype if both a and b are the same type check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. + Specify comparison precision. Only used when `check_exact` is False. 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare + If int, then specify the digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. """ if isinstance(left, pd.Index): return assert_index_equal(left, right, check_exact=check_exact, From 8944d3510699c8f229b6c4d574c529d27fce777f Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Fri, 22 Jun 2018 11:33:13 +0100 Subject: [PATCH 076/113] DOC: update the Series.any / Dataframe.any docstring (#21579) --- pandas/core/generic.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9902da4094404..04ba0b5de3f7f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -463,7 +463,7 @@ def ndim(self): See Also -------- - ndarray.ndim + ndarray.ndim : Number of array dimensions. Examples -------- @@ -487,7 +487,7 @@ def size(self): See Also -------- - ndarray.size + ndarray.size : Number of elements in the array. Examples -------- @@ -9420,7 +9420,11 @@ def _doc_parms(cls): _any_see_also = """\ See Also -------- -pandas.DataFrame.all : Return whether all elements are True. +numpy.any : Numpy version of this method. +Series.any : Return whether any element is True. +Series.all : Return whether all elements are True. +DataFrame.any : Return whether any element is True over requested axis. +DataFrame.all : Return whether all elements are True over requested axis. """ _any_desc = """\ From 828008426a026377e156acd693b25acd59153fb6 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 22 Jun 2018 09:52:20 -0700 Subject: [PATCH 077/113] TST: Clean up tests in test_take.py (#21591) Utilizes pytest's fixtures to cleanup the code significantly. --- pandas/tests/test_take.py | 525 +++++++++++++++++--------------------- 1 file changed, 239 insertions(+), 286 deletions(-) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 9ab147edb8d1b..ade847923c083 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -10,315 +10,268 @@ from pandas._libs.tslib import iNaT +@pytest.fixture(params=[True, False]) +def writeable(request): + return request.param + + +# Check that take_nd works both with writeable arrays +# (in which case fast typed memory-views implementation) +# and read-only arrays alike. +@pytest.fixture(params=[ + (np.float64, True), + (np.float32, True), + (np.uint64, False), + (np.uint32, False), + (np.uint16, False), + (np.uint8, False), + (np.int64, False), + (np.int32, False), + (np.int16, False), + (np.int8, False), + (np.object_, True), + (np.bool, False), +]) +def dtype_can_hold_na(request): + return request.param + + +@pytest.fixture(params=[ + (np.int8, np.int16(127), np.int8), + (np.int8, np.int16(128), np.int16), + (np.int32, 1, np.int32), + (np.int32, 2.0, np.float64), + (np.int32, 3.0 + 4.0j, np.complex128), + (np.int32, True, np.object_), + (np.int32, "", np.object_), + (np.float64, 1, np.float64), + (np.float64, 2.0, np.float64), + (np.float64, 3.0 + 4.0j, np.complex128), + (np.float64, True, np.object_), + (np.float64, "", np.object_), + (np.complex128, 1, np.complex128), + (np.complex128, 2.0, np.complex128), + (np.complex128, 3.0 + 4.0j, np.complex128), + (np.complex128, True, np.object_), + (np.complex128, "", np.object_), + (np.bool_, 1, np.object_), + (np.bool_, 2.0, np.object_), + (np.bool_, 3.0 + 4.0j, np.object_), + (np.bool_, True, np.bool_), + (np.bool_, '', np.object_), +]) +def dtype_fill_out_dtype(request): + return request.param + + class TestTake(object): - # standard incompatible fill error + # Standard incompatible fill error. fill_error = re.compile("Incompatible type for fill_value") - def test_1d_with_out(self): - def _test_dtype(dtype, can_hold_na, writeable=True): - data = np.random.randint(0, 2, 4).astype(dtype) - data.flags.writeable = writeable + def test_1d_with_out(self, dtype_can_hold_na, writeable): + dtype, can_hold_na = dtype_can_hold_na + + data = np.random.randint(0, 2, 4).astype(dtype) + data.flags.writeable = writeable + + indexer = [2, 1, 0, 1] + out = np.empty(4, dtype=dtype) + algos.take_1d(data, indexer, out=out) - indexer = [2, 1, 0, 1] - out = np.empty(4, dtype=dtype) + expected = data.take(indexer) + tm.assert_almost_equal(out, expected) + + indexer = [2, 1, 0, -1] + out = np.empty(4, dtype=dtype) + + if can_hold_na: algos.take_1d(data, indexer, out=out) expected = data.take(indexer) + expected[3] = np.nan tm.assert_almost_equal(out, expected) - - indexer = [2, 1, 0, -1] - out = np.empty(4, dtype=dtype) - if can_hold_na: + else: + with tm.assert_raises_regex(TypeError, self.fill_error): algos.take_1d(data, indexer, out=out) - expected = data.take(indexer) - expected[3] = np.nan - tm.assert_almost_equal(out, expected) - else: - with tm.assert_raises_regex(TypeError, self.fill_error): - algos.take_1d(data, indexer, out=out) - # no exception o/w - data.take(indexer, out=out) - - for writeable in [True, False]: - # Check that take_nd works both with writeable arrays (in which - # case fast typed memoryviews implementation) and read-only - # arrays alike. - _test_dtype(np.float64, True, writeable=writeable) - _test_dtype(np.float32, True, writeable=writeable) - _test_dtype(np.uint64, False, writeable=writeable) - _test_dtype(np.uint32, False, writeable=writeable) - _test_dtype(np.uint16, False, writeable=writeable) - _test_dtype(np.uint8, False, writeable=writeable) - _test_dtype(np.int64, False, writeable=writeable) - _test_dtype(np.int32, False, writeable=writeable) - _test_dtype(np.int16, False, writeable=writeable) - _test_dtype(np.int8, False, writeable=writeable) - _test_dtype(np.object_, True, writeable=writeable) - _test_dtype(np.bool, False, writeable=writeable) - - def test_1d_fill_nonna(self): - def _test_dtype(dtype, fill_value, out_dtype): - data = np.random.randint(0, 2, 4).astype(dtype) - - indexer = [2, 1, 0, -1] - - result = algos.take_1d(data, indexer, fill_value=fill_value) - assert ((result[[0, 1, 2]] == data[[2, 1, 0]]).all()) - assert (result[3] == fill_value) - assert (result.dtype == out_dtype) - - indexer = [2, 1, 0, 1] - - result = algos.take_1d(data, indexer, fill_value=fill_value) - assert ((result[[0, 1, 2, 3]] == data[indexer]).all()) - assert (result.dtype == dtype) - - _test_dtype(np.int8, np.int16(127), np.int8) - _test_dtype(np.int8, np.int16(128), np.int16) - _test_dtype(np.int32, 1, np.int32) - _test_dtype(np.int32, 2.0, np.float64) - _test_dtype(np.int32, 3.0 + 4.0j, np.complex128) - _test_dtype(np.int32, True, np.object_) - _test_dtype(np.int32, '', np.object_) - _test_dtype(np.float64, 1, np.float64) - _test_dtype(np.float64, 2.0, np.float64) - _test_dtype(np.float64, 3.0 + 4.0j, np.complex128) - _test_dtype(np.float64, True, np.object_) - _test_dtype(np.float64, '', np.object_) - _test_dtype(np.complex128, 1, np.complex128) - _test_dtype(np.complex128, 2.0, np.complex128) - _test_dtype(np.complex128, 3.0 + 4.0j, np.complex128) - _test_dtype(np.complex128, True, np.object_) - _test_dtype(np.complex128, '', np.object_) - _test_dtype(np.bool_, 1, np.object_) - _test_dtype(np.bool_, 2.0, np.object_) - _test_dtype(np.bool_, 3.0 + 4.0j, np.object_) - _test_dtype(np.bool_, True, np.bool_) - _test_dtype(np.bool_, '', np.object_) - - def test_2d_with_out(self): - def _test_dtype(dtype, can_hold_na, writeable=True): - data = np.random.randint(0, 2, (5, 3)).astype(dtype) - data.flags.writeable = writeable - - indexer = [2, 1, 0, 1] - out0 = np.empty((4, 3), dtype=dtype) - out1 = np.empty((5, 4), dtype=dtype) + + # No Exception otherwise. + data.take(indexer, out=out) + + def test_1d_fill_nonna(self, dtype_fill_out_dtype): + dtype, fill_value, out_dtype = dtype_fill_out_dtype + data = np.random.randint(0, 2, 4).astype(dtype) + indexer = [2, 1, 0, -1] + + result = algos.take_1d(data, indexer, fill_value=fill_value) + assert ((result[[0, 1, 2]] == data[[2, 1, 0]]).all()) + assert (result[3] == fill_value) + assert (result.dtype == out_dtype) + + indexer = [2, 1, 0, 1] + + result = algos.take_1d(data, indexer, fill_value=fill_value) + assert ((result[[0, 1, 2, 3]] == data[indexer]).all()) + assert (result.dtype == dtype) + + def test_2d_with_out(self, dtype_can_hold_na, writeable): + dtype, can_hold_na = dtype_can_hold_na + + data = np.random.randint(0, 2, (5, 3)).astype(dtype) + data.flags.writeable = writeable + + indexer = [2, 1, 0, 1] + out0 = np.empty((4, 3), dtype=dtype) + out1 = np.empty((5, 4), dtype=dtype) + algos.take_nd(data, indexer, out=out0, axis=0) + algos.take_nd(data, indexer, out=out1, axis=1) + + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + + indexer = [2, 1, 0, -1] + out0 = np.empty((4, 3), dtype=dtype) + out1 = np.empty((5, 4), dtype=dtype) + + if can_hold_na: algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) + expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) + expected0[3, :] = np.nan + expected1[:, 3] = np.nan + tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) - - indexer = [2, 1, 0, -1] - out0 = np.empty((4, 3), dtype=dtype) - out1 = np.empty((5, 4), dtype=dtype) - if can_hold_na: - algos.take_nd(data, indexer, out=out0, axis=0) - algos.take_nd(data, indexer, out=out1, axis=1) - expected0 = data.take(indexer, axis=0) - expected1 = data.take(indexer, axis=1) - expected0[3, :] = np.nan - expected1[:, 3] = np.nan - tm.assert_almost_equal(out0, expected0) - tm.assert_almost_equal(out1, expected1) - else: - for i, out in enumerate([out0, out1]): - with tm.assert_raises_regex(TypeError, - self.fill_error): - algos.take_nd(data, indexer, out=out, axis=i) - # no exception o/w - data.take(indexer, out=out, axis=i) - - for writeable in [True, False]: - # Check that take_nd works both with writeable arrays (in which - # case fast typed memoryviews implementation) and read-only - # arrays alike. - _test_dtype(np.float64, True, writeable=writeable) - _test_dtype(np.float32, True, writeable=writeable) - _test_dtype(np.uint64, False, writeable=writeable) - _test_dtype(np.uint32, False, writeable=writeable) - _test_dtype(np.uint16, False, writeable=writeable) - _test_dtype(np.uint8, False, writeable=writeable) - _test_dtype(np.int64, False, writeable=writeable) - _test_dtype(np.int32, False, writeable=writeable) - _test_dtype(np.int16, False, writeable=writeable) - _test_dtype(np.int8, False, writeable=writeable) - _test_dtype(np.object_, True, writeable=writeable) - _test_dtype(np.bool, False, writeable=writeable) - - def test_2d_fill_nonna(self): - def _test_dtype(dtype, fill_value, out_dtype): - data = np.random.randint(0, 2, (5, 3)).astype(dtype) - - indexer = [2, 1, 0, -1] - - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2], :] == data[[2, 1, 0], :]).all()) - assert ((result[3, :] == fill_value).all()) - assert (result.dtype == out_dtype) - - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all()) - assert ((result[:, 3] == fill_value).all()) - assert (result.dtype == out_dtype) - - indexer = [2, 1, 0, 1] - - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2, 3], :] == data[indexer, :]).all()) - assert (result.dtype == dtype) - - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2, 3]] == data[:, indexer]).all()) - assert (result.dtype == dtype) - - _test_dtype(np.int8, np.int16(127), np.int8) - _test_dtype(np.int8, np.int16(128), np.int16) - _test_dtype(np.int32, 1, np.int32) - _test_dtype(np.int32, 2.0, np.float64) - _test_dtype(np.int32, 3.0 + 4.0j, np.complex128) - _test_dtype(np.int32, True, np.object_) - _test_dtype(np.int32, '', np.object_) - _test_dtype(np.float64, 1, np.float64) - _test_dtype(np.float64, 2.0, np.float64) - _test_dtype(np.float64, 3.0 + 4.0j, np.complex128) - _test_dtype(np.float64, True, np.object_) - _test_dtype(np.float64, '', np.object_) - _test_dtype(np.complex128, 1, np.complex128) - _test_dtype(np.complex128, 2.0, np.complex128) - _test_dtype(np.complex128, 3.0 + 4.0j, np.complex128) - _test_dtype(np.complex128, True, np.object_) - _test_dtype(np.complex128, '', np.object_) - _test_dtype(np.bool_, 1, np.object_) - _test_dtype(np.bool_, 2.0, np.object_) - _test_dtype(np.bool_, 3.0 + 4.0j, np.object_) - _test_dtype(np.bool_, True, np.bool_) - _test_dtype(np.bool_, '', np.object_) - - def test_3d_with_out(self): - def _test_dtype(dtype, can_hold_na): - data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) - - indexer = [2, 1, 0, 1] - out0 = np.empty((4, 4, 3), dtype=dtype) - out1 = np.empty((5, 4, 3), dtype=dtype) - out2 = np.empty((5, 4, 4), dtype=dtype) + else: + for i, out in enumerate([out0, out1]): + with tm.assert_raises_regex(TypeError, + self.fill_error): + algos.take_nd(data, indexer, out=out, axis=i) + + # No Exception otherwise. + data.take(indexer, out=out, axis=i) + + def test_2d_fill_nonna(self, dtype_fill_out_dtype): + dtype, fill_value, out_dtype = dtype_fill_out_dtype + data = np.random.randint(0, 2, (5, 3)).astype(dtype) + indexer = [2, 1, 0, -1] + + result = algos.take_nd(data, indexer, axis=0, + fill_value=fill_value) + assert ((result[[0, 1, 2], :] == data[[2, 1, 0], :]).all()) + assert ((result[3, :] == fill_value).all()) + assert (result.dtype == out_dtype) + + result = algos.take_nd(data, indexer, axis=1, + fill_value=fill_value) + assert ((result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all()) + assert ((result[:, 3] == fill_value).all()) + assert (result.dtype == out_dtype) + + indexer = [2, 1, 0, 1] + result = algos.take_nd(data, indexer, axis=0, + fill_value=fill_value) + assert ((result[[0, 1, 2, 3], :] == data[indexer, :]).all()) + assert (result.dtype == dtype) + + result = algos.take_nd(data, indexer, axis=1, + fill_value=fill_value) + assert ((result[:, [0, 1, 2, 3]] == data[:, indexer]).all()) + assert (result.dtype == dtype) + + def test_3d_with_out(self, dtype_can_hold_na): + dtype, can_hold_na = dtype_can_hold_na + + data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) + indexer = [2, 1, 0, 1] + + out0 = np.empty((4, 4, 3), dtype=dtype) + out1 = np.empty((5, 4, 3), dtype=dtype) + out2 = np.empty((5, 4, 4), dtype=dtype) + + algos.take_nd(data, indexer, out=out0, axis=0) + algos.take_nd(data, indexer, out=out1, axis=1) + algos.take_nd(data, indexer, out=out2, axis=2) + + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + expected2 = data.take(indexer, axis=2) + + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + tm.assert_almost_equal(out2, expected2) + + indexer = [2, 1, 0, -1] + out0 = np.empty((4, 4, 3), dtype=dtype) + out1 = np.empty((5, 4, 3), dtype=dtype) + out2 = np.empty((5, 4, 4), dtype=dtype) + + if can_hold_na: algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) algos.take_nd(data, indexer, out=out2, axis=2) + expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected2 = data.take(indexer, axis=2) + + expected0[3, :, :] = np.nan + expected1[:, 3, :] = np.nan + expected2[:, :, 3] = np.nan + tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) tm.assert_almost_equal(out2, expected2) - - indexer = [2, 1, 0, -1] - out0 = np.empty((4, 4, 3), dtype=dtype) - out1 = np.empty((5, 4, 3), dtype=dtype) - out2 = np.empty((5, 4, 4), dtype=dtype) - if can_hold_na: - algos.take_nd(data, indexer, out=out0, axis=0) - algos.take_nd(data, indexer, out=out1, axis=1) - algos.take_nd(data, indexer, out=out2, axis=2) - expected0 = data.take(indexer, axis=0) - expected1 = data.take(indexer, axis=1) - expected2 = data.take(indexer, axis=2) - expected0[3, :, :] = np.nan - expected1[:, 3, :] = np.nan - expected2[:, :, 3] = np.nan - tm.assert_almost_equal(out0, expected0) - tm.assert_almost_equal(out1, expected1) - tm.assert_almost_equal(out2, expected2) - else: - for i, out in enumerate([out0, out1, out2]): - with tm.assert_raises_regex(TypeError, - self.fill_error): - algos.take_nd(data, indexer, out=out, axis=i) - # no exception o/w - data.take(indexer, out=out, axis=i) - - _test_dtype(np.float64, True) - _test_dtype(np.float32, True) - _test_dtype(np.uint64, False) - _test_dtype(np.uint32, False) - _test_dtype(np.uint16, False) - _test_dtype(np.uint8, False) - _test_dtype(np.int64, False) - _test_dtype(np.int32, False) - _test_dtype(np.int16, False) - _test_dtype(np.int8, False) - _test_dtype(np.object_, True) - _test_dtype(np.bool, False) - - def test_3d_fill_nonna(self): - def _test_dtype(dtype, fill_value, out_dtype): - data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) - - indexer = [2, 1, 0, -1] - - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all()) - assert ((result[3, :, :] == fill_value).all()) - assert (result.dtype == out_dtype) - - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all()) - assert ((result[:, 3, :] == fill_value).all()) - assert (result.dtype == out_dtype) - - result = algos.take_nd(data, indexer, axis=2, - fill_value=fill_value) - assert ((result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all()) - assert ((result[:, :, 3] == fill_value).all()) - assert (result.dtype == out_dtype) - - indexer = [2, 1, 0, 1] - - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all()) - assert (result.dtype == dtype) - - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all()) - assert (result.dtype == dtype) - - result = algos.take_nd(data, indexer, axis=2, - fill_value=fill_value) - assert ((result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all()) - assert (result.dtype == dtype) - - _test_dtype(np.int8, np.int16(127), np.int8) - _test_dtype(np.int8, np.int16(128), np.int16) - _test_dtype(np.int32, 1, np.int32) - _test_dtype(np.int32, 2.0, np.float64) - _test_dtype(np.int32, 3.0 + 4.0j, np.complex128) - _test_dtype(np.int32, True, np.object_) - _test_dtype(np.int32, '', np.object_) - _test_dtype(np.float64, 1, np.float64) - _test_dtype(np.float64, 2.0, np.float64) - _test_dtype(np.float64, 3.0 + 4.0j, np.complex128) - _test_dtype(np.float64, True, np.object_) - _test_dtype(np.float64, '', np.object_) - _test_dtype(np.complex128, 1, np.complex128) - _test_dtype(np.complex128, 2.0, np.complex128) - _test_dtype(np.complex128, 3.0 + 4.0j, np.complex128) - _test_dtype(np.complex128, True, np.object_) - _test_dtype(np.complex128, '', np.object_) - _test_dtype(np.bool_, 1, np.object_) - _test_dtype(np.bool_, 2.0, np.object_) - _test_dtype(np.bool_, 3.0 + 4.0j, np.object_) - _test_dtype(np.bool_, True, np.bool_) - _test_dtype(np.bool_, '', np.object_) + else: + for i, out in enumerate([out0, out1, out2]): + with tm.assert_raises_regex(TypeError, + self.fill_error): + algos.take_nd(data, indexer, out=out, axis=i) + + # No Exception otherwise. + data.take(indexer, out=out, axis=i) + + def test_3d_fill_nonna(self, dtype_fill_out_dtype): + dtype, fill_value, out_dtype = dtype_fill_out_dtype + + data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) + indexer = [2, 1, 0, -1] + + result = algos.take_nd(data, indexer, axis=0, + fill_value=fill_value) + assert ((result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all()) + assert ((result[3, :, :] == fill_value).all()) + assert (result.dtype == out_dtype) + + result = algos.take_nd(data, indexer, axis=1, + fill_value=fill_value) + assert ((result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all()) + assert ((result[:, 3, :] == fill_value).all()) + assert (result.dtype == out_dtype) + + result = algos.take_nd(data, indexer, axis=2, + fill_value=fill_value) + assert ((result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all()) + assert ((result[:, :, 3] == fill_value).all()) + assert (result.dtype == out_dtype) + + indexer = [2, 1, 0, 1] + result = algos.take_nd(data, indexer, axis=0, + fill_value=fill_value) + assert ((result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all()) + assert (result.dtype == dtype) + + result = algos.take_nd(data, indexer, axis=1, + fill_value=fill_value) + assert ((result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all()) + assert (result.dtype == dtype) + + result = algos.take_nd(data, indexer, axis=2, + fill_value=fill_value) + assert ((result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all()) + assert (result.dtype == dtype) def test_1d_other_dtypes(self): arr = np.random.randn(10).astype(np.float32) From 5f956cea01fefe7ba53a5b57b819c7ad98db5c9f Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 22 Jun 2018 16:45:26 -0600 Subject: [PATCH 078/113] TST: Add interval closed fixture to top-level conftest (#21595) --- pandas/conftest.py | 8 ++++++++ pandas/tests/indexes/interval/test_construction.py | 5 ----- pandas/tests/indexes/interval/test_interval.py | 5 ----- pandas/tests/indexes/interval/test_interval_range.py | 5 ----- pandas/tests/indexes/interval/test_interval_tree.py | 5 ----- pandas/tests/indexing/interval/test_interval.py | 5 +---- 6 files changed, 9 insertions(+), 24 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 9d806a91f37f7..d6b18db4e71f2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -137,6 +137,14 @@ def nselect_method(request): return request.param +@pytest.fixture(params=['left', 'right', 'both', 'neither']) +def closed(request): + """ + Fixture for trying all interval closed parameters + """ + return request.param + + @pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')]) def nulls_fixture(request): """ diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index b1711c3444586..ac946a3421e53 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -14,11 +14,6 @@ import pandas.util.testing as tm -@pytest.fixture(params=['left', 'right', 'both', 'neither']) -def closed(request): - return request.param - - @pytest.fixture(params=[None, 'foo']) def name(request): return request.param diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 9920809a18a24..6a7330f8cfb68 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -12,11 +12,6 @@ import pandas as pd -@pytest.fixture(scope='class', params=['left', 'right', 'both', 'neither']) -def closed(request): - return request.param - - @pytest.fixture(scope='class', params=[None, 'foo']) def name(request): return request.param diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 29fe2b0185662..447856e7e9d51 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -11,11 +11,6 @@ import pandas.util.testing as tm -@pytest.fixture(scope='class', params=['left', 'right', 'both', 'neither']) -def closed(request): - return request.param - - @pytest.fixture(scope='class', params=[None, 'foo']) def name(request): return request.param diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 056d3e1087a2e..5f248bf7725e5 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -7,11 +7,6 @@ import pandas.util.testing as tm -@pytest.fixture(scope='class', params=['left', 'right', 'both', 'neither']) -def closed(request): - return request.param - - @pytest.fixture( scope='class', params=['int32', 'int64', 'float32', 'float64', 'uint64']) def dtype(request): diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 233fbd2c8d7be..f2f59159032a2 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -3,7 +3,6 @@ import pandas as pd from pandas import Series, DataFrame, IntervalIndex, Interval -from pandas.compat import product import pandas.util.testing as tm @@ -51,9 +50,7 @@ def test_getitem_with_scalar(self): tm.assert_series_equal(expected, s[s >= 2]) # TODO: check this behavior is consistent with test_interval_new.py - @pytest.mark.parametrize('direction, closed', - product(('increasing', 'decreasing'), - ('left', 'right', 'neither', 'both'))) + @pytest.mark.parametrize('direction', ['increasing', 'decreasing']) def test_nonoverlapping_monotonic(self, direction, closed): tpls = [(0, 1), (2, 3), (4, 5)] if direction == 'decreasing': From 19b78ca10df56e90082aad1f84347476f5dc4248 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Jun 2018 15:57:40 -0700 Subject: [PATCH 079/113] cache DateOffset attrs now that they are immutable (#21582) --- asv_bench/benchmarks/period.py | 5 +++++ doc/source/whatsnew/v0.24.0.txt | 1 + pandas/_libs/tslibs/offsets.pyx | 3 +++ pandas/tseries/offsets.py | 28 ++++++++++++---------------- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 897a3338c164c..c34f9a737473e 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -64,6 +64,11 @@ def setup(self): def time_setitem_period_column(self): self.df['col'] = self.rng + def time_set_index(self): + # GH#21582 limited by comparisons of Period objects + self.df['col2'] = self.rng + self.df.set_index('col2', append=True) + class Algorithms(object): diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 4bfae7de01b8f..5f05bbdfdb948 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -130,6 +130,7 @@ Performance Improvements - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) +- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`) - .. _whatsnew_0240.docs: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index a9ef9166e4d33..63add06db17b4 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -404,6 +404,9 @@ class _BaseOffset(object): kwds = {key: odict[key] for key in odict if odict[key]} state.update(kwds) + if '_cache' not in state: + state['_cache'] = {} + self.__dict__.update(state) if 'weekmask' in state and 'holidays' in state: diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index da8fdb4d79e34..a3f82c1a0902e 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -288,6 +288,7 @@ def isAnchored(self): # if there were a canonical docstring for what isAnchored means. return (self.n == 1) + @cache_readonly def _params(self): all_paras = self.__dict__.copy() if 'holidays' in all_paras and not all_paras['holidays']: @@ -322,8 +323,6 @@ def name(self): return self.rule_code def __eq__(self, other): - if other is None: - return False if isinstance(other, compat.string_types): from pandas.tseries.frequencies import to_offset @@ -333,13 +332,13 @@ def __eq__(self, other): if not isinstance(other, DateOffset): return False - return self._params() == other._params() + return self._params == other._params def __ne__(self, other): return not self == other def __hash__(self): - return hash(self._params()) + return hash(self._params) def __add__(self, other): if isinstance(other, (ABCDatetimeIndex, ABCSeries)): @@ -397,7 +396,7 @@ def _prefix(self): def rule_code(self): return self._prefix - @property + @cache_readonly def freqstr(self): try: code = self.rule_code @@ -601,7 +600,7 @@ def next_bday(self): else: return BusinessDay(n=nb_offset) - # TODO: Cache this once offsets are immutable + @cache_readonly def _get_daytime_flag(self): if self.start == self.end: raise ValueError('start and end must not be the same') @@ -643,12 +642,12 @@ def _prev_opening_time(self, other): return datetime(other.year, other.month, other.day, self.start.hour, self.start.minute) - # TODO: cache this once offsets are immutable + @cache_readonly def _get_business_hours_by_sec(self): """ Return business hours in a day by seconds. """ - if self._get_daytime_flag(): + if self._get_daytime_flag: # create dummy datetime to calculate businesshours in a day dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute) until = datetime(2014, 4, 1, self.end.hour, self.end.minute) @@ -662,7 +661,7 @@ def _get_business_hours_by_sec(self): def rollback(self, dt): """Roll provided date backward to next offset only if not on offset""" if not self.onOffset(dt): - businesshours = self._get_business_hours_by_sec() + businesshours = self._get_business_hours_by_sec if self.n >= 0: dt = self._prev_opening_time( dt) + timedelta(seconds=businesshours) @@ -683,9 +682,8 @@ def rollforward(self, dt): @apply_wraps def apply(self, other): - # calculate here because offset is not immutable - daytime = self._get_daytime_flag() - businesshours = self._get_business_hours_by_sec() + daytime = self._get_daytime_flag + businesshours = self._get_business_hours_by_sec bhdelta = timedelta(seconds=businesshours) if isinstance(other, datetime): @@ -766,7 +764,7 @@ def onOffset(self, dt): dt.minute, dt.second, dt.microsecond) # Valid BH can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time - businesshours = self._get_business_hours_by_sec() + businesshours = self._get_business_hours_by_sec return self._onOffset(dt, businesshours) def _onOffset(self, dt, businesshours): @@ -2203,13 +2201,12 @@ def __eq__(self, other): if isinstance(other, Tick): return self.delta == other.delta else: - # TODO: Are there cases where this should raise TypeError? return False # This is identical to DateOffset.__hash__, but has to be redefined here # for Python 3, because we've redefined __eq__. def __hash__(self): - return hash(self._params()) + return hash(self._params) def __ne__(self, other): if isinstance(other, compat.string_types): @@ -2220,7 +2217,6 @@ def __ne__(self, other): if isinstance(other, Tick): return self.delta != other.delta else: - # TODO: Are there cases where this should raise TypeError? return True @property From 303a27974a1f553ae73775464cb8c5864fa28098 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Fri, 22 Jun 2018 23:59:27 +0100 Subject: [PATCH 080/113] BUG: Series dot product __rmatmul__ doesn't allow matrix vector multiplication (#21578) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/series.py | 2 +- pandas/tests/series/test_analytics.py | 21 ++++++++++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5f05bbdfdb948..00d358ad8b522 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -184,7 +184,7 @@ Offsets Numeric ^^^^^^^ -- +- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) - - diff --git a/pandas/core/series.py b/pandas/core/series.py index 2f762dff4aeab..a608db806d20b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2066,7 +2066,7 @@ def __matmul__(self, other): def __rmatmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5 """ - return self.dot(other) + return self.dot(np.transpose(other)) @Substitution(klass='Series') @Appender(base._shared_docs['searchsorted']) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index b9c7b837b8b81..36342b5ba4ee1 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -849,11 +849,30 @@ def test_matmul(self): expected = np.dot(a.values, a.values) assert_almost_equal(result, expected) - # np.array @ Series (__rmatmul__) + # GH 21530 + # vector (1D np.array) @ Series (__rmatmul__) result = operator.matmul(a.values, a) expected = np.dot(a.values, a.values) assert_almost_equal(result, expected) + # GH 21530 + # vector (1D list) @ Series (__rmatmul__) + result = operator.matmul(a.values.tolist(), a) + expected = np.dot(a.values, a.values) + assert_almost_equal(result, expected) + + # GH 21530 + # matrix (2D np.array) @ Series (__rmatmul__) + result = operator.matmul(b.T.values, a) + expected = np.dot(b.T.values, a.values) + assert_almost_equal(result, expected) + + # GH 21530 + # matrix (2D nested lists) @ Series (__rmatmul__) + result = operator.matmul(b.T.values.tolist(), a) + expected = np.dot(b.T.values, a.values) + assert_almost_equal(result, expected) + # mixed dtype DataFrame @ Series a['p'] = int(a.p) result = operator.matmul(b.T, a) From a4798c3abce3040ea50966561251f31f62f3706c Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sat, 23 Jun 2018 00:01:39 +0100 Subject: [PATCH 081/113] BUG: first/last lose timezone in groupby with as_index=False (#21573) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/test_nth.py | 61 +++++++++++++++++++++++++++++++- 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 00d358ad8b522..90fc579ae69e5 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -226,7 +226,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) - - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3bc59157055ce..0bbdfbbe52ac4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4740,7 +4740,7 @@ def _wrap_transformed_output(self, output, names=None): def _wrap_agged_blocks(self, items, blocks): if not self.as_index: - index = np.arange(blocks[0].values.shape[1]) + index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, [items, index]) result = DataFrame(mgr) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index a32ba9ad76f14..a1b748cd50e8f 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -1,11 +1,12 @@ import numpy as np import pandas as pd -from pandas import DataFrame, MultiIndex, Index, Series, isna +from pandas import DataFrame, MultiIndex, Index, Series, isna, Timestamp from pandas.compat import lrange from pandas.util.testing import ( assert_frame_equal, assert_produces_warning, assert_series_equal) +import pytest def test_first_last_nth(df): @@ -219,6 +220,64 @@ def test_nth_multi_index(three_group): assert_frame_equal(result, expected) +@pytest.mark.parametrize('data, expected_first, expected_last', [ + ({'id': ['A'], + 'time': Timestamp('2012-02-01 14:00:00', + tz='US/Central'), + 'foo': [1]}, + {'id': ['A'], + 'time': Timestamp('2012-02-01 14:00:00', + tz='US/Central'), + 'foo': [1]}, + {'id': ['A'], + 'time': Timestamp('2012-02-01 14:00:00', + tz='US/Central'), + 'foo': [1]}), + ({'id': ['A', 'B', 'A'], + 'time': [Timestamp('2012-01-01 13:00:00', + tz='America/New_York'), + Timestamp('2012-02-01 14:00:00', + tz='US/Central'), + Timestamp('2012-03-01 12:00:00', + tz='Europe/London')], + 'foo': [1, 2, 3]}, + {'id': ['A', 'B'], + 'time': [Timestamp('2012-01-01 13:00:00', + tz='America/New_York'), + Timestamp('2012-02-01 14:00:00', + tz='US/Central')], + 'foo': [1, 2]}, + {'id': ['A', 'B'], + 'time': [Timestamp('2012-03-01 12:00:00', + tz='Europe/London'), + Timestamp('2012-02-01 14:00:00', + tz='US/Central')], + 'foo': [3, 2]}) +]) +def test_first_last_tz(data, expected_first, expected_last): + # GH15884 + # Test that the timezone is retained when calling first + # or last on groupby with as_index=False + + df = DataFrame(data) + + result = df.groupby('id', as_index=False).first() + expected = DataFrame(expected_first) + cols = ['id', 'time', 'foo'] + assert_frame_equal(result[cols], expected[cols]) + + result = df.groupby('id', as_index=False)['time'].first() + assert_frame_equal(result, expected[['id', 'time']]) + + result = df.groupby('id', as_index=False).last() + expected = DataFrame(expected_last) + cols = ['id', 'time', 'foo'] + assert_frame_equal(result[cols], expected[cols]) + + result = df.groupby('id', as_index=False)['time'].last() + assert_frame_equal(result, expected[['id', 'time']]) + + def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex From f901431f6faf1d829cfd33969bc4813f9add8505 Mon Sep 17 00:00:00 2001 From: Jacopo Rota Date: Sat, 23 Jun 2018 01:04:38 +0200 Subject: [PATCH 082/113] add test case when to_csv argument is sys.stdout (#21572) --- pandas/tests/io/formats/test_to_csv.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index dfa3751bff57a..36c4ae547ad4e 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -285,3 +285,18 @@ def test_to_csv_string_array_utf8(self): df.to_csv(path, encoding='utf-8') with open(path, 'r') as f: assert f.read() == expected_utf8 + + @tm.capture_stdout + def test_to_csv_stdout_file(self): + # GH 21561 + df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']], + columns=['name_1', 'name_2']) + expected_ascii = '''\ +,name_1,name_2 +0,foo,bar +1,baz,qux +''' + df.to_csv(sys.stdout, encoding='ascii') + output = sys.stdout.getvalue() + assert output == expected_ascii + assert not sys.stdout.closed From 8794ef2c44f8659ace718ad5d93fd4943f708116 Mon Sep 17 00:00:00 2001 From: Vu Le Date: Sat, 23 Jun 2018 06:07:21 +0700 Subject: [PATCH 083/113] BUG: Fix json_normalize throwing TypeError (#21536) (#21540) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/io/json/normalize.py | 8 +++++++- pandas/tests/io/json/test_normalize.py | 6 ++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index c781f45715bd4..ff872cfc6b3ef 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -71,7 +71,7 @@ Bug Fixes **I/O** - Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) -- +- Bug in :func:`json_normalize` when formatting the ``record_prefix`` with integer columns (:issue:`21536`) - **Plotting** diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index b845a43b9ca9e..2004a24c2ec5a 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -170,6 +170,11 @@ def json_normalize(data, record_path=None, meta=None, 3 Summit 1234 John Kasich Ohio OH 4 Cuyahoga 1337 John Kasich Ohio OH + >>> data = {'A': [1, 2]} + >>> json_normalize(data, 'A', record_prefix='Prefix.') + Prefix.0 + 0 1 + 1 2 """ def _pull_field(js, spec): result = js @@ -259,7 +264,8 @@ def _recursive_extract(data, path, seen_meta, level=0): result = DataFrame(records) if record_prefix is not None: - result.rename(columns=lambda x: record_prefix + x, inplace=True) + result = result.rename( + columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) # Data types, a problem for k, v in compat.iteritems(meta_vals): diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 395c2c90767d3..200a853c48900 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -123,6 +123,12 @@ def test_simple_normalize_with_separator(self, deep_nested): 'country', 'states_name']).sort_values() assert result.columns.sort_values().equals(expected) + def test_value_array_record_prefix(self): + # GH 21536 + result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.') + expected = DataFrame([[1], [2]], columns=['Prefix.0']) + tm.assert_frame_equal(result, expected) + def test_more_deeply_nested(self, deep_nested): result = json_normalize(deep_nested, ['states', 'cities'], From 6eb0341173759e12499ec1ebe763dcdea4ab3a0d Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sat, 23 Jun 2018 00:36:44 +0100 Subject: [PATCH 084/113] DOC: updated the Series.str.rsplit and Series.str.split docstrings (#21026) --- pandas/core/strings.py | 244 +++++++++++++++++++++-------------------- 1 file changed, 123 insertions(+), 121 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 08239ae4dae20..b27cfdfe3f1bd 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1343,108 +1343,7 @@ def str_pad(arr, width, side='left', fillchar=' '): def str_split(arr, pat=None, n=None): - """ - Split strings around given separator/delimiter. - - Split each string in the caller's values by given - pattern, propagating NaN values. Equivalent to :meth:`str.split`. - - Parameters - ---------- - pat : str, optional - String or regular expression to split on. - If not specified, split on whitespace. - n : int, default -1 (all) - Limit number of splits in output. - ``None``, 0 and -1 will be interpreted as return all splits. - expand : bool, default False - Expand the split strings into separate columns. - - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. - Returns - ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - Notes - ----- - The handling of the `n` keyword depends on the number of found splits: - - - If found splits > `n`, make first `n` splits only - - If found splits <= `n`, make all splits - - If for a certain row the number of found splits < `n`, - append `None` for padding up to `n` if ``expand=True`` - - If using ``expand=True``, Series and Index callers return DataFrame and - MultiIndex objects, respectively. - - See Also - -------- - str.split : Standard library version of this method. - Series.str.get_dummies : Split each string into dummy variables. - Series.str.partition : Split string on a separator, returning - the before, separator, and after components. - - Examples - -------- - >>> s = pd.Series(["this is good text", "but this is even better"]) - - By default, split will return an object of the same size - having lists containing the split elements - - >>> s.str.split() - 0 [this, is, good, text] - 1 [but, this, is, even, better] - dtype: object - >>> s.str.split("random") - 0 [this is good text] - 1 [but this is even better] - dtype: object - - When using ``expand=True``, the split elements will expand out into - separate columns. - - For Series object, output return type is DataFrame. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is good text None - 1 but this is even better - >>> s.str.split(" is ", expand=True) - 0 1 - 0 this good text - 1 but this even better - - For Index object, output return type is MultiIndex. - - >>> i = pd.Index(["ba 100 001", "ba 101 002", "ba 102 003"]) - >>> i.str.split(expand=True) - MultiIndex(levels=[['ba'], ['100', '101', '102'], ['001', '002', '003']], - labels=[[0, 0, 0], [0, 1, 2], [0, 1, 2]]) - - Parameter `n` can be used to limit the number of splits in the output. - - >>> s.str.split("is", n=1) - 0 [th, is good text] - 1 [but th, is even better] - dtype: object - >>> s.str.split("is", n=1, expand=True) - 0 1 - 0 th is good text - 1 but th is even better - - If NaN is present, it is propagated throughout the columns - during the split. - - >>> s = pd.Series(["this is good text", "but this is even better", np.nan]) - >>> s.str.split(n=3, expand=True) - 0 1 2 3 - 0 this is good text - 1 but this is even better - 2 NaN NaN NaN NaN - """ if pat is None: if n is None or n == 0: n = -1 @@ -1464,25 +1363,7 @@ def str_split(arr, pat=None, n=None): def str_rsplit(arr, pat=None, n=None): - """ - Split each string in the Series/Index by the given delimiter - string, starting at the end of the string and working to the front. - Equivalent to :meth:`str.rsplit`. - Parameters - ---------- - pat : string, default None - Separator to split on. If None, splits on whitespace - n : int, default -1 (all) - None, 0 and -1 will be interpreted as return all splits - expand : bool, default False - * If True, return DataFrame/MultiIndex expanding dimensionality. - * If False, return Series/Index. - - Returns - ------- - split : Series/Index or DataFrame/MultiIndex of objects - """ if n is None or n == 0: n = -1 f = lambda x: x.rsplit(pat, n) @@ -2325,12 +2206,133 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): res = Series(res, index=data.index, name=self._orig.name) return res - @copy(str_split) + _shared_docs['str_split'] = (""" + Split strings around given separator/delimiter. + + Splits the string in the Series/Index from the %(side)s, + at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + pat : str, optional + String or regular expression to split on. + If not specified, split on whitespace. + n : int, default -1 (all) + Limit number of splits in output. + ``None``, 0 and -1 will be interpreted as return all splits. + expand : bool, default False + Expand the splitted strings into separate columns. + + * If ``True``, return DataFrame/MultiIndex expanding dimensionality. + * If ``False``, return Series/Index, containing lists of strings. + + Returns + ------- + Series, Index, DataFrame or MultiIndex + Type matches caller unless ``expand=True`` (see Notes). + + See Also + -------- + Series.str.split : Split strings around given separator/delimiter. + Series.str.rsplit : Splits string around given separator/delimiter, + starting from the right. + Series.str.join : Join lists contained as elements in the Series/Index + with passed delimiter. + str.split : Standard library version for split. + str.rsplit : Standard library version for rsplit. + + Notes + ----- + The handling of the `n` keyword depends on the number of found splits: + + - If found splits > `n`, make first `n` splits only + - If found splits <= `n`, make all splits + - If for a certain row the number of found splits < `n`, + append `None` for padding up to `n` if ``expand=True`` + + If using ``expand=True``, Series and Index callers return DataFrame and + MultiIndex objects, respectively. + + Examples + -------- + >>> s = pd.Series(["this is a regular sentence", + "https://docs.python.org/3/tutorial/index.html", np.nan]) + + In the default setting, the string is split by whitespace. + + >>> s.str.split() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + Without the `n` parameter, the outputs of `rsplit` and `split` + are identical. + + >>> s.str.rsplit() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `n` parameter can be used to limit the number of splits on the + delimiter. The outputs of `split` and `rsplit` are different. + + >>> s.str.split(n=2) + 0 [this, is, a regular sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + >>> s.str.rsplit(n=2) + 0 [this is a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `pat` parameter can be used to split by other characters. + + >>> s.str.split(pat = "/") + 0 [this is a regular sentence] + 1 [https:, , docs.python.org, 3, tutorial, index... + 2 NaN + dtype: object + + When using ``expand=True``, the split elements will expand out into + separate columns. If NaN is present, it is propagated throughout + the columns during the split. + + >>> s.str.split(expand=True) + 0 1 2 3 + 0 this is a regular + 1 https://docs.python.org/3/tutorial/index.html None None None + 2 NaN NaN NaN NaN \ + + 4 + 0 sentence + 1 None + 2 NaN + + For slightly more complex use cases like splitting the html document name + from a url, a combination of parameter settings can be used. + + >>> s.str.rsplit("/", n=1, expand=True) + 0 1 + 0 this is a regular sentence None + 1 https://docs.python.org/3/tutorial index.html + 2 NaN NaN + """) + + @Appender(_shared_docs['str_split'] % { + 'side': 'beginning', + 'method': 'split'}) def split(self, pat=None, n=-1, expand=False): result = str_split(self._data, pat, n=n) return self._wrap_result(result, expand=expand) - @copy(str_rsplit) + @Appender(_shared_docs['str_split'] % { + 'side': 'end', + 'method': 'rsplit'}) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._data, pat, n=n) return self._wrap_result(result, expand=expand) From 289f3adaaa91a8cf491b11fd72ddda4852a23611 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sat, 23 Jun 2018 05:27:58 -0600 Subject: [PATCH 085/113] TST: Use multiple instances of parametrize instead of product (#21602) --- pandas/tests/dtypes/test_dtypes.py | 14 +-- pandas/tests/frame/test_rank.py | 124 +++++++++++----------- pandas/tests/groupby/test_function.py | 7 +- pandas/tests/groupby/test_whitelist.py | 12 +-- pandas/tests/reshape/test_concat.py | 11 +- pandas/tests/sparse/series/test_series.py | 20 ++-- pandas/tests/test_multilevel.py | 80 +++++++------- pandas/tests/test_resample.py | 98 +++++++++-------- pandas/tests/test_window.py | 7 +- 9 files changed, 185 insertions(+), 188 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index cc833af03ae66..eee53a2fcac6a 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -2,8 +2,6 @@ import re import pytest -from itertools import product - import numpy as np import pandas as pd from pandas import ( @@ -233,12 +231,14 @@ def test_dst(self): assert is_datetimetz(s2) assert s1.dtype == s2.dtype - def test_parser(self): + @pytest.mark.parametrize('tz', ['UTC', 'US/Eastern']) + @pytest.mark.parametrize('constructor', ['M8', 'datetime64']) + def test_parser(self, tz, constructor): # pr #11245 - for tz, constructor in product(('UTC', 'US/Eastern'), - ('M8', 'datetime64')): - assert (DatetimeTZDtype('%s[ns, %s]' % (constructor, tz)) == - DatetimeTZDtype('ns', tz)) + dtz_str = '{con}[ns, {tz}]'.format(con=constructor, tz=tz) + result = DatetimeTZDtype(dtz_str) + expected = DatetimeTZDtype('ns', tz) + assert result == expected def test_empty(self): dt = DatetimeTZDtype() diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index b8ba408b54715..a1210f1ed54e4 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -10,7 +10,6 @@ from pandas.util.testing import assert_frame_equal from pandas.tests.frame.common import TestData from pandas import Series, DataFrame -from pandas.compat import product class TestRank(TestData): @@ -26,6 +25,13 @@ class TestRank(TestData): 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), } + @pytest.fixture(params=['average', 'min', 'max', 'first', 'dense']) + def method(self, request): + """ + Fixture for trying all rank methods + """ + return request.param + def test_rank(self): rankdata = pytest.importorskip('scipy.stats.rankdata') @@ -217,34 +223,35 @@ def test_rank_methods_frame(self): expected = expected.astype('float64') tm.assert_frame_equal(result, expected) - def test_rank_descending(self): - dtypes = ['O', 'f8', 'i8'] + @pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) + def test_rank_descending(self, method, dtype): - for dtype, method in product(dtypes, self.results): - if 'i' in dtype: - df = self.df.dropna() - else: - df = self.df.astype(dtype) + if 'i' in dtype: + df = self.df.dropna() + else: + df = self.df.astype(dtype) - res = df.rank(ascending=False) - expected = (df.max() - df).rank() - assert_frame_equal(res, expected) + res = df.rank(ascending=False) + expected = (df.max() - df).rank() + assert_frame_equal(res, expected) - if method == 'first' and dtype == 'O': - continue + if method == 'first' and dtype == 'O': + return - expected = (df.max() - df).rank(method=method) + expected = (df.max() - df).rank(method=method) - if dtype != 'O': - res2 = df.rank(method=method, ascending=False, - numeric_only=True) - assert_frame_equal(res2, expected) + if dtype != 'O': + res2 = df.rank(method=method, ascending=False, + numeric_only=True) + assert_frame_equal(res2, expected) - res3 = df.rank(method=method, ascending=False, - numeric_only=False) - assert_frame_equal(res3, expected) + res3 = df.rank(method=method, ascending=False, + numeric_only=False) + assert_frame_equal(res3, expected) - def test_rank_2d_tie_methods(self): + @pytest.mark.parametrize('axis', [0, 1]) + @pytest.mark.parametrize('dtype', [None, object]) + def test_rank_2d_tie_methods(self, method, axis, dtype): df = self.df def _check2d(df, expected, method='average', axis=0): @@ -257,43 +264,38 @@ def _check2d(df, expected, method='average', axis=0): result = df.rank(method=method, axis=axis) assert_frame_equal(result, exp_df) - dtypes = [None, object] disabled = set([(object, 'first')]) - results = self.results - - for method, axis, dtype in product(results, [0, 1], dtypes): - if (dtype, method) in disabled: - continue - frame = df if dtype is None else df.astype(dtype) - _check2d(frame, results[method], method=method, axis=axis) - - -@pytest.mark.parametrize( - "method,exp", [("dense", - [[1., 1., 1.], - [1., 0.5, 2. / 3], - [1., 0.5, 1. / 3]]), - ("min", - [[1. / 3, 1., 1.], - [1. / 3, 1. / 3, 2. / 3], - [1. / 3, 1. / 3, 1. / 3]]), - ("max", - [[1., 1., 1.], - [1., 2. / 3, 2. / 3], - [1., 2. / 3, 1. / 3]]), - ("average", - [[2. / 3, 1., 1.], - [2. / 3, 0.5, 2. / 3], - [2. / 3, 0.5, 1. / 3]]), - ("first", - [[1. / 3, 1., 1.], - [2. / 3, 1. / 3, 2. / 3], - [3. / 3, 2. / 3, 1. / 3]])]) -def test_rank_pct_true(method, exp): - # see gh-15630. - - df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) - result = df.rank(method=method, pct=True) - - expected = DataFrame(exp) - tm.assert_frame_equal(result, expected) + if (dtype, method) in disabled: + return + frame = df if dtype is None else df.astype(dtype) + _check2d(frame, self.results[method], method=method, axis=axis) + + @pytest.mark.parametrize( + "method,exp", [("dense", + [[1., 1., 1.], + [1., 0.5, 2. / 3], + [1., 0.5, 1. / 3]]), + ("min", + [[1. / 3, 1., 1.], + [1. / 3, 1. / 3, 2. / 3], + [1. / 3, 1. / 3, 1. / 3]]), + ("max", + [[1., 1., 1.], + [1., 2. / 3, 2. / 3], + [1., 2. / 3, 1. / 3]]), + ("average", + [[2. / 3, 1., 1.], + [2. / 3, 0.5, 2. / 3], + [2. / 3, 0.5, 1. / 3]]), + ("first", + [[1. / 3, 1., 1.], + [2. / 3, 1. / 3, 2. / 3], + [3. / 3, 2. / 3, 1. / 3]])]) + def test_rank_pct_true(self, method, exp): + # see gh-15630. + + df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) + result = df.rank(method=method, pct=True) + + expected = DataFrame(exp) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index f1d678db4ff7f..9df362a8e132f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -778,9 +778,10 @@ def test_frame_describe_unstacked_format(): # nunique # -------------------------------- -@pytest.mark.parametrize("n, m", cart_product(10 ** np.arange(2, 6), - (10, 100, 1000))) -@pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2)) +@pytest.mark.parametrize('n', 10 ** np.arange(2, 6)) +@pytest.mark.parametrize('m', [10, 100, 1000]) +@pytest.mark.parametrize('sort', [False, True]) +@pytest.mark.parametrize('dropna', [False, True]) def test_series_groupby_nunique(n, m, sort, dropna): def check_nunique(df, keys, as_index=True): diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 8d6e074881cbb..f4a58b9cbe61b 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -8,7 +8,6 @@ import numpy as np from pandas import DataFrame, Series, compat, date_range, Index, MultiIndex from pandas.util import testing as tm -from pandas.compat import lrange, product AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var', 'sem'] @@ -175,12 +174,11 @@ def raw_frame(): return raw_frame -@pytest.mark.parametrize( - "op, level, axis, skipna, sort", - product(AGG_FUNCTIONS, - lrange(2), lrange(2), - [True, False], - [True, False])) +@pytest.mark.parametrize('op', AGG_FUNCTIONS) +@pytest.mark.parametrize('level', [0, 1]) +@pytest.mark.parametrize('axis', [0, 1]) +@pytest.mark.parametrize('skipna', [True, False]) +@pytest.mark.parametrize('sort', [True, False]) def test_regression_whitelist_methods( raw_frame, op, level, axis, skipna, sort): diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index dea305d4b3fee..8d819f9926abb 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1,5 +1,5 @@ from warnings import catch_warnings -from itertools import combinations, product +from itertools import combinations import datetime as dt import dateutil @@ -941,10 +941,11 @@ def test_append_different_columns_types(self, df_columns, series_index): columns=combined_columns) assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "index_can_append, index_cannot_append_with_other", - product(indexes_can_append, indexes_cannot_append_with_other), - ids=lambda x: x.__class__.__name__) + @pytest.mark.parametrize('index_can_append', indexes_can_append, + ids=lambda x: x.__class__.__name__) + @pytest.mark.parametrize('index_cannot_append_with_other', + indexes_cannot_append_with_other, + ids=lambda x: x.__class__.__name__) def test_append_different_columns_types_raises( self, index_can_append, index_cannot_append_with_other): # GH18359 diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index eb63c87820070..921c30234660f 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -23,8 +23,6 @@ from pandas.core.sparse.api import SparseSeries from pandas.tests.series.test_api import SharedWithSparse -from itertools import product - def _test_data1(): # nan-based @@ -985,16 +983,16 @@ def test_combine_first(self): tm.assert_sp_series_equal(result, result2) tm.assert_sp_series_equal(result, expected) - @pytest.mark.parametrize('deep,fill_values', [([True, False], - [0, 1, np.nan, None])]) - def test_memory_usage_deep(self, deep, fill_values): - for deep, fill_value in product(deep, fill_values): - sparse_series = SparseSeries(fill_values, fill_value=fill_value) - dense_series = Series(fill_values) - sparse_usage = sparse_series.memory_usage(deep=deep) - dense_usage = dense_series.memory_usage(deep=deep) + @pytest.mark.parametrize('deep', [True, False]) + @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None]) + def test_memory_usage_deep(self, deep, fill_value): + values = [0, 1, np.nan, None] + sparse_series = SparseSeries(values, fill_value=fill_value) + dense_series = Series(values) + sparse_usage = sparse_series.memory_usage(deep=deep) + dense_usage = dense_series.memory_usage(deep=deep) - assert sparse_usage < dense_usage + assert sparse_usage < dense_usage class TestSparseHandlingMultiIndexes(object): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 79e05c90a21b0..3caee2b44c579 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -20,6 +20,9 @@ import pandas as pd import pandas._libs.index as _index +AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', + 'std', 'var', 'sem'] + class Base(object): @@ -1389,60 +1392,57 @@ def test_count(self): pytest.raises(KeyError, series.count, 'x') pytest.raises(KeyError, frame.count, level='x') - AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', - 'mad', 'std', 'var', 'sem'] - + @pytest.mark.parametrize('op', AGG_FUNCTIONS) + @pytest.mark.parametrize('level', [0, 1]) + @pytest.mark.parametrize('skipna', [True, False]) @pytest.mark.parametrize('sort', [True, False]) - def test_series_group_min_max(self, sort): + def test_series_group_min_max(self, op, level, skipna, sort): # GH 17537 - for op, level, skipna in cart_product(self.AGG_FUNCTIONS, lrange(2), - [False, True]): - grouped = self.series.groupby(level=level, sort=sort) - aggf = lambda x: getattr(x, op)(skipna=skipna) - # skipna=True - leftside = grouped.agg(aggf) - rightside = getattr(self.series, op)(level=level, skipna=skipna) - if sort: - rightside = rightside.sort_index(level=level) - tm.assert_series_equal(leftside, rightside) - + grouped = self.series.groupby(level=level, sort=sort) + # skipna=True + leftside = grouped.agg(lambda x: getattr(x, op)(skipna=skipna)) + rightside = getattr(self.series, op)(level=level, skipna=skipna) + if sort: + rightside = rightside.sort_index(level=level) + tm.assert_series_equal(leftside, rightside) + + @pytest.mark.parametrize('op', AGG_FUNCTIONS) + @pytest.mark.parametrize('level', [0, 1]) + @pytest.mark.parametrize('axis', [0, 1]) + @pytest.mark.parametrize('skipna', [True, False]) @pytest.mark.parametrize('sort', [True, False]) - def test_frame_group_ops(self, sort): + def test_frame_group_ops(self, op, level, axis, skipna, sort): # GH 17537 self.frame.iloc[1, [1, 2]] = np.nan self.frame.iloc[7, [0, 1]] = np.nan - for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, - lrange(2), lrange(2), - [False, True]): - - if axis == 0: - frame = self.frame - else: - frame = self.frame.T + if axis == 0: + frame = self.frame + else: + frame = self.frame.T - grouped = frame.groupby(level=level, axis=axis, sort=sort) + grouped = frame.groupby(level=level, axis=axis, sort=sort) - pieces = [] + pieces = [] - def aggf(x): - pieces.append(x) - return getattr(x, op)(skipna=skipna, axis=axis) + def aggf(x): + pieces.append(x) + return getattr(x, op)(skipna=skipna, axis=axis) - leftside = grouped.agg(aggf) - rightside = getattr(frame, op)(level=level, axis=axis, - skipna=skipna) - if sort: - rightside = rightside.sort_index(level=level, axis=axis) - frame = frame.sort_index(level=level, axis=axis) + leftside = grouped.agg(aggf) + rightside = getattr(frame, op)(level=level, axis=axis, + skipna=skipna) + if sort: + rightside = rightside.sort_index(level=level, axis=axis) + frame = frame.sort_index(level=level, axis=axis) - # for good measure, groupby detail - level_index = frame._get_axis(axis).levels[level] + # for good measure, groupby detail + level_index = frame._get_axis(axis).levels[level] - tm.assert_index_equal(leftside._get_axis(axis), level_index) - tm.assert_index_equal(rightside._get_axis(axis), level_index) + tm.assert_index_equal(leftside._get_axis(axis), level_index) + tm.assert_index_equal(rightside._get_axis(axis), level_index) - tm.assert_frame_equal(leftside, rightside) + tm.assert_frame_equal(leftside, rightside) def test_stat_op_corner(self): obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 6f0ad0535c6b4..60f23309b11d9 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -17,7 +17,7 @@ from pandas import (Series, DataFrame, Panel, Index, isna, notna, Timestamp) -from pandas.compat import range, lrange, zip, product, OrderedDict +from pandas.compat import range, lrange, zip, OrderedDict from pandas.errors import UnsupportedFunctionCall from pandas.core.groupby.groupby import DataError import pandas.core.common as com @@ -1951,30 +1951,32 @@ def test_resample_nunique_with_date_gap(self): assert_series_equal(results[0], results[2]) assert_series_equal(results[0], results[3]) - def test_resample_group_info(self): # GH10914 - for n, k in product((10000, 100000), (10, 100, 1000)): - dr = date_range(start='2015-08-27', periods=n // 10, freq='T') - ts = Series(np.random.randint(0, n // k, n).astype('int64'), - index=np.random.choice(dr, n)) + @pytest.mark.parametrize('n', [10000, 100000]) + @pytest.mark.parametrize('k', [10, 100, 1000]) + def test_resample_group_info(self, n, k): + # GH10914 + dr = date_range(start='2015-08-27', periods=n // 10, freq='T') + ts = Series(np.random.randint(0, n // k, n).astype('int64'), + index=np.random.choice(dr, n)) - left = ts.resample('30T').nunique() - ix = date_range(start=ts.index.min(), end=ts.index.max(), - freq='30T') + left = ts.resample('30T').nunique() + ix = date_range(start=ts.index.min(), end=ts.index.max(), + freq='30T') - vals = ts.values - bins = np.searchsorted(ix.values, ts.index, side='right') + vals = ts.values + bins = np.searchsorted(ix.values, ts.index, side='right') - sorter = np.lexsort((vals, bins)) - vals, bins = vals[sorter], bins[sorter] + sorter = np.lexsort((vals, bins)) + vals, bins = vals[sorter], bins[sorter] - mask = np.r_[True, vals[1:] != vals[:-1]] - mask |= np.r_[True, bins[1:] != bins[:-1]] + mask = np.r_[True, vals[1:] != vals[:-1]] + mask |= np.r_[True, bins[1:] != bins[:-1]] - arr = np.bincount(bins[mask] - 1, - minlength=len(ix)).astype('int64', copy=False) - right = Series(arr, index=ix) + arr = np.bincount(bins[mask] - 1, + minlength=len(ix)).astype('int64', copy=False) + right = Series(arr, index=ix) - assert_series_equal(left, right) + assert_series_equal(left, right) def test_resample_size(self): n = 10000 @@ -2323,28 +2325,25 @@ def test_annual_upsample(self): method='ffill') assert_series_equal(result, expected) - def test_quarterly_upsample(self): - targets = ['D', 'B', 'M'] - - for month in MONTHS: - ts = _simple_pts('1/1/1990', '12/31/1995', freq='Q-%s' % month) - - for targ, conv in product(targets, ['start', 'end']): - result = ts.resample(targ, convention=conv).ffill() - expected = result.to_timestamp(targ, how=conv) - expected = expected.asfreq(targ, 'ffill').to_period() - assert_series_equal(result, expected) - - def test_monthly_upsample(self): - targets = ['D', 'B'] + @pytest.mark.parametrize('month', MONTHS) + @pytest.mark.parametrize('target', ['D', 'B', 'M']) + @pytest.mark.parametrize('convention', ['start', 'end']) + def test_quarterly_upsample(self, month, target, convention): + freq = 'Q-{month}'.format(month=month) + ts = _simple_pts('1/1/1990', '12/31/1995', freq=freq) + result = ts.resample(target, convention=convention).ffill() + expected = result.to_timestamp(target, how=convention) + expected = expected.asfreq(target, 'ffill').to_period() + assert_series_equal(result, expected) + @pytest.mark.parametrize('target', ['D', 'B']) + @pytest.mark.parametrize('convention', ['start', 'end']) + def test_monthly_upsample(self, target, convention): ts = _simple_pts('1/1/1990', '12/31/1995', freq='M') - - for targ, conv in product(targets, ['start', 'end']): - result = ts.resample(targ, convention=conv).ffill() - expected = result.to_timestamp(targ, how=conv) - expected = expected.asfreq(targ, 'ffill').to_period() - assert_series_equal(result, expected) + result = ts.resample(target, convention=convention).ffill() + expected = result.to_timestamp(target, how=convention) + expected = expected.asfreq(target, 'ffill').to_period() + assert_series_equal(result, expected) def test_resample_basic(self): # GH3609 @@ -2455,17 +2454,16 @@ def test_fill_method_and_how_upsample(self): both = s.resample('M').ffill().resample('M').last().astype('int64') assert_series_equal(last, both) - def test_weekly_upsample(self): - targets = ['D', 'B'] - - for day in DAYS: - ts = _simple_pts('1/1/1990', '12/31/1995', freq='W-%s' % day) - - for targ, conv in product(targets, ['start', 'end']): - result = ts.resample(targ, convention=conv).ffill() - expected = result.to_timestamp(targ, how=conv) - expected = expected.asfreq(targ, 'ffill').to_period() - assert_series_equal(result, expected) + @pytest.mark.parametrize('day', DAYS) + @pytest.mark.parametrize('target', ['D', 'B']) + @pytest.mark.parametrize('convention', ['start', 'end']) + def test_weekly_upsample(self, day, target, convention): + freq = 'W-{day}'.format(day=day) + ts = _simple_pts('1/1/1990', '12/31/1995', freq=freq) + result = ts.resample(target, convention=convention).ffill() + expected = result.to_timestamp(target, how=convention) + expected = expected.asfreq(target, 'ffill').to_period() + assert_series_equal(result, expected) def test_resample_to_timestamps(self): ts = _simple_pts('1/1/1990', '12/31/1995', freq='M') diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index cfd88f41f855e..78d1fa84cc5db 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2105,10 +2105,9 @@ def _non_null_values(x): (mean_x * mean_y)) @pytest.mark.slow - @pytest.mark.parametrize( - 'min_periods, adjust, ignore_na', product([0, 1, 2, 3, 4], - [True, False], - [False, True])) + @pytest.mark.parametrize('min_periods', [0, 1, 2, 3, 4]) + @pytest.mark.parametrize('adjust', [True, False]) + @pytest.mark.parametrize('ignore_na', [True, False]) def test_ewm_consistency(self, min_periods, adjust, ignore_na): def _weights(s, com, adjust, ignore_na): if isinstance(s, DataFrame): From a65ea90cc8ad974f556b76ee229c58cdd051c49d Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 23 Jun 2018 12:12:18 -0400 Subject: [PATCH 086/113] MyPy cleanup and absolute imports in pandas.core.dtypes.common (#21008) --- pandas/core/arrays/base.py | 2 +- pandas/core/base.py | 2 +- pandas/core/dtypes/common.py | 24 +++++++++++++----------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ce87c0a8b0c5a..30949ca6d1d6b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -195,13 +195,13 @@ def __setitem__(self, key, value): ) def __len__(self): + # type: () -> int """Length of this array Returns ------- length : int """ - # type: () -> int raise AbstractMethodError(self) def __iter__(self): diff --git a/pandas/core/base.py b/pandas/core/base.py index c331ead8d2fef..6625a3bbe97d7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -788,6 +788,7 @@ def base(self): @property def _ndarray_values(self): + # type: () -> np.ndarray """The data as an ndarray, possibly losing information. The expectation is that this is cheap to compute, and is primarily @@ -795,7 +796,6 @@ def _ndarray_values(self): - categorical -> codes """ - # type: () -> np.ndarray if is_extension_array_dtype(self): return self.values._ndarray_values return self.values diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c45838e6040a9..05f82c67ddb8b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -5,17 +5,19 @@ PY3, PY36) from pandas._libs import algos, lib from pandas._libs.tslibs import conversion -from .dtypes import (CategoricalDtype, CategoricalDtypeType, - DatetimeTZDtype, DatetimeTZDtypeType, - PeriodDtype, PeriodDtypeType, - IntervalDtype, IntervalDtypeType, - ExtensionDtype, PandasExtensionDtype) -from .generic import (ABCCategorical, ABCPeriodIndex, - ABCDatetimeIndex, ABCSeries, - ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, - ABCIndexClass, ABCDateOffset) -from .inference import is_string_like, is_list_like -from .inference import * # noqa +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, + DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, IntervalDtype, + IntervalDtypeType, ExtensionDtype, PandasExtensionDtype) +from pandas.core.dtypes.generic import ( + ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, + ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass, + ABCDateOffset) +from pandas.core.dtypes.inference import ( # noqa:F401 + is_bool, is_integer, is_hashable, is_iterator, is_float, + is_dict_like, is_scalar, is_string_like, is_list_like, is_number, + is_file_like, is_re, is_re_compilable, is_sequence, is_nested_list_like, + is_named_tuple, is_array_like, is_decimal, is_complex, is_interval) _POSSIBLY_CAST_DTYPES = set([np.dtype(t).name From 13a3d5abc479426558a53f4a27e84b4365880ca0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 25 Jun 2018 03:56:55 -0700 Subject: [PATCH 087/113] remove unused cimport (#21619) --- pandas/_libs/hashtable_class_helper.pxi.in | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b92eb0e651276..4d2b6f845eb71 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -4,8 +4,6 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -from missing cimport is_null_datetimelike - #---------------------------------------------------------------------- # VectorData From afdcac1411f697c8c0cd4b879dc5b3eef9cd8c26 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Jun 2018 09:57:43 -0500 Subject: [PATCH 088/113] CI: Test against Python 3.7 (#21604) --- .travis.yml | 5 +++++ ci/travis-37.yaml | 14 ++++++++++++++ doc/source/install.rst | 2 +- doc/source/whatsnew/v0.23.2.txt | 6 ++++++ pandas/compat/__init__.py | 9 +++++---- pandas/tests/tseries/offsets/test_offsets.py | 10 ++++++++-- setup.py | 1 + 7 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 ci/travis-37.yaml diff --git a/.travis.yml b/.travis.yml index 4e25380a7d941..2d2a0bc019c80 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,6 +35,11 @@ matrix: language: generic env: - JOB="3.5, OSX" ENV_FILE="ci/travis-35-osx.yaml" TEST_ARGS="--skip-slow --skip-network" + + - dist: trusty + env: + - JOB="3.7" ENV_FILE="ci/travis-37.yaml" TEST_ARGS="--skip-slow --skip-network" + - dist: trusty env: - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true diff --git a/ci/travis-37.yaml b/ci/travis-37.yaml new file mode 100644 index 0000000000000..8b255c9e6ec72 --- /dev/null +++ b/ci/travis-37.yaml @@ -0,0 +1,14 @@ +name: pandas +channels: + - defaults + - conda-forge + - c3i_test +dependencies: + - python=3.7 + - cython + - numpy + - python-dateutil + - nomkl + - pytz + - pytest + - pytest-xdist diff --git a/doc/source/install.rst b/doc/source/install.rst index 87d1b63914635..fa6b9f4fc7f4d 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -43,7 +43,7 @@ For more information, see the `Python 3 statement`_ and the `Porting to Python 3 Python version support ---------------------- -Officially Python 2.7, 3.5, and 3.6. +Officially Python 2.7, 3.5, 3.6, and 3.7. Installing pandas ----------------- diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index ff872cfc6b3ef..d163ad8564efb 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -6,6 +6,12 @@ v0.23.2 This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. +.. note:: + + Pandas 0.23.2 is first pandas release that's compatible with + Python 3.7 (:issue:`20552`) + + .. contents:: What's new in v0.23.2 :local: :backlinks: none diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5ae22694d0da7..28a55133e68aa 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -40,10 +40,11 @@ from collections import namedtuple PY2 = sys.version_info[0] == 2 -PY3 = (sys.version_info[0] >= 3) -PY35 = (sys.version_info >= (3, 5)) -PY36 = (sys.version_info >= (3, 6)) -PYPY = (platform.python_implementation() == 'PyPy') +PY3 = sys.version_info[0] >= 3 +PY35 = sys.version_info >= (3, 5) +PY36 = sys.version_info >= (3, 6) +PY37 = sys.version_info >= (3, 7) +PYPY = platform.python_implementation() == 'PyPy' try: import __builtin__ as builtins diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 66cb9baeb9357..74bc08ee9649b 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -591,7 +591,10 @@ def test_repr(self): assert repr(self.offset) == '' assert repr(self.offset2) == '<2 * BusinessDays>' - expected = '' + if compat.PY37: + expected = '' + else: + expected = '' assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -1651,7 +1654,10 @@ def test_repr(self): assert repr(self.offset) == '' assert repr(self.offset2) == '<2 * CustomBusinessDays>' - expected = '' + if compat.PY37: + expected = '' + else: + expected = '' assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): diff --git a/setup.py b/setup.py index d6890a08b09d0..dd026bd611727 100755 --- a/setup.py +++ b/setup.py @@ -217,6 +217,7 @@ def build_extensions(self): 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Programming Language :: Cython', 'Topic :: Scientific/Engineering'] From 9f6c1540f63a92efb8880fe220216cc0d5474c22 Mon Sep 17 00:00:00 2001 From: aberres Date: Tue, 26 Jun 2018 00:22:23 +0200 Subject: [PATCH 089/113] DOC: Do no use 'type' as first word when specifying a return type (#21622) (#21623) --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 28 ++++++++++++++-------------- pandas/core/groupby/groupby.py | 2 +- pandas/core/sparse/series.py | 2 +- pandas/core/window.py | 4 ++-- pandas/io/packers.py | 2 +- pandas/io/pickle.py | 2 +- pandas/io/pytables.py | 2 +- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 74bb2abc27c4b..34d3eb0a6db73 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4675,7 +4675,7 @@ def swaplevel(self, i=-2, j=-1, axis=0): Returns ------- - swapped : type of caller (new object) + swapped : same type as caller (new object) .. versionchanged:: 0.18.1 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 04ba0b5de3f7f..4efdd3812accd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -800,7 +800,7 @@ def swaplevel(self, i=-2, j=-1, axis=0): Returns ------- - swapped : type of caller (new object) + swapped : same type as caller (new object) .. versionchanged:: 0.18.1 @@ -1073,7 +1073,7 @@ def _set_axis_name(self, name, axis=0, inplace=False): Returns ------- - renamed : type of caller or None if inplace=True + renamed : same type as caller or None if inplace=True See Also -------- @@ -2468,7 +2468,7 @@ def get(self, key, default=None): Returns ------- - value : type of items contained in object + value : same type as items contained in object """ try: return self[key] @@ -2768,7 +2768,7 @@ def __delitem__(self, key): Returns ------- - taken : type of caller + taken : same type as caller An array-like containing the elements taken from the object. See Also @@ -2824,7 +2824,7 @@ def _take(self, indices, axis=0, is_copy=True): Returns ------- - taken : type of caller + taken : same type as caller An array-like containing the elements taken from the object. See Also @@ -3033,7 +3033,7 @@ def select(self, crit, axis=0): Returns ------- - selection : type of caller + selection : same type as caller """ warnings.warn("'select' is deprecated and will be removed in a " "future release. You can use " @@ -3924,7 +3924,7 @@ def head(self, n=5): Returns ------- - obj_head : type of caller + obj_head : same type as caller The first `n` rows of the caller object. See Also @@ -4447,7 +4447,7 @@ def _consolidate(self, inplace=False): Returns ------- - consolidated : type of caller + consolidated : same type as caller """ inplace = validate_bool_kwarg(inplace, 'inplace') if inplace: @@ -4916,7 +4916,7 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): Returns ------- - casted : type of caller + casted : same type as caller Examples -------- @@ -6691,7 +6691,7 @@ def asfreq(self, freq, method=None, how=None, normalize=False, Returns ------- - converted : type of caller + converted : same type as caller Examples -------- @@ -6772,7 +6772,7 @@ def at_time(self, time, asof=False): Returns ------- - values_at_time : type of caller + values_at_time : same type as caller Examples -------- @@ -6826,7 +6826,7 @@ def between_time(self, start_time, end_time, include_start=True, Returns ------- - values_between_time : type of caller + values_between_time : same type as caller Examples -------- @@ -7145,7 +7145,7 @@ def first(self, offset): Returns ------- - subset : type of caller + subset : same type as caller See Also -------- @@ -7209,7 +7209,7 @@ def last(self, offset): Returns ------- - subset : type of caller + subset : same type as caller See Also -------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0bbdfbbe52ac4..c69d7f43de8ea 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -867,7 +867,7 @@ def get_group(self, name, obj=None): Returns ------- - group : type of obj + group : same type as obj """ if obj is None: obj = self._selected_obj diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 714cd09a27294..09d958059d355 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -398,7 +398,7 @@ def abs(self): Returns ------- - abs: type of caller + abs: same type as caller """ return self._constructor(np.abs(self.values), index=self.index).__finalize__(self) diff --git a/pandas/core/window.py b/pandas/core/window.py index 9d0f9dc4f75f9..f089e402261db 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -665,7 +665,7 @@ def _apply_window(self, mean=True, **kwargs): Returns ------- - y : type of input argument + y : same type as input argument """ window = self._prep_window(**kwargs) @@ -2139,7 +2139,7 @@ def _apply(self, func, **kwargs): Returns ------- - y : type of input argument + y : same type as input argument """ blocks, obj, index = self._create_blocks() diff --git a/pandas/io/packers.py b/pandas/io/packers.py index f9b1d1574d45c..03a5e8528f72d 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -178,7 +178,7 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): Returns ------- - obj : type of object stored in file + obj : same type as object stored in file """ path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index d27735fbca318..d347d76c33e0f 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -103,7 +103,7 @@ def read_pickle(path, compression='infer'): Returns ------- - unpickled : type of object stored in file + unpickled : same type as object stored in file See Also -------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index aad387e0cdd58..580c7923017e5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -687,7 +687,7 @@ def get(self, key): Returns ------- - obj : type of object stored in file + obj : same type as object stored in file """ group = self.get_node(key) if group is None: From c19017b465f4ef8e681535fcf6c84bb3217179b1 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 25 Jun 2018 23:24:30 +0100 Subject: [PATCH 090/113] CLN: make CategoricalIndex._create_categorical a classmethod (#21618) --- pandas/core/indexes/base.py | 3 ++- pandas/core/indexes/category.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4f140a6e77b2f..122f8662abb61 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1130,7 +1130,8 @@ def to_frame(self, index=True): """ from pandas import DataFrame - result = DataFrame(self._shallow_copy(), columns=[self.name or 0]) + name = self.name or 0 + result = DataFrame({name: self.values.copy()}) if index: result.index = self diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index fc669074758da..a2efe2c49c747 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -85,11 +85,11 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, name = data.name if isinstance(data, ABCCategorical): - data = cls._create_categorical(cls, data, categories, ordered, + data = cls._create_categorical(data, categories, ordered, dtype) elif isinstance(data, CategoricalIndex): data = data._data - data = cls._create_categorical(cls, data, categories, ordered, + data = cls._create_categorical(data, categories, ordered, dtype) else: @@ -99,7 +99,7 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if data is not None or categories is None: cls._scalar_data_error(data) data = [] - data = cls._create_categorical(cls, data, categories, ordered, + data = cls._create_categorical(data, categories, ordered, dtype) if copy: @@ -136,8 +136,8 @@ def _create_from_codes(self, codes, categories=None, ordered=None, ordered=self.ordered) return CategoricalIndex(cat, name=name) - @staticmethod - def _create_categorical(self, data, categories=None, ordered=None, + @classmethod + def _create_categorical(cls, data, categories=None, ordered=None, dtype=None): """ *this is an internal non-public method* @@ -155,7 +155,7 @@ def _create_categorical(self, data, categories=None, ordered=None, ------- Categorical """ - if (isinstance(data, (ABCSeries, type(self))) and + if (isinstance(data, (cls, ABCSeries)) and is_categorical_dtype(data)): data = data.values @@ -179,7 +179,7 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, dtype=None, **kwargs): result = object.__new__(cls) - values = cls._create_categorical(cls, values, categories, ordered, + values = cls._create_categorical(values, categories, ordered, dtype=dtype) result._data = values result.name = name @@ -236,7 +236,7 @@ def _is_dtype_compat(self, other): if not is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( - self, other, categories=self.categories, ordered=self.ordered)) + other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") @@ -798,7 +798,7 @@ def _evaluate_compare(self, other): other = other._values elif isinstance(other, Index): other = self._create_categorical( - self, other._values, categories=self.categories, + other._values, categories=self.categories, ordered=self.ordered) if isinstance(other, (ABCCategorical, np.ndarray, From 5e4882ebfa31488fd431b6a2b7e3413afd0c6708 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 26 Jun 2018 00:29:57 +0200 Subject: [PATCH 091/113] PERF: do not check for label presence preventively (#21594) closes #21593 --- doc/source/whatsnew/v0.24.0.txt | 3 +- pandas/core/indexing.py | 36 +++---------------- .../indexes/datetimes/test_partial_slicing.py | 8 +++-- pandas/tests/indexing/test_multiindex.py | 9 ++++- 4 files changed, 20 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 90fc579ae69e5..a63276efc5b7c 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -17,7 +17,7 @@ Other Enhancements - :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`) - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) -- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`) +- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) .. _whatsnew_0240.api_breaking: @@ -199,6 +199,7 @@ Indexing ^^^^^^^^ - The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) +- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError`` - consistently with the case of a flat :class:`Int64Index` - rather than falling back to positional indexing (:issue:`21593`) - - diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1f9fe5f947d0c..a69313a2d4a43 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -13,7 +13,6 @@ is_iterator, is_scalar, is_sparse, - _is_unorderable_exception, _ensure_platform_int) from pandas.core.dtypes.missing import isna, _infer_fill_value from pandas.errors import AbstractMethodError @@ -139,10 +138,7 @@ def _get_label(self, label, axis=None): # as its basically direct indexing # but will fail when the index is not present # see GH5667 - try: - return self.obj._xs(label, axis=axis) - except: - return self.obj[label] + return self.obj._xs(label, axis=axis) elif isinstance(label, tuple) and isinstance(label[axis], slice): raise IndexingError('no slices here, handle elsewhere') @@ -1797,9 +1793,8 @@ class _LocIndexer(_LocationIndexer): @Appender(_NDFrameIndexer._validate_key.__doc__) def _validate_key(self, key, axis): - ax = self.obj._get_axis(axis) - # valid for a label where all labels are in the index + # valid for a collection of labels (we check their presence later) # slice of labels (where start-end in labels) # slice of integers (only if in the labels) # boolean @@ -1807,32 +1802,11 @@ def _validate_key(self, key, axis): if isinstance(key, slice): return - elif com.is_bool_indexer(key): + if com.is_bool_indexer(key): return - elif not is_list_like_indexer(key): - - def error(): - if isna(key): - raise TypeError("cannot use label indexing with a null " - "key") - raise KeyError(u"the label [{key}] is not in the [{axis}]" - .format(key=key, - axis=self.obj._get_axis_name(axis))) - - try: - key = self._convert_scalar_indexer(key, axis) - except TypeError as e: - - # python 3 type errors should be raised - if _is_unorderable_exception(e): - error() - raise - except: - error() - - if not ax.contains(key): - error() + if not is_list_like_indexer(key): + self._convert_scalar_indexer(key, axis) def _is_scalar_access(self, key): # this is a shortcut accessor to both .loc and .iloc diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 4580d9fff31d5..e1e80e50e31f0 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -11,6 +11,8 @@ date_range, Index, Timedelta, Timestamp) from pandas.util import testing as tm +from pandas.core.indexing import IndexingError + class TestSlicing(object): def test_dti_slicing(self): @@ -313,12 +315,12 @@ def test_partial_slicing_with_multiindex(self): result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')] tm.assert_series_equal(result, expected) - # this is a KeyError as we don't do partial string selection on - # multi-levels + # this is an IndexingError as we don't do partial string selection on + # multi-levels. def f(): df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')] - pytest.raises(KeyError, f) + pytest.raises(IndexingError, f) # GH 4294 # partial slice on a series mi diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 43656a392e582..d2c4c8f5e149b 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -230,7 +230,8 @@ def test_iloc_getitem_multiindex(self): # corner column rs = mi_int.iloc[2, 2] with catch_warnings(record=True): - xp = mi_int.ix[:, 2].ix[2] + # First level is int - so use .loc rather than .ix (GH 21593) + xp = mi_int.loc[(8, 12), (4, 10)] assert rs == xp # this is basically regular indexing @@ -278,6 +279,12 @@ def test_loc_multiindex(self): xp = mi_int.ix[4] tm.assert_frame_equal(rs, xp) + # missing label + pytest.raises(KeyError, lambda: mi_int.loc[2]) + with catch_warnings(record=True): + # GH 21593 + pytest.raises(KeyError, lambda: mi_int.ix[2]) + def test_getitem_partial_int(self): # GH 12416 # with single item From b3443daadd2f3971f33dddd93f2b38dcaa1a9748 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Mon, 25 Jun 2018 23:31:08 +0100 Subject: [PATCH 092/113] TST: Refactor test_maybe_match_name and test_hash_pandas_object (#21600) --- pandas/tests/test_common.py | 36 +++++------------- pandas/tests/util/test_hashing.py | 62 +++++++++++++++---------------- 2 files changed, 39 insertions(+), 59 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index ef5f13bfa504a..61f838eeeeb30 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -25,7 +25,6 @@ def test_mut_exclusive(): def test_get_callable_name(): - from functools import partial getname = com._get_callable_name def fn(x): @@ -154,8 +153,7 @@ def test_random_state(): # Check with random state object state2 = npr.RandomState(10) - assert (com._random_state(state2).uniform() == - npr.RandomState(10).uniform()) + assert com._random_state(state2).uniform() == npr.RandomState(10).uniform() # check with no arg random state assert com._random_state() is np.random @@ -168,29 +166,15 @@ def test_random_state(): com._random_state(5.5) -def test_maybe_match_name(): - - matched = ops._maybe_match_name( - Series([1], name='x'), Series( - [2], name='x')) - assert (matched == 'x') - - matched = ops._maybe_match_name( - Series([1], name='x'), Series( - [2], name='y')) - assert (matched is None) - - matched = ops._maybe_match_name(Series([1]), Series([2], name='x')) - assert (matched is None) - - matched = ops._maybe_match_name(Series([1], name='x'), Series([2])) - assert (matched is None) - - matched = ops._maybe_match_name(Series([1], name='x'), [2]) - assert (matched == 'x') - - matched = ops._maybe_match_name([1], Series([2], name='y')) - assert (matched == 'y') +@pytest.mark.parametrize('left, right, expected', [ + (Series([1], name='x'), Series([2], name='x'), 'x'), + (Series([1], name='x'), Series([2], name='y'), None), + (Series([1]), Series([2], name='x'), None), + (Series([1], name='x'), Series([2]), None), + (Series([1], name='x'), [2], 'x'), + ([1], Series([2], name='y'), 'y')]) +def test_maybe_match_name(left, right, expected): + assert ops._maybe_match_name(left, right) == expected def test_dict_compat(): diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index fe8d75539879e..82b870c156cc8 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -142,39 +142,35 @@ def test_multiindex_objects(self): tm.assert_numpy_array_equal(np.sort(result), np.sort(expected)) - def test_hash_pandas_object(self): - - for obj in [Series([1, 2, 3]), - Series([1.0, 1.5, 3.2]), - Series([1.0, 1.5, np.nan]), - Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), - Series(['a', 'b', 'c']), - Series(['a', np.nan, 'c']), - Series(['a', None, 'c']), - Series([True, False, True]), - Series(), - Index([1, 2, 3]), - Index([True, False, True]), - DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), - DataFrame(), - tm.makeMissingDataframe(), - tm.makeMixedDataFrame(), - tm.makeTimeDataFrame(), - tm.makeTimeSeries(), - tm.makeTimedeltaIndex(), - tm.makePeriodIndex(), - Series(tm.makePeriodIndex()), - Series(pd.date_range('20130101', - periods=3, tz='US/Eastern')), - MultiIndex.from_product( - [range(5), - ['foo', 'bar', 'baz'], - pd.date_range('20130101', periods=2)]), - MultiIndex.from_product( - [pd.CategoricalIndex(list('aabc')), - range(3)])]: - self.check_equal(obj) - self.check_not_equal_with_index(obj) + @pytest.mark.parametrize('obj', [ + Series([1, 2, 3]), + Series([1.0, 1.5, 3.2]), + Series([1.0, 1.5, np.nan]), + Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), + Series(['a', 'b', 'c']), + Series(['a', np.nan, 'c']), + Series(['a', None, 'c']), + Series([True, False, True]), + Series(), + Index([1, 2, 3]), + Index([True, False, True]), + DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), + DataFrame(), + tm.makeMissingDataframe(), + tm.makeMixedDataFrame(), + tm.makeTimeDataFrame(), + tm.makeTimeSeries(), + tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), + Series(tm.makePeriodIndex()), + Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), + MultiIndex.from_product([range(5), ['foo', 'bar', 'baz'], + pd.date_range('20130101', periods=2)]), + MultiIndex.from_product([pd.CategoricalIndex(list('aabc')), range(3)]) + ]) + def test_hash_pandas_object(self, obj): + self.check_equal(obj) + self.check_not_equal_with_index(obj) def test_hash_pandas_object2(self): for name, s in self.df.iteritems(): From ebeccfce0318252ddec15b21df5e26013a77baf5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Jun 2018 02:34:15 -0500 Subject: [PATCH 093/113] API/COMPAT: support axis=None for logical reduction (reduce over all axes) (#21486) * Compat with NumPy 1.15 logical func * Accepts axis=None as reduce all dims --- doc/source/whatsnew/v0.23.2.txt | 30 +++++++ pandas/core/frame.py | 22 ++++- pandas/core/generic.py | 44 ++++++---- pandas/core/panel.py | 17 +++- pandas/core/series.py | 3 +- pandas/tests/frame/test_analytics.py | 119 +++++++++++++++++++++++++-- pandas/tests/test_panel.py | 7 ++ pandas/util/_test_decorators.py | 4 + 8 files changed, 215 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index d163ad8564efb..c5de6f0a61720 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -16,6 +16,36 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none +.. _whatsnew_0232.enhancements: + +Logical Reductions over Entire DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`DataFrame.all` and :meth:`DataFrame.any` now accept ``axis=None`` to reduce over all axes to a scalar (:issue:`19976`) + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2], "B": [True, False]}) + df.all(axis=None) + + +This also provides compatibility with NumPy 1.15, which now dispatches to ``DataFrame.all``. +With NumPy 1.15 and pandas 0.23.1 or earlier, :func:`numpy.all` will no longer reduce over every axis: + +.. code-block:: python + + >>> # NumPy 1.15, pandas 0.23.1 + >>> np.any(pd.DataFrame({"A": [False], "B": [False]})) + A False + B False + dtype: bool + +With pandas 0.23.2, that will correctly return False, as it did with NumPy < 1.15. + +.. ipython:: python + + np.any(pd.DataFrame({"A": [False], "B": [False]})) + .. _whatsnew_0232.fixed_regressions: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 34d3eb0a6db73..0bf5acf14294a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6846,13 +6846,18 @@ def _count_level(self, level, axis=0, numeric_only=False): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): - axis = self._get_axis_number(axis) + if axis is None and filter_type == 'bool': + labels = None + constructor = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + constructor = self._constructor def f(x): return op(x, axis=axis, skipna=skipna, **kwds) - labels = self._get_agg_axis(axis) - # exclude timedelta/datetime unless we are uniform types if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type: numeric_only = True @@ -6861,6 +6866,13 @@ def f(x): try: values = self.values result = f(values) + + if (filter_type == 'bool' and is_object_dtype(values) and + axis is None): + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: combine with hasattr(result, 'dtype') further down + # hard since we don't have `values` down there. + result = np.bool_(result) except Exception as e: # try by-column first @@ -6927,7 +6939,9 @@ def f(x): if axis == 0: result = coerce_to_dtypes(result, self.dtypes) - return Series(result, index=labels) + if constructor is not None: + result = Series(result, index=labels) + return result def nunique(self, axis=0, dropna=True): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4efdd3812accd..8fa79a130d1f8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8728,6 +8728,8 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): + if axis is None: + raise ValueError("Must specify 'axis' when aggregating by level.") grouped = self.groupby(level=level, axis=axis, sort=False) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) @@ -9054,8 +9056,15 @@ def _doc_parms(cls): Parameters ---------- -axis : int, default 0 - Select the axis which can be 0 for indices and 1 for columns. +axis : {0 or 'index', 1 or 'columns', None}, default 0 + Indicate which axis or axes should be reduced. + + * 0 / 'index' : reduce the index, return a Series whose index is the + original column labels. + * 1 / 'columns' : reduce the columns, return a Series whose index is the + original index. + * None : reduce all axes, return a scalar. + skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -9077,9 +9086,9 @@ def _doc_parms(cls): %(examples)s""" _all_doc = """\ -Return whether all elements are True over series or dataframe axis. +Return whether all elements are True, potentially over an axis. -Returns True if all elements within a series or along a dataframe +Returns True if all elements within a series or along a Dataframe axis are non-zero, not-empty or not-False.""" _all_examples = """\ @@ -9092,7 +9101,7 @@ def _doc_parms(cls): >>> pd.Series([True, False]).all() False -Dataframes +DataFrames Create a dataframe from a dictionary. @@ -9109,12 +9118,17 @@ def _doc_parms(cls): col2 False dtype: bool -Adding axis=1 argument will check if row-wise values all return True. +Specify ``axis='columns'`` to check if row-wise values all return True. ->>> df.all(axis=1) +>>> df.all(axis='columns') 0 True 1 False dtype: bool + +Or ``axis=None`` for whether every value is True. + +>>> df.all(axis=None) +False """ _all_see_also = """\ @@ -9484,6 +9498,11 @@ def _doc_parms(cls): 1 False dtype: bool +Aggregating over the entire DataFrame with ``axis=None``. + +>>> df.any(axis=None) +True + `any` for an empty DataFrame is an empty Series. >>> pd.DataFrame([]).any() @@ -9654,22 +9673,17 @@ def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, @Substitution(outname=name, desc=desc, name1=name1, name2=name2, axis_descr=axis_descr, examples=examples, see_also=see_also) @Appender(_bool_doc) - def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, + def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): nv.validate_logical_func(tuple(), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number if level is not None: if bool_only is not None: raise NotImplementedError("Option bool_only is not " "implemented with option level.") return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - return self._reduce(f, axis=axis, skipna=skipna, - numeric_only=bool_only, filter_type='bool', - name=name) + return self._reduce(f, name, axis=axis, skipna=skipna, + numeric_only=bool_only, filter_type='bool') return set_function_name(logical_func, name, cls) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index c4aa471b8b944..4f7400ad8388b 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1143,13 +1143,26 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, raise NotImplementedError('Panel.{0} does not implement ' 'numeric_only.'.format(name)) - axis_name = self._get_axis_name(axis) - axis_number = self._get_axis_number(axis_name) + if axis is None and filter_type == 'bool': + # labels = None + # constructor = None + axis_number = None + axis_name = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + # labels = self._get_agg_axis(axis) + # constructor = self._constructor + axis_name = self._get_axis_name(axis) + axis_number = self._get_axis_number(axis_name) + f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) with np.errstate(all='ignore'): result = f(self.values) + if axis is None and filter_type == 'bool': + return np.bool_(result) axes = self._get_plane_axes(axis_name) if result.ndim == 2 and axis_name != self._info_axis_name: result = result.T diff --git a/pandas/core/series.py b/pandas/core/series.py index a608db806d20b..cdb901d18767c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3241,7 +3241,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, delegate = self._values if isinstance(delegate, np.ndarray): # Validate that 'axis' is consistent with Series's single axis. - self._get_axis_number(axis) + if axis is not None: + self._get_axis_number(axis) if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 12ebdbe0fd3c7..84873659ac931 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -15,7 +15,7 @@ from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, - _np_version_under1p12, _np_version_under1p15, + _np_version_under1p12, to_datetime, to_timedelta) import pandas as pd import pandas.core.nanops as nanops @@ -1159,11 +1159,35 @@ def test_any_all(self): self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) - df = DataFrame(randn(10, 4)) > 0 - df.any(1) - df.all(1) - df.any(1, bool_only=True) - df.all(1, bool_only=True) + def test_any_all_extra(self): + df = DataFrame({ + 'A': [True, False, False], + 'B': [True, True, False], + 'C': [True, True, True], + }, index=['a', 'b', 'c']) + result = df[['A', 'B']].any(1) + expected = Series([True, True, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df[['A', 'B']].any(1, bool_only=True) + tm.assert_series_equal(result, expected) + + result = df.all(1) + expected = Series([True, False, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df.all(1, bool_only=True) + tm.assert_series_equal(result, expected) + + # Axis is None + result = df.all(axis=None).item() + assert result is False + + result = df.any(axis=None).item() + assert result is True + + result = df[['C']].all(axis=None).item() + assert result is True # skip pathological failure cases # class CantNonzero(object): @@ -1185,6 +1209,86 @@ def test_any_all(self): # df.any(1, bool_only=True) # df.all(1, bool_only=True) + @pytest.mark.parametrize('func, data, expected', [ + (np.any, {}, False), + (np.all, {}, True), + (np.any, {'A': []}, False), + (np.all, {'A': []}, True), + (np.any, {'A': [False, False]}, False), + (np.all, {'A': [False, False]}, False), + (np.any, {'A': [True, False]}, True), + (np.all, {'A': [True, False]}, False), + (np.any, {'A': [True, True]}, True), + (np.all, {'A': [True, True]}, True), + + (np.any, {'A': [False], 'B': [False]}, False), + (np.all, {'A': [False], 'B': [False]}, False), + + (np.any, {'A': [False, False], 'B': [False, True]}, True), + (np.all, {'A': [False, False], 'B': [False, True]}, False), + + # other types + (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False), + (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True), + (np.all, {'A': pd.Series([0, 1], dtype=int)}, False), + (np.any, {'A': pd.Series([0, 1], dtype=int)}, True), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + (np.all, {'A': pd.Series([0, 1], dtype='category')}, False), + (np.any, {'A': pd.Series([0, 1], dtype='category')}, True), + (np.all, {'A': pd.Series([1, 2], dtype='category')}, True), + (np.any, {'A': pd.Series([1, 2], dtype='category')}, True), + + # # Mix + # GH-21484 + # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), + # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), + ]) + def test_any_all_np_func(self, func, data, expected): + # https://github.com/pandas-dev/pandas/issues/19976 + data = DataFrame(data) + result = func(data) + assert isinstance(result, np.bool_) + assert result.item() is expected + + # method version + result = getattr(DataFrame(data), func.__name__)(axis=None) + assert isinstance(result, np.bool_) + assert result.item() is expected + + def test_any_all_object(self): + # https://github.com/pandas-dev/pandas/issues/19976 + result = np.all(DataFrame(columns=['a', 'b'])).item() + assert result is True + + result = np.any(DataFrame(columns=['a', 'b'])).item() + assert result is False + + @pytest.mark.parametrize('method', ['any', 'all']) + def test_any_all_level_axis_none_raises(self, method): + df = DataFrame( + {"A": 1}, + index=MultiIndex.from_product([['A', 'B'], ['a', 'b']], + names=['out', 'in']) + ) + xpr = "Must specify 'axis' when aggregating by level." + with tm.assert_raises_regex(ValueError, xpr): + getattr(df, method)(axis=None, level='out') + def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): if frame is None: @@ -2091,9 +2195,6 @@ def test_clip_against_list_like(self, inplace, lower, axis, res): result = original tm.assert_frame_equal(result, expected, check_exact=True) - @pytest.mark.xfail( - not _np_version_under1p15, - reason="failing under numpy-dev gh-19976") @pytest.mark.parametrize("axis", [0, 1, None]) def test_clip_against_frame(self, axis): df = DataFrame(np.random.randn(1000, 2)) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index d95a2ad2d7f76..2f8bc228cf86e 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2707,3 +2707,10 @@ def test_panel_index(): np.repeat([1, 2, 3], 4)], names=['time', 'panel']) tm.assert_index_equal(index, expected) + + +def test_panel_np_all(): + with catch_warnings(record=True): + wp = Panel({"A": DataFrame({'b': [1, 2]})}) + result = np.all(wp) + assert result == np.bool_(True) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 89d90258f58e0..27c24e3a68079 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -30,6 +30,7 @@ def test_foo(): from pandas.compat import (is_platform_windows, is_platform_32bit, PY3, import_lzma) +from pandas.compat.numpy import _np_version_under1p15 from pandas.core.computation.expressions import (_USE_NUMEXPR, _NUMEXPR_INSTALLED) @@ -160,6 +161,9 @@ def decorated_func(func): skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), reason="Missing matplotlib dependency") + +skip_if_np_lt_115 = pytest.mark.skipif(_np_version_under1p15, + reason="NumPy 1.15 or greater required") skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") skip_if_mpl_1_5 = pytest.mark.skipif(_skip_if_mpl_1_5(), From db233f371e790d3f96b643fbdbdb985f6753409a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 26 Jun 2018 00:44:05 -0700 Subject: [PATCH 094/113] DOC: Move tz cleanup whatsnew entries to v0.24 (#21631) --- doc/source/whatsnew/v0.23.2.txt | 6 +----- doc/source/whatsnew/v0.24.0.txt | 6 +++++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index c5de6f0a61720..a603bf9f7e9e0 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -129,11 +129,7 @@ Bug Fixes - Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) - Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) -- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) -- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError``(:issue:`8910`) -- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) -- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) -- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) + **Other** diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index a63276efc5b7c..8e38171e93bc2 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -159,7 +159,11 @@ Datetimelike - Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) - Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) -- +- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) +- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError``(:issue:`8910`) +- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) +- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) +- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) Timedelta ^^^^^^^^^ From 79d982aae39b552e3919d801d509ba1cf81f240b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 26 Jun 2018 12:01:10 +0200 Subject: [PATCH 095/113] DOC: fixup old whatsnew for dtype coercing change (#21456) (#21634) --- doc/source/whatsnew/v0.11.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.11.0.txt b/doc/source/whatsnew/v0.11.0.txt index 3c9cfda49aebd..f39e6c9ff459b 100644 --- a/doc/source/whatsnew/v0.11.0.txt +++ b/doc/source/whatsnew/v0.11.0.txt @@ -76,7 +76,7 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe df1.dtypes df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), B = Series(randn(8)), - C = Series(randn(8),dtype='uint8') )) + C = Series(range(8),dtype='uint8') )) df2 df2.dtypes From 367ce07dfd4e085033d26d7776a027d65890dc44 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Tue, 26 Jun 2018 15:37:54 +0530 Subject: [PATCH 096/113] DEPR: MultiIndex.to_hierarchical (#21613) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/indexes/multi.py | 6 +++++- pandas/core/panel.py | 12 ++++++++---- pandas/tests/indexes/test_multi.py | 11 +++++++---- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 8e38171e93bc2..fd92958930e55 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -110,7 +110,7 @@ Deprecations ~~~~~~~~~~~~ - :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`). -- +- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) - .. _whatsnew_0240.prior_deprecations: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 61b50f139dd10..f9f3041bef073 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -189,7 +189,6 @@ class MultiIndex(Index): from_product set_levels set_labels - to_hierarchical to_frame is_lexsorted sortlevel @@ -1182,6 +1181,8 @@ def to_frame(self, index=True): def to_hierarchical(self, n_repeat, n_shuffle=1): """ + .. deprecated:: 0.24.0 + Return a MultiIndex reshaped to conform to the shapes given by n_repeat and n_shuffle. @@ -1216,6 +1217,9 @@ def to_hierarchical(self, n_repeat, n_shuffle=1): # Assumes that each label is divisible by n_shuffle labels = [x.reshape(n_shuffle, -1).ravel(order='F') for x in labels] names = self.names + warnings.warn("Method .to_hierarchical is deprecated and will " + "be removed in a future version", + FutureWarning, stacklevel=2) return MultiIndex(levels=levels, labels=labels, names=names) @property diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 4f7400ad8388b..e012819812f6b 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -948,10 +948,14 @@ def to_frame(self, filter_observations=True): data[item] = self[item].values.ravel()[selector] def construct_multi_parts(idx, n_repeat, n_shuffle=1): - axis_idx = idx.to_hierarchical(n_repeat, n_shuffle) - labels = [x[selector] for x in axis_idx.labels] - levels = axis_idx.levels - names = axis_idx.names + # Replicates and shuffles MultiIndex, returns individual attributes + labels = [np.repeat(x, n_repeat) for x in idx.labels] + # Assumes that each label is divisible by n_shuffle + labels = [x.reshape(n_shuffle, -1).ravel(order='F') + for x in labels] + labels = [x[selector] for x in labels] + levels = idx.levels + names = idx.names return labels, levels, names def construct_index_parts(idx, major=True): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index ab53002ee1587..362f917e74972 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1673,9 +1673,11 @@ def test_to_frame(self): tm.assert_frame_equal(result, expected) def test_to_hierarchical(self): + # GH21613 index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( 2, 'two')]) - result = index.to_hierarchical(3) + with tm.assert_produces_warning(FutureWarning): + result = index.to_hierarchical(3) expected = MultiIndex(levels=[[1, 2], ['one', 'two']], labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) @@ -1683,7 +1685,8 @@ def test_to_hierarchical(self): assert result.names == index.names # K > 1 - result = index.to_hierarchical(3, 2) + with tm.assert_produces_warning(FutureWarning): + result = index.to_hierarchical(3, 2) expected = MultiIndex(levels=[[1, 2], ['one', 'two']], labels=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]) @@ -1694,8 +1697,8 @@ def test_to_hierarchical(self): index = MultiIndex.from_tuples([(2, 'c'), (1, 'b'), (2, 'a'), (2, 'b')], names=['N1', 'N2']) - - result = index.to_hierarchical(2) + with tm.assert_produces_warning(FutureWarning): + result = index.to_hierarchical(2) expected = MultiIndex.from_tuples([(2, 'c'), (2, 'c'), (1, 'b'), (1, 'b'), (2, 'a'), (2, 'a'), From 8db9303ac3effe1dded9c6bf9012ed171864620b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 26 Jun 2018 08:26:21 -0400 Subject: [PATCH 097/113] TST: xfail flaky 3.7 test, xref #21636 (#21637) --- pandas/tests/groupby/test_categorical.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e0793b8e1bd64..0fec6a8f96a24 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from pandas.compat import PY37 from pandas import (Index, MultiIndex, CategoricalIndex, DataFrame, Categorical, Series, qcut) from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -205,6 +206,7 @@ def test_level_get_group(observed): assert_frame_equal(result, expected) +@pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636") @pytest.mark.parametrize('ordered', [True, False]) def test_apply(ordered): # GH 10138 From e8f5ede0946c236ef42e16096a5b8091f8d4b774 Mon Sep 17 00:00:00 2001 From: Brett Naul Date: Tue, 26 Jun 2018 07:59:23 -0700 Subject: [PATCH 098/113] [ENH] Add read support for Google Cloud Storage (#20729) * Google Cloud Storage support using gcsfs --- ci/appveyor-27.yaml | 1 + ci/check_imports.py | 1 + ci/circle-36-locale_slow.yaml | 1 + ci/requirements-optional-conda.txt | 1 + ci/requirements-optional-pip.txt | 1 + ci/travis-27.yaml | 1 + ci/travis-36.yaml | 1 + doc/source/install.rst | 1 + doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/conftest.py | 16 +++++++++ pandas/io/common.py | 19 +++++++++-- pandas/io/excel.py | 2 +- pandas/io/gcs.py | 16 +++++++++ pandas/io/json/json.py | 6 ++-- pandas/tests/dtypes/test_inference.py | 9 ++--- pandas/tests/io/parser/common.py | 9 ++--- pandas/tests/io/test_gcs.py | 47 +++++++++++++++++++++++++++ pandas/util/_print_versions.py | 1 + 18 files changed, 116 insertions(+), 19 deletions(-) create mode 100644 pandas/io/gcs.py create mode 100644 pandas/tests/io/test_gcs.py diff --git a/ci/appveyor-27.yaml b/ci/appveyor-27.yaml index cfc6a796bd77e..10511ac0e00ca 100644 --- a/ci/appveyor-27.yaml +++ b/ci/appveyor-27.yaml @@ -6,6 +6,7 @@ dependencies: - beautifulsoup4 - bottleneck - dateutil + - gcsfs - html5lib - jinja2=2.8 - lxml diff --git a/ci/check_imports.py b/ci/check_imports.py index d6f24ebcc4d3e..3f09290f8c375 100644 --- a/ci/check_imports.py +++ b/ci/check_imports.py @@ -5,6 +5,7 @@ blacklist = { 'bs4', + 'gcsfs', 'html5lib', 'ipython', 'jinja2' diff --git a/ci/circle-36-locale_slow.yaml b/ci/circle-36-locale_slow.yaml index cc852c1e2aeeb..f44e98e1ee09d 100644 --- a/ci/circle-36-locale_slow.yaml +++ b/ci/circle-36-locale_slow.yaml @@ -5,6 +5,7 @@ channels: dependencies: - beautifulsoup4 - cython + - gcsfs - html5lib - ipython - jinja2 diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index e8cfcdf80f2e8..9e4e8e99b5205 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -3,6 +3,7 @@ blosc bottleneck fastparquet feather-format +gcsfs html5lib ipython>=5.6.0 ipykernel diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 877c52fa0b4fd..3cce3f5339883 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -5,6 +5,7 @@ blosc bottleneck fastparquet feather-format +gcsfs html5lib ipython>=5.6.0 ipykernel diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml index 22b993a2da886..482b888b88062 100644 --- a/ci/travis-27.yaml +++ b/ci/travis-27.yaml @@ -9,6 +9,7 @@ dependencies: - fastparquet - feather-format - flake8=3.4.1 + - gcsfs - html5lib - ipython - jemalloc=4.5.0.post diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml index 006276ba1a65f..ff4f1a4a86f99 100644 --- a/ci/travis-36.yaml +++ b/ci/travis-36.yaml @@ -8,6 +8,7 @@ dependencies: - dask - fastparquet - feather-format + - gcsfs - geopandas - html5lib - ipython diff --git a/doc/source/install.rst b/doc/source/install.rst index fa6b9f4fc7f4d..a8c5194124829 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -276,6 +276,7 @@ Optional Dependencies * `Jinja2 `__: Template engine for conditional HTML formatting. * `s3fs `__: necessary for Amazon S3 access (s3fs >= 0.0.7). * `blosc `__: for msgpack compression using ``blosc`` +* `gcsfs `__: necessary for Google Cloud Storage access (gcsfs >= 0.1.0). * One of `qtpy `__ (requires PyQt or PySide), `PyQt5 `__, diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index fd92958930e55..72e7373d0dd33 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -18,7 +18,7 @@ Other Enhancements - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`) - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) - :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) - +- Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`) .. _whatsnew_0240.api_breaking: diff --git a/pandas/conftest.py b/pandas/conftest.py index d6b18db4e71f2..b4a599758417c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,3 +1,5 @@ +import importlib + import pytest import numpy as np @@ -249,3 +251,17 @@ def any_int_dtype(request): """ return request.param + + +@pytest.fixture +def mock(): + """ + Fixture providing the 'mock' module. + + Uses 'unittest.mock' for Python 3. Attempts to import the 3rd party 'mock' + package for Python 2, skipping if not present. + """ + if PY3: + return importlib.import_module("unittest.mock") + else: + return pytest.importorskip("mock") diff --git a/pandas/io/common.py b/pandas/io/common.py index ac9077f2db50e..6d579fc8a8a09 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -88,7 +88,7 @@ def _is_url(url): """ try: return parse_url(url).scheme in _VALID_URLS - except: + except Exception: return False @@ -165,7 +165,15 @@ def is_s3_url(url): """Check for an s3, s3n, or s3a url""" try: return parse_url(url).scheme in ['s3', 's3n', 's3a'] - except: # noqa + except Exception: + return False + + +def is_gcs_url(url): + """Check for a gcs url""" + try: + return parse_url(url).scheme in ['gcs', 'gs'] + except Exception: return False @@ -208,6 +216,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=compression, mode=mode) + if is_gcs_url(filepath_or_buffer): + from pandas.io import gcs + return gcs.get_filepath_or_buffer(filepath_or_buffer, + encoding=encoding, + compression=compression, + mode=mode) + if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): diff --git a/pandas/io/excel.py b/pandas/io/excel.py index e86d33742b266..793a95ffb0ee7 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -46,7 +46,7 @@ io : string, path object (pathlib.Path or py._path.local.LocalPath), file-like object, pandas ExcelFile, or xlrd workbook. The string could be a URL. Valid URL schemes include http, ftp, s3, - and file. For file URLs, a host is expected. For instance, a local + gcs, and file. For file URLs, a host is expected. For instance, a local file could be file://localhost/path/to/workbook.xlsx sheet_name : string, int, mixed list of strings/ints, or None, default 0 diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py new file mode 100644 index 0000000000000..aa1cb648f05d1 --- /dev/null +++ b/pandas/io/gcs.py @@ -0,0 +1,16 @@ +""" GCS support for remote file interactivity """ +try: + import gcsfs +except ImportError: + raise ImportError("The gcsfs library is required to handle GCS files") + + +def get_filepath_or_buffer(filepath_or_buffer, encoding=None, + compression=None, mode=None): + + if mode is None: + mode = 'rb' + + fs = gcsfs.GCSFileSystem() + filepath_or_buffer = fs.open(filepath_or_buffer, mode) + return filepath_or_buffer, None, compression, True diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 1627b2f4d3ec3..9992be521d61f 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -231,9 +231,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, Parameters ---------- path_or_buf : a valid JSON string or file-like, default: None - The string could be a URL. Valid URL schemes include http, ftp, s3, and - file. For file URLs, a host is expected. For instance, a local file - could be ``file://localhost/path/to/table.json`` + The string could be a URL. Valid URL schemes include http, ftp, s3, + gcs, and file. For file URLs, a host is expected. For instance, a local + file could be ``file://localhost/path/to/table.json`` orient : string, Indication of expected JSON string format. diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b4f5d67530fbd..65527ac1b278f 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -20,7 +20,7 @@ DatetimeIndex, TimedeltaIndex, Timestamp, Panel, Period, Categorical, isna, Interval, DateOffset) -from pandas.compat import u, PY2, PY3, StringIO, lrange +from pandas.compat import u, PY2, StringIO, lrange from pandas.core.dtypes import inference from pandas.core.dtypes.common import ( is_timedelta64_dtype, @@ -128,7 +128,7 @@ def test_is_dict_like_fails(ll): assert not inference.is_dict_like(ll) -def test_is_file_like(): +def test_is_file_like(mock): class MockFile(object): pass @@ -166,10 +166,7 @@ class MockFile(object): # Iterator but no read / write attributes data = [1, 2, 3] assert not is_file(data) - - if PY3: - from unittest import mock - assert not is_file(mock.Mock()) + assert not is_file(mock.Mock()) @pytest.mark.parametrize( diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index b39122e5e7906..6e1d3575a1481 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1546,7 +1546,7 @@ def test_file_handles(self): assert not m.closed m.close() - def test_invalid_file_buffer(self): + def test_invalid_file_buffer(self, mock): # see gh-15337 class InvalidBuffer(object): @@ -1577,11 +1577,8 @@ def seek(self, pos, whence=0): tm.assert_frame_equal(result, expected) - if PY3: - from unittest import mock - - with tm.assert_raises_regex(ValueError, msg): - self.read_csv(mock.Mock()) + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(mock.Mock()) @tm.capture_stderr def test_skip_bad_lines(self): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py new file mode 100644 index 0000000000000..251c93df0733d --- /dev/null +++ b/pandas/tests/io/test_gcs.py @@ -0,0 +1,47 @@ +import numpy as np +import pytest + +from pandas import DataFrame, date_range, read_csv +from pandas.compat import StringIO +from pandas.io.common import is_gcs_url +from pandas.util import _test_decorators as td +from pandas.util.testing import assert_frame_equal + + +def test_is_gcs_url(): + assert is_gcs_url("gcs://pandas/somethingelse.com") + assert is_gcs_url("gs://pandas/somethingelse.com") + assert not is_gcs_url("s3://pandas/somethingelse.com") + + +@td.skip_if_no('gcsfs') +def test_read_csv_gcs(mock): + df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], + 'dt': date_range('2018-06-18', periods=2)}) + with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem: + instance = MockFileSystem.return_value + instance.open.return_value = StringIO(df1.to_csv(index=False)) + df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) + + assert_frame_equal(df1, df2) + + +@td.skip_if_no('gcsfs') +def test_gcs_get_filepath_or_buffer(mock): + df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], + 'dt': date_range('2018-06-18', periods=2)}) + with mock.patch('pandas.io.gcs.get_filepath_or_buffer') as MockGetFilepath: + MockGetFilepath.return_value = (StringIO(df1.to_csv(index=False)), + None, None, False) + df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) + + assert_frame_equal(df1, df2) + assert MockGetFilepath.called + + +@pytest.mark.skipif(td.safe_import('gcsfs'), + reason='Only check when gcsfs not installed') +def test_gcs_not_present_exception(): + with pytest.raises(ImportError) as e: + read_csv('gs://test/test.csv') + assert 'gcsfs library is required' in str(e.value) diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 83c1433bf5c39..01198fc541e0c 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -96,6 +96,7 @@ def show_versions(as_json=False): ("fastparquet", lambda mod: mod.__version__), ("pandas_gbq", lambda mod: mod.__version__), ("pandas_datareader", lambda mod: mod.__version__), + ("gcsfs", lambda mod: mod.__version__), ] deps_blob = list() From 58a1a08554bf222968f78a5cff2a7257c36178f8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Jun 2018 10:02:17 -0500 Subject: [PATCH 099/113] PKG: Exclude data test files. (#19535) --- MANIFEST.in | 34 ++++--- ci/script_single.sh | 8 +- doc/source/whatsnew/v0.23.2.txt | 5 + pandas/conftest.py | 41 +++++++++ pandas/tests/indexes/test_multi.py | 8 +- pandas/tests/io/conftest.py | 21 ++--- pandas/tests/io/formats/test_format.py | 4 +- pandas/tests/io/json/test_compression.py | 6 +- pandas/tests/io/json/test_pandas.py | 8 +- pandas/tests/io/parser/common.py | 25 +++-- pandas/tests/io/parser/compression.py | 4 +- pandas/tests/io/parser/dtypes.py | 6 +- pandas/tests/io/parser/test_network.py | 53 +++++------ pandas/tests/io/parser/test_parsers.py | 6 +- pandas/tests/io/parser/test_textreader.py | 5 +- pandas/tests/io/sas/test_sas7bdat.py | 43 ++++----- pandas/tests/io/sas/test_xport.py | 6 +- pandas/tests/io/test_common.py | 54 +++++------ pandas/tests/io/test_excel.py | 12 +-- pandas/tests/io/test_html.py | 92 +++++++++++-------- pandas/tests/io/test_packers.py | 51 +++++----- pandas/tests/io/test_pickle.py | 38 ++++---- pandas/tests/io/test_pytables.py | 23 +++-- pandas/tests/io/test_sql.py | 63 +++++++------ pandas/tests/io/test_stata.py | 9 +- pandas/tests/plotting/common.py | 5 - pandas/tests/plotting/test_deprecated.py | 5 +- pandas/tests/plotting/test_misc.py | 16 ++-- pandas/tests/reshape/merge/test_merge_asof.py | 33 +++---- pandas/tests/reshape/test_tile.py | 6 +- pandas/tests/tseries/offsets/test_offsets.py | 16 ++-- pandas/tests/util/test_testing.py | 13 +++ pandas/util/_test_decorators.py | 1 - pandas/util/testing.py | 10 -- setup.cfg | 3 +- setup.py | 6 +- 36 files changed, 392 insertions(+), 347 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 9773019c6e6e0..b417b8890fa24 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,27 +3,39 @@ include LICENSE include RELEASE.md include README.md include setup.py -include pyproject.toml graft doc prune doc/build +graft LICENSES + graft pandas -global-exclude *.so -global-exclude *.pyd +global-exclude *.bz2 +global-exclude *.csv +global-exclude *.dta +global-exclude *.gz +global-exclude *.h5 +global-exclude *.html +global-exclude *.json +global-exclude *.msgpack +global-exclude *.pickle +global-exclude *.png global-exclude *.pyc +global-exclude *.pyd +global-exclude *.sas7bdat +global-exclude *.so +global-exclude *.xls +global-exclude *.xlsm +global-exclude *.xlsx +global-exclude *.xpt +global-exclude *.xz +global-exclude *.zip global-exclude *~ -global-exclude \#* -global-exclude .git* global-exclude .DS_Store -global-exclude *.png +global-exclude .git* +global-exclude \#* -# include examples/data/* -# recursive-include examples *.py -# recursive-include doc/source * -# recursive-include doc/sphinxext * -# recursive-include LICENSES * include versioneer.py include pandas/_version.py include pandas/io/formats/templates/*.tpl diff --git a/ci/script_single.sh b/ci/script_single.sh index f376c920ac71b..60e2fbb33ee5d 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -25,12 +25,12 @@ if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas - pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas + pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index a603bf9f7e9e0..a41a6c31b0678 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -76,6 +76,11 @@ Documentation Changes - - +Build Changes +------------- + +- The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`) + .. _whatsnew_0232.bug_fixes: Bug Fixes diff --git a/pandas/conftest.py b/pandas/conftest.py index b4a599758417c..82d860b091b82 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,7 +1,9 @@ +import os import importlib import pytest +import pandas import numpy as np import pandas as pd from pandas.compat import PY3 @@ -17,6 +19,8 @@ def pytest_addoption(parser): help="run high memory tests") parser.addoption("--only-slow", action="store_true", help="run only slow tests") + parser.addoption("--strict-data-files", action="store_true", + help="Fail if a test is skipped for missing data file.") def pytest_runtest_setup(item): @@ -131,6 +135,43 @@ def join_type(request): return request.param +@pytest.fixture +def datapath(request): + """Get the path to a data file. + + Parameters + ---------- + path : str + Path to the file, relative to ``pandas/tests/`` + + Returns + ------- + path : path including ``pandas/tests``. + + Raises + ------ + ValueError + If the path doesn't exist and the --strict-data-files option is set. + """ + def deco(*args): + path = os.path.join('pandas', 'tests', *args) + if not os.path.exists(path): + if request.config.getoption("--strict-data-files"): + msg = "Could not find file {} and --strict-data-files is set." + raise ValueError(msg.format(path)) + else: + msg = "Could not find {}." + pytest.skip(msg.format(path)) + return path + return deco + + +@pytest.fixture +def iris(datapath): + """The iris dataset as a DataFrame.""" + return pandas.read_csv(datapath('data', 'iris.csv')) + + @pytest.fixture(params=['nlargest', 'nsmallest']) def nselect_method(request): """ diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 362f917e74972..c925c4c403960 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1182,12 +1182,12 @@ def test_iter(self): ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] assert result == expected - def test_legacy_pickle(self): + def test_legacy_pickle(self, datapath): if PY3: pytest.skip("testing for legacy pickles not " "support on py3") - path = tm.get_data_path('multiindex_v1.pickle') + path = datapath('indexes', 'data', 'multiindex_v1.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) @@ -1203,10 +1203,10 @@ def test_legacy_pickle(self): assert_almost_equal(res, exp) assert_almost_equal(exp, exp2) - def test_legacy_v2_unpickle(self): + def test_legacy_v2_unpickle(self, datapath): # 0.7.3 -> 0.8.0 format manage - path = tm.get_data_path('mindex_073.pickle') + path = datapath('indexes', 'data', 'mindex_073.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 8deb51e190bab..7623587803b41 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,32 +1,23 @@ -import os - import pytest from pandas.io.parsers import read_table -from pandas.util import testing as tm - - -@pytest.fixture -def parser_data(request): - return os.path.join(tm.get_data_path(), '..', 'parser', 'data') @pytest.fixture -def tips_file(parser_data): +def tips_file(datapath): """Path to the tips dataset""" - return os.path.join(parser_data, 'tips.csv') + return datapath('io', 'parser', 'data', 'tips.csv') @pytest.fixture -def jsonl_file(parser_data): +def jsonl_file(datapath): """Path a JSONL dataset""" - return os.path.join(parser_data, 'items.jsonl') + return datapath('io', 'parser', 'data', 'items.jsonl') @pytest.fixture -def salaries_table(parser_data): +def salaries_table(datapath): """DataFrame with the salaries dataset""" - path = os.path.join(parser_data, 'salaries.csv') - return read_table(path) + return read_table(datapath('io', 'parser', 'data', 'salaries.csv')) @pytest.fixture diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f221df93dd412..63b7cb3459069 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -916,8 +916,8 @@ def test_unicode_problem_decoding_as_ascii(self): dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})}) compat.text_type(dm.to_string()) - def test_string_repr_encoding(self): - filepath = tm.get_data_path('unicode_series.csv') + def test_string_repr_encoding(self, datapath): + filepath = datapath('io', 'formats', 'data', 'unicode_series.csv') df = pd.read_csv(filepath, header=None, encoding='latin1') repr(df) repr(df[1]) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c9074ca49e5be..05ceace20f5a4 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -21,11 +21,11 @@ def test_compression_roundtrip(compression): assert_frame_equal(df, pd.read_json(result)) -def test_read_zipped_json(): - uncompressed_path = tm.get_data_path("tsframe_v012.json") +def test_read_zipped_json(datapath): + uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) - compressed_path = tm.get_data_path("tsframe_v012.json.zip") + compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") compressed_df = pd.read_json(compressed_path, compression='zip') assert_frame_equal(uncompressed_df, compressed_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7e497c395266f..bcbac4400c953 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -37,8 +37,9 @@ class TestPandasContainer(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(scope="function", autouse=True) + def setup(self, datapath): + self.dirpath = datapath("io", "json", "data") self.ts = tm.makeTimeSeries() self.ts.name = 'ts' @@ -59,7 +60,8 @@ def setup_method(self, method): self.mixed_frame = _mixed_frame.copy() self.categorical = _cat_frame.copy() - def teardown_method(self, method): + yield + del self.dirpath del self.ts diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 6e1d3575a1481..9e871d27f0ce8 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -77,7 +77,7 @@ def test_read_csv(self): else: prefix = u("file://") - fname = prefix + compat.text_type(self.csv1) + fname = prefix + compat.text_type(os.path.abspath(self.csv1)) self.read_csv(fname, index_col=0, parse_dates=True) def test_1000_sep(self): @@ -651,21 +651,19 @@ def test_read_csv_parse_simple_list(self): tm.assert_frame_equal(df, expected) @tm.network - def test_url(self): + def test_url(self, datapath): # HTTP(S) url = ('https://raw.github.com/pandas-dev/pandas/master/' 'pandas/tests/io/parser/data/salaries.csv') url_table = self.read_table(url) - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing @pytest.mark.slow - def test_file(self): - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + def test_file(self, datapath): + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) try: @@ -755,8 +753,8 @@ def test_utf16_bom_skiprows(self): tm.assert_frame_equal(result, expected) - def test_utf16_example(self): - path = tm.get_data_path('utf16_ex.txt') + def test_utf16_example(self, datapath): + path = datapath('io', 'parser', 'data', 'utf16_ex.txt') # it works! and is the right length result = self.read_table(path, encoding='utf-16') @@ -767,8 +765,8 @@ def test_utf16_example(self): result = self.read_table(buf, encoding='utf-16') assert len(result) == 50 - def test_unicode_encoding(self): - pth = tm.get_data_path('unicode_series.csv') + def test_unicode_encoding(self, datapath): + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') result = self.read_csv(pth, header=None, encoding='latin-1') result = result.set_index(0) @@ -1513,10 +1511,9 @@ def test_internal_eof_byte_to_file(self): result = self.read_csv(path) tm.assert_frame_equal(result, expected) - def test_sub_character(self): + def test_sub_character(self, datapath): # see gh-16893 - dirpath = tm.get_data_path() - filename = os.path.join(dirpath, "sub_char.csv") + filename = datapath('io', 'parser', 'data', 'sub_char.csv') expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) result = self.read_csv(filename) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index e84db66561c49..e4950af19ea95 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -120,9 +120,9 @@ def test_read_csv_infer_compression(self): tm.assert_frame_equal(expected, df) - def test_read_csv_compressed_utf16_example(self): + def test_read_csv_compressed_utf16_example(self, datapath): # GH18071 - path = tm.get_data_path('utf16_ex_small.zip') + path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip') result = self.read_csv(path, encoding='utf-16', compression='zip', sep='\t') diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index b91ce04673e29..8060ebf2fbcd4 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -125,9 +125,9 @@ def test_categorical_dtype_high_cardinality_numeric(self): np.sort(actual.a.cat.categories), ordered=True) tm.assert_frame_equal(actual, expected) - def test_categorical_dtype_encoding(self): + def test_categorical_dtype_encoding(self, datapath): # GH 10153 - pth = tm.get_data_path('unicode_series.csv') + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') encoding = 'latin-1' expected = self.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) @@ -135,7 +135,7 @@ def test_categorical_dtype_encoding(self): dtype={1: 'category'}) tm.assert_frame_equal(actual, expected) - pth = tm.get_data_path('utf16_ex.txt') + pth = datapath('io', 'parser', 'data', 'utf16_ex.txt') encoding = 'utf-16' expected = self.read_table(pth, encoding=encoding) expected = expected.apply(Categorical) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index fdf45f307e953..e2243b8087a5b 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -48,10 +48,16 @@ def check_compressed_urls(salaries_table, compression, extension, mode, tm.assert_frame_equal(url_table, salaries_table) +@pytest.fixture +def tips_df(datapath): + """DataFrame with the tips dataset.""" + return read_csv(datapath('io', 'parser', 'data', 'tips.csv')) + + @pytest.mark.usefixtures("s3_resource") class TestS3(object): - def test_parse_public_s3_bucket(self): + def test_parse_public_s3_bucket(self, tips_df): pytest.importorskip('s3fs') # more of an integration test due to the not-public contents portion # can probably mock this though. @@ -60,45 +66,40 @@ def test_parse_public_s3_bucket(self): ext, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self): + def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self): + def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self): + def test_parse_public_s3_bucket_nrows(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self): + def test_parse_public_s3_bucket_chunked(self, tips_df): # Read with a chunksize chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp) @@ -109,14 +110,13 @@ def test_parse_public_s3_bucket_chunked(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self): + def test_parse_public_s3_bucket_chunked_python(self, tips_df): # Read with a chunksize using the Python parser chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp, @@ -127,36 +127,33 @@ def test_parse_public_s3_bucket_chunked_python(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self): + def test_parse_public_s3_bucket_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self): + def test_infer_s3_compression(self, tips_df): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self): + def test_parse_public_s3_bucket_nrows_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) def test_s3_fails(self): with pytest.raises(IOError): diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 7717102b64fc5..b6f13039641a2 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +import pytest import pandas.util.testing as tm from pandas import read_csv, read_table, DataFrame @@ -45,8 +46,9 @@ def read_table(self, *args, **kwargs): def float_precision_choices(self): raise com.AbstractMethodError(self) - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index e8d9d8b52164b..c7026e3e0fc88 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -28,8 +28,9 @@ class TestTextReader(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b80263021c269..101ee3e619f5b 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -11,8 +11,9 @@ class TestSAS7BDAT(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: @@ -123,9 +124,8 @@ def test_iterator_read_too_much(self): rdr.close() -def test_encoding_options(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test1.sas7bdat") +def test_encoding_options(datapath): + fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) df2 = pd.read_sas(fname, encoding='utf-8') for col in df1.columns: @@ -143,43 +143,39 @@ def test_encoding_options(): assert(x == y.decode()) -def test_productsales(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "productsales.sas7bdat") +def test_productsales(datapath): + fname = datapath("io", "sas", "data", "productsales.sas7bdat") df = pd.read_sas(fname, encoding='utf-8') - fname = os.path.join(dirpath, "productsales.csv") + fname = datapath("io", "sas", "data", "productsales.csv") df0 = pd.read_csv(fname, parse_dates=['MONTH']) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0) -def test_12659(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test_12659.sas7bdat") +def test_12659(datapath): + fname = datapath("io", "sas", "data", "test_12659.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "test_12659.csv") + fname = datapath("io", "sas", "data", "test_12659.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0) -def test_airline(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "airline.sas7bdat") +def test_airline(datapath): + fname = datapath("io", "sas", "data", "airline.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "airline.csv") + fname = datapath("io", "sas", "data", "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0, check_exact=False) -def test_date_time(): +def test_date_time(datapath): # Support of different SAS date/datetime formats (PR #15871) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "datetime.sas7bdat") + fname = datapath("io", "sas", "data", "datetime.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "datetime.csv") + fname = datapath("io", "sas", "data", "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) # GH 19732: Timestamps imported from sas will incur floating point errors @@ -187,9 +183,8 @@ def test_date_time(): tm.assert_frame_equal(df, df0) -def test_zero_variables(): +def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "zero_variables.sas7bdat") + fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") with pytest.raises(EmptyDataError): pd.read_sas(fname) diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index de31c3e36a8d5..6e5b2ab067aa5 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -1,3 +1,4 @@ +import pytest import pandas as pd import pandas.util.testing as tm from pandas.io.sas.sasreader import read_sas @@ -18,8 +19,9 @@ def numeric_as_float(data): class TestXport(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a89156db38ae3..5c9739be73393 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -149,27 +149,22 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): reader(path) @pytest.mark.parametrize('reader, module, path', [ - (pd.read_csv, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_table, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_fwf, 'os', os.path.join(HERE, 'data', - 'fixed_width_format.txt')), - (pd.read_excel, 'xlrd', os.path.join(HERE, 'data', 'test1.xlsx')), - (pd.read_feather, 'feather', os.path.join(HERE, 'data', - 'feather-0_3_1.feather')), - (pd.read_hdf, 'tables', os.path.join(HERE, 'data', 'legacy_hdf', - 'datetimetz_object.h5')), - (pd.read_stata, 'os', os.path.join(HERE, 'data', 'stata10_115.dta')), - (pd.read_sas, 'os', os.path.join(HERE, 'sas', 'data', - 'test1.sas7bdat')), - (pd.read_json, 'os', os.path.join(HERE, 'json', 'data', - 'tsframe_v012.json')), - (pd.read_msgpack, 'os', os.path.join(HERE, 'msgpack', 'data', - 'frame.mp')), - (pd.read_pickle, 'os', os.path.join(HERE, 'data', - 'categorical_0_14_1.pickle')), + (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), + (pd.read_table, 'os', ('io', 'data', 'iris.csv')), + (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), + (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), + (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), + (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf', + 'datetimetz_object.h5')), + (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')), + (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')), + (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')), + (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')), + (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')), ]) - def test_read_fspath_all(self, reader, module, path): + def test_read_fspath_all(self, reader, module, path, datapath): pytest.importorskip(module) + path = datapath(*path) mypath = CustomFSPath(path) result = reader(mypath) @@ -232,13 +227,14 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) -class TestMMapWrapper(object): +@pytest.fixture +def mmap_file(datapath): + return datapath('io', 'data', 'test_mmap.csv') + - def setup_method(self, method): - self.mmap_file = os.path.join(tm.get_data_path(), - 'test_mmap.csv') +class TestMMapWrapper(object): - def test_constructor_bad_file(self): + def test_constructor_bad_file(self, mmap_file): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 @@ -252,15 +248,15 @@ def test_constructor_bad_file(self): tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file) - target = open(self.mmap_file, 'r') + target = open(mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assert_raises_regex( ValueError, msg, common.MMapWrapper, target) - def test_get_attr(self): - with open(self.mmap_file, 'r') as target: + def test_get_attr(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) attrs = dir(wrapper.mmap) @@ -273,8 +269,8 @@ def test_get_attr(self): assert not hasattr(wrapper, 'foo') - def test_next(self): - with open(self.mmap_file, 'r') as target: + def test_next(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) lines = target.readlines() diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 2a225e6fe6a45..1fda56dbff772 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -39,8 +39,9 @@ @td.skip_if_no('xlrd', '0.9') class SharedItems(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -49,7 +50,6 @@ def setup_method(self, method): def get_csv_refdf(self, basename): """ Obtain the reference data from read_csv with the Python engine. - Test data path is defined by pandas.util.testing.get_data_path() Parameters ---------- @@ -68,8 +68,7 @@ def get_csv_refdf(self, basename): def get_excelfile(self, basename, ext): """ - Return test data ExcelFile instance. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data ExcelFile instance. Parameters ---------- @@ -86,8 +85,7 @@ def get_excelfile(self, basename, ext): def get_exceldf(self, basename, ext, *args, **kwds): """ - Return test data DataFrame. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data DataFrame. Parameters ---------- diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index a56946b82b027..9c6a8de7ed446 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1,6 +1,5 @@ from __future__ import print_function -import glob import os import re import threading @@ -25,8 +24,18 @@ import pandas.util._test_decorators as td from pandas.util.testing import makeCustomDataframe as mkdf, network +HERE = os.path.dirname(__file__) -DATA_PATH = tm.get_data_path() + +@pytest.fixture(params=[ + 'chinese_utf-16.html', + 'chinese_utf-32.html', + 'chinese_utf-8.html', + 'letz_latin1.html', +]) +def html_encoding_file(request, datapath): + """Parametrized fixture for HTML encoding test filenames.""" + return datapath('io', 'data', 'html_encoding', request.param) def assert_framelist_equal(list1, list2, *args, **kwargs): @@ -44,11 +53,11 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): @td.skip_if_no('bs4') -def test_bs4_version_fails(monkeypatch): +def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') with tm.assert_raises_regex(ValueError, "minimum version"): - read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4') + read_html(datapath("io", "data", "spam.html"), flavor='bs4') def test_invalid_flavor(): @@ -59,8 +68,8 @@ def test_invalid_flavor(): @td.skip_if_no('bs4') @td.skip_if_no('lxml') -def test_same_ordering(): - filename = os.path.join(DATA_PATH, 'valid_markup.html') +def test_same_ordering(datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4) @@ -72,11 +81,14 @@ def test_same_ordering(): pytest.param('lxml', marks=pytest.mark.skipif( not td.safe_import('lxml'), reason='No lxml'))], scope="class") class TestReadHtml(object): - spam_data = os.path.join(DATA_PATH, 'spam.html') - spam_data_kwargs = {} - if PY3: - spam_data_kwargs['encoding'] = 'UTF-8' - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + + @pytest.fixture(autouse=True) + def set_files(self, datapath): + self.spam_data = datapath('io', 'data', 'spam.html') + self.spam_data_kwargs = {} + if PY3: + self.spam_data_kwargs['encoding'] = 'UTF-8' + self.banklist_data = datapath("io", "data", "banklist.html") @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor, request): @@ -272,7 +284,8 @@ def test_invalid_url(self): @pytest.mark.slow def test_file_url(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), 'First', + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), + 'First', attrs={'id': 'table'}) assert isinstance(dfs, list) for df in dfs: @@ -326,7 +339,7 @@ def test_multiindex_header_index_skiprows(self): @pytest.mark.slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) assert isinstance(dfs, list) @@ -352,9 +365,9 @@ def test_python_docs_table(self): assert sorted(zz) == sorted(['Repo', 'What']) @pytest.mark.slow - def test_thousands_macau_stats(self): + def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath("io", "data", "macau.html") dfs = self.read_html(macau_data, index_col=0, attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] @@ -362,9 +375,9 @@ def test_thousands_macau_stats(self): assert not any(s.isna().any() for _, s in df.iteritems()) @pytest.mark.slow - def test_thousands_macau_index_col(self): + def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath('io', 'data', 'macau.html') dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -518,8 +531,8 @@ def test_countries_municipalities(self): res2 = self.read_html(data2, header=0) assert_framelist_equal(res1, res2) - def test_nyse_wsj_commas_table(self): - data = os.path.join(DATA_PATH, 'nyse_wsj.html') + def test_nyse_wsj_commas_table(self, datapath): + data = datapath('io', 'data', 'nyse_wsj.html') df = self.read_html(data, index_col=0, header=0, attrs={'class': 'mdcTable'})[0] @@ -530,7 +543,7 @@ def test_nyse_wsj_commas_table(self): tm.assert_index_equal(df.columns, columns) @pytest.mark.slow - def test_banklist_header(self): + def test_banklist_header(self, datapath): from pandas.io.html import _remove_whitespace def try_remove_ws(x): @@ -541,7 +554,7 @@ def try_remove_ws(x): df = self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'})[0] - ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), + ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'), converters={'Updated Date': Timestamp, 'Closing Date': Timestamp}) assert df.shape == ground_truth.shape @@ -658,19 +671,19 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_computer_sales_page(self): - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + def test_computer_sales_page(self, datapath): + data = datapath('io', 'data', 'computer_sales_page.html') with tm.assert_raises_regex(ParserError, r"Passed header=\[0,1\] are " r"too many rows for this " r"multi_index of columns"): self.read_html(data, header=[0, 1]) - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + data = datapath('io', 'data', 'computer_sales_page.html') assert self.read_html(data, header=[1, 2]) - def test_wikipedia_states_table(self): - data = os.path.join(DATA_PATH, 'wikipedia_states.html') + def test_wikipedia_states_table(self, datapath): + data = datapath('io', 'data', 'wikipedia_states.html') assert os.path.isfile(data), '%r is not a file' % data assert os.path.getsize(data), '%r is an empty file' % data result = self.read_html(data, 'Arizona', header=1)[0] @@ -784,15 +797,15 @@ def test_multiple_header_rows(self): html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) - def test_works_on_valid_markup(self): - filename = os.path.join(DATA_PATH, 'valid_markup.html') + def test_works_on_valid_markup(self, datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow - def test_fallback_success(self): - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + def test_fallback_success(self, datapath): + banklist_data = datapath('io', 'data', 'banklist.html') self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) def test_to_html_timestamp(self): @@ -835,22 +848,23 @@ def test_displayed_only(self, displayed_only, exp0, exp1): else: assert len(dfs) == 1 # Should not parse hidden table - @pytest.mark.parametrize("f", glob.glob( - os.path.join(DATA_PATH, 'html_encoding', '*.html'))) - def test_encode(self, f): - _, encoding = os.path.splitext(os.path.basename(f))[0].split('_') + def test_encode(self, html_encoding_file): + _, encoding = os.path.splitext( + os.path.basename(html_encoding_file) + )[0].split('_') try: - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_string = self.read_html(fobj.read(), encoding=encoding, index_col=0).pop() - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_file_like = self.read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0).pop() - from_filename = self.read_html(f, encoding=encoding, + from_filename = self.read_html(html_encoding_file, + encoding=encoding, index_col=0).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) @@ -906,7 +920,7 @@ def seekable(self): assert self.read_html(bad) @pytest.mark.slow - def test_importcheck_thread_safety(self): + def test_importcheck_thread_safety(self, datapath): # see gh-16928 class ErrorThread(threading.Thread): @@ -921,7 +935,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = os.path.join(DATA_PATH, 'valid_markup.html') + filename = datapath('io', 'data', 'valid_markup.html') helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 0b1c1ca178762..412e218f95c6f 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -3,6 +3,7 @@ from warnings import catch_warnings import os import datetime +import glob import numpy as np from distutils.version import LooseVersion @@ -836,13 +837,13 @@ def test_default_encoding(self): assert_frame_equal(result, frame) -def legacy_packers_versions(): - # yield the packers versions - path = tm.get_data_path('legacy_msgpack') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - yield v +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_msgpack", "*", "*.msgpack")) + + +@pytest.fixture(params=files) +def legacy_packer(request, datapath): + return datapath(request.param) class TestMsgpack(object): @@ -919,24 +920,20 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('version', legacy_packers_versions()) def test_msgpacks_legacy(self, current_packers_data, all_packers_data, - version): - - pth = tm.get_data_path('legacy_msgpack/{0}'.format(version)) - n = 0 - for f in os.listdir(pth): - # GH12142 0.17 files packed in P2 can't be read in P3 - if (compat.PY3 and version.startswith('0.17.') and - f.split('.')[-4][-1] == '2'): - continue - vf = os.path.join(pth, f) - try: - with catch_warnings(record=True): - self.compare(current_packers_data, all_packers_data, - vf, version) - except ImportError: - # blosc not installed - continue - n += 1 - assert n > 0, 'Msgpack files are not tested' + legacy_packer, datapath): + + version = os.path.basename(os.path.dirname(legacy_packer)) + + # GH12142 0.17 files packed in P2 can't be read in P3 + if (compat.PY3 and version.startswith('0.17.') and + legacy_packer.split('.')[-4][-1] == '2'): + msg = "Files packed in Py2 can't be read in Py3 ({})" + pytest.skip(msg.format(version)) + try: + with catch_warnings(record=True): + self.compare(current_packers_data, all_packers_data, + legacy_packer, version) + except ImportError: + # blosc not installed + pass diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index fbe2174e603e2..45cbbd43cd6a8 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -12,7 +12,7 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ - +import glob import pytest from warnings import catch_warnings @@ -184,27 +184,25 @@ def compare_sp_frame_float(result, expected, typ, version): tm.assert_sp_frame_equal(result, expected) +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_pickle", "*", "*.pickle")) + + +@pytest.fixture(params=files) +def legacy_pickle(request, datapath): + return datapath(request.param) + + # --------------------- # tests # --------------------- -def legacy_pickle_versions(): - # yield the pickle versions - path = tm.get_data_path('legacy_pickle') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - for f in os.listdir(p): - yield (v, f) - - -@pytest.mark.parametrize('version, f', legacy_pickle_versions()) -def test_pickles(current_pickle_data, version, f): +def test_pickles(current_pickle_data, legacy_pickle): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") - vf = tm.get_data_path('legacy_pickle/{}/{}'.format(version, f)) + version = os.path.basename(os.path.dirname(legacy_pickle)) with catch_warnings(record=True): - compare(current_pickle_data, vf, version) + compare(current_pickle_data, legacy_pickle, version) def test_round_trip_current(current_pickle_data): @@ -260,12 +258,11 @@ def python_unpickler(path): compare_element(result, expected, typ) -def test_pickle_v0_14_1(): +def test_pickle_v0_14_1(datapath): cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle') # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], @@ -275,14 +272,13 @@ def test_pickle_v0_14_1(): tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) -def test_pickle_v0_15_2(): +def test_pickle_v0_15_2(datapath): # ordered -> _ordered # GH 9347 cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index f96e7eeb40ea2..b95df3840b6c5 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4449,28 +4449,27 @@ def f(): store.select('df') tm.assert_raises_regex(ClosedFileError, 'file is not open', f) - def test_pytables_native_read(self): - + def test_pytables_native_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native.h5'), + datapath('io', 'data', 'legacy_hdf/pytables_native.h5'), mode='r') as store: d2 = store['detector/readout'] assert isinstance(d2, DataFrame) @pytest.mark.skipif(PY35 and is_platform_windows(), reason="native2 read fails oddly on windows / 3.5") - def test_pytables_native2_read(self): + def test_pytables_native2_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native2.h5'), + datapath('io', 'data', 'legacy_hdf', 'pytables_native2.h5'), mode='r') as store: str(store) d1 = store['detector'] assert isinstance(d1, DataFrame) - def test_legacy_table_read(self): + def test_legacy_table_read(self, datapath): # legacy table types with ensure_clean_store( - tm.get_data_path('legacy_hdf/legacy_table.h5'), + datapath('io', 'data', 'legacy_hdf', 'legacy_table.h5'), mode='r') as store: with catch_warnings(record=True): @@ -5117,7 +5116,7 @@ def test_fspath(self): with pd.HDFStore(path) as store: assert os.fspath(store) == str(path) - def test_read_py2_hdf_file_in_py3(self): + def test_read_py2_hdf_file_in_py3(self, datapath): # GH 16781 # tests reading a PeriodIndex DataFrame written in Python2 in Python3 @@ -5132,8 +5131,8 @@ def test_read_py2_hdf_file_in_py3(self): ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) with ensure_clean_store( - tm.get_data_path( - 'legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), + datapath('io', 'data', 'legacy_hdf', + 'periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), mode='r') as store: result = store['p'] assert_frame_equal(result, expected) @@ -5530,14 +5529,14 @@ def test_store_timezone(self): assert_frame_equal(result, df) - def test_legacy_datetimetz_object(self): + def test_legacy_datetimetz_object(self, datapath): # legacy from < 0.17.0 # 8260 expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) with ensure_clean_store( - tm.get_data_path('legacy_hdf/datetimetz_object.h5'), + datapath('io', 'data', 'legacy_hdf', 'datetimetz_object.h5'), mode='r') as store: result = store['df'] assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index f3ab74d37a2bc..f8f742c5980ac 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -22,7 +22,6 @@ import pytest import sqlite3 import csv -import os import warnings import numpy as np @@ -184,9 +183,11 @@ class MixInBase(object): def teardown_method(self, method): - for tbl in self._get_all_tables(): - self.drop_table(tbl) - self._close_conn() + # if setup fails, there may not be a connection to close. + if hasattr(self, 'conn'): + for tbl in self._get_all_tables(): + self.drop_table(tbl) + self._close_conn() class MySQLMixIn(MixInBase): @@ -253,9 +254,9 @@ def _get_exec(self): else: return self.conn.cursor() - def _load_iris_data(self): + def _load_iris_data(self, datapath): import io - iris_csv_file = os.path.join(tm.get_data_path(), 'iris.csv') + iris_csv_file = datapath('io', 'data', 'iris.csv') self.drop_table('iris') self._get_exec().execute(SQL_STRINGS['create_iris'][self.flavor]) @@ -503,9 +504,10 @@ class _TestSQLApi(PandasSQLTest): flavor = 'sqlite' mode = None - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_iris_view() self._load_test1_data() self._load_test2_data() @@ -1025,8 +1027,9 @@ class _EngineToConnMixin(object): A mixin that causes setup_connect to create a conn rather than an engine. """ - def setup_method(self, method): - super(_EngineToConnMixin, self).setup_method(method) + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + super(_EngineToConnMixin, self).setup_method(datapath) engine = self.conn conn = engine.connect() self.__tx = conn.begin() @@ -1034,12 +1037,14 @@ def setup_method(self, method): self.__engine = engine self.conn = conn - def teardown_method(self, method): + yield + self.__tx.rollback() self.conn.close() self.conn = self.__engine self.pandasSQL = sql.SQLDatabase(self.__engine) - super(_EngineToConnMixin, self).teardown_method(method) + # XXX: + # super(_EngineToConnMixin, self).teardown_method(method) @pytest.mark.single @@ -1136,7 +1141,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ flavor = None - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): cls.setup_import() cls.setup_driver() @@ -1149,10 +1154,11 @@ def setup_class(cls): msg = "{0} - can't connect to {1} server".format(cls, cls.flavor) pytest.skip(msg) - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.setup_connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_raw_sql() self._load_test1_data() @@ -1920,11 +1926,12 @@ class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): def connect(cls): return sqlite3.connect(':memory:') - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() self.pandasSQL = sql.SQLiteDatabase(self.conn) - self._load_iris_data() + self._load_iris_data(datapath) self._load_test1_data() @@ -2135,8 +2142,9 @@ def _skip_if_no_pymysql(): @pytest.mark.single class TestXSQLite(SQLiteMixIn): - def setup_method(self, method): - self.method = method + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): + self.method = request.function self.conn = sqlite3.connect(':memory:') def test_basic(self): @@ -2215,8 +2223,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): create_sql = """ CREATE TABLE test ( @@ -2236,7 +2243,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): pass @@ -2341,7 +2348,7 @@ def clean_up(test_table_to_drop): "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn): - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): _skip_if_no_pymysql() @@ -2370,7 +2377,8 @@ def setup_class(cls): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): _skip_if_no_pymysql() import pymysql try: @@ -2396,7 +2404,7 @@ def setup_method(self, method): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - self.method = method + self.method = request.function def test_basic(self): _skip_if_no_pymysql() @@ -2501,8 +2509,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): _skip_if_no_pymysql() drop_sql = "DROP TABLE IF EXISTS test" create_sql = """ @@ -2525,7 +2532,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): _skip_if_no_pymysql() diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index bfb72be80400e..cfe47cae7e5e1 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -25,8 +25,8 @@ @pytest.fixture -def dirpath(): - return tm.get_data_path() +def dirpath(datapath): + return datapath("io", "data") @pytest.fixture @@ -39,8 +39,9 @@ def parsed_114(dirpath): class TestStata(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index f65791329f2f1..09687dd97bd43 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -74,11 +74,6 @@ def setup_method(self, method): else: self.default_figsize = (8.0, 6.0) self.default_tick_position = 'left' if self.mpl_ge_2_0_0 else 'default' - # common test data - from pandas import read_csv - base = os.path.join(os.path.dirname(curpath()), os.pardir) - path = os.path.join(base, 'tests', 'data', 'iris.csv') - self.iris = read_csv(path) n = 100 with tm.RNGContext(42): diff --git a/pandas/tests/plotting/test_deprecated.py b/pandas/tests/plotting/test_deprecated.py index 2c2d371921d2f..a45b17ec98261 100644 --- a/pandas/tests/plotting/test_deprecated.py +++ b/pandas/tests/plotting/test_deprecated.py @@ -46,10 +46,9 @@ def test_boxplot_deprecated(self): by='indic') @pytest.mark.slow - def test_radviz_deprecated(self): - df = self.iris + def test_radviz_deprecated(self, iris): with tm.assert_produces_warning(FutureWarning): - plotting.radviz(frame=df, class_column='Name') + plotting.radviz(frame=iris, class_column='Name') @pytest.mark.slow def test_plot_params(self): diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index c82c939584dc7..0473610ea2f8f 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -100,11 +100,11 @@ def test_scatter_matrix_axis(self): axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow - def test_andrews_curves(self): + def test_andrews_curves(self, iris): from pandas.plotting import andrews_curves from matplotlib import cm - df = self.iris + df = iris _check_plot_works(andrews_curves, frame=df, class_column='Name') @@ -165,11 +165,11 @@ def test_andrews_curves(self): andrews_curves(data=df, class_column='Name') @pytest.mark.slow - def test_parallel_coordinates(self): + def test_parallel_coordinates(self, iris): from pandas.plotting import parallel_coordinates from matplotlib import cm - df = self.iris + df = iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name') @@ -234,11 +234,11 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] @pytest.mark.slow - def test_radviz(self): + def test_radviz(self, iris): from pandas.plotting import radviz from matplotlib import cm - df = self.iris + df = iris _check_plot_works(radviz, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') @@ -272,8 +272,8 @@ def test_radviz(self): self._check_colors(handles, facecolors=colors) @pytest.mark.slow - def test_subplot_titles(self): - df = self.iris.drop('Name', axis=1).head() + def test_subplot_titles(self, iris): + df = iris.drop('Name', axis=1).head() # Use the column names as the subplot titles title = list(df.columns) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index cebbcc41c3e17..59b53cd23010e 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1,4 +1,3 @@ -import os import pytest import pytz @@ -13,8 +12,8 @@ class TestAsOfMerge(object): - def read_data(self, name, dedupe=False): - path = os.path.join(tm.get_data_path(), name) + def read_data(self, datapath, name, dedupe=False): + path = datapath('reshape', 'merge', 'data', name) x = read_csv(path) if dedupe: x = (x.drop_duplicates(['time', 'ticker'], keep='last') @@ -23,15 +22,17 @@ def read_data(self, name, dedupe=False): x.time = to_datetime(x.time) return x - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): - self.trades = self.read_data('trades.csv') - self.quotes = self.read_data('quotes.csv', dedupe=True) - self.asof = self.read_data('asof.csv') - self.tolerance = self.read_data('tolerance.csv') - self.allow_exact_matches = self.read_data('allow_exact_matches.csv') + self.trades = self.read_data(datapath, 'trades.csv') + self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True) + self.asof = self.read_data(datapath, 'asof.csv') + self.tolerance = self.read_data(datapath, 'tolerance.csv') + self.allow_exact_matches = self.read_data(datapath, + 'allow_exact_matches.csv') self.allow_exact_matches_and_tolerance = self.read_data( - 'allow_exact_matches_and_tolerance.csv') + datapath, 'allow_exact_matches_and_tolerance.csv') def test_examples1(self): """ doc-string examples """ @@ -423,11 +424,11 @@ def test_multiby_indexed(self): pd.merge_asof(left, right, left_index=True, right_index=True, left_by=['k1', 'k2'], right_by=['k1']) - def test_basic2(self): + def test_basic2(self, datapath): - expected = self.read_data('asof2.csv') - trades = self.read_data('trades2.csv') - quotes = self.read_data('quotes2.csv', dedupe=True) + expected = self.read_data(datapath, 'asof2.csv') + trades = self.read_data(datapath, 'trades2.csv') + quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True) result = merge_asof(trades, quotes, on='time', @@ -467,14 +468,14 @@ def test_valid_join_keys(self): merge_asof(trades, quotes, by='ticker') - def test_with_duplicates(self): + def test_with_duplicates(self, datapath): q = pd.concat([self.quotes, self.quotes]).sort_values( ['time', 'ticker']).reset_index(drop=True) result = merge_asof(self.trades, q, on='time', by='ticker') - expected = self.read_data('asof.csv') + expected = self.read_data(datapath, 'asof.csv') assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 5ea27f9e34e1c..807fb2530603a 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -282,10 +282,10 @@ def test_round_frac(self): result = tmod._round_frac(0.000123456, precision=2) assert result == 0.00012 - def test_qcut_binning_issues(self): + def test_qcut_binning_issues(self, datapath): # #1978, 1979 - path = os.path.join(tm.get_data_path(), 'cut_data.csv') - arr = np.loadtxt(path) + cut_file = datapath(os.path.join('reshape', 'data', 'cut_data.csv')) + arr = np.loadtxt(cut_file) result = qcut(arr, 20) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 74bc08ee9649b..b93a0206479ca 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,4 +1,3 @@ -import os from distutils.version import LooseVersion from datetime import date, datetime, timedelta @@ -518,14 +517,15 @@ def test_add(self, offset_types, tz): assert isinstance(result, Timestamp) assert result == expected_localize - def test_pickle_v0_15_2(self): + def test_pickle_v0_15_2(self, datapath): offsets = {'DateOffset': DateOffset(years=1), 'MonthBegin': MonthBegin(1), 'Day': Day(1), 'YearBegin': YearBegin(1), 'Week': Week(1)} - pickle_path = os.path.join(tm.get_data_path(), - 'dateoffset_0_15_2.pickle') + + pickle_path = datapath('tseries', 'offsets', 'data', + 'dateoffset_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f) # @@ -1838,12 +1838,10 @@ def _check_roundtrip(obj): _check_roundtrip(self.offset2) _check_roundtrip(self.offset * 2) - def test_pickle_compat_0_14_1(self): + def test_pickle_compat_0_14_1(self, datapath): hdays = [datetime(2013, 1, 1) for ele in range(4)] - - pth = tm.get_data_path() - - cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle')) + pth = datapath('tseries', 'offsets', 'data', 'cday-0.14.1.pickle') + cday0_14_1 = read_pickle(pth) cday = CDay(holidays=hdays) assert cday == cday0_14_1 diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index ab7c4fb528452..4d34987e14f75 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import pandas as pd import pytest import numpy as np @@ -841,3 +842,15 @@ def test_locale(self): # GH9744 locales = tm.get_locales() assert len(locales) >= 1 + + +def test_datapath_missing(datapath, request): + if not request.config.getoption("--strict-data-files"): + pytest.skip("Need to set '--strict-data-files'") + + with pytest.raises(ValueError): + datapath('not_a_file') + + result = datapath('data', 'iris.csv') + expected = os.path.join('pandas', 'tests', 'data', 'iris.csv') + assert result == expected diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 27c24e3a68079..c6ab24403d58d 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -23,7 +23,6 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ - import pytest import locale from distutils.version import LooseVersion diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 675dd94d49750..a5afcb6915034 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -6,7 +6,6 @@ import sys import tempfile import warnings -import inspect import os import subprocess import locale @@ -757,15 +756,6 @@ def ensure_clean(filename=None, return_filelike=False): print("Exception on removing file: {error}".format(error=e)) -def get_data_path(f=''): - """Return the path of a data file, these are relative to the current test - directory. - """ - # get our callers file - _, filename, _, _, _, _ = inspect.getouterframes(inspect.currentframe())[1] - base_dir = os.path.abspath(os.path.dirname(filename)) - return os.path.join(base_dir, 'data', f) - # ----------------------------------------------------------------------------- # Comparators diff --git a/setup.cfg b/setup.cfg index 6d9657737a8bd..9ec967c25e225 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,4 +32,5 @@ markers = slow: mark a test as slow network: mark a test as network high_memory: mark a test as a high-memory only -doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL +addopts = --strict-data-files +doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL \ No newline at end of file diff --git a/setup.py b/setup.py index dd026bd611727..0fd008612b5bd 100755 --- a/setup.py +++ b/setup.py @@ -735,11 +735,7 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=find_packages(include=['pandas', 'pandas.*']), - package_data={'': ['data/*', 'templates/*', '_libs/*.dll'], - 'pandas.tests.io': ['data/legacy_hdf/*.h5', - 'data/legacy_pickle/*/*.pickle', - 'data/legacy_msgpack/*/*.msgpack', - 'data/html_encoding/*.html']}, + package_data={'': ['templates/*', '_libs/*.dll']}, ext_modules=extensions, maintainer_email=EMAIL, description=DESCRIPTION, From c6660f6b5b0ad2b11832fadf098388d67b6726c9 Mon Sep 17 00:00:00 2001 From: Benjamin Grove Date: Tue, 26 Jun 2018 13:26:39 -0700 Subject: [PATCH 100/113] DOC: fix typo in cookbook.rst (#21635) Removing the semicolon delimiter at the end of the modified line of code allows the line's output to be displayed. --- doc/source/cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index fdc3b38cfdebc..f6fa9e9f86143 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -132,7 +132,7 @@ Building Criteria .. ipython:: python - newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']; newseries; + newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']; newseries ...or (with assignment modifies the DataFrame.) From 7555378d64672a36e44205c62ca9d8f4f52aa0de Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 26 Jun 2018 22:30:42 +0100 Subject: [PATCH 101/113] DOC: minor correction to v0.23.2.txt (#21644) --- doc/source/whatsnew/v0.23.2.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index a41a6c31b0678..9c4b408a1d24b 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -61,8 +61,8 @@ Fixed Regressions Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Improved performance of membership checks in :class:`CategoricalIndex` - (i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains` +- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` + (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` is likewise much faster (:issue:`21369`, :issue:`21508`) - Improved performance of :meth:`HDFStore.groups` (and dependent functions like :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) From 45cfa62f17e87f34eae630cb5b68658eef37a45e Mon Sep 17 00:00:00 2001 From: david-liu-brattle-1 <36486871+david-liu-brattle-1@users.noreply.github.com> Date: Tue, 26 Jun 2018 18:19:41 -0400 Subject: [PATCH 102/113] Cleanup clipboard tests (#21163) --- pandas/tests/io/test_clipboard.py | 196 ++++++++++++++++++++---------- 1 file changed, 129 insertions(+), 67 deletions(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 98c0effabec84..80fddd50fc9a8 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -9,10 +9,11 @@ from pandas import DataFrame from pandas import read_clipboard from pandas import get_option +from pandas.compat import PY2 from pandas.util import testing as tm from pandas.util.testing import makeCustomDataframe as mkdf from pandas.io.clipboard.exceptions import PyperclipException -from pandas.io.clipboard import clipboard_set +from pandas.io.clipboard import clipboard_set, clipboard_get try: @@ -22,73 +23,134 @@ _DEPS_INSTALLED = 0 +def build_kwargs(sep, excel): + kwargs = {} + if excel != 'default': + kwargs['excel'] = excel + if sep != 'default': + kwargs['sep'] = sep + return kwargs + + +@pytest.fixture(params=['delims', 'utf8', 'string', 'long', 'nonascii', + 'colwidth', 'mixed', 'float', 'int']) +def df(request): + data_type = request.param + + if data_type == 'delims': + return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'], + 'b': ['hi\'j', 'k\'\'lm']}) + elif data_type == 'utf8': + return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], + 'b': ['øπ∆˚¬', 'œ∑´®']}) + elif data_type == 'string': + return mkdf(5, 3, c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'long': + max_rows = get_option('display.max_rows') + return mkdf(max_rows + 1, 3, + data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'nonascii': + return pd.DataFrame({'en': 'in English'.split(), + 'es': 'en español'.split()}) + elif data_type == 'colwidth': + _cw = get_option('display.max_colwidth') + 1 + return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'mixed': + return DataFrame({'a': np.arange(1.0, 6.0) + 0.01, + 'b': np.arange(1, 6), + 'c': list('abcde')}) + elif data_type == 'float': + return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'int': + return mkdf(5, 3, data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + else: + raise ValueError + + @pytest.mark.single @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") class TestClipboard(object): - - @classmethod - def setup_class(cls): - cls.data = {} - cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['float'] = mkdf(5, 3, - data_gen_f=lambda r, c: float(r) + 0.01, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, - 'b': np.arange(1, 6), - 'c': list('abcde')}) - - # Test columns exceeding "max_colwidth" (GH8305) - _cw = get_option('display.max_colwidth') + 1 - cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - # Test GH-5346 - max_rows = get_option('display.max_rows') - cls.data['longdf'] = mkdf(max_rows + 1, 3, - data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - # Test for non-ascii text: GH9263 - cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(), - 'es': 'en español'.split()}) - # unicode round trip test for GH 13747, GH 12529 - cls.data['utf8'] = pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], - 'b': ['øπ∆˚¬', 'œ∑´®']}) - cls.data_types = list(cls.data.keys()) - - @classmethod - def teardown_class(cls): - del cls.data_types, cls.data - - def check_round_trip_frame(self, data_type, excel=None, sep=None, + def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): - data = self.data[data_type] data.to_clipboard(excel=excel, sep=sep, encoding=encoding) - if sep is not None: - result = read_clipboard(sep=sep, index_col=0, encoding=encoding) - else: - result = read_clipboard(encoding=encoding) + result = read_clipboard(sep=sep or '\t', index_col=0, + encoding=encoding) tm.assert_frame_equal(data, result, check_dtype=False) - def test_round_trip_frame_sep(self): - for dt in self.data_types: - self.check_round_trip_frame(dt, sep=',') - self.check_round_trip_frame(dt, sep=r'\s+') - self.check_round_trip_frame(dt, sep='|') - - def test_round_trip_frame_string(self): - for dt in self.data_types: - self.check_round_trip_frame(dt, excel=False) - - def test_round_trip_frame(self): - for dt in self.data_types: - self.check_round_trip_frame(dt) + # Test that default arguments copy as tab delimited + @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' + 'Issue in #21104, Fixed in #21111') + def test_round_trip_frame(self, df): + self.check_round_trip_frame(df) + + # Test that explicit delimiters are respected + @pytest.mark.parametrize('sep', ['\t', ',', '|']) + def test_round_trip_frame_sep(self, df, sep): + self.check_round_trip_frame(df, sep=sep) + + # Test white space separator + @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " + "aren't handled correctly in default c engine. Fixed " + "in #21111 by defaulting to python engine for " + "whitespace separator") + def test_round_trip_frame_string(self, df): + df.to_clipboard(excel=False, sep=None) + result = read_clipboard() + assert df.to_string() == result.to_string() + assert df.shape == result.shape + + # Two character separator is not supported in to_clipboard + # Test that multi-character separators are not silently passed + @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") + def test_excel_sep_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=True, sep=r'\t') + + # Separator is ignored when excel=False and should produce a warning + @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") + def test_copy_delim_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=False, sep='\t') + + # Tests that the default behavior of to_clipboard is tab + # delimited and excel="True" + @pytest.mark.xfail(reason="to_clipboard defaults to space delim. Issue in " + "#21104, Fixed in #21111") + @pytest.mark.parametrize('sep', ['\t', None, 'default']) + @pytest.mark.parametrize('excel', [True, None, 'default']) + def test_clipboard_copy_tabs_default(self, sep, excel, df): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + if PY2: + # to_clipboard copies unicode, to_csv produces bytes. This is + # expected behavior + assert clipboard_get().encode('utf-8') == df.to_csv(sep='\t') + else: + assert clipboard_get() == df.to_csv(sep='\t') + + # Tests reading of white space separated tables + @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " + "aren't handled correctly. in default c engine. Fixed " + "in #21111 by defaulting to python engine for " + "whitespace separator") + @pytest.mark.parametrize('sep', [None, 'default']) + @pytest.mark.parametrize('excel', [False]) + def test_clipboard_copy_strings(self, sep, excel, df): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + result = read_clipboard(sep=r'\s+') + assert result.to_string() == df.to_string() + assert df.shape == result.shape def test_read_clipboard_infer_excel(self): # gh-19010: avoid warnings @@ -124,15 +186,15 @@ def test_read_clipboard_infer_excel(self): tm.assert_frame_equal(res, exp) - def test_invalid_encoding(self): + def test_invalid_encoding(self, df): # test case for testing invalid encoding - data = self.data['string'] with pytest.raises(ValueError): - data.to_clipboard(encoding='ascii') + df.to_clipboard(encoding='ascii') with pytest.raises(NotImplementedError): pd.read_clipboard(encoding='ascii') - def test_round_trip_valid_encodings(self): - for enc in ['UTF-8', 'utf-8', 'utf8']: - for dt in self.data_types: - self.check_round_trip_frame(dt, encoding=enc) + @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' + 'Issue in #21104, Fixed in #21111') + @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8']) + def test_round_trip_valid_encodings(self, enc, df): + self.check_round_trip_frame(df, encoding=enc) From d746bee41816d57d266e7e41340a1e2cd4b7fe7c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 26 Jun 2018 15:25:57 -0700 Subject: [PATCH 103/113] ENH: Update to_gbq and read_gbq to pandas-gbq 0.5.0 (#21628) * Add link to Pandas-GBQ 0.5.0 in what's new. * Remove unnecessary sleep in GBQ tests. Closes https://github.com/pydata/pandas-gbq/issues/177 Closes #21627 --- doc/source/whatsnew/v0.24.0.txt | 5 ++ pandas/core/frame.py | 59 +++++++++++----------- pandas/io/gbq.py | 86 ++++++++++++++++++--------------- pandas/tests/io/test_gbq.py | 15 +++--- 4 files changed, 92 insertions(+), 73 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 72e7373d0dd33..60c3e4df8d129 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -19,6 +19,11 @@ Other Enhancements - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) - :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) - Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`) +- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to + reflect changes from the `Pandas-GBQ library version 0.5.0 + `__. + (:issue:`21627`) + .. _whatsnew_0240.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0bf5acf14294a..b553cfdc72c92 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1102,37 +1102,27 @@ def to_dict(self, orient='dict', into=dict): else: raise ValueError("orient '{o}' not understood".format(o=orient)) - def to_gbq(self, destination_table, project_id, chunksize=None, - verbose=None, reauth=False, if_exists='fail', private_key=None, - auth_local_webserver=False, table_schema=None): + def to_gbq(self, destination_table, project_id=None, chunksize=None, + reauth=False, if_exists='fail', private_key=None, + auth_local_webserver=False, table_schema=None, location=None, + progress_bar=True, verbose=None): """ Write a DataFrame to a Google BigQuery table. This function requires the `pandas-gbq package `__. - Authentication to the Google BigQuery service is via OAuth 2.0. - - - If ``private_key`` is provided, the library loads the JSON service - account credentials and uses those to authenticate. - - - If no ``private_key`` is provided, the library tries `application - default credentials`_. - - .. _application default credentials: - https://cloud.google.com/docs/authentication/production#providing_credentials_to_your_application - - - If application default credentials are not found or cannot be used - with BigQuery, the library authenticates with user account - credentials. In this case, you will be asked to grant permissions - for product name 'pandas GBQ'. + See the `How to authenticate with Google BigQuery + `__ + guide for authentication instructions. Parameters ---------- destination_table : str - Name of table to be written, in the form 'dataset.tablename'. - project_id : str - Google BigQuery Account project ID. + Name of table to be written, in the form ``dataset.tablename``. + project_id : str, optional + Google BigQuery Account project ID. Optional when available from + the environment. chunksize : int, optional Number of rows to be inserted in each chunk from the dataframe. Set to ``None`` to load the whole dataframe at once. @@ -1170,8 +1160,21 @@ def to_gbq(self, destination_table, project_id, chunksize=None, BigQuery API documentation on available names of a field. *New in version 0.3.1 of pandas-gbq*. - verbose : boolean, deprecated - *Deprecated in Pandas-GBQ 0.4.0.* Use the `logging module + location : str, optional + Location where the load job should run. See the `BigQuery locations + documentation + `__ for a + list of available locations. The location must match that of the + target dataset. + + *New in version 0.5.0 of pandas-gbq*. + progress_bar : bool, default True + Use the library `tqdm` to show the progress bar for the upload, + chunk by chunk. + + *New in version 0.5.0 of pandas-gbq*. + verbose : bool, deprecated + Deprecated in Pandas-GBQ 0.4.0. Use the `logging module to adjust verbosity instead `__. @@ -1182,10 +1185,12 @@ def to_gbq(self, destination_table, project_id, chunksize=None, """ from pandas.io import gbq return gbq.to_gbq( - self, destination_table, project_id, chunksize=chunksize, - verbose=verbose, reauth=reauth, if_exists=if_exists, - private_key=private_key, auth_local_webserver=auth_local_webserver, - table_schema=table_schema) + self, destination_table, project_id=project_id, + chunksize=chunksize, reauth=reauth, + if_exists=if_exists, private_key=private_key, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, location=location, + progress_bar=progress_bar, verbose=verbose) @classmethod def from_records(cls, data, index=None, exclude=None, columns=None, diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index c7c16598ee432..87a0e4d5d1747 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -22,34 +22,26 @@ def _try_import(): def read_gbq(query, project_id=None, index_col=None, col_order=None, - reauth=False, verbose=None, private_key=None, dialect='legacy', - **kwargs): + reauth=False, private_key=None, auth_local_webserver=False, + dialect='legacy', location=None, configuration=None, + verbose=None): """ Load data from Google BigQuery. This function requires the `pandas-gbq package `__. - Authentication to the Google BigQuery service is via OAuth 2.0. - - - If "private_key" is not provided: - - By default "application default credentials" are used. - - If default application credentials are not found or are restrictive, - user account credentials are used. In this case, you will be asked to - grant permissions for product name 'pandas GBQ'. - - - If "private_key" is provided: - - Service account credentials will be used to authenticate. + See the `How to authenticate with Google BigQuery + `__ + guide for authentication instructions. Parameters ---------- query : str SQL-Like Query to return data values. - project_id : str - Google BigQuery Account project ID. + project_id : str, optional + Google BigQuery Account project ID. Optional when available from + the environment. index_col : str, optional Name of result column to use for index in results DataFrame. col_order : list(str), optional @@ -62,6 +54,16 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, Service account private key in JSON format. Can be file path or string contents. This is useful for remote server authentication (eg. Jupyter/IPython notebook on remote host). + auth_local_webserver : boolean, default False + Use the `local webserver flow`_ instead of the `console flow`_ + when getting user credentials. + + .. _local webserver flow: + http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server + .. _console flow: + http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + + *New in version 0.2.0 of pandas-gbq*. dialect : str, default 'legacy' SQL syntax dialect to use. Value can be one of: @@ -74,19 +76,26 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, compliant with the SQL 2011 standard. For more information see `BigQuery Standard SQL Reference `__. - verbose : boolean, deprecated - *Deprecated in Pandas-GBQ 0.4.0.* Use the `logging module - to adjust verbosity instead - `__. - kwargs : dict - Arbitrary keyword arguments. - configuration (dict): query config parameters for job processing. + location : str, optional + Location where the query job should run. See the `BigQuery locations + documentation + `__ for a + list of available locations. The location must match that of any + datasets used in the query. + + *New in version 0.5.0 of pandas-gbq*. + configuration : dict, optional + Query config parameters for job processing. For example: configuration = {'query': {'useQueryCache': False}} - For more information see `BigQuery SQL Reference - `__ + For more information see `BigQuery REST API Reference + `__. + verbose : None, deprecated + Deprecated in Pandas-GBQ 0.4.0. Use the `logging module + to adjust verbosity instead + `__. Returns ------- @@ -100,20 +109,21 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, """ pandas_gbq = _try_import() return pandas_gbq.read_gbq( - query, project_id=project_id, - index_col=index_col, col_order=col_order, - reauth=reauth, verbose=verbose, - private_key=private_key, - dialect=dialect, - **kwargs) + query, project_id=project_id, index_col=index_col, + col_order=col_order, reauth=reauth, verbose=verbose, + private_key=private_key, auth_local_webserver=auth_local_webserver, + dialect=dialect, location=location, configuration=configuration) -def to_gbq(dataframe, destination_table, project_id, chunksize=None, +def to_gbq(dataframe, destination_table, project_id=None, chunksize=None, verbose=None, reauth=False, if_exists='fail', private_key=None, - auth_local_webserver=False, table_schema=None): + auth_local_webserver=False, table_schema=None, location=None, + progress_bar=True): pandas_gbq = _try_import() return pandas_gbq.to_gbq( - dataframe, destination_table, project_id, chunksize=chunksize, - verbose=verbose, reauth=reauth, if_exists=if_exists, - private_key=private_key, auth_local_webserver=auth_local_webserver, - table_schema=table_schema) + dataframe, destination_table, project_id=project_id, + chunksize=chunksize, verbose=verbose, reauth=reauth, + if_exists=if_exists, private_key=private_key, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, location=location, + progress_bar=progress_bar) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 58a84ad4d47f8..dc6c319bb3366 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -2,7 +2,6 @@ from datetime import datetime import pytz import platform -from time import sleep import os import numpy as np @@ -48,16 +47,18 @@ def _in_travis_environment(): def _get_project_id(): if _in_travis_environment(): return os.environ.get('GBQ_PROJECT_ID') - else: - return PROJECT_ID + return PROJECT_ID or os.environ.get('GBQ_PROJECT_ID') def _get_private_key_path(): if _in_travis_environment(): return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci', 'travis_gbq.json']) - else: - return PRIVATE_KEY_JSON_PATH + + private_key_path = PRIVATE_KEY_JSON_PATH + if not private_key_path: + private_key_path = os.environ.get('GBQ_GOOGLE_APPLICATION_CREDENTIALS') + return private_key_path def clean_gbq_environment(private_key=None): @@ -123,11 +124,9 @@ def test_roundtrip(self): test_size = 20001 df = make_mixed_dataframe_v2(test_size) - df.to_gbq(destination_table, _get_project_id(), chunksize=10000, + df.to_gbq(destination_table, _get_project_id(), chunksize=None, private_key=_get_private_key_path()) - sleep(30) # <- Curses Google!!! - result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), From 476717ce0e6a195b5474cf247c6f610800062975 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 26 Jun 2018 15:27:29 -0700 Subject: [PATCH 104/113] More speedups for Period comparisons (#21606) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/_libs/tslibs/offsets.pyx | 44 +++++++++++++++++++++++++++++++++ pandas/tseries/offsets.py | 31 +---------------------- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 60c3e4df8d129..5e757987d4518 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -135,7 +135,7 @@ Performance Improvements - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) -- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`) +- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`) - .. _whatsnew_0240.docs: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 63add06db17b4..b4b27b99bdb30 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -88,6 +88,15 @@ for _d in DAYS: # --------------------------------------------------------------------- # Misc Helpers +cdef to_offset(object obj): + """ + Wrap pandas.tseries.frequencies.to_offset to keep centralize runtime + imports + """ + from pandas.tseries.frequencies import to_offset + return to_offset(obj) + + def as_datetime(obj): f = getattr(obj, 'to_pydatetime', None) if f is not None: @@ -313,6 +322,41 @@ class _BaseOffset(object): def __setattr__(self, name, value): raise AttributeError("DateOffset objects are immutable.") + def __eq__(self, other): + if is_string_object(other): + other = to_offset(other) + + try: + return self._params == other._params + except AttributeError: + # other is not a DateOffset object + return False + + return self._params == other._params + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(self._params) + + @property + def _params(self): + """ + Returns a tuple containing all of the attributes needed to evaluate + equality between two DateOffset objects. + """ + # NB: non-cython subclasses override property with cache_readonly + all_paras = self.__dict__.copy() + if 'holidays' in all_paras and not all_paras['holidays']: + all_paras.pop('holidays') + exclude = ['kwds', 'name', 'calendar'] + attrs = [(k, v) for k, v in all_paras.items() + if (k not in exclude) and (k[0] != '_')] + attrs = sorted(set(attrs)) + params = tuple([str(self.__class__)] + attrs) + return params + @property def kwds(self): # for backwards-compatibility diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a3f82c1a0902e..1cfd3f476f8ab 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -182,6 +182,7 @@ def __add__(date): Since 0 is a bit weird, we suggest avoiding its use. """ + _params = cache_readonly(BaseOffset._params.fget) _use_relativedelta = False _adjust_dst = False _attributes = frozenset(['n', 'normalize'] + @@ -288,18 +289,6 @@ def isAnchored(self): # if there were a canonical docstring for what isAnchored means. return (self.n == 1) - @cache_readonly - def _params(self): - all_paras = self.__dict__.copy() - if 'holidays' in all_paras and not all_paras['holidays']: - all_paras.pop('holidays') - exclude = ['kwds', 'name', 'calendar'] - attrs = [(k, v) for k, v in all_paras.items() - if (k not in exclude) and (k[0] != '_')] - attrs = sorted(set(attrs)) - params = tuple([str(self.__class__)] + attrs) - return params - # TODO: Combine this with BusinessMixin version by defining a whitelisted # set of attributes on each object rather than the existing behavior of # iterating over internal ``__dict__`` @@ -322,24 +311,6 @@ def _repr_attrs(self): def name(self): return self.rule_code - def __eq__(self, other): - - if isinstance(other, compat.string_types): - from pandas.tseries.frequencies import to_offset - - other = to_offset(other) - - if not isinstance(other, DateOffset): - return False - - return self._params == other._params - - def __ne__(self, other): - return not self == other - - def __hash__(self): - return hash(self._params) - def __add__(self, other): if isinstance(other, (ABCDatetimeIndex, ABCSeries)): return other + self From 001dc78fb1c774564732cb38b362038b3f6968d8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 26 Jun 2018 15:29:47 -0700 Subject: [PATCH 105/113] use ccalendar instead of np_datetime (#21549) --- pandas/_libs/tslibs/ccalendar.pxd | 2 +- pandas/_libs/tslibs/np_datetime.pxd | 4 ---- pandas/_libs/tslibs/offsets.pyx | 10 ++-------- pandas/_libs/tslibs/period.pyx | 3 +-- setup.py | 1 + 5 files changed, 5 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index 42473a97a7150..04fb6eaf49c84 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -6,7 +6,7 @@ from cython cimport Py_ssize_t from numpy cimport int64_t, int32_t -cdef int dayofweek(int y, int m, int m) nogil +cdef int dayofweek(int y, int m, int d) nogil cdef bint is_leapyear(int64_t year) nogil cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil cpdef int32_t get_week_of_year(int year, int month, int day) nogil diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 33b8b32bcf2dc..1a0baa8271643 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -54,10 +54,6 @@ cdef extern from "../src/datetime/np_datetime.h": PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *result) nogil - int days_per_month_table[2][12] - int dayofweek(int y, int m, int d) nogil - int is_leapyear(int64_t year) nogil - cdef int reverse_ops[6] diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b4b27b99bdb30..841db80cf094e 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -18,12 +18,12 @@ cnp.import_array() from util cimport is_string_object, is_integer_object from ccalendar import MONTHS, DAYS +from ccalendar cimport get_days_in_month, dayofweek from conversion cimport tz_convert_single, pydt_to_i8 from frequencies cimport get_freq_code from nattype cimport NPY_NAT from np_datetime cimport (pandas_datetimestruct, - dtstruct_to_dt64, dt64_to_dtstruct, - is_leapyear, days_per_month_table, dayofweek) + dtstruct_to_dt64, dt64_to_dtstruct) # --------------------------------------------------------------------- # Constants @@ -494,12 +494,6 @@ class BaseOffset(_BaseOffset): # ---------------------------------------------------------------------- # RelativeDelta Arithmetic -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline int get_days_in_month(int year, int month) nogil: - return days_per_month_table[is_leapyear(year)][month - 1] - - cdef inline int year_add_months(pandas_datetimestruct dts, int months) nogil: """new year number after shifting pandas_datetimestruct number of months""" return dts.year + (dts.month + months - 1) / 12 diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index cc2fb6e0617cb..49208056f88fe 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -45,9 +45,8 @@ from timezones cimport is_utc, is_tzlocal, get_utcoffset, get_dst_info from timedeltas cimport delta_to_nanoseconds cimport ccalendar -from ccalendar cimport dayofweek, get_day_of_year +from ccalendar cimport dayofweek, get_day_of_year, is_leapyear from ccalendar import MONTH_NUMBERS -from ccalendar cimport is_leapyear from conversion cimport tz_convert_utc_to_tzlocal from frequencies cimport (get_freq_code, get_base_alias, get_to_timestamp_base, get_freq_str, diff --git a/setup.py b/setup.py index 0fd008612b5bd..621655dd05dbc 100755 --- a/setup.py +++ b/setup.py @@ -592,6 +592,7 @@ def pxd(name): '_libs.tslibs.offsets': { 'pyxfile': '_libs/tslibs/offsets', 'pxdfiles': ['_libs/src/util', + '_libs/tslibs/ccalendar', '_libs/tslibs/conversion', '_libs/tslibs/frequencies', '_libs/tslibs/nattype'], From 59286daee4083a4b1760e4d04365576fca52f8bd Mon Sep 17 00:00:00 2001 From: Stephen Pascoe Date: Sun, 30 Aug 2015 08:58:08 +0100 Subject: [PATCH 106/113] ENH: Function to walk the group hierarchy of a PyTables HDF5 file. closes #10143 --- doc/source/api.rst | 1 + doc/source/io.rst | 19 ++++++++++++ doc/source/whatsnew/v0.24.0.txt | 3 +- pandas/io/pytables.py | 47 +++++++++++++++++++++++++++++ pandas/tests/io/test_pytables.py | 51 ++++++++++++++++++++++++++++++++ 5 files changed, 120 insertions(+), 1 deletion(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index f2c00d5d12031..8dc5d0e9fc023 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -100,6 +100,7 @@ HDFStore: PyTables (HDF5) HDFStore.select HDFStore.info HDFStore.keys + HDFStore.walk Feather ~~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index ae6c4f12f04f7..9fe578524c8e0 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3554,6 +3554,25 @@ everything in the sub-store and **below**, so be *careful*. store.remove('food') store + +You can walk through the group hierarchy using the ``walk`` method which +will yield a tuple for each group key along with the relative keys of its contents. + +.. versionadded:: 0.24.0 + + +.. ipython:: python + + for (path, subgroups, subkeys) in store.walk(): + for subgroup in subgroups: + print('GROUP: {}/{}'.format(path, subgroup)) + for subkey in subkeys: + key = '/'.join([path, subkey]) + print('KEY: {}'.format(key)) + print(store.get(key)) + + + .. warning:: Hierarchical keys cannot be retrieved as dotted (attribute) access as described above for items stored under the root node. diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5e757987d4518..abf574ae109fd 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -23,7 +23,8 @@ Other Enhancements reflect changes from the `Pandas-GBQ library version 0.5.0 `__. (:issue:`21627`) - +- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- .. _whatsnew_0240.api_breaking: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 580c7923017e5..f93ad425b2c6a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1106,6 +1106,53 @@ def groups(self): g._v_name != u('table')))) ] + def walk(self, where="/"): + """ Walk the pytables group hierarchy for pandas objects + + This generator will yield the group path, subgroups and pandas object + names for each group. + Any non-pandas PyTables objects that are not a group will be ignored. + + The `where` group itself is listed first (preorder), then each of its + child groups (following an alphanumerical order) is also traversed, + following the same procedure. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + where : str, optional + Group where to start walking. + If not supplied, the root group is used. + + Yields + ------ + path : str + Full path to a group (without trailing '/') + groups : list of str + names of the groups contained in `path` + leaves : list of str + names of the pandas objects contained in `path` + + """ + _tables() + self._check_if_open() + for g in self._handle.walk_groups(where): + if getattr(g._v_attrs, 'pandas_type', None) is not None: + continue + + groups = [] + leaves = [] + for child in g._v_children.values(): + pandas_type = getattr(child._v_attrs, 'pandas_type', None) + if pandas_type is None: + if isinstance(child, _table_mod.group.Group): + groups.append(child._v_name) + else: + leaves.append(child._v_name) + + yield (g._v_pathname.rstrip('/'), groups, leaves) + def get_node(self, key): """ return the node with the key or None if it does not exist """ self._check_if_open() diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index b95df3840b6c5..29063b64221c1 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -635,6 +635,57 @@ def test_get(self): pytest.raises(KeyError, store.get, 'b') + @pytest.mark.parametrize('where, expected', [ + ('/', { + '': ({'first_group', 'second_group'}, set()), + '/first_group': (set(), {'df1', 'df2'}), + '/second_group': ({'third_group'}, {'df3', 's1'}), + '/second_group/third_group': (set(), {'df4'}), + }), + ('/second_group', { + '/second_group': ({'third_group'}, {'df3', 's1'}), + '/second_group/third_group': (set(), {'df4'}), + }) + ]) + def test_walk(self, where, expected): + # GH10143 + objs = { + 'df1': pd.DataFrame([1, 2, 3]), + 'df2': pd.DataFrame([4, 5, 6]), + 'df3': pd.DataFrame([6, 7, 8]), + 'df4': pd.DataFrame([9, 10, 11]), + 's1': pd.Series([10, 9, 8]), + # Next 3 items aren't pandas objects and should be ignored + 'a1': np.array([[1, 2, 3], [4, 5, 6]]), + 'tb1': np.array([(1, 2, 3), (4, 5, 6)], dtype='i,i,i'), + 'tb2': np.array([(7, 8, 9), (10, 11, 12)], dtype='i,i,i') + } + + with ensure_clean_store('walk_groups.hdf', mode='w') as store: + store.put('/first_group/df1', objs['df1']) + store.put('/first_group/df2', objs['df2']) + store.put('/second_group/df3', objs['df3']) + store.put('/second_group/s1', objs['s1']) + store.put('/second_group/third_group/df4', objs['df4']) + # Create non-pandas objects + store._handle.create_array('/first_group', 'a1', objs['a1']) + store._handle.create_table('/first_group', 'tb1', obj=objs['tb1']) + store._handle.create_table('/second_group', 'tb2', obj=objs['tb2']) + + assert len(list(store.walk(where=where))) == len(expected) + for path, groups, leaves in store.walk(where=where): + assert path in expected + expected_groups, expected_frames = expected[path] + assert expected_groups == set(groups) + assert expected_frames == set(leaves) + for leaf in leaves: + frame_path = '/'.join([path, leaf]) + obj = store.get(frame_path) + if 'df' in leaf: + tm.assert_frame_equal(obj, objs[leaf]) + else: + tm.assert_series_equal(obj, objs[leaf]) + def test_getattr(self): with ensure_clean_store(self.path) as store: From dad12525b3786327c1230d88d7e583f0df8bbc9d Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 27 Jun 2018 03:57:55 -0600 Subject: [PATCH 107/113] DOC: Fix versionadded directive typos in IntervalIndex (#21649) --- pandas/core/indexes/interval.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index eb9d7efc06c27..23a655b9a51ee 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -160,7 +160,7 @@ class IntervalIndex(IntervalMixin, Index): dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Attributes ---------- @@ -438,7 +438,7 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -568,7 +568,7 @@ def from_intervals(cls, data, closed=None, name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -619,7 +619,7 @@ def from_tuples(cls, data, closed='right', name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -671,7 +671,7 @@ def to_tuples(self, na_tuple=True): Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA value itself if False, ``nan``. - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- From b3b047eaa7b8e30675fa8ac33b4b34848b3c4b8c Mon Sep 17 00:00:00 2001 From: LeakedMemory Date: Wed, 27 Jun 2018 06:23:22 -0500 Subject: [PATCH 108/113] TST: Use absolute path for datapath (#21647) --- pandas/conftest.py | 4 +++- pandas/tests/util/test_testing.py | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 82d860b091b82..5b9c162a0a022 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -153,8 +153,10 @@ def datapath(request): ValueError If the path doesn't exist and the --strict-data-files option is set. """ + BASE_PATH = os.path.join(os.path.dirname(__file__), 'tests') + def deco(*args): - path = os.path.join('pandas', 'tests', *args) + path = os.path.join(BASE_PATH, *args) if not os.path.exists(path): if request.config.getoption("--strict-data-files"): msg = "Could not find file {} and --strict-data-files is set." diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index 4d34987e14f75..95ea4658212e9 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -852,5 +852,10 @@ def test_datapath_missing(datapath, request): datapath('not_a_file') result = datapath('data', 'iris.csv') - expected = os.path.join('pandas', 'tests', 'data', 'iris.csv') + expected = os.path.join( + os.path.dirname(os.path.dirname(__file__)), + 'data', + 'iris.csv' + ) + assert result == expected From 242ccbcf14b705f4e6d3513e952e8a18c4acb798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phan=20Taljaard?= Date: Wed, 27 Jun 2018 13:59:59 +0200 Subject: [PATCH 109/113] DOC: update DataFrame.dropna's axis argument docs (#21652) --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b553cfdc72c92..42a68de52a3c4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4186,6 +4186,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, .. deprecated:: 0.23.0 Pass tuple or list to drop on multiple axes. + Only a single axis is allowed. how : {'any', 'all'}, default 'any' Determine if row or column is removed from DataFrame, when we have From 8cbfcbff06bf5a74b9c928e42e64d9eedcfe5b3c Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 27 Jun 2018 09:29:15 -0600 Subject: [PATCH 110/113] BUG: Let IntervalIndex constructor override inferred closed (#21584) --- doc/source/whatsnew/v0.24.0.txt | 7 +++++ pandas/_libs/interval.pyx | 19 +++++++++++--- pandas/core/indexes/interval.py | 14 +++------- .../indexes/interval/test_construction.py | 26 ++++++++++++++----- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index abf574ae109fd..406ca9ba045c9 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -205,6 +205,13 @@ Strings - - +Interval +^^^^^^^^ + +- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) +- +- + Indexing ^^^^^^^^ diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 5dbf509fda65e..fbb7265a17f8b 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -335,11 +335,17 @@ cdef class Interval(IntervalMixin): @cython.wraparound(False) @cython.boundscheck(False) -cpdef intervals_to_interval_bounds(ndarray intervals): +cpdef intervals_to_interval_bounds(ndarray intervals, + bint validate_closed=True): """ Parameters ---------- - intervals: ndarray object array of Intervals / nulls + intervals : ndarray + object array of Intervals / nulls + + validate_closed: boolean, default True + boolean indicating if all intervals must be closed on the same side. + Mismatching closed will raise if True, else return None for closed. Returns ------- @@ -353,6 +359,7 @@ cpdef intervals_to_interval_bounds(ndarray intervals): object closed = None, interval int64_t n = len(intervals) ndarray left, right + bint seen_closed = False left = np.empty(n, dtype=intervals.dtype) right = np.empty(n, dtype=intervals.dtype) @@ -370,10 +377,14 @@ cpdef intervals_to_interval_bounds(ndarray intervals): left[i] = interval.left right[i] = interval.right - if closed is None: + if not seen_closed: + seen_closed = True closed = interval.closed elif closed != interval.closed: - raise ValueError('intervals must all be closed on the same side') + closed = None + if validate_closed: + msg = 'intervals must all be closed on the same side' + raise ValueError(msg) return left, right, closed diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 23a655b9a51ee..80619c7beb28c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -233,7 +233,7 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, if isinstance(data, IntervalIndex): left = data.left right = data.right - closed = data.closed + closed = closed or data.closed else: # don't allow scalars @@ -241,16 +241,8 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, cls._scalar_data_error(data) data = maybe_convert_platform_interval(data) - left, right, infer_closed = intervals_to_interval_bounds(data) - - if (com._all_not_none(closed, infer_closed) and - closed != infer_closed): - # GH 18421 - msg = ("conflicting values for closed: constructor got " - "'{closed}', inferred from data '{infer_closed}'" - .format(closed=closed, infer_closed=infer_closed)) - raise ValueError(msg) - + left, right, infer_closed = intervals_to_interval_bounds( + data, validate_closed=closed is None) closed = closed or infer_closed return cls._simple_new(left, right, closed, name, copy=copy, diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index ac946a3421e53..3745f79d7d65d 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -312,13 +312,7 @@ def test_generic_errors(self, constructor): pass def test_constructor_errors(self, constructor): - # mismatched closed inferred from intervals vs constructor. - ivs = [Interval(0, 1, closed='both'), Interval(1, 2, closed='both')] - msg = 'conflicting values for closed' - with tm.assert_raises_regex(ValueError, msg): - constructor(ivs, closed='neither') - - # mismatched closed within intervals + # mismatched closed within intervals with no constructor override ivs = [Interval(0, 1, closed='right'), Interval(2, 3, closed='left')] msg = 'intervals must all be closed on the same side' with tm.assert_raises_regex(ValueError, msg): @@ -336,6 +330,24 @@ def test_constructor_errors(self, constructor): with tm.assert_raises_regex(TypeError, msg): constructor([0, 1]) + @pytest.mark.parametrize('data, closed', [ + ([], 'both'), + ([np.nan, np.nan], 'neither'), + ([Interval(0, 3, closed='neither'), + Interval(2, 5, closed='neither')], 'left'), + ([Interval(0, 3, closed='left'), + Interval(2, 5, closed='right')], 'neither'), + (IntervalIndex.from_breaks(range(5), closed='both'), 'right')]) + def test_override_inferred_closed(self, constructor, data, closed): + # GH 19370 + if isinstance(data, IntervalIndex): + tuples = data.to_tuples() + else: + tuples = [(iv.left, iv.right) if notna(iv) else iv for iv in data] + expected = IntervalIndex.from_tuples(tuples, closed=closed) + result = constructor(data, closed=closed) + tm.assert_index_equal(result, expected) + class TestFromIntervals(TestClassConstructors): """ From d07e61b5b3542a9a86868b155b946f87b4ad31c9 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 28 Jun 2018 03:11:48 -0700 Subject: [PATCH 111/113] TST: Use fixtures in dtypes/test_cast.py (#21661) --- pandas/conftest.py | 12 ++ pandas/tests/dtypes/test_cast.py | 249 ++++++++++++++++--------------- 2 files changed, 138 insertions(+), 123 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 5b9c162a0a022..803b3add97052 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -245,6 +245,18 @@ def float_dtype(request): return request.param +@pytest.fixture(params=["complex64", "complex128"]) +def complex_dtype(request): + """ + Parameterized fixture for complex dtypes. + + * complex64 + * complex128 + """ + + return request.param + + UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"] ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 4a19682e2c558..0d6382424ccf5 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -36,7 +36,7 @@ class TestMaybeDowncast(object): - def test_downcast_conv(self): + def test_downcast(self): # test downcasting arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) @@ -53,33 +53,34 @@ def test_downcast_conv(self): expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) - # GH16875 coercing of bools + # see gh-16875: coercing of booleans. ser = Series([True, True, False]) result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) expected = ser tm.assert_series_equal(result, expected) - # conversions - + @pytest.mark.parametrize("dtype", [np.float64, object, np.int64]) + def test_downcast_conversion_no_nan(self, dtype): expected = np.array([1, 2]) - for dtype in [np.float64, object, np.int64]: - arr = np.array([1.0, 2.0], dtype=dtype) - result = maybe_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected, check_dtype=False) - - for dtype in [np.float64, object]: - expected = np.array([1.0, 2.0, np.nan], dtype=dtype) - arr = np.array([1.0, 2.0, np.nan], dtype=dtype) - result = maybe_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected) - - # empties - for dtype in [np.int32, np.float64, np.float32, np.bool_, - np.int64, object]: - arr = np.array([], dtype=dtype) - result = maybe_downcast_to_dtype(arr, 'int64') - tm.assert_almost_equal(result, np.array([], dtype=np.int64)) - assert result.dtype == np.int64 + arr = np.array([1.0, 2.0], dtype=dtype) + + result = maybe_downcast_to_dtype(arr, "infer") + tm.assert_almost_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize("dtype", [np.float64, object]) + def test_downcast_conversion_nan(self, dtype): + expected = np.array([1.0, 2.0, np.nan], dtype=dtype) + arr = np.array([1.0, 2.0, np.nan], dtype=dtype) + + result = maybe_downcast_to_dtype(arr, "infer") + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize("dtype", [np.int32, np.float64, np.float32, + np.bool_, np.int64, object]) + def test_downcast_conversion_empty(self, dtype): + arr = np.array([], dtype=dtype) + result = maybe_downcast_to_dtype(arr, "int64") + tm.assert_numpy_array_equal(result, np.array([], dtype=np.int64)) def test_datetimelikes_nan(self): arr = np.array([1, 2, np.nan]) @@ -104,66 +105,71 @@ def test_datetime_with_timezone(self): class TestInferDtype(object): - def testinfer_dtype_from_scalar(self): - # Test that infer_dtype_from_scalar is returning correct dtype for int - # and float. + def test_infer_dtype_from_int_scalar(self, any_int_dtype): + # Test that infer_dtype_from_scalar is + # returning correct dtype for int and float. + data = np.dtype(any_int_dtype).type(12) + dtype, val = infer_dtype_from_scalar(data) + assert dtype == type(data) + + def test_infer_dtype_from_float_scalar(self, float_dtype): + float_dtype = np.dtype(float_dtype).type + data = float_dtype(12) - for dtypec in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, - np.int32, np.uint64, np.int64]: - data = dtypec(12) - dtype, val = infer_dtype_from_scalar(data) - assert dtype == type(data) + dtype, val = infer_dtype_from_scalar(data) + assert dtype == float_dtype + def test_infer_dtype_from_python_scalar(self): data = 12 dtype, val = infer_dtype_from_scalar(data) assert dtype == np.int64 - for dtypec in [np.float16, np.float32, np.float64]: - data = dtypec(12) - dtype, val = infer_dtype_from_scalar(data) - assert dtype == dtypec - data = np.float(12) dtype, val = infer_dtype_from_scalar(data) assert dtype == np.float64 - for data in [True, False]: - dtype, val = infer_dtype_from_scalar(data) - assert dtype == np.bool_ + @pytest.mark.parametrize("bool_val", [True, False]) + def test_infer_dtype_from_boolean(self, bool_val): + dtype, val = infer_dtype_from_scalar(bool_val) + assert dtype == np.bool_ - for data in [np.complex64(1), np.complex128(1)]: - dtype, val = infer_dtype_from_scalar(data) - assert dtype == np.complex_ + def test_infer_dtype_from_complex(self, complex_dtype): + data = np.dtype(complex_dtype).type(1) + dtype, val = infer_dtype_from_scalar(data) + assert dtype == np.complex_ - for data in [np.datetime64(1, 'ns'), Timestamp(1), - datetime(2000, 1, 1, 0, 0)]: - dtype, val = infer_dtype_from_scalar(data) - assert dtype == 'M8[ns]' + @pytest.mark.parametrize("data", [np.datetime64(1, "ns"), Timestamp(1), + datetime(2000, 1, 1, 0, 0)]) + def test_infer_dtype_from_datetime(self, data): + dtype, val = infer_dtype_from_scalar(data) + assert dtype == "M8[ns]" - for data in [np.timedelta64(1, 'ns'), Timedelta(1), - timedelta(1)]: - dtype, val = infer_dtype_from_scalar(data) - assert dtype == 'm8[ns]' + @pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), + timedelta(1)]) + def test_infer_dtype_from_timedelta(self, data): + dtype, val = infer_dtype_from_scalar(data) + assert dtype == "m8[ns]" - for freq in ['M', 'D']: - p = Period('2011-01-01', freq=freq) - dtype, val = infer_dtype_from_scalar(p, pandas_dtype=True) - assert dtype == 'period[{0}]'.format(freq) - assert val == p.ordinal + @pytest.mark.parametrize("freq", ["M", "D"]) + def test_infer_dtype_from_period(self, freq): + p = Period("2011-01-01", freq=freq) + dtype, val = infer_dtype_from_scalar(p, pandas_dtype=True) - dtype, val = infer_dtype_from_scalar(p) - dtype == np.object_ - assert val == p + assert dtype == "period[{0}]".format(freq) + assert val == p.ordinal - # misc - for data in [date(2000, 1, 1), - Timestamp(1, tz='US/Eastern'), 'foo']: + dtype, val = infer_dtype_from_scalar(p) + assert dtype == np.object_ + assert val == p - dtype, val = infer_dtype_from_scalar(data) - assert dtype == np.object_ + @pytest.mark.parametrize("data", [date(2000, 1, 1), "foo", + Timestamp(1, tz="US/Eastern")]) + def test_infer_dtype_misc(self, data): + dtype, val = infer_dtype_from_scalar(data) + assert dtype == np.object_ @pytest.mark.parametrize('tz', ['UTC', 'US/Eastern', 'Asia/Tokyo']) - def testinfer_from_scalar_tz(self, tz): + def test_infer_from_scalar_tz(self, tz): dt = Timestamp(1, tz=tz) dtype, val = infer_dtype_from_scalar(dt, pandas_dtype=True) assert dtype == 'datetime64[ns, {0}]'.format(tz) @@ -173,7 +179,7 @@ def testinfer_from_scalar_tz(self, tz): assert dtype == np.object_ assert val == dt - def testinfer_dtype_from_scalar_errors(self): + def test_infer_dtype_from_scalar_errors(self): with pytest.raises(ValueError): infer_dtype_from_scalar(np.array([1])) @@ -329,66 +335,63 @@ def test_maybe_convert_objects_copy(self): class TestCommonTypes(object): - def test_numpy_dtypes(self): - # (source_types, destination_type) - testcases = ( - # identity - ((np.int64,), np.int64), - ((np.uint64,), np.uint64), - ((np.float32,), np.float32), - ((np.object,), np.object), - - # into ints - ((np.int16, np.int64), np.int64), - ((np.int32, np.uint32), np.int64), - ((np.uint16, np.uint64), np.uint64), - - # into floats - ((np.float16, np.float32), np.float32), - ((np.float16, np.int16), np.float32), - ((np.float32, np.int16), np.float32), - ((np.uint64, np.int64), np.float64), - ((np.int16, np.float64), np.float64), - ((np.float16, np.int64), np.float64), - - # into others - ((np.complex128, np.int32), np.complex128), - ((np.object, np.float32), np.object), - ((np.object, np.int16), np.object), - - # bool with int - ((np.dtype('bool'), np.int64), np.object), - ((np.dtype('bool'), np.int32), np.object), - ((np.dtype('bool'), np.int16), np.object), - ((np.dtype('bool'), np.int8), np.object), - ((np.dtype('bool'), np.uint64), np.object), - ((np.dtype('bool'), np.uint32), np.object), - ((np.dtype('bool'), np.uint16), np.object), - ((np.dtype('bool'), np.uint8), np.object), - - # bool with float - ((np.dtype('bool'), np.float64), np.object), - ((np.dtype('bool'), np.float32), np.object), - - ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ns]')), - np.dtype('datetime64[ns]')), - ((np.dtype('timedelta64[ns]'), np.dtype('timedelta64[ns]')), - np.dtype('timedelta64[ns]')), - - ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ms]')), - np.dtype('datetime64[ns]')), - ((np.dtype('timedelta64[ms]'), np.dtype('timedelta64[ns]')), - np.dtype('timedelta64[ns]')), - - ((np.dtype('datetime64[ns]'), np.dtype('timedelta64[ns]')), - np.object), - ((np.dtype('datetime64[ns]'), np.int64), np.object) - ) - for src, common in testcases: - assert find_common_type(src) == common - + @pytest.mark.parametrize("source_dtypes,expected_common_dtype", [ + ((np.int64,), np.int64), + ((np.uint64,), np.uint64), + ((np.float32,), np.float32), + ((np.object,), np.object), + + # into ints + ((np.int16, np.int64), np.int64), + ((np.int32, np.uint32), np.int64), + ((np.uint16, np.uint64), np.uint64), + + # into floats + ((np.float16, np.float32), np.float32), + ((np.float16, np.int16), np.float32), + ((np.float32, np.int16), np.float32), + ((np.uint64, np.int64), np.float64), + ((np.int16, np.float64), np.float64), + ((np.float16, np.int64), np.float64), + + # into others + ((np.complex128, np.int32), np.complex128), + ((np.object, np.float32), np.object), + ((np.object, np.int16), np.object), + + # bool with int + ((np.dtype('bool'), np.int64), np.object), + ((np.dtype('bool'), np.int32), np.object), + ((np.dtype('bool'), np.int16), np.object), + ((np.dtype('bool'), np.int8), np.object), + ((np.dtype('bool'), np.uint64), np.object), + ((np.dtype('bool'), np.uint32), np.object), + ((np.dtype('bool'), np.uint16), np.object), + ((np.dtype('bool'), np.uint8), np.object), + + # bool with float + ((np.dtype('bool'), np.float64), np.object), + ((np.dtype('bool'), np.float32), np.object), + + ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ns]')), + np.dtype('datetime64[ns]')), + ((np.dtype('timedelta64[ns]'), np.dtype('timedelta64[ns]')), + np.dtype('timedelta64[ns]')), + + ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ms]')), + np.dtype('datetime64[ns]')), + ((np.dtype('timedelta64[ms]'), np.dtype('timedelta64[ns]')), + np.dtype('timedelta64[ns]')), + + ((np.dtype('datetime64[ns]'), np.dtype('timedelta64[ns]')), + np.object), + ((np.dtype('datetime64[ns]'), np.int64), np.object) + ]) + def test_numpy_dtypes(self, source_dtypes, expected_common_dtype): + assert find_common_type(source_dtypes) == expected_common_dtype + + def test_raises_empty_input(self): with pytest.raises(ValueError): - # empty find_common_type([]) def test_categorical_dtype(self): From 0829063acb06b326d645ce094b029b8a1f4dcb3f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 28 Jun 2018 03:22:59 -0700 Subject: [PATCH 112/113] TST: Clean old timezone issues PT2 (#21612) --- doc/source/whatsnew/v0.24.0.txt | 24 +++++++++++-------- pandas/conftest.py | 17 +++++++++++++ pandas/tests/frame/test_indexing.py | 10 ++++++++ .../indexes/datetimes/test_arithmetic.py | 4 ++++ .../indexes/datetimes/test_date_range.py | 9 +++++++ pandas/tests/indexing/test_datetime.py | 14 +++++++++++ .../tests/scalar/timestamp/test_timestamp.py | 8 +++++++ pandas/tests/series/test_constructors.py | 8 +++++++ pandas/tests/series/test_replace.py | 7 ++++++ pandas/tests/series/test_timezones.py | 8 +++++++ 10 files changed, 99 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 406ca9ba045c9..1105acda067d3 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -164,12 +164,6 @@ Datetimelike ^^^^^^^^^^^^ - Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) -- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) -- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) -- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError``(:issue:`8910`) -- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) -- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) -- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) Timedelta ^^^^^^^^^ @@ -181,9 +175,15 @@ Timedelta Timezones ^^^^^^^^^ -- -- -- +- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) +- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError``(:issue:`8910`) +- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) +- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) +- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) +- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) +- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) +- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp`s to tz-aware (:issue:`13051`) +- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) Offsets ^^^^^^^ @@ -217,7 +217,10 @@ Indexing - The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) - When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError`` - consistently with the case of a flat :class:`Int64Index` - rather than falling back to positional indexing (:issue:`21593`) -- +- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) +- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) +- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) + - MultiIndex @@ -245,6 +248,7 @@ Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) +- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) - - diff --git a/pandas/conftest.py b/pandas/conftest.py index 803b3add97052..ae08e0817de29 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -320,3 +320,20 @@ def mock(): return importlib.import_module("unittest.mock") else: return pytest.importorskip("mock") + + +@pytest.fixture(params=['__eq__', '__ne__', '__le__', + '__lt__', '__ge__', '__gt__']) +def all_compare_operators(request): + """ + Fixture for dunder names for common compare operations + + * >= + * > + * == + * != + * < + * <= + """ + + return request.param diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index be37e696ea0a3..c7aaf900b17fa 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2248,6 +2248,16 @@ def test_setitem_datetimelike_with_inference(self): index=list('ABCDEFGH')) assert_series_equal(result, expected) + @pytest.mark.parametrize('idxer', ['var', ['var']]) + def test_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): + # GH 11365 + tz = tz_naive_fixture + idx = date_range(start='2015-07-12', periods=3, freq='H', tz=tz) + expected = DataFrame(1.2, index=idx, columns=['var']) + result = DataFrame(index=idx, columns=['var']) + result.loc[:, idxer] = expected + tm.assert_frame_equal(result, expected) + def test_at_time_between_time_datetimeindex(self): index = date_range("2012-01-01", "2012-01-05", freq='30min') df = DataFrame(randn(len(index), 5), index=index) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 0649083a440df..ff31ffee13217 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -276,6 +276,10 @@ def test_comparison_tzawareness_compat(self, op): with pytest.raises(TypeError): op(dz, ts) + # GH 12601: Check comparison against Timestamps and DatetimeIndex + with pytest.raises(TypeError): + op(ts, dz) + @pytest.mark.parametrize('op', [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le]) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index ec37bbbcb6c02..47d4d15420f1d 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -292,6 +292,15 @@ def test_construct_over_dst(self): freq='H', tz='US/Pacific') tm.assert_index_equal(result, expected) + def test_construct_with_different_start_end_string_format(self): + # GH 12064 + result = date_range('2013-01-01 00:00:00+09:00', + '2013/01/01 02:00:00+09:00', freq='H') + expected = DatetimeIndex([Timestamp('2013-01-01 00:00:00+09:00'), + Timestamp('2013-01-01 01:00:00+09:00'), + Timestamp('2013-01-01 02:00:00+09:00')]) + tm.assert_index_equal(result, expected) + class TestGenRangeGeneration(object): diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index a5c12e4152c90..751372380d262 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -252,3 +252,17 @@ def test_series_partial_set_period(self): check_stacklevel=False): result = ser.loc[keys] tm.assert_series_equal(result, exp) + + def test_nanosecond_getitem_setitem_with_tz(self): + # GH 11679 + data = ['2016-06-28 08:30:00.123456789'] + index = pd.DatetimeIndex(data, dtype='datetime64[ns, America/Chicago]') + df = DataFrame({'a': [10]}, index=index) + result = df.loc[df.index[0]] + expected = Series(10, index=['a'], name=df.index[0]) + tm.assert_series_equal(result, expected) + + result = df.copy() + result.loc[df.index[0], 'a'] = -1 + expected = DataFrame(-1, index=index, columns=['a']) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 8dc9903b7356d..5272059163a07 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -542,6 +542,14 @@ def test_construct_timestamp_near_dst(self, offset): result = Timestamp(expected, tz='Europe/Helsinki') assert result == expected + @pytest.mark.parametrize('arg', [ + '2013/01/01 00:00:00+09:00', '2013-01-01 00:00:00+09:00']) + def test_construct_with_different_string_format(self, arg): + # GH 12064 + result = Timestamp(arg) + expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540)) + assert result == expected + class TestTimestamp(object): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 27cfec0dbf20d..fe224436c52e6 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1185,3 +1185,11 @@ def test_constructor_range_dtype(self, dtype): expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64') result = Series(range(5), dtype=dtype) tm.assert_series_equal(result, expected) + + def test_constructor_tz_mixed_data(self): + # GH 13051 + dt_list = [Timestamp('2016-05-01 02:03:37'), + Timestamp('2016-04-30 19:03:37-0700', tz='US/Pacific')] + result = Series(dt_list) + expected = Series(dt_list, dtype=object) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 2c07d87865f53..a3b92798879f5 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -108,6 +108,13 @@ def test_replace_gh5319(self): pd.Timestamp('20120101')) tm.assert_series_equal(result, expected) + # GH 11792: Test with replacing NaT in a list with tz data + ts = pd.Timestamp('2015/01/01', tz='UTC') + s = pd.Series([pd.NaT, pd.Timestamp('2015/01/01', tz='UTC')]) + result = s.replace([np.nan, pd.NaT], pd.Timestamp.min) + expected = pd.Series([pd.Timestamp.min, ts], dtype=object) + tm.assert_series_equal(expected, result) + def test_replace_with_single_list(self): ser = pd.Series([0, 1, 2, 3, 4]) result = ser.replace([1, 2, 3]) diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py index b54645d04bd1a..f2433163352ac 100644 --- a/pandas/tests/series/test_timezones.py +++ b/pandas/tests/series/test_timezones.py @@ -300,3 +300,11 @@ def test_getitem_pydatetime_tz(self, tzstr): dt = datetime(2012, 12, 24, 17, 0) time_datetime = tslib._localize_pydatetime(dt, tz) assert ts[time_pandas] == ts[time_datetime] + + def test_series_truncate_datetimeindex_tz(self): + # GH 9243 + idx = date_range('4/1/2005', '4/30/2005', freq='D', tz='US/Pacific') + s = Series(range(len(idx)), index=idx) + result = s.truncate(datetime(2005, 4, 2), datetime(2005, 4, 4)) + expected = Series([1, 2, 3], index=idx[1:4]) + tm.assert_series_equal(result, expected) From da3b9031cd1ad48ae026710a7109403962eaba59 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Thu, 28 Jun 2018 20:55:45 +0100 Subject: [PATCH 113/113] Whatsnew Timestamp bug --- doc/source/whatsnew/v0.23.2.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 9c4b408a1d24b..8c36d51a5fd16 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -54,7 +54,7 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) -- +- Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) .. _whatsnew_0232.performance: