From 03d07dfe353d37de5865879ea5acd237b4df1772 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 4 Apr 2020 12:22:16 -0700 Subject: [PATCH 01/26] BUG: 2D indexing on DTA/TDA/PA --- pandas/core/arrays/datetimelike.py | 12 ++-------- pandas/core/indexes/extension.py | 5 +++- pandas/tests/arrays/test_datetimelike.py | 29 ++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c0bbbebac7c33..4fabd8f558fee 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -550,10 +550,7 @@ def __getitem__(self, key): key = np.asarray(key, dtype=bool) key = check_array_indexer(self, key) - if key.all(): - key = slice(0, None, None) - else: - key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + key = lib.maybe_booleans_to_slice(key.view(np.uint8)) elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice): # see https://github.com/pandas-dev/pandas/issues/31299, need to allow # this for now (would otherwise raise in check_array_indexer) @@ -561,7 +558,7 @@ def __getitem__(self, key): else: key = check_array_indexer(self, key) - is_period = is_period_dtype(self) + is_period = is_period_dtype(self.dtype) if is_period: freq = self.freq else: @@ -577,11 +574,6 @@ def __getitem__(self, key): freq = self.freq result = getitem(key) - if result.ndim > 1: - # To support MPL which performs slicing with 2 dim - # even though it only has 1 dim by definition - return result - return self._simple_new(result, dtype=self.dtype, freq=freq) def __setitem__( diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index f38a4fb83c64f..c752990531b34 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -214,7 +214,10 @@ class ExtensionIndex(Index): def __getitem__(self, key): result = self._data[key] if isinstance(result, type(self._data)): - return type(self)(result, name=self.name) + if result.ndim == 1: + return type(self)(result, name=self.name) + # Unpack to ndarray for MPL compat + result = result._data # Includes cases where we get a 2D ndarray back for MPL compat deprecate_ndim_indexing(result) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 83995ab26cb56..fe35344f46688 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -60,6 +60,12 @@ def timedelta_index(request): class SharedTests: index_cls: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] + @pytest.fixture + def arr1d(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + return arr + def test_compare_len1_raises(self): # make sure we raise when comparing with different lengths, specific # to the case where one has length-1, which numpy would broadcast @@ -204,6 +210,18 @@ def test_searchsorted(self): result = arr.searchsorted(pd.NaT) assert result == 0 + def test_getitem_2d(self, arr1d): + # 2d slicing on a 1D array + expected = type(arr1d)(arr1d._data[:, np.newaxis], dtype=arr1d.dtype) + result = arr1d[:, np.newaxis] + tm.assert_equal(result, expected) + + # Lookup on a 2D array + arr2d = expected + expected = type(arr2d)(arr2d._data[:3, 0], dtype=arr2d.dtype) + result = arr2d[:3, 0] + tm.assert_equal(result, expected) + def test_setitem(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D") @@ -265,6 +283,13 @@ class TestDatetimeArray(SharedTests): array_cls = DatetimeArray dtype = pd.Timestamp + @pytest.fixture + def arr1d(self, tz_naive_fixture): + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq="H", tz=tz) + dta = dti._data + return dta + def test_round(self, tz_naive_fixture): # GH#24064 tz = tz_naive_fixture @@ -645,6 +670,10 @@ class TestPeriodArray(SharedTests): array_cls = PeriodArray dtype = pd.Period + @pytest.fixture + def arr1d(self, period_index): + return period_index._data + def test_from_pi(self, period_index): pi = period_index arr = PeriodArray(pi) From 8cf6c3baa9e7e1c75446b653a97e56fd903d6e4c Mon Sep 17 00:00:00 2001 From: tv3141 Date: Sat, 4 Apr 2020 01:01:46 +0100 Subject: [PATCH 02/26] BUG: Fix segfault in GroupBy.count and DataFrame.count (#32842) --- doc/source/whatsnew/v1.1.0.rst | 3 ++- pandas/_libs/lib.pyx | 6 ++++-- pandas/core/frame.py | 24 ++++++++++++----------- pandas/tests/groupby/test_counting.py | 11 ++++++++++- pandas/tests/test_multilevel.py | 28 +++++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1c5c8ed23273d..8bff34dbdadad 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -326,7 +326,7 @@ Numeric - Bug in :meth:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`) - Bug in :meth:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) -- +- Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) Conversion ^^^^^^^^^^ @@ -424,6 +424,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6e7e8ff51f201..9802b29b1dbc7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -798,14 +798,16 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, with nogil: for i in range(n): for j in range(k): - counts[labels[i], j] += mask[i, j] + if mask[i, j]: + counts[labels[i], j] += 1 else: # axis == 1 counts = np.zeros((n, max_bin), dtype='i8') with nogil: for i in range(n): for j in range(k): - counts[i, labels[j]] += mask[i, j] + if mask[i, j]: + counts[i, labels[j]] += 1 return counts diff --git a/pandas/core/frame.py b/pandas/core/frame.py index be04bbc7942c4..71b755bbf9665 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7891,18 +7891,21 @@ def _count_level(self, level, axis=0, numeric_only=False): f"Can only count levels on hierarchical {self._get_axis_name(axis)}." ) + # Mask NaNs: Mask rows or columns where the index level is NaN, and all + # values in the DataFrame that are NaN if frame._is_mixed_type: # Since we have mixed types, calling notna(frame.values) might # upcast everything to object - mask = notna(frame).values + values_mask = notna(frame).values else: # But use the speedup when we have homogeneous dtypes - mask = notna(frame.values) + values_mask = notna(frame.values) + index_mask = notna(count_axis.get_level_values(level=level)) if axis == 1: - # We're transposing the mask rather than frame to avoid potential - # upcasts to object, which induces a ~20x slowdown - mask = mask.T + mask = index_mask & values_mask + else: + mask = index_mask.reshape(-1, 1) & values_mask if isinstance(level, str): level = count_axis._get_level_number(level) @@ -7910,15 +7913,14 @@ def _count_level(self, level, axis=0, numeric_only=False): level_name = count_axis._names[level] level_index = count_axis.levels[level]._shallow_copy(name=level_name) level_codes = ensure_int64(count_axis.codes[level]) - counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) - - result = DataFrame(counts, index=level_index, columns=agg_axis) + counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) if axis == 1: - # Undo our earlier transpose - return result.T + result = DataFrame(counts, index=agg_axis, columns=level_index) else: - return result + result = DataFrame(counts, index=level_index, columns=agg_axis) + + return result def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index b4239d7d34a90..56a18757da6e7 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp +from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, Timestamp import pandas._testing as tm @@ -220,3 +220,12 @@ def test_count_with_only_nans_in_first_group(self): mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"]) expected = Series([], index=mi, dtype=np.int64, name="C") tm.assert_series_equal(result, expected, check_index_type=False) + + def test_count_groupby_column_with_nan_in_groupby_column(self): + # https://github.com/pandas-dev/pandas/issues/32841 + df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) + res = df.groupby(["B"]).count() + expected = DataFrame( + index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]} + ) + tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 0fdcc513ee126..dd0bac683c35c 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -248,6 +248,34 @@ def _check_counts(frame, axis=0): result = self.frame.count(level=0, numeric_only=True) tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) + def test_count_index_with_nan(self): + # https://github.com/pandas-dev/pandas/issues/21824 + df = DataFrame( + { + "Person": ["John", "Myla", None, "John", "Myla"], + "Age": [24.0, 5, 21.0, 33, 26], + "Single": [False, True, True, True, False], + } + ) + + # count on row labels + res = df.set_index(["Person", "Single"]).count(level="Person") + expected = DataFrame( + index=Index(["John", "Myla"], name="Person"), + columns=Index(["Age"]), + data=[2, 2], + ) + tm.assert_frame_equal(res, expected) + + # count on column labels + res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1) + expected = DataFrame( + columns=Index(["John", "Myla"], name="Person"), + index=Index(["Age"]), + data=[[2, 2]], + ) + tm.assert_frame_equal(res, expected) + def test_count_level_series(self): index = MultiIndex( levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], From 7072981651df66778c3719666c3cec5583e9c3d7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 3 Apr 2020 17:21:31 -0700 Subject: [PATCH 03/26] TYP: require Index objects earlier in internals (#33100) --- pandas/core/internals/__init__.py | 4 ---- pandas/core/internals/construction.py | 25 +++++++++++--------- pandas/core/internals/managers.py | 33 ++++++++++++++++----------- 3 files changed, 34 insertions(+), 28 deletions(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index e70652b81c42f..bc45b7c74ecc1 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -18,8 +18,6 @@ BlockManager, SingleBlockManager, concatenate_block_managers, - create_block_manager_from_arrays, - create_block_manager_from_blocks, ) __all__ = [ @@ -40,6 +38,4 @@ "BlockManager", "SingleBlockManager", "concatenate_block_managers", - "create_block_manager_from_arrays", - "create_block_manager_from_blocks", ] diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3e0fb8455884a..fc7da4155db36 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,6 +3,7 @@ constructors before passing them to a BlockManager. """ from collections import abc +from typing import Tuple import numpy as np import numpy.ma as ma @@ -29,7 +30,6 @@ ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, - ABCPeriodIndex, ABCSeries, ABCTimedeltaIndex, ) @@ -44,7 +44,7 @@ get_objs_combined_axis, union_indexes, ) -from pandas.core.internals import ( +from pandas.core.internals.managers import ( create_block_manager_from_arrays, create_block_manager_from_blocks, ) @@ -53,12 +53,16 @@ # BlockManager Interface -def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None, verify_integrity=True): +def arrays_to_mgr( + arrays, arr_names, index, columns, dtype=None, verify_integrity: bool = True +): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ + arr_names = ensure_index(arr_names) + if verify_integrity: # figure out the index, if necessary if index is None: @@ -70,6 +74,9 @@ def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None, verify_integrit arrays = _homogenize(arrays, index, dtype) columns = ensure_index(columns) + else: + columns = ensure_index(columns) + index = ensure_index(index) # from BlockManager perspective axes = [columns, index] @@ -163,7 +170,8 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): values = [values] if columns is None: - columns = list(range(len(values))) + columns = Index(range(len(values))) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype) # by definition an array here @@ -416,7 +424,7 @@ def get_names_from_index(data): return index -def _get_axes(N, K, index, columns): +def _get_axes(N, K, index, columns) -> Tuple[Index, Index]: # helper to create the axes as indexes # return axes or defaults @@ -635,12 +643,7 @@ def sanitize_index(data, index: Index): if len(data) != len(index): raise ValueError("Length of values does not match length of index") - if isinstance(data, ABCIndexClass): - pass - elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)): - data = data._values - - elif isinstance(data, np.ndarray): + if isinstance(data, np.ndarray): # coerce datetimelike types if data.dtype.kind in ["M", "m"]: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 45027bde58f14..2f1206e800d9b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2,7 +2,7 @@ import itertools import operator import re -from typing import Dict, List, Optional, Sequence, Tuple, TypeVar, Union +from typing import DefaultDict, Dict, List, Optional, Sequence, Tuple, TypeVar, Union import warnings import numpy as np @@ -341,7 +341,7 @@ def _verify_integrity(self) -> None: tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: - construction_error(tot_items, block.shape[1:], self.axes) + raise construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError( "Number of manager items must equal union of " @@ -1648,7 +1648,7 @@ def concat( # Constructor Helpers -def create_block_manager_from_blocks(blocks, axes): +def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: try: if len(blocks) == 1 and not isinstance(blocks[0], Block): # if blocks[0] is of length 0, return empty blocks @@ -1669,10 +1669,15 @@ def create_block_manager_from_blocks(blocks, axes): except ValueError as e: blocks = [getattr(b, "values", b) for b in blocks] tot_items = sum(b.shape[0] for b in blocks) - construction_error(tot_items, blocks[0].shape[1:], axes, e) + raise construction_error(tot_items, blocks[0].shape[1:], axes, e) -def create_block_manager_from_arrays(arrays, names, axes): +def create_block_manager_from_arrays( + arrays, names: Index, axes: List[Index] +) -> BlockManager: + assert isinstance(names, Index) + assert isinstance(axes, list) + assert all(isinstance(x, Index) for x in axes) try: blocks = form_blocks(arrays, names, axes) @@ -1680,7 +1685,7 @@ def create_block_manager_from_arrays(arrays, names, axes): mgr._consolidate_inplace() return mgr except ValueError as e: - construction_error(len(arrays), arrays[0].shape, axes, e) + raise construction_error(len(arrays), arrays[0].shape, axes, e) def construction_error(tot_items, block_shape, axes, e=None): @@ -1695,23 +1700,25 @@ def construction_error(tot_items, block_shape, axes, e=None): if len(implied) <= 2: implied = implied[::-1] + # We return the exception object instead of raising it so that we + # can raise it in the caller; mypy plays better with that if passed == implied and e is not None: - raise e + return e if block_shape[0] == 0: - raise ValueError("Empty data passed with indices specified.") - raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") + return ValueError("Empty data passed with indices specified.") + return ValueError(f"Shape of passed values is {passed}, indices imply {implied}") # ----------------------------------------------------------------------- -def form_blocks(arrays, names, axes): +def form_blocks(arrays, names: Index, axes) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? - items_dict = defaultdict(list) + items_dict: DefaultDict[str, List] = defaultdict(list) extra_locs = [] - names_idx = ensure_index(names) + names_idx = names if names_idx.equals(axes[0]): names_indexer = np.arange(len(names_idx)) else: @@ -1729,7 +1736,7 @@ def form_blocks(arrays, names, axes): block_type = get_block_type(v) items_dict[block_type.__name__].append((i, k, v)) - blocks = [] + blocks: List[Block] = [] if len(items_dict["FloatBlock"]): float_blocks = _multi_blockify(items_dict["FloatBlock"]) blocks.extend(float_blocks) From 9526c32b5317ad0c51ebd4fbc47e73407a4c48cf Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sat, 4 Apr 2020 18:29:07 +0300 Subject: [PATCH 04/26] DOC: Fixed examples in `pandas/core/accessor.py` (#33260) --- ci/code_checks.sh | 4 ++++ pandas/core/accessor.py | 13 +++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3e9138814fbdf..093e0b831a0aa 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -268,6 +268,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then # Individual files + MSG='Doctests accessor.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/accessor.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests base.py' ; echo $MSG pytest -q --doctest-modules pandas/core/base.py RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index b6ca19bde8009..f970cefe15527 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -257,12 +257,13 @@ def plot(self): Back in an interactive IPython session: - >>> ds = pd.DataFrame({{'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}}) - >>> ds.geo.center - (5.0, 10.0) - >>> ds.geo.plot() - # plots data on a map + .. code-block:: ipython + + In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10), + ...: "latitude": np.linspace(0, 20)}}) + In [2]: ds.geo.center + Out[2]: (5.0, 10.0) + In [3]: ds.geo.plot() # plots data on a map """ def decorator(accessor): From 87c470493d12393de4f99fb2697c49fb7e9331a6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 4 Apr 2020 10:15:46 -0700 Subject: [PATCH 05/26] Troubleshoot Travis (#33280) --- pandas/tests/io/json/test_pandas.py | 36 ++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d9071a80b5db7..b74abc965f7fa 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -15,6 +15,16 @@ from pandas import DataFrame, DatetimeIndex, Series, Timestamp, read_json import pandas._testing as tm +_seriesd = tm.getSeriesData() + +_frame = DataFrame(_seriesd) + +_cat_frame = _frame.copy() +cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * (len(_cat_frame) - 15) +_cat_frame.index = pd.CategoricalIndex(cat, name="E") +_cat_frame["E"] = list(reversed(cat)) +_cat_frame["sort"] = np.arange(len(_cat_frame), dtype="int64") + def assert_json_roundtrip_equal(result, expected, orient): if orient == "records" or orient == "values": @@ -26,6 +36,12 @@ def assert_json_roundtrip_equal(result, expected, orient): @pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: + @pytest.fixture(autouse=True) + def setup(self): + self.categorical = _cat_frame.copy() + + yield + def test_frame_double_encoded_labels(self, orient): df = DataFrame( [["a", "b"], ["c", "d"]], @@ -167,21 +183,25 @@ def test_roundtrip_str_axes(self, orient, convert_axes, numpy, dtype): @pytest.mark.parametrize("convert_axes", [True, False]) @pytest.mark.parametrize("numpy", [True, False]) def test_roundtrip_categorical(self, orient, convert_axes, numpy): - cats = ["a", "b"] - df = pd.DataFrame( - pd.Categorical(cats), index=pd.CategoricalIndex(cats), columns=["cat"] - ) + # TODO: create a better frame to test with and improve coverage + if orient in ("index", "columns"): + pytest.xfail(f"Can't have duplicate index values for orient '{orient}')") - data = df.to_json(orient=orient) - if numpy and orient != "split": + data = self.categorical.to_json(orient=orient) + if numpy and orient in ("records", "values"): pytest.xfail(f"Orient {orient} is broken with numpy=True") result = pd.read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy ) - # Categorical dtypes are not preserved on round trip - expected = pd.DataFrame(cats, index=cats, columns=["cat"]) + expected = self.categorical.copy() + expected.index = expected.index.astype(str) # Categorical not preserved + expected.index.name = None # index names aren't preserved in JSON + + if not numpy and orient == "index": + expected = expected.sort_index() + assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("convert_axes", [True, False]) From de536574995d88e7ee115365345bf405bbbda69c Mon Sep 17 00:00:00 2001 From: Benjamin Beier Liu Date: Sat, 4 Apr 2020 17:07:09 -0400 Subject: [PATCH 06/26] TST: add DataFrame test for construct from tuple case from GH-32776 (#33267) --- pandas/tests/frame/test_constructors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 9f40e8c6931c8..fcdc62753ca0a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1336,6 +1336,7 @@ def test_constructor_mixed_type_rows(self): (((), ()), [(), ()]), (((), ()), [[], []]), (([], []), [[], []]), + (([1], [2]), [[1], [2]]), # GH 32776 (([1, 2, 3], [4, 5, 6]), [[1, 2, 3], [4, 5, 6]]), ], ) From c1f2906c733614ba063c3370e074936109d2572e Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sun, 5 Apr 2020 00:09:59 +0300 Subject: [PATCH 07/26] DOC: Fixed examples in `pandas/core/window` (#33266) --- ci/code_checks.sh | 4 ++ pandas/core/window/ewm.py | 31 +++-------- pandas/core/window/expanding.py | 34 ++++-------- pandas/core/window/rolling.py | 91 ++++++++++----------------------- 4 files changed, 49 insertions(+), 111 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 093e0b831a0aa..cd9e4384fd0d9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -326,6 +326,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/tools/ RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests window' ; echo $MSG + pytest -q --doctest-modules pandas/core/window/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests tseries' ; echo $MSG pytest -q --doctest-modules pandas/tseries/ RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 0ec876583dcde..2759280dc1d1c 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -167,33 +167,18 @@ def _constructor(self): """ Examples -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 >>> df.ewm(alpha=0.5).mean() A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.464856 0.569633 -0.490089 - 2 -0.207700 0.149687 -1.135379 - 3 -0.471677 -0.645305 -0.906555 - 4 -0.355635 -0.203033 -0.904111 - 5 1.076417 1.503943 -1.146293 - 6 -0.041654 1.925562 -0.588728 - 7 0.680292 0.132049 0.548693 - 8 0.067236 0.948257 0.163353 - 9 -0.286980 0.618493 -0.694496 + 0 1.000000 4.000000 7.000000 + 1 1.666667 4.666667 7.666667 + 2 2.428571 5.428571 8.428571 """ ) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 140e0144d0a2d..146c139806bca 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -37,7 +37,8 @@ class Expanding(_Rolling_and_Expanding): Examples -------- - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) + >>> df B 0 0.0 1 1.0 @@ -98,33 +99,18 @@ def _get_window(self, other=None, **kwargs): """ Examples -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 >>> df.ewm(alpha=0.5).mean() A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.464856 0.569633 -0.490089 - 2 -0.207700 0.149687 -1.135379 - 3 -0.471677 -0.645305 -0.906555 - 4 -0.355635 -0.203033 -0.904111 - 5 1.076417 1.503943 -1.146293 - 6 -0.041654 1.925562 -0.588728 - 7 0.680292 0.132049 0.548693 - 8 0.067236 0.948257 0.163353 - 9 -0.286980 0.618493 -0.694496 + 0 1.000000 4.000000 7.000000 + 1 1.666667 4.666667 7.666667 + 2 2.428571 5.428571 8.428571 """ ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index dc8cf839d0bcb..729e4069b1309 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1039,33 +1039,18 @@ def _get_window( """ Examples -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 - - >>> df.rolling(3, win_type='boxcar').agg('mean') - A B C - 0 NaN NaN NaN - 1 NaN NaN NaN - 2 -0.885035 0.212600 -0.711689 - 3 -0.323928 -0.200122 -1.093408 - 4 -0.071445 -0.431533 -1.075833 - 5 0.504739 0.676083 -0.996353 - 6 0.358206 1.903256 -0.774200 - 7 0.906020 1.283573 0.085482 - 8 -0.096361 0.818139 0.472290 - 9 0.070889 0.134399 -0.031308 + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.rolling(2, win_type="boxcar").agg("mean") + A B C + 0 NaN NaN NaN + 1 1.5 4.5 7.5 + 2 2.5 5.5 8.5 """ ) @@ -1904,46 +1889,24 @@ def _validate_freq(self): """ Examples -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 - >>> df.rolling(3).sum() - A B C - 0 NaN NaN NaN - 1 NaN NaN NaN - 2 -2.655105 0.637799 -2.135068 - 3 -0.971785 -0.600366 -3.280224 - 4 -0.214334 -1.294599 -3.227500 - 5 1.514216 2.028250 -2.989060 - 6 1.074618 5.709767 -2.322600 - 7 2.718061 3.850718 0.256446 - 8 -0.289082 2.454418 1.416871 - 9 0.212668 0.403198 -0.093924 - - >>> df.rolling(3).agg({'A':'sum', 'B':'min'}) - A B - 0 NaN NaN - 1 NaN NaN - 2 -2.655105 -0.165272 - 3 -0.971785 -1.340923 - 4 -0.214334 -1.340923 - 5 1.514216 -1.340923 - 6 1.074618 0.211596 - 7 2.718061 -1.647453 - 8 -0.289082 -1.647453 - 9 0.212668 -1.647453 + >>> df.rolling(2).sum() + A B C + 0 NaN NaN NaN + 1 3.0 9.0 15.0 + 2 5.0 11.0 17.0 + + >>> df.rolling(2).agg({"A": "sum", "B": "min"}) + A B + 0 NaN NaN + 1 3.0 4.0 + 2 5.0 5.0 """ ) From f98db693b848806298a1e6644fb2199f04a93746 Mon Sep 17 00:00:00 2001 From: Farhan Reynaldo Date: Sun, 5 Apr 2020 04:23:02 +0700 Subject: [PATCH 08/26] DOC: Fix error in Series.clip and DataFrame.clip (#33282) --- pandas/core/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0ecfbce460b3a..c202bf846047f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7299,6 +7299,12 @@ def clip( Same type as calling object with the values outside the clip boundaries replaced. + See Also + -------- + Series.clip : Trim values at input threshold in series. + DataFrame.clip : Trim values at input threshold in dataframe. + numpy.clip : Clip (limit) the values in an array. + Examples -------- >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]} From 1de584d4fce8d7e0f0bb86a1e9419074ef72f5b5 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sun, 5 Apr 2020 00:35:34 +0300 Subject: [PATCH 09/26] TYP: Fixed type annotaions in `scripts/validate_rst_title_capitalization` (#33268) --- scripts/validate_rst_title_capitalization.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 59d422a1605a0..3d19e37ac7a1d 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -14,7 +14,7 @@ import os import re import sys -from typing import Generator, List, Tuple +from typing import Iterable, List, Tuple CAPITALIZATION_EXCEPTIONS = { "pandas", @@ -148,7 +148,7 @@ def correct_title_capitalization(title: str) -> str: return correct_title -def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: +def find_titles(rst_file: str) -> Iterable[Tuple[str, int]]: """ Algorithm to identify particular text that should be considered headings in an RST file. @@ -184,7 +184,7 @@ def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: previous_line = line -def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: +def find_rst_files(source_paths: List[str]) -> Iterable[str]: """ Given the command line arguments of directory paths, this method yields the strings of the .rst file directories that these paths contain. @@ -214,7 +214,7 @@ def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: yield filename -def main(source_paths: List[str], output_format: str) -> bool: +def main(source_paths: List[str], output_format: str) -> int: """ The main method to print all headings with incorrect capitalization. From d507fe652f9cc123a91a20c5c0d9220e1bc49cee Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 5 Apr 2020 11:27:56 +0100 Subject: [PATCH 10/26] TYP: pandas/core/dtypes/dtypes.py (#31384) --- pandas/core/dtypes/dtypes.py | 74 +++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d7ba150e3ec9d..17c4c6ba1c701 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,7 +3,18 @@ """ import re -from typing import Any, Dict, List, MutableMapping, Optional, Tuple, Type, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + MutableMapping, + Optional, + Tuple, + Type, + Union, + cast, +) import numpy as np import pytz @@ -16,6 +27,15 @@ from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass from pandas.core.dtypes.inference import is_bool, is_list_like +if TYPE_CHECKING: + import pyarrow # noqa: F401 + from pandas.core.arrays import ( # noqa: F401 + IntervalArray, + PeriodArray, + DatetimeArray, + ) + from pandas import Categorical # noqa: F401 + str_type = str @@ -68,7 +88,7 @@ def register(self, dtype: Type[ExtensionDtype]) -> None: """ Parameters ---------- - dtype : ExtensionDtype + dtype : ExtensionDtype class """ if not issubclass(dtype, ExtensionDtype): raise ValueError("can only register pandas extension dtypes") @@ -122,7 +142,7 @@ class PandasExtensionDtype(ExtensionDtype): # and ExtensionDtype's @properties in the subclasses below. The kind and # type variables in those subclasses are explicitly typed below. subdtype = None - str: Optional[str_type] = None + str: str_type num = 100 shape: Tuple[int, ...] = tuple() itemsize = 8 @@ -500,7 +520,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: return np.bitwise_xor.reduce(hashed) @classmethod - def construct_array_type(cls): + def construct_array_type(cls) -> Type["Categorical"]: """ Return the array type associated with this dtype. @@ -508,7 +528,7 @@ def construct_array_type(cls): ------- type """ - from pandas import Categorical + from pandas import Categorical # noqa: F811 return Categorical @@ -672,9 +692,9 @@ class DatetimeTZDtype(PandasExtensionDtype): _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") _cache: Dict[str_type, PandasExtensionDtype] = {} - def __init__(self, unit="ns", tz=None): + def __init__(self, unit: Union[str_type, "DatetimeTZDtype"] = "ns", tz=None): if isinstance(unit, DatetimeTZDtype): - unit, tz = unit.unit, unit.tz + unit, tz = unit.unit, unit.tz # type: ignore if unit != "ns": if isinstance(unit, str) and tz is None: @@ -704,7 +724,7 @@ def __init__(self, unit="ns", tz=None): self._tz = tz @property - def unit(self): + def unit(self) -> str_type: """ The precision of the datetime data. """ @@ -718,7 +738,7 @@ def tz(self): return self._tz @classmethod - def construct_array_type(cls): + def construct_array_type(cls) -> Type["DatetimeArray"]: """ Return the array type associated with this dtype. @@ -726,12 +746,12 @@ def construct_array_type(cls): ------- type """ - from pandas.core.arrays import DatetimeArray + from pandas.core.arrays import DatetimeArray # noqa: F811 return DatetimeArray @classmethod - def construct_from_string(cls, string: str_type): + def construct_from_string(cls, string: str_type) -> "DatetimeTZDtype": """ Construct a DatetimeTZDtype from a string. @@ -789,7 +809,7 @@ def __eq__(self, other: Any) -> bool: and str(self.tz) == str(other.tz) ) - def __setstate__(self, state): + def __setstate__(self, state) -> None: # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) @@ -884,7 +904,7 @@ def _parse_dtype_strict(cls, freq): raise ValueError("could not construct PeriodDtype") @classmethod - def construct_from_string(cls, string): + def construct_from_string(cls, string: str_type) -> "PeriodDtype": """ Strict construction from a string, raise a TypeError if not possible @@ -934,7 +954,7 @@ def __setstate__(self, state): self._freq = state["freq"] @classmethod - def is_dtype(cls, dtype) -> bool: + def is_dtype(cls, dtype: object) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) @@ -955,7 +975,7 @@ def is_dtype(cls, dtype) -> bool: return super().is_dtype(dtype) @classmethod - def construct_array_type(cls): + def construct_array_type(cls) -> Type["PeriodArray"]: """ Return the array type associated with this dtype. @@ -967,9 +987,13 @@ def construct_array_type(cls): return PeriodArray - def __from_arrow__(self, array): - """Construct PeriodArray from pyarrow Array/ChunkedArray.""" - import pyarrow + def __from_arrow__( + self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] + ) -> "PeriodArray": + """ + Construct PeriodArray from pyarrow Array/ChunkedArray. + """ + import pyarrow # noqa: F811 from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -1075,7 +1099,7 @@ def subtype(self): return self._subtype @classmethod - def construct_array_type(cls): + def construct_array_type(cls) -> Type["IntervalArray"]: """ Return the array type associated with this dtype. @@ -1142,7 +1166,7 @@ def __setstate__(self, state): self._subtype = state["subtype"] @classmethod - def is_dtype(cls, dtype) -> bool: + def is_dtype(cls, dtype: object) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) @@ -1160,9 +1184,13 @@ def is_dtype(cls, dtype) -> bool: return False return super().is_dtype(dtype) - def __from_arrow__(self, array): - """Construct IntervalArray from pyarrow Array/ChunkedArray.""" - import pyarrow + def __from_arrow__( + self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] + ) -> "IntervalArray": + """ + Construct IntervalArray from pyarrow Array/ChunkedArray. + """ + import pyarrow # noqa: F811 from pandas.core.arrays import IntervalArray if isinstance(array, pyarrow.Array): From b7c9824d5e5896be2effb0a317cbb58ff6ae12f7 Mon Sep 17 00:00:00 2001 From: Benjamin Beier Liu Date: Sun, 5 Apr 2020 14:28:16 -0400 Subject: [PATCH 11/26] TST: add date range test for reindex case from GH-32740 (#33265) * add test on date range timezone reindex * undo * fix * move code to datetime test * import timedelta * adding assertion the np array results * comply to coding standard * update * update * assert_numpy_array_equal * comply black format * format checking * white space * bleck --- .../tests/indexes/datetimes/test_datetime.py | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 1529a259c49af..e109c7a4f1c8d 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,4 +1,4 @@ -from datetime import date +from datetime import date, timedelta import dateutil import numpy as np @@ -44,6 +44,45 @@ def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): assert str(index.reindex([])[0].tz) == "US/Eastern" assert str(index.reindex(np.array([]))[0].tz) == "US/Eastern" + def test_reindex_with_same_tz(self): + # GH 32740 + rng_a = date_range("2010-01-01", "2010-01-02", periods=24, tz="utc") + rng_b = date_range("2010-01-01", "2010-01-02", periods=23, tz="utc") + result1, result2 = rng_a.reindex( + rng_b, method="nearest", tolerance=timedelta(seconds=20) + ) + expected_list1 = [ + "2010-01-01 00:00:00", + "2010-01-01 01:05:27.272727272", + "2010-01-01 02:10:54.545454545", + "2010-01-01 03:16:21.818181818", + "2010-01-01 04:21:49.090909090", + "2010-01-01 05:27:16.363636363", + "2010-01-01 06:32:43.636363636", + "2010-01-01 07:38:10.909090909", + "2010-01-01 08:43:38.181818181", + "2010-01-01 09:49:05.454545454", + "2010-01-01 10:54:32.727272727", + "2010-01-01 12:00:00", + "2010-01-01 13:05:27.272727272", + "2010-01-01 14:10:54.545454545", + "2010-01-01 15:16:21.818181818", + "2010-01-01 16:21:49.090909090", + "2010-01-01 17:27:16.363636363", + "2010-01-01 18:32:43.636363636", + "2010-01-01 19:38:10.909090909", + "2010-01-01 20:43:38.181818181", + "2010-01-01 21:49:05.454545454", + "2010-01-01 22:54:32.727272727", + "2010-01-02 00:00:00", + ] + expected1 = DatetimeIndex( + expected_list1, dtype="datetime64[ns, UTC]", freq=None, + ) + expected2 = np.array([0] + [-1] * 21 + [23], dtype=np.int64,) + tm.assert_index_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) + def test_time_loc(self): # GH8667 from datetime import time from pandas._libs.index import _SIZE_CUTOFF From adba667d6c8f6e9cf0824378941de77097cf1b2c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 5 Apr 2020 12:12:07 -0700 Subject: [PATCH 12/26] REF: BlockManager.combine -> _combine (#33294) --- pandas/core/internals/managers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2f1206e800d9b..59d3fbc306947 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -729,7 +729,7 @@ def get_bool_data(self, copy: bool = False) -> "BlockManager": Whether to copy the blocks """ self._consolidate_inplace() - return self.combine([b for b in self.blocks if b.is_bool], copy) + return self._combine([b for b in self.blocks if b.is_bool], copy) def get_numeric_data(self, copy: bool = False) -> "BlockManager": """ @@ -739,9 +739,9 @@ def get_numeric_data(self, copy: bool = False) -> "BlockManager": Whether to copy the blocks """ self._consolidate_inplace() - return self.combine([b for b in self.blocks if b.is_numeric], copy) + return self._combine([b for b in self.blocks if b.is_numeric], copy) - def combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": + def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": """ return a new manager with the blocks """ if len(blocks) == 0: return self.make_empty() @@ -896,7 +896,7 @@ def to_dict(self, copy: bool = True): for b in self.blocks: bd.setdefault(str(b.dtype), []).append(b) - return {dtype: self.combine(blocks, copy=copy) for dtype, blocks in bd.items()} + return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} def fast_xs(self, loc: int) -> ArrayLike: """ From 5550f2772dd4b700947831d2b5099e229ad148f2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 5 Apr 2020 12:16:59 -0700 Subject: [PATCH 13/26] REF: make kwargs explicit in BlockManager methods (#33295) --- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/managers.py | 32 ++++++++++++++++++++----------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index bd90325114ee1..d8b54fd5cffb3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1282,7 +1282,7 @@ def diff(self, n: int, axis: int = 1) -> List["Block"]: new_values = _block_shape(new_values, ndim=self.ndim) return [self.make_block(values=new_values)] - def shift(self, periods, axis: int = 0, fill_value=None): + def shift(self, periods: int, axis: int = 0, fill_value=None): """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 59d3fbc306947..230cb7af75f58 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -545,14 +545,24 @@ def get_axe(block, qs, axes): def isna(self, func) -> "BlockManager": return self.apply("apply", func=func) - def where(self, **kwargs) -> "BlockManager": - if kwargs.pop("align", True): + def where( + self, other, cond, align: bool, errors: str, try_cast: bool, axis: int + ) -> "BlockManager": + if align: align_keys = ["other", "cond"] else: align_keys = ["cond"] - kwargs["other"] = extract_array(kwargs["other"], extract_numpy=True) + other = extract_array(other, extract_numpy=True) - return self.apply("where", align_keys=align_keys, **kwargs) + return self.apply( + "where", + align_keys=align_keys, + other=other, + cond=cond, + errors=errors, + try_cast=try_cast, + axis=axis, + ) def setitem(self, indexer, value) -> "BlockManager": return self.apply("setitem", indexer=indexer, value=value) @@ -584,11 +594,13 @@ def diff(self, n: int, axis: int) -> "BlockManager": def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) - def shift(self, **kwargs) -> "BlockManager": - return self.apply("shift", **kwargs) + def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": + return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) - def fillna(self, **kwargs) -> "BlockManager": - return self.apply("fillna", **kwargs) + def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager": + return self.apply( + "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast + ) def downcast(self) -> "BlockManager": return self.apply("downcast") @@ -753,9 +765,7 @@ def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": new_blocks = [] for b in blocks: b = b.copy(deep=copy) - b.mgr_locs = algos.take_1d( - inv_indexer, b.mgr_locs.as_array, axis=0, allow_fill=False - ) + b.mgr_locs = inv_indexer[b.mgr_locs.indexer] new_blocks.append(b) axes = list(self.axes) From 7113911f9a1866054d17ecfda2ea0d8053cca711 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 5 Apr 2020 20:21:29 +0100 Subject: [PATCH 14/26] DOC/CLN: remove versionadded/changed:: 0.21 (#33301) --- doc/source/development/contributing.rst | 8 ++++---- doc/source/user_guide/basics.rst | 4 ---- doc/source/user_guide/categorical.rst | 2 -- doc/source/user_guide/groupby.rst | 2 -- doc/source/user_guide/io.rst | 6 ------ doc/source/user_guide/merging.rst | 4 ---- pandas/_libs/lib.pyx | 2 -- pandas/core/arrays/categorical.py | 7 ------- pandas/core/common.py | 2 -- pandas/core/dtypes/dtypes.py | 2 -- pandas/core/frame.py | 10 ---------- pandas/core/generic.py | 19 ------------------- pandas/core/indexes/base.py | 4 ---- pandas/core/indexes/category.py | 2 -- pandas/core/indexes/datetimes.py | 6 ------ pandas/core/resample.py | 2 -- pandas/core/reshape/pivot.py | 3 --- pandas/core/series.py | 6 ------ pandas/io/formats/style.py | 2 -- pandas/io/html.py | 2 -- pandas/io/json/_json.py | 5 ----- pandas/io/parquet.py | 4 ---- pandas/io/pickle.py | 1 - pandas/io/pytables.py | 5 ----- 24 files changed, 4 insertions(+), 106 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 31241287c61cb..ba7f7eb907f4a 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -791,7 +791,7 @@ the ``pandas.util._decorators.deprecate``: from pandas.util._decorators import deprecate - deprecate('old_func', 'new_func', '0.21.0') + deprecate('old_func', 'new_func', '1.1.0') Otherwise, you need to do it manually: @@ -803,7 +803,7 @@ Otherwise, you need to do it manually: def old_func(): """Summary of the function. - .. deprecated:: 0.21.0 + .. deprecated:: 1.1.0 Use new_func instead. """ warnings.warn('Use new_func instead.', FutureWarning, stacklevel=2) @@ -1354,9 +1354,9 @@ directive is used. The sphinx syntax for that is: .. code-block:: rst - .. versionadded:: 0.21.0 + .. versionadded:: 1.1.0 -This will put the text *New in version 0.21.0* wherever you put the sphinx +This will put the text *New in version 1.1.0* wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method (`example `__) or a new keyword argument (`example `__). diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index aa93f37a313f9..055b43bc1e59b 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1224,8 +1224,6 @@ following can be done: This means that the reindexed Series's index is the same Python object as the DataFrame's index. -.. versionadded:: 0.21.0 - :meth:`DataFrame.reindex` also supports an "axis-style" calling convention, where you specify a single ``labels`` argument and the ``axis`` it applies to. @@ -1435,8 +1433,6 @@ Series can also be used: If the mapping doesn't include a column/index label, it isn't renamed. Note that extra labels in the mapping don't throw an error. -.. versionadded:: 0.21.0 - :meth:`DataFrame.rename` also supports an "axis-style" calling convention, where you specify a single ``mapper`` and the ``axis`` to apply that mapping to. diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index a55326db748fd..d4faf527a4790 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -211,8 +211,6 @@ To get back to the original ``Series`` or NumPy array, use CategoricalDtype ---------------- -.. versionchanged:: 0.21.0 - A categorical's type is fully described by 1. ``categories``: a sequence of unique values and no missing values diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 8cd229070e365..b06c3afa6dfe8 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1327,8 +1327,6 @@ See the :ref:`visualization documentation` for more. Piping function calls ~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.21.0 - Similar to the functionality provided by ``DataFrame`` and ``Series``, functions that take ``GroupBy`` objects can be chained together using a ``pipe`` method to allow for a cleaner, more readable syntax. To read about ``.pipe`` in general terms, diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d68dc24bae658..a4cc1f9ee02ca 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -461,8 +461,6 @@ specification: pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes -.. versionadded:: 0.21.0 - Specifying ``dtype='category'`` will result in an unordered ``Categorical`` whose ``categories`` are the unique values observed in the data. For more control on the categories and order, create a @@ -2171,8 +2169,6 @@ Line delimited json pandas is able to read and write line-delimited json files that are common in data processing pipelines using Hadoop or Spark. -.. versionadded:: 0.21.0 - For line-delimited json files, pandas can also return an iterator which reads in ``chunksize`` lines at a time. This can be useful for large files or to read from a stream. .. ipython:: python @@ -4646,8 +4642,6 @@ Read from a feather file. Parquet ------- -.. versionadded:: 0.21.0 - `Apache Parquet `__ provides a partitioned binary columnar serialization for data frames. It is designed to make reading and writing data frames efficient, and to make sharing data across data analysis languages easy. Parquet can use a variety of compression techniques to shrink the file size as much as possible diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 49f4bbb6beb19..0450c81958a51 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -573,8 +573,6 @@ all standard database join operations between ``DataFrame`` or named ``Series`` dataset. * "many_to_many" or "m:m": allowed, but does not result in checks. - .. versionadded:: 0.21.0 - .. note:: Support for specifying index levels as the ``on``, ``left_on``, and @@ -773,8 +771,6 @@ Here is another example with duplicate join keys in DataFrames: Checking for duplicate keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.21.0 - Users can use the ``validate`` argument to automatically check whether there are unexpected duplicates in their merge keys. Key uniqueness is checked before merge operations and so should protect against memory overflows. Checking key diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9802b29b1dbc7..276c2d5198831 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1197,8 +1197,6 @@ def infer_dtype(value: object, skipna: bool = True) -> str: skipna : bool, default True Ignore NaN values when inferring the type. - .. versionadded:: 0.21.0 - Returns ------- str diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 55c42f59f865e..ad82d68baa5b3 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -242,8 +242,6 @@ class Categorical(ExtensionArray, PandasObject): dtype : CategoricalDtype An instance of ``CategoricalDtype`` to use for this categorical. - .. versionadded:: 0.21.0 - Attributes ---------- categories : Index @@ -257,8 +255,6 @@ class Categorical(ExtensionArray, PandasObject): The instance of ``CategoricalDtype`` storing the ``categories`` and ``ordered``. - .. versionadded:: 0.21.0 - Methods ------- from_codes @@ -876,8 +872,6 @@ def rename_categories(self, new_categories, inplace=False): are passed through and extra categories in the mapping are ignored. - .. versionadded:: 0.21.0. - * callable : a callable that is called on all items in the old categories and whose return values comprise the new categories. @@ -1306,7 +1300,6 @@ def __setstate__(self, state): if not isinstance(state, dict): raise Exception("invalid pickle state") - # compat with pre 0.21.0 CategoricalDtype change if "_dtype" not in state: state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) diff --git a/pandas/core/common.py b/pandas/core/common.py index 4ff1a93737d41..8b152162dc95a 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -357,8 +357,6 @@ def standardize_mapping(into): """ Helper function to standardize a supplied mapping. - .. versionadded:: 0.21.0 - Parameters ---------- into : instance or subclass of collections.abc.Mapping diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 17c4c6ba1c701..4be5da9c4c54a 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -189,8 +189,6 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ Type for categorical data with the categories and orderedness. - .. versionchanged:: 0.21.0 - Parameters ---------- categories : sequence, optional diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 71b755bbf9665..ddb7be405d77a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -245,8 +245,6 @@ dataset. * "many_to_many" or "m:m": allowed, but does not result in checks. - .. versionadded:: 0.21.0 - Returns ------- DataFrame @@ -1339,8 +1337,6 @@ def to_dict(self, orient="dict", into=dict): instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. - .. versionadded:: 0.21.0 - Returns ------- dict, list or collections.abc.Mapping @@ -2118,8 +2114,6 @@ def to_parquet( """ Write a DataFrame to the binary parquet format. - .. versionadded:: 0.21.0 - This function writes the dataframe as a `parquet file `_. You can choose different parquet backends, and have the option of compression. See @@ -3749,13 +3743,9 @@ def drop( index : single label or list-like Alternative to specifying axis (``labels, axis=0`` is equivalent to ``index=labels``). - - .. versionadded:: 0.21.0 columns : single label or list-like Alternative to specifying axis (``labels, axis=1`` is equivalent to ``columns=labels``). - - .. versionadded:: 0.21.0 level : int or level name, optional For MultiIndex, level from which the labels will be removed. inplace : bool, default False diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c202bf846047f..9640c1e087f47 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -526,13 +526,6 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): Indexes for%(extended_summary_sub)s row labels can be changed by assigning a list-like or Index. - .. versionchanged:: 0.21.0 - - The signature is now `labels` and `axis`, consistent with - the rest of pandas API. Previously, the `axis` and `labels` - arguments were respectively the first and second positional - arguments. - Parameters ---------- labels : list-like, Index @@ -1178,8 +1171,6 @@ def _set_axis_name(self, name, axis=0, inplace=False): inplace : bool, default False If `True`, do operation inplace and return None. - .. versionadded:: 0.21.0 - Returns ------- Series, DataFrame, or None @@ -2146,7 +2137,6 @@ def to_json( only used when the first argument is a filename. By default, the compression is inferred from the filename. - .. versionadded:: 0.21.0 .. versionchanged:: 0.24.0 'infer' option added and set to default index : bool, default True @@ -2663,7 +2653,6 @@ def to_pickle( parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html. - .. versionadded:: 0.21.0. See Also -------- @@ -3794,8 +3783,6 @@ def reindex_like( the same size as the index and its dtype must exactly match the index's type. - .. versionadded:: 0.21.0 (list-like tolerance) - Returns ------- Series or DataFrame @@ -4235,8 +4222,6 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: the same size as the index and its dtype must exactly match the index's type. - .. versionadded:: 0.21.0 (list-like tolerance) - Returns ------- %(klass)s with changed index. @@ -5750,8 +5735,6 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: columns unchanged. The inference rules are the same as during normal Series/DataFrame construction. - .. versionadded:: 0.21.0 - Returns ------- converted : same type as input object @@ -7287,8 +7270,6 @@ def clip( Align object with lower and upper along the given axis. inplace : bool, default False Whether to perform the operation in place on the data. - - .. versionadded:: 0.21.0 *args, **kwargs Additional keywords have no effect but might be accepted for compatibility with numpy. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5fec68d257167..b97f0366579b3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2836,8 +2836,6 @@ def get_loc(self, key, method=None, tolerance=None): the index at the matching location most satisfy the equation ``abs(index[loc] - key) <= tolerance``. - .. versionadded:: 0.21.0 (list-like tolerance) - Returns ------- loc : int if unique index, slice if monotonic index, else mask @@ -2909,8 +2907,6 @@ def get_loc(self, key, method=None, tolerance=None): the same size as the index and its dtype must exactly match the index's type. - .. versionadded:: 0.21.0 (list-like tolerance) - Returns ------- indexer : ndarray of int diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 073e1967678ec..635bf32639075 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -91,8 +91,6 @@ class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): dtype : CategoricalDtype or "category", optional If :class:`CategoricalDtype`, cannot be used together with `categories` or `ordered`. - - .. versionadded:: 0.21.0 copy : bool, default False Make a copy of input ndarray. name : object, optional diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 92c3b9125d269..68d6229e798f5 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1014,16 +1014,10 @@ def bdate_range( Weekmask of valid business days, passed to ``numpy.busdaycalendar``, only used when custom frequency strings are passed. The default value None is equivalent to 'Mon Tue Wed Thu Fri'. - - .. versionadded:: 0.21.0 - holidays : list-like or None, default None Dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar``, only used when custom frequency strings are passed. - - .. versionadded:: 0.21.0 - closed : str, default None Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None). diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2e1dcf8da5bd4..b17092caabdd1 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -467,8 +467,6 @@ def nearest(self, limit=None): limit : int, optional Limit of how many values to fill. - .. versionadded:: 0.21.0 - Returns ------- Series or DataFrame diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b3b0166334413..17473ac26dfd6 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -500,9 +500,6 @@ def crosstab( margins_name : str, default 'All' Name of the row/column that will contain the totals when margins is True. - - .. versionadded:: 0.21.0 - dropna : bool, default True Do not include columns whose entries are all NaN. normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False diff --git a/pandas/core/series.py b/pandas/core/series.py index 03b82365358ac..2b073b3c5cebf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1509,8 +1509,6 @@ def to_dict(self, into=dict): instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. - .. versionadded:: 0.21.0 - Returns ------- collections.abc.Mapping @@ -4067,12 +4065,8 @@ def drop( index : single label or list-like Redundant for application on Series, but 'index' can be used instead of 'labels'. - - .. versionadded:: 0.21.0 columns : single label or list-like No change is made to the Series; use 'index' or 'labels' instead. - - .. versionadded:: 0.21.0 level : int or level name, optional For MultiIndex, level for which the labels will be removed. inplace : bool, default False diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 718534e42ec25..fecdf3b758f0f 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -765,8 +765,6 @@ def where( Updates the HTML representation with a style which is selected in accordance with the return value of a function. - .. versionadded:: 0.21.0 - Parameters ---------- cond : callable diff --git a/pandas/io/html.py b/pandas/io/html.py index ce6674ffb9588..442a2791fc6e6 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1057,8 +1057,6 @@ def read_html( the header, otherwise the function attempts to find the header within the body (by putting rows with only ```` elements into the header). - .. versionadded:: 0.21.0 - Similar to :func:`~read_csv` the `header` argument is applied **after** `skiprows` is applied. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index d6b90ae99973e..b955b83dbfde5 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -490,9 +490,6 @@ def read_json( for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. - - .. versionadded:: 0.21.0 - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -500,8 +497,6 @@ def read_json( otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. - .. versionadded:: 0.21.0 - Returns ------- Series or DataFrame diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9ae9729fc05ee..46320355512d1 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -260,8 +260,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. - .. versionadded:: 0.21.0 - Parameters ---------- path : str, path object or file-like object @@ -287,8 +285,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): 'pyarrow' is unavailable. columns : list, default=None If not None, only these columns will be read from the file. - - .. versionadded:: 0.21.1 **kwargs Any additional kwargs are passed to the engine. diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 4e731b8ecca11..6faebf56a11ab 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -43,7 +43,6 @@ def to_pickle( HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html - .. versionadded:: 0.21.0 See Also -------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8c213803170a3..3e4b25088e094 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -306,9 +306,6 @@ def read_hdf( By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. - - .. versionadded:: 0.21.0 support for __fspath__ protocol. - key : object, optional The group identifier in the store. Can be omitted if the HDF file contains a single pandas object. @@ -1462,8 +1459,6 @@ def info(self) -> str: """ Print detailed information on the store. - .. versionadded:: 0.21.0 - Returns ------- str From 9c74fc911934f37e587270de94597d49326b94eb Mon Sep 17 00:00:00 2001 From: rhshadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 5 Apr 2020 14:39:55 -0500 Subject: [PATCH 15/26] CLN: Add/refine type hints to some functions in core.dtypes.cast (#33286) --- pandas/core/dtypes/cast.py | 20 +++++++++++++------- pandas/core/groupby/generic.py | 6 +++--- pandas/core/groupby/groupby.py | 16 +++++++++++----- pandas/core/groupby/ops.py | 2 +- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 57c17f48e01ce..223cc43d158e6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -3,6 +3,7 @@ """ from datetime import date, datetime, timedelta +from typing import TYPE_CHECKING, Type import numpy as np @@ -63,6 +64,7 @@ ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, + ABCExtensionArray, ABCPeriodArray, ABCPeriodIndex, ABCSeries, @@ -70,6 +72,10 @@ from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.missing import isna, notna +if TYPE_CHECKING: + from pandas import Series + from pandas.core.arrays import ExtensionArray # noqa: F401 + _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max @@ -246,9 +252,7 @@ def trans(x): return result -def maybe_cast_result( - result, obj: ABCSeries, numeric_only: bool = False, how: str = "" -): +def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: str = ""): """ Try casting result to a different type if appropriate @@ -256,8 +260,8 @@ def maybe_cast_result( ---------- result : array-like Result to cast. - obj : ABCSeries - Input series from which result was calculated. + obj : Series + Input Series from which result was calculated. numeric_only : bool, default False Whether to cast only numerics or datetimes as well. how : str, default "" @@ -313,13 +317,13 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: return d.get((dtype, how), dtype) -def maybe_cast_to_extension_array(cls, obj, dtype=None): +def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): """ Call to `_from_sequence` that returns the object unchanged on Exception. Parameters ---------- - cls : ExtensionArray subclass + cls : class, subclass of ExtensionArray obj : arraylike Values to pass to cls._from_sequence dtype : ExtensionDtype, optional @@ -329,6 +333,8 @@ def maybe_cast_to_extension_array(cls, obj, dtype=None): ExtensionArray or obj """ assert isinstance(cls, type), f"must pass a type: {cls}" + assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" + assert issubclass(cls, ABCExtensionArray), assertion_msg try: result = cls._from_sequence(obj, dtype=dtype) except Exception: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 093c925acbc49..88580f6ebb3ed 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -151,7 +151,7 @@ def pinner(cls): @pin_whitelisted_properties(Series, base.series_apply_whitelist) -class SeriesGroupBy(GroupBy): +class SeriesGroupBy(GroupBy[Series]): _apply_whitelist = base.series_apply_whitelist def _iterate_slices(self) -> Iterable[Series]: @@ -815,7 +815,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): @pin_whitelisted_properties(DataFrame, base.dataframe_apply_whitelist) -class DataFrameGroupBy(GroupBy): +class DataFrameGroupBy(GroupBy[DataFrame]): _apply_whitelist = base.dataframe_apply_whitelist @@ -1462,7 +1462,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: for i, _ in enumerate(result.columns): res = algorithms.take_1d(result.iloc[:, i].values, ids) # TODO: we have no test cases that get here with EA dtypes; - # try_cast may not be needed if EAs never get here + # maybe_cast_result may not be needed if EAs never get here if cast: res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm) output.append(res) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dff712ee17ea6..1474e173b4f8c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -17,6 +17,7 @@ class providing the base-class of operations. Callable, Dict, FrozenSet, + Generic, Hashable, Iterable, List, @@ -24,6 +25,7 @@ class providing the base-class of operations. Optional, Tuple, Type, + TypeVar, Union, ) @@ -353,13 +355,13 @@ def _group_selection_context(groupby): ] -class _GroupBy(PandasObject, SelectionMixin): +class _GroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): _group_selection = None _apply_whitelist: FrozenSet[str] = frozenset() def __init__( self, - obj: NDFrame, + obj: FrameOrSeries, keys: Optional[_KeysArgType] = None, axis: int = 0, level=None, @@ -995,7 +997,11 @@ def _apply_filter(self, indices, dropna): return filtered -class GroupBy(_GroupBy): +# To track operations that expand dimensions, like ohlc +OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) + + +class GroupBy(_GroupBy[FrameOrSeries]): """ Class for grouping and aggregating relational data. @@ -2420,8 +2426,8 @@ def tail(self, n=5): return self._selected_obj[mask] def _reindex_output( - self, output: FrameOrSeries, fill_value: Scalar = np.NaN - ) -> FrameOrSeries: + self, output: OutputFrameOrSeries, fill_value: Scalar = np.NaN + ) -> OutputFrameOrSeries: """ If we have categorical groupers, then we might want to make sure that we have a fully re-indexed output to the levels. This means expanding diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 742de397956c0..8d535374a083f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -682,7 +682,7 @@ def _aggregate_series_pure_python(self, obj: Series, func): assert result is not None result = lib.maybe_convert_objects(result, try_float=0) - # TODO: try_cast back to EA? + # TODO: maybe_cast_to_extension_array? return result, counts From d1cc4e7f86446710aca59be001dc1830caae13d3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 5 Apr 2020 12:53:13 -0700 Subject: [PATCH 16/26] DEPR: Index.is_mixed (#33291) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/indexes/base.py | 8 +++++++- pandas/tests/indexes/test_base.py | 6 ++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8bff34dbdadad..d283d4450e6bf 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -256,6 +256,7 @@ Deprecations - :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) - :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) - The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) +- :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b97f0366579b3..cc03208b34fe6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1955,6 +1955,12 @@ def is_mixed(self) -> bool: >>> idx.is_mixed() False """ + warnings.warn( + "Index.is_mixed is deprecated and will be removed in a future version. " + "Check index.inferred_type directly instead.", + FutureWarning, + stacklevel=2, + ) return self.inferred_type in ["mixed"] def holds_integer(self) -> bool: @@ -3131,7 +3137,7 @@ def is_int(v): # convert the slice to an indexer here # if we are mixed and have integers - if is_positional and self.is_mixed(): + if is_positional: try: # Validate start & stop if start is not None: diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 35ee81229b716..0417208868314 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1160,6 +1160,12 @@ def test_intersection_difference(self, indices, sort): diff = indices.difference(indices, sort=sort) tm.assert_index_equal(inter, diff) + def test_is_mixed_deprecated(self): + # GH#32922 + index = self.create_index() + with tm.assert_produces_warning(FutureWarning): + index.is_mixed() + @pytest.mark.parametrize( "indices, expected", [ From 57a6537d094694fefd6e2a381d47aaf4cdfc9570 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sun, 5 Apr 2020 23:00:28 +0300 Subject: [PATCH 17/26] CLN: Added static types _libs/algos (#33271) --- pandas/_libs/algos.pyx | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7a32b8957003e..6b6ead795584f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -50,18 +50,17 @@ from pandas._libs.khash cimport ( import pandas._libs.missing as missing -cdef float64_t FP_ERR = 1e-13 - -cdef float64_t NaN = np.NaN - -cdef int64_t NPY_NAT = get_nat() +cdef: + float64_t FP_ERR = 1e-13 + float64_t NaN = np.NaN + int64_t NPY_NAT = get_nat() tiebreakers = { - 'average': TIEBREAK_AVERAGE, - 'min': TIEBREAK_MIN, - 'max': TIEBREAK_MAX, - 'first': TIEBREAK_FIRST, - 'dense': TIEBREAK_DENSE, + "average": TIEBREAK_AVERAGE, + "min": TIEBREAK_MIN, + "max": TIEBREAK_MAX, + "first": TIEBREAK_FIRST, + "dense": TIEBREAK_DENSE, } @@ -120,6 +119,7 @@ cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr): kh_int64_t *table int ret = 0 list uniques = [] + ndarray[int64_t, ndim=1] result table = kh_init_int64() kh_resize_int64(table, 10) @@ -261,7 +261,7 @@ def kth_smallest(numeric[:] a, Py_ssize_t k) -> numeric: @cython.boundscheck(False) @cython.wraparound(False) -def nancorr(const float64_t[:, :] mat, bint cov=0, minp=None): +def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): cdef: Py_ssize_t i, j, xi, yi, N, K bint minpv @@ -325,7 +325,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=0, minp=None): @cython.boundscheck(False) @cython.wraparound(False) -def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): +def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1) -> ndarray: cdef: Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result @@ -581,7 +581,7 @@ D @cython.boundscheck(False) @cython.wraparound(False) -def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): +def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: cdef: Py_ssize_t i, j, nleft, nright ndarray[int64_t, ndim=1] indexer @@ -810,18 +810,14 @@ def rank_1d( """ cdef: Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - ndarray[rank_t] sorted_data, values - ndarray[float64_t] ranks ndarray[int64_t] argsorted ndarray[uint8_t, cast=True] sorted_mask - rank_t val, nan_value - float64_t sum_ranks = 0 int tiebreak = 0 - bint keep_na = 0 + bint keep_na = False bint isnan, condition float64_t count = 0.0 @@ -1034,19 +1030,14 @@ def rank_2d( """ cdef: Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - Py_ssize_t infs - ndarray[float64_t, ndim=2] ranks ndarray[rank_t, ndim=2] values - ndarray[int64_t, ndim=2] argsorted - rank_t val, nan_value - float64_t sum_ranks = 0 int tiebreak = 0 - bint keep_na = 0 + bint keep_na = False float64_t count = 0.0 bint condition, skip_condition From 55171c219b2eee991269821298782d0ac31f6451 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 5 Apr 2020 13:04:26 -0700 Subject: [PATCH 18/26] CLN: remove BlockManager.__contains__ (#33293) --- pandas/core/internals/managers.py | 3 --- pandas/tests/internals/test_internals.py | 4 ---- 2 files changed, 7 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 230cb7af75f58..ac8de977b9a1a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -791,9 +791,6 @@ def get_slice(self, slobj: slice, axis: int = 0) -> "BlockManager": bm = type(self)(new_blocks, new_axes, do_integrity_check=False) return bm - def __contains__(self, item) -> bool: - return item in self.items - @property def nblocks(self) -> int: return len(self.blocks) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 657849874f091..57fbc9ab13f84 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -301,10 +301,6 @@ def test_duplicate_ref_loc_failure(self): mgr = BlockManager(blocks, axes) mgr.iget(1) - def test_contains(self, mgr): - assert "a" in mgr - assert "baz" not in mgr - def test_pickle(self, mgr): mgr2 = tm.round_trip_pickle(mgr) From eb080f3fa7ea63fbe2be866dea00cc76fbb66b0a Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Mon, 6 Apr 2020 11:29:10 +0300 Subject: [PATCH 19/26] DOC: Improved doc for `Index.equals` (#33289) --- pandas/core/indexes/base.py | 57 ++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cc03208b34fe6..3aa8a1e93355d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4195,15 +4195,64 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def equals(self, other) -> bool: + def equals(self, other: Any) -> bool: """ - Determine if two Index objects contain the same elements. + Determine if two Index object are equal. + + The things that are being compared are: + + * The elements inside the Index object. + * The order of the elements inside the Index object. + + Parameters + ---------- + other : Any + The other object to compare against. Returns ------- bool - True if "other" is an Index and it has the same elements as calling - index; False otherwise. + True if "other" is an Index and it has the same elements and order + as the calling index; False otherwise. + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3]) + >>> idx1 + Int64Index([1, 2, 3], dtype='int64') + >>> idx1.equals(pd.Index([1, 2, 3])) + True + + The elements inside are compared + + >>> idx2 = pd.Index(["1", "2", "3"]) + >>> idx2 + Index(['1', '2', '3'], dtype='object') + + >>> idx1.equals(idx2) + False + + The oreder is compared + + >>> ascending_idx = pd.Index([1, 2, 3]) + >>> ascending_idx + Int64Index([1, 2, 3], dtype='int64') + >>> descending_idx = pd.Index([3, 2, 1]) + >>> descending_idx + Int64Index([3, 2, 1], dtype='int64') + >>> ascending_idx.equals(descending_idx) + False + + The dtype is *not* compared + + >>> int64_idx = pd.Int64Index([1, 2, 3]) + >>> int64_idx + Int64Index([1, 2, 3], dtype='int64') + >>> uint64_idx = pd.UInt64Index([1, 2, 3]) + >>> uint64_idx + UInt64Index([1, 2, 3], dtype='uint64') + >>> int64_idx.equals(uint64_idx) + True """ if self.is_(other): return True From 2f5af5e3d95e43168546449cbd9ab96c9a1a4f5a Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 6 Apr 2020 08:05:16 -0500 Subject: [PATCH 20/26] DOC/CLN: Fix docstring typo (#33320) --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3aa8a1e93355d..d0319e9181bad 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4232,7 +4232,7 @@ def equals(self, other: Any) -> bool: >>> idx1.equals(idx2) False - The oreder is compared + The order is compared >>> ascending_idx = pd.Index([1, 2, 3]) >>> ascending_idx From dfa6f44b638ba9ca6567fc8a4b79561c7bdcdaf0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 06:59:54 -0700 Subject: [PATCH 21/26] DOC: include Offset.__call__ to autosummary to fix sphinx warning (#33309) --- doc/source/reference/offset_frequency.rst | 45 +++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index fc1c6d6bd6d47..17544cb7a1225 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -37,6 +37,7 @@ Methods DateOffset.onOffset DateOffset.is_anchored DateOffset.is_on_offset + DateOffset.__call__ BusinessDay ----------- @@ -69,6 +70,7 @@ Methods BusinessDay.onOffset BusinessDay.is_anchored BusinessDay.is_on_offset + BusinessDay.__call__ BusinessHour ------------ @@ -100,6 +102,7 @@ Methods BusinessHour.onOffset BusinessHour.is_anchored BusinessHour.is_on_offset + BusinessHour.__call__ CustomBusinessDay ----------------- @@ -131,6 +134,7 @@ Methods CustomBusinessDay.onOffset CustomBusinessDay.is_anchored CustomBusinessDay.is_on_offset + CustomBusinessDay.__call__ CustomBusinessHour ------------------ @@ -162,6 +166,7 @@ Methods CustomBusinessHour.onOffset CustomBusinessHour.is_anchored CustomBusinessHour.is_on_offset + CustomBusinessHour.__call__ MonthOffset ----------- @@ -194,6 +199,7 @@ Methods MonthOffset.onOffset MonthOffset.is_anchored MonthOffset.is_on_offset + MonthOffset.__call__ MonthEnd -------- @@ -226,6 +232,7 @@ Methods MonthEnd.onOffset MonthEnd.is_anchored MonthEnd.is_on_offset + MonthEnd.__call__ MonthBegin ---------- @@ -258,6 +265,7 @@ Methods MonthBegin.onOffset MonthBegin.is_anchored MonthBegin.is_on_offset + MonthBegin.__call__ BusinessMonthEnd ---------------- @@ -290,6 +298,7 @@ Methods BusinessMonthEnd.onOffset BusinessMonthEnd.is_anchored BusinessMonthEnd.is_on_offset + BusinessMonthEnd.__call__ BusinessMonthBegin ------------------ @@ -322,6 +331,7 @@ Methods BusinessMonthBegin.onOffset BusinessMonthBegin.is_anchored BusinessMonthBegin.is_on_offset + BusinessMonthBegin.__call__ CustomBusinessMonthEnd ---------------------- @@ -354,6 +364,7 @@ Methods CustomBusinessMonthEnd.onOffset CustomBusinessMonthEnd.is_anchored CustomBusinessMonthEnd.is_on_offset + CustomBusinessMonthEnd.__call__ CustomBusinessMonthBegin ------------------------ @@ -386,6 +397,7 @@ Methods CustomBusinessMonthBegin.onOffset CustomBusinessMonthBegin.is_anchored CustomBusinessMonthBegin.is_on_offset + CustomBusinessMonthBegin.__call__ SemiMonthOffset --------------- @@ -418,6 +430,7 @@ Methods SemiMonthOffset.onOffset SemiMonthOffset.is_anchored SemiMonthOffset.is_on_offset + SemiMonthOffset.__call__ SemiMonthEnd ------------ @@ -450,6 +463,7 @@ Methods SemiMonthEnd.onOffset SemiMonthEnd.is_anchored SemiMonthEnd.is_on_offset + SemiMonthEnd.__call__ SemiMonthBegin -------------- @@ -482,6 +496,7 @@ Methods SemiMonthBegin.onOffset SemiMonthBegin.is_anchored SemiMonthBegin.is_on_offset + SemiMonthBegin.__call__ Week ---- @@ -514,6 +529,7 @@ Methods Week.onOffset Week.is_anchored Week.is_on_offset + Week.__call__ WeekOfMonth ----------- @@ -545,6 +561,7 @@ Methods WeekOfMonth.onOffset WeekOfMonth.is_anchored WeekOfMonth.is_on_offset + WeekOfMonth.__call__ LastWeekOfMonth --------------- @@ -576,6 +593,7 @@ Methods LastWeekOfMonth.onOffset LastWeekOfMonth.is_anchored LastWeekOfMonth.is_on_offset + LastWeekOfMonth.__call__ QuarterOffset ------------- @@ -608,6 +626,7 @@ Methods QuarterOffset.onOffset QuarterOffset.is_anchored QuarterOffset.is_on_offset + QuarterOffset.__call__ BQuarterEnd ----------- @@ -640,6 +659,7 @@ Methods BQuarterEnd.onOffset BQuarterEnd.is_anchored BQuarterEnd.is_on_offset + BQuarterEnd.__call__ BQuarterBegin ------------- @@ -672,6 +692,7 @@ Methods BQuarterBegin.onOffset BQuarterBegin.is_anchored BQuarterBegin.is_on_offset + BQuarterBegin.__call__ QuarterEnd ---------- @@ -704,6 +725,7 @@ Methods QuarterEnd.onOffset QuarterEnd.is_anchored QuarterEnd.is_on_offset + QuarterEnd.__call__ QuarterBegin ------------ @@ -736,6 +758,7 @@ Methods QuarterBegin.onOffset QuarterBegin.is_anchored QuarterBegin.is_on_offset + QuarterBegin.__call__ YearOffset ---------- @@ -768,6 +791,7 @@ Methods YearOffset.onOffset YearOffset.is_anchored YearOffset.is_on_offset + YearOffset.__call__ BYearEnd -------- @@ -800,6 +824,7 @@ Methods BYearEnd.onOffset BYearEnd.is_anchored BYearEnd.is_on_offset + BYearEnd.__call__ BYearBegin ---------- @@ -832,6 +857,7 @@ Methods BYearBegin.onOffset BYearBegin.is_anchored BYearBegin.is_on_offset + BYearBegin.__call__ YearEnd ------- @@ -864,6 +890,7 @@ Methods YearEnd.onOffset YearEnd.is_anchored YearEnd.is_on_offset + YearEnd.__call__ YearBegin --------- @@ -896,6 +923,7 @@ Methods YearBegin.onOffset YearBegin.is_anchored YearBegin.is_on_offset + YearBegin.__call__ FY5253 ------ @@ -929,6 +957,7 @@ Methods FY5253.onOffset FY5253.is_anchored FY5253.is_on_offset + FY5253.__call__ FY5253Quarter ------------- @@ -962,6 +991,7 @@ Methods FY5253Quarter.is_anchored FY5253Quarter.is_on_offset FY5253Quarter.year_has_extra_week + FY5253Quarter.__call__ Easter ------ @@ -993,6 +1023,7 @@ Methods Easter.onOffset Easter.is_anchored Easter.is_on_offset + Easter.__call__ Tick ---- @@ -1024,6 +1055,7 @@ Methods Tick.onOffset Tick.is_anchored Tick.is_on_offset + Tick.__call__ Day --- @@ -1055,6 +1087,7 @@ Methods Day.onOffset Day.is_anchored Day.is_on_offset + Day.__call__ Hour ---- @@ -1086,6 +1119,7 @@ Methods Hour.onOffset Hour.is_anchored Hour.is_on_offset + Hour.__call__ Minute ------ @@ -1117,6 +1151,7 @@ Methods Minute.onOffset Minute.is_anchored Minute.is_on_offset + Minute.__call__ Second ------ @@ -1148,6 +1183,7 @@ Methods Second.onOffset Second.is_anchored Second.is_on_offset + Second.__call__ Milli ----- @@ -1179,6 +1215,7 @@ Methods Milli.onOffset Milli.is_anchored Milli.is_on_offset + Milli.__call__ Micro ----- @@ -1210,6 +1247,7 @@ Methods Micro.onOffset Micro.is_anchored Micro.is_on_offset + Micro.__call__ Nano ---- @@ -1241,6 +1279,7 @@ Methods Nano.onOffset Nano.is_anchored Nano.is_on_offset + Nano.__call__ BDay ---- @@ -1277,6 +1316,7 @@ Methods BDay.is_on_offset BDay.rollback BDay.rollforward + BDay.__call__ BMonthEnd --------- @@ -1312,6 +1352,7 @@ Methods BMonthEnd.is_on_offset BMonthEnd.rollback BMonthEnd.rollforward + BMonthEnd.__call__ BMonthBegin ----------- @@ -1347,6 +1388,7 @@ Methods BMonthBegin.is_on_offset BMonthBegin.rollback BMonthBegin.rollforward + BMonthBegin.__call__ CBMonthEnd ---------- @@ -1386,6 +1428,7 @@ Methods CBMonthEnd.is_on_offset CBMonthEnd.rollback CBMonthEnd.rollforward + CBMonthEnd.__call__ CBMonthBegin ------------ @@ -1425,6 +1468,7 @@ Methods CBMonthBegin.is_on_offset CBMonthBegin.rollback CBMonthBegin.rollforward + CBMonthBegin.__call__ CDay ---- @@ -1461,6 +1505,7 @@ Methods CDay.is_on_offset CDay.rollback CDay.rollforward + CDay.__call__ .. _api.frequencies: From c52f8316f206cbbb1efd4ec8002d72d92a16fd3e Mon Sep 17 00:00:00 2001 From: neilkg <33635204+neilkg@users.noreply.github.com> Date: Mon, 6 Apr 2020 10:33:37 -0400 Subject: [PATCH 22/26] BUG: DataFrame._item_cache not cleared on on .copy() (#33299) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/generic.py | 1 + pandas/tests/frame/test_api.py | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d283d4450e6bf..7cb7db27ae603 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -364,6 +364,7 @@ Indexing - Bug in :class:`Index` constructor where an unhelpful error message was raised for ``numpy`` scalars (:issue:`33017`) - Bug in :meth:`DataFrame.lookup` incorrectly raising an ``AttributeError`` when ``frame.index`` or ``frame.columns`` is not unique; this will now raise a ``ValueError`` with a helpful error message (:issue:`33041`) - Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`32831`) +- Bug in :meth:`DataFrame.copy` _item_cache not invalidated after copy causes post-copy value updates to not be reflected (:issue:`31784`) Missing ^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9640c1e087f47..82cc45ee16c00 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5665,6 +5665,7 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: dtype: object """ data = self._data.copy(deep=deep) + self._clear_item_cache() return self._constructor(data).__finalize__(self) def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 91627b46c2fee..4149485be181d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -540,3 +540,21 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + + def test_cache_on_copy(self): + # GH 31784 _item_cache not cleared on copy causes incorrect reads after updates + df = DataFrame({"a": [1]}) + + df["x"] = [0] + df["a"] + + df.copy() + + df["a"].values[0] = -1 + + tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]})) + + df["y"] = [0] + + assert df["a"].values[0] == -1 + tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]})) From 563a64e01bedd33828a517200fab58e9e719f7dd Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Mon, 6 Apr 2020 17:48:11 +0300 Subject: [PATCH 23/26] CLN: Added static types for `pandas/_libs/reduction.pyx` (#33316) --- pandas/_libs/reduction.pyx | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 9f8579606014a..4a9c89848a9d8 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -36,7 +36,12 @@ cdef class Reducer: object dummy, f, labels, typ, ityp, index ndarray arr - def __init__(self, ndarray arr, object f, axis=1, dummy=None, labels=None): + def __init__( + self, ndarray arr, object f, int axis=1, object dummy=None, object labels=None + ): + cdef: + Py_ssize_t n, k + n, k = (arr).shape if axis == 0: @@ -60,7 +65,7 @@ cdef class Reducer: self.dummy, self.typ, self.index, self.ityp = self._check_dummy( dummy=dummy) - cdef _check_dummy(self, dummy=None): + cdef _check_dummy(self, object dummy=None): cdef: object index = None, typ = None, ityp = None @@ -147,7 +152,7 @@ cdef class Reducer: cdef class _BaseGrouper: - cdef _check_dummy(self, dummy): + cdef _check_dummy(self, object dummy): # both values and index must be an ndarray! values = dummy.values @@ -190,13 +195,16 @@ cdef class _BaseGrouper: """ Call self.f on our new group, then update to the next group. """ + cdef: + object res + cached_ityp._engine.clear_mapping() res = self.f(cached_typ) res = _extract_result(res) if not initialized: # On the first pass, we check the output shape to see # if this looks like a reduction. - initialized = 1 + initialized = True _check_result_array(res, len(self.dummy_arr)) islider.advance(group_size) @@ -534,7 +542,11 @@ cdef class BlockSlider: cdef: char **base_ptrs - def __init__(self, frame): + def __init__(self, object frame): + cdef: + Py_ssize_t i + object b + self.frame = frame self.dummy = frame[:0] self.index = self.dummy.index From b44021d3bebcc259ab9c58a7c4184a13c5da88f9 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Mon, 6 Apr 2020 17:49:13 +0300 Subject: [PATCH 24/26] Changed files permissions to be the same (#33318) Co-authored-by: MomIsBestFriend <> --- pandas/core/indexing.py | 0 pandas/io/parsers.py | 0 pandas/tests/io/generate_legacy_storage_files.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 pandas/core/indexing.py mode change 100755 => 100644 pandas/io/parsers.py mode change 100755 => 100644 pandas/tests/io/generate_legacy_storage_files.py diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py old mode 100755 new mode 100644 diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py old mode 100755 new mode 100644 diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py old mode 100755 new mode 100644 From 927b6959eeea3115e51d085a682499990a904b3c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Apr 2020 18:11:07 +0200 Subject: [PATCH 25/26] PERF: fix placement when slicing a Series (#33324) --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ac8de977b9a1a..c6efd6a2ac6a7 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1568,7 +1568,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> "SingleBlockManager": blk = self._block array = blk._slice(slobj) - block = blk.make_block_same_class(array, placement=range(len(array))) + block = blk.make_block_same_class(array, placement=slice(0, len(array))) return type(self)(block, self.index[slobj]) @property From 2f3cd7da6a289cbf463d097124af206714815908 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 14:55:29 -0700 Subject: [PATCH 26/26] BUG: scalar lookup on 2D DTA/TDA/PA --- pandas/core/arrays/datetimelike.py | 2 ++ pandas/core/internals/blocks.py | 9 ++------- pandas/tests/arrays/test_datetimelike.py | 5 +++++ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4fabd8f558fee..ce42fad31ef78 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -574,6 +574,8 @@ def __getitem__(self, key): freq = self.freq result = getitem(key) + if lib.is_scalar(result): + return self._box_func(result) return self._simple_new(result, dtype=self.dtype, freq=freq) def __setitem__( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d8b54fd5cffb3..415fa7a2aa0e3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -6,7 +6,7 @@ import numpy as np -from pandas._libs import NaT, Timestamp, algos as libalgos, lib, writers +from pandas._libs import NaT, algos as libalgos, lib, writers import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare @@ -2024,12 +2024,7 @@ def array_values(self): def iget(self, key): # GH#31649 we need to wrap scalars in Timestamp/Timedelta # TODO(EA2D): this can be removed if we ever have 2D EA - result = super().iget(key) - if isinstance(result, np.datetime64): - result = Timestamp(result) - elif isinstance(result, np.timedelta64): - result = Timedelta(result) - return result + return self.array_values().reshape(self.shape)[key] def shift(self, periods, axis=0, fill_value=None): # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index fe35344f46688..88263997d7784 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -222,6 +222,11 @@ def test_getitem_2d(self, arr1d): result = arr2d[:3, 0] tm.assert_equal(result, expected) + # Scalar lookup + result = arr2d[-1, 0] + expected = arr1d[-1] + assert result == expected + def test_setitem(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D")