diff --git a/pandas/conftest.py b/pandas/conftest.py index 7851cba9cd91a..0d3f8b034beb7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -967,7 +967,7 @@ def __len__(self): "uint": tm.makeUIntIndex(100), "range": tm.makeRangeIndex(100), "float": tm.makeFloatIndex(100), - "bool": tm.makeBoolIndex(2), + "bool": tm.makeBoolIndex(10), "categorical": tm.makeCategoricalIndex(100), "interval": tm.makeIntervalIndex(100), "empty": Index([]), @@ -978,6 +978,15 @@ def __len__(self): @pytest.fixture(params=indices_dict.keys()) def indices(request): + """ + Fixture for many "simple" kinds of indices. + + These indices are unlikely to cover corner cases, e.g. + - no names + - no NaTs/NaNs + - no values near implementation bounds + - ... + """ # copy to avoid mutation, e.g. setting .name return indices_dict[request.param].copy() @@ -995,6 +1004,14 @@ def _create_series(index): } +@pytest.fixture +def series_with_simple_index(indices): + """ + Fixture for tests on series with changing types of indices. + """ + return _create_series(indices) + + _narrow_dtypes = [ np.float16, np.float32, diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index 9deb56f070d56..625d559001e72 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -137,227 +137,238 @@ def setup_method(self, method): self.is_valid_objs = self.objs self.not_valid_objs = [] - def test_none_comparison(self): + def test_none_comparison(self, series_with_simple_index): + series = series_with_simple_index + if isinstance(series.index, IntervalIndex): + # IntervalIndex breaks on "series[0] = np.nan" below + pytest.skip("IntervalIndex doesn't support assignment") + if len(series) < 1: + pytest.skip("Test doesn't make sense on empty data") # bug brought up by #1079 # changed from TypeError in 0.17.0 - for o in self.is_valid_objs: - if isinstance(o, Series): + series[0] = np.nan + + # noinspection PyComparisonWithNone + result = series == None # noqa + assert not result.iat[0] + assert not result.iat[1] + + # noinspection PyComparisonWithNone + result = series != None # noqa + assert result.iat[0] + assert result.iat[1] + + result = None == series # noqa + assert not result.iat[0] + assert not result.iat[1] + + result = None != series # noqa + assert result.iat[0] + assert result.iat[1] + + if is_datetime64_dtype(series) or is_datetime64tz_dtype(series): + # Following DatetimeIndex (and Timestamp) convention, + # inequality comparisons with Series[datetime64] raise + msg = "Invalid comparison" + with pytest.raises(TypeError, match=msg): + None > series + with pytest.raises(TypeError, match=msg): + series > None + else: + result = None > series + assert not result.iat[0] + assert not result.iat[1] - o[0] = np.nan - - # noinspection PyComparisonWithNone - result = o == None # noqa - assert not result.iat[0] - assert not result.iat[1] - - # noinspection PyComparisonWithNone - result = o != None # noqa - assert result.iat[0] - assert result.iat[1] - - result = None == o # noqa - assert not result.iat[0] - assert not result.iat[1] - - result = None != o # noqa - assert result.iat[0] - assert result.iat[1] - - if is_datetime64_dtype(o) or is_datetime64tz_dtype(o): - # Following DatetimeIndex (and Timestamp) convention, - # inequality comparisons with Series[datetime64] raise - msg = "Invalid comparison" - with pytest.raises(TypeError, match=msg): - None > o - with pytest.raises(TypeError, match=msg): - o > None - else: - result = None > o - assert not result.iat[0] - assert not result.iat[1] + result = series < None + assert not result.iat[0] + assert not result.iat[1] - result = o < None - assert not result.iat[0] - assert not result.iat[1] + def test_ndarray_compat_properties(self, index_or_series_obj): + obj = index_or_series_obj - def test_ndarray_compat_properties(self): + # Check that we work. + for p in ["shape", "dtype", "T", "nbytes"]: + assert getattr(obj, p, None) is not None - for o in self.objs: - # Check that we work. - for p in ["shape", "dtype", "T", "nbytes"]: - assert getattr(o, p, None) is not None + # deprecated properties + for p in ["flags", "strides", "itemsize", "base", "data"]: + assert not hasattr(obj, p) - # deprecated properties - for p in ["flags", "strides", "itemsize", "base", "data"]: - assert not hasattr(o, p) + msg = "can only convert an array of size 1 to a Python scalar" + with pytest.raises(ValueError, match=msg): + obj.item() # len > 1 - msg = "can only convert an array of size 1 to a Python scalar" - with pytest.raises(ValueError, match=msg): - o.item() # len > 1 - - assert o.ndim == 1 - assert o.size == len(o) + assert obj.ndim == 1 + assert obj.size == len(obj) assert Index([1]).item() == 1 assert Series([1]).item() == 1 - def test_value_counts_unique_nunique(self): - for orig in self.objs: - o = orig.copy() - klass = type(o) - values = o._values - - if isinstance(values, Index): - # reset name not to affect latter process - values.name = None - - # create repeated values, 'n'th element is repeated by n+1 times - # skip boolean, because it only has 2 values at most - if isinstance(o, Index) and o.is_boolean(): - continue - elif isinstance(o, Index): - expected_index = Index(o[::-1]) - expected_index.name = None - o = o.repeat(range(1, len(o) + 1)) - o.name = "a" - else: - expected_index = Index(values[::-1]) - idx = o.index.repeat(range(1, len(o) + 1)) - # take-based repeat - indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1)) - rep = values.take(indices) - o = klass(rep, index=idx, name="a") - - # check values has the same dtype as the original - assert o.dtype == orig.dtype - - expected_s = Series( - range(10, 0, -1), index=expected_index, dtype="int64", name="a" + def test_value_counts_unique_nunique(self, index_or_series_obj): + orig = index_or_series_obj + obj = orig.copy() + klass = type(obj) + values = obj._values + + if orig.duplicated().any(): + pytest.xfail( + "The test implementation isn't flexible enough to deal" + " with duplicated values. This isn't a bug in the" + " application code, but in the test code." ) - result = o.value_counts() - tm.assert_series_equal(result, expected_s) - assert result.index.name is None - assert result.name == "a" + # create repeated values, 'n'th element is repeated by n+1 times + if isinstance(obj, Index): + expected_index = Index(obj[::-1]) + expected_index.name = None + obj = obj.repeat(range(1, len(obj) + 1)) + else: + expected_index = Index(values[::-1]) + idx = obj.index.repeat(range(1, len(obj) + 1)) + # take-based repeat + indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1)) + rep = values.take(indices) + obj = klass(rep, index=idx) + + # check values has the same dtype as the original + assert obj.dtype == orig.dtype + + expected_s = Series( + range(len(orig), 0, -1), index=expected_index, dtype="int64" + ) - result = o.unique() - if isinstance(o, Index): - assert isinstance(result, type(o)) - tm.assert_index_equal(result, orig) - assert result.dtype == orig.dtype - elif is_datetime64tz_dtype(o): - # datetimetz Series returns array of Timestamp - assert result[0] == orig[0] - for r in result: - assert isinstance(r, Timestamp) - - tm.assert_numpy_array_equal( - result.astype(object), orig._values.astype(object) - ) - else: - tm.assert_numpy_array_equal(result, orig.values) - assert result.dtype == orig.dtype + result = obj.value_counts() + tm.assert_series_equal(result, expected_s) + assert result.index.name is None + + result = obj.unique() + if isinstance(obj, Index): + assert isinstance(result, type(obj)) + tm.assert_index_equal(result, orig) + assert result.dtype == orig.dtype + elif is_datetime64tz_dtype(obj): + # datetimetz Series returns array of Timestamp + assert result[0] == orig[0] + for r in result: + assert isinstance(r, Timestamp) + + tm.assert_numpy_array_equal( + result.astype(object), orig._values.astype(object) + ) + else: + tm.assert_numpy_array_equal(result, orig.values) + assert result.dtype == orig.dtype - assert o.nunique() == len(np.unique(o.values)) + # dropna=True would break for MultiIndex + assert obj.nunique(dropna=False) == len(np.unique(obj.values)) @pytest.mark.parametrize("null_obj", [np.nan, None]) - def test_value_counts_unique_nunique_null(self, null_obj): - - for orig in self.objs: - o = orig.copy() - klass = type(o) - values = o._ndarray_values - - if not allow_na_ops(o): - continue - - # special assign to the numpy array - if is_datetime64tz_dtype(o): - if isinstance(o, DatetimeIndex): - v = o.asi8 - v[0:2] = iNaT - values = o._shallow_copy(v) - else: - o = o.copy() - o[0:2] = pd.NaT - values = o._values - - elif needs_i8_conversion(o): - values[0:2] = iNaT - values = o._shallow_copy(values) + def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj): + orig = index_or_series_obj + obj = orig.copy() + klass = type(obj) + values = obj._ndarray_values + num_values = len(orig) + + if not allow_na_ops(obj): + pytest.skip("type doesn't allow for NA operations") + elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)): + pytest.skip(f"values of {klass} cannot be changed") + elif isinstance(orig, pd.MultiIndex): + pytest.skip("MultiIndex doesn't support isna") + + # special assign to the numpy array + if is_datetime64tz_dtype(obj): + if isinstance(obj, DatetimeIndex): + v = obj.asi8 + v[0:2] = iNaT + values = obj._shallow_copy(v) else: - values[0:2] = null_obj - # check values has the same dtype as the original - - assert values.dtype == o.dtype + obj = obj.copy() + obj[0:2] = pd.NaT + values = obj._values - # create repeated values, 'n'th element is repeated by n+1 - # times - if isinstance(o, (DatetimeIndex, PeriodIndex)): - expected_index = o.copy() - expected_index.name = None + elif needs_i8_conversion(obj): + values[0:2] = iNaT + values = obj._shallow_copy(values) + else: + values[0:2] = null_obj - # attach name to klass - o = klass(values.repeat(range(1, len(o) + 1))) - o.name = "a" - else: - if isinstance(o, DatetimeIndex): - expected_index = orig._values._shallow_copy(values) - else: - expected_index = Index(values) - expected_index.name = None - o = o.repeat(range(1, len(o) + 1)) - o.name = "a" - - # check values has the same dtype as the original - assert o.dtype == orig.dtype - # check values correctly have NaN - nanloc = np.zeros(len(o), dtype=np.bool) - nanloc[:3] = True - if isinstance(o, Index): - tm.assert_numpy_array_equal(pd.isna(o), nanloc) - else: - exp = Series(nanloc, o.index, name="a") - tm.assert_series_equal(pd.isna(o), exp) - - expected_s_na = Series( - list(range(10, 2, -1)) + [3], - index=expected_index[9:0:-1], - dtype="int64", - name="a", - ) - expected_s = Series( - list(range(10, 2, -1)), - index=expected_index[9:1:-1], - dtype="int64", - name="a", - ) + # check values has the same dtype as the original + assert values.dtype == obj.dtype - result_s_na = o.value_counts(dropna=False) - tm.assert_series_equal(result_s_na, expected_s_na) - assert result_s_na.index.name is None - assert result_s_na.name == "a" - result_s = o.value_counts() - tm.assert_series_equal(o.value_counts(), expected_s) - assert result_s.index.name is None - assert result_s.name == "a" + # create repeated values, 'n'th element is repeated by n+1 + # times + if isinstance(obj, (DatetimeIndex, PeriodIndex)): + expected_index = obj.copy() + expected_index.name = None - result = o.unique() - if isinstance(o, Index): - tm.assert_index_equal(result, Index(values[1:], name="a")) - elif is_datetime64tz_dtype(o): - # unable to compare NaT / nan - tm.assert_extension_array_equal(result[1:], values[2:]) - assert result[0] is pd.NaT + # attach name to klass + obj = klass(values.repeat(range(1, len(obj) + 1))) + obj.name = "a" + else: + if isinstance(obj, DatetimeIndex): + expected_index = orig._values._shallow_copy(values) else: - tm.assert_numpy_array_equal(result[1:], values[2:]) - - assert pd.isna(result[0]) - assert result.dtype == orig.dtype + expected_index = Index(values) + expected_index.name = None + obj = obj.repeat(range(1, len(obj) + 1)) + obj.name = "a" + + # check values has the same dtype as the original + assert obj.dtype == orig.dtype + + # check values correctly have NaN + nanloc = np.zeros(len(obj), dtype=np.bool) + nanloc[:3] = True + if isinstance(obj, Index): + tm.assert_numpy_array_equal(pd.isna(obj), nanloc) + else: + exp = Series(nanloc, obj.index, name="a") + tm.assert_series_equal(pd.isna(obj), exp) + + expected_data = list(range(num_values, 2, -1)) + expected_data_na = expected_data.copy() + if expected_data_na: + expected_data_na.append(3) + expected_s_na = Series( + expected_data_na, + index=expected_index[num_values - 1 : 0 : -1], + dtype="int64", + name="a", + ) + expected_s = Series( + expected_data, + index=expected_index[num_values - 1 : 1 : -1], + dtype="int64", + name="a", + ) - assert o.nunique() == 8 - assert o.nunique(dropna=False) == 9 + result_s_na = obj.value_counts(dropna=False) + tm.assert_series_equal(result_s_na, expected_s_na) + assert result_s_na.index.name is None + assert result_s_na.name == "a" + result_s = obj.value_counts() + tm.assert_series_equal(obj.value_counts(), expected_s) + assert result_s.index.name is None + assert result_s.name == "a" + + result = obj.unique() + if isinstance(obj, Index): + tm.assert_index_equal(result, Index(values[1:], name="a")) + elif is_datetime64tz_dtype(obj): + # unable to compare NaT / nan + tm.assert_extension_array_equal(result[1:], values[2:]) + assert result[0] is pd.NaT + elif len(obj) > 0: + tm.assert_numpy_array_equal(result[1:], values[2:]) + + assert pd.isna(result[0]) + assert result.dtype == orig.dtype + + assert obj.nunique() == max(0, num_values - 2) + assert obj.nunique(dropna=False) == max(0, num_values - 1) def test_value_counts_inferred(self, index_or_series): klass = index_or_series diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 2073aa0727809..a7437b39872be 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -514,12 +514,12 @@ def test_union_base(self, indices): @pytest.mark.parametrize("sort", [None, False]) def test_difference_base(self, sort, indices): - if isinstance(indices, CategoricalIndex): - return - first = indices[2:] second = indices[:4] - answer = indices[4:] + if isinstance(indices, CategoricalIndex) or indices.is_boolean(): + answer = [] + else: + answer = indices[4:] result = first.difference(second, sort) assert tm.equalContents(result, answer)