diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 57c625ced8a43..d78419c12ce0d 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -223,27 +223,27 @@ def time_series_datetimeindex_repr(self): class All: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] - param_names = ["N", "case"] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + param_names = ["N", "case", "dtype"] - def setup(self, N, case): + def setup(self, N, case, dtype): val = case != "fast" - self.s = Series([val] * N) + self.s = Series([val] * N, dtype=dtype) - def time_all(self, N, case): + def time_all(self, N, case, dtype): self.s.all() class Any: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] - param_names = ["N", "case"] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + param_names = ["N", "case", "dtype"] - def setup(self, N, case): + def setup(self, N, case, dtype): val = case == "fast" - self.s = Series([val] * N) + self.s = Series([val] * N, dtype=dtype) - def time_any(self, N, case): + def time_any(self, N, case, dtype): self.s.any() @@ -265,11 +265,14 @@ class NanOps: "prod", ], [10 ** 3, 10 ** 6], - ["int8", "int32", "int64", "float64"], + ["int8", "int32", "int64", "float64", "Int64", "boolean"], ] param_names = ["func", "N", "dtype"] def setup(self, func, N, dtype): + if func == "argmax" and dtype in {"Int64", "boolean"}: + # Skip argmax for nullable int since this doesn't work yet (GH-24382) + raise NotImplementedError self.s = Series([1] * N, dtype=dtype) self.func = getattr(self.s, func) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index ec67394e55a1e..ebbd3c9eddfdb 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -7,11 +7,17 @@ class FrameOps: - params = [ops, ["float", "int"], [0, 1]] + params = [ops, ["float", "int", "Int64"], [0, 1]] param_names = ["op", "dtype", "axis"] def setup(self, op, dtype, axis): - df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) + if op == "mad" and dtype == "Int64" and axis == 1: + # GH-33036 + raise NotImplementedError + values = np.random.randn(100000, 4) + if dtype == "Int64": + values = values.astype(int) + df = pd.DataFrame(values).astype(dtype) self.df_func = getattr(df, op) def time_op(self, op, dtype, axis): diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 15b4128424eb1..5401cc81785ab 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -283,14 +283,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/tools/datetimes.py RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests top-level reshaping functions' ; echo $MSG - pytest -q --doctest-modules \ - pandas/core/reshape/concat.py \ - pandas/core/reshape/pivot.py \ - pandas/core/reshape/reshape.py \ - pandas/core/reshape/tile.py \ - pandas/core/reshape/melt.py \ - -k"-crosstab -pivot_table -cut" + MSG='Doctests reshaping functions' ; echo $MSG + pytest -q --doctest-modules pandas/core/reshape/ RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests interval classes' ; echo $MSG @@ -325,6 +319,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then MSG='Doctests generic.py' ; echo $MSG pytest -q --doctest-modules pandas/core/generic.py RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests tseries' ; echo $MSG + pytest -q --doctest-modules pandas/tseries/ + RET=$(($RET + $?)) ; echo $MSG "DONE" fi ### DOCSTRINGS ### diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index 1b3bcb799d5ce..412a5f9e7485f 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -118,7 +118,7 @@ done by requesting the pandas ``dtypes`` attribute: titanic.dtypes For each of the columns, the used data type is enlisted. The data types -in this ``DataFrame`` are integers (``int64``), floats (``float63``) and +in this ``DataFrame`` are integers (``int64``), floats (``float64``) and strings (``object``). .. note:: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f3aff0654530e..d68dc24bae658 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -28,7 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` - binary;`ORC Format `__;:ref:`read_orc`; + binary;`ORC Format `__;:ref:`read_orc`; binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; @@ -4817,7 +4817,7 @@ ORC .. versionadded:: 1.0.0 -Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization +Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library. diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 2e4d0fecaf5cf..234c12ce79822 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -641,21 +641,40 @@ You can check whether elements contain a pattern: .. ipython:: python pattern = r'[0-9][a-z]' - pd.Series(['1', '2', '3a', '3b', '03c'], + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.contains(pattern) Or whether elements match a pattern: .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c'], + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.match(pattern) -The distinction between ``match`` and ``contains`` is strictness: ``match`` -relies on strict ``re.match``, while ``contains`` relies on ``re.search``. +.. versionadded:: 1.1.0 -Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take -an extra ``na`` argument so missing values can be considered True or False: +.. ipython:: python + + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], + dtype="string").str.fullmatch(pattern) + +.. note:: + + The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness: + ``fullmatch`` tests whether the entire string matches the regular expression; + ``match`` tests whether there is a match of the regular expression that begins + at the first character of the string; and ``contains`` tests whether there is + a match of the regular expression at any position within the string. + + The corresponding functions in the ``re`` package for these three match modes are + `re.fullmatch `_, + `re.match `_, and + `re.search `_, + respectively. + +Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and +``endswith`` take an extra ``na`` argument so missing values can be considered +True or False: .. ipython:: python diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 692df075f25cb..20415bba99476 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -69,6 +69,7 @@ Other enhancements - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) +- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - @@ -167,6 +168,32 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss ... KeyError: Timestamp('1970-01-01 00:00:00') +:meth:`DataFrame.merge` preserves right frame's row order +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) + +.. ipython:: python + + left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]}) + right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]}) + left_df + right_df + +*Previous behavior*: + +.. code-block:: python + + >>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right") + animal max_speed + 0 pig 11 + 1 quetzal 80 + +*New behavior*: + +.. ipython:: python + + left_df.merge(right_df, on=['animal', 'max_speed'], how="right") + .. --------------------------------------------------------------------------- .. _whatsnew_110.api_breaking.assignment_to_multiple_columns: @@ -228,6 +255,8 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). +- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`). + .. --------------------------------------------------------------------------- @@ -254,13 +283,14 @@ Datetimelike - Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`) - :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`) - Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`) +- Bug in :meth:`DatetimeIndex.searchsorted` not accepting a ``list`` or :class:`Series` as its argument (:issue:`32762`) Timedelta ^^^^^^^^^ - Bug in constructing a :class:`Timedelta` with a high precision integer that would round the :class:`Timedelta` components (:issue:`31354`) - Bug in dividing ``np.nan`` or ``None`` by :class:`Timedelta`` incorrectly returning ``NaT`` (:issue:`31869`) -- +- Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`) Timezones ^^^^^^^^^ @@ -286,7 +316,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). - @@ -308,6 +338,9 @@ Indexing - Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) - Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) - Bug in :meth:`Series.__getitem__` indexing with non-standard scalars, e.g. ``np.dtype`` (:issue:`32684`) +- Fix to preserve the ability to index with the "nearest" method with xarray's CFTimeIndex, an :class:`Index` subclass (`pydata/xarray#3751 `_, :issue:`32905`). +- Bug in :class:`Index` constructor where an unhelpful error message was raised for ``numpy`` scalars (:issue:`33017`) +- Bug in :meth:`DataFrame.lookup` incorrectly raising an ``AttributeError`` when ``frame.index`` or ``frame.columns`` is not unique; this will now raise a ``ValueError`` with a helpful error message (:issue:`33041`) Missing ^^^^^^^ @@ -369,6 +402,8 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) + Reshaping ^^^^^^^^^ @@ -381,11 +416,16 @@ Reshaping - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) - :meth:`DataFrame.pivot` can now take lists for ``index`` and ``columns`` arguments (:issue:`21425`) - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) +- Bug where :meth:`Index.astype` would lose the name attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) - :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) +- Bug on inplace operation of a Series that was adding a column to the DataFrame from where it was originally dropped from (using inplace=True) (:issue:`30484`) - Bug in :meth:`DataFrame.apply` where callback was called with :class:`Series` parameter even though ``raw=True`` requested. (:issue:`32423`) - Bug in :meth:`DataFrame.pivot_table` losing timezone information when creating a :class:`MultiIndex` level from a column with timezone-aware dtype (:issue:`32558`) +- Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) +- Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) + Sparse ^^^^^^ diff --git a/environment.yml b/environment.yml index 532c36038fcaf..cf579738f6fe9 100644 --- a/environment.yml +++ b/environment.yml @@ -101,6 +101,7 @@ dependencies: - s3fs # pandas.read_csv... when using 's3://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray + - cftime # Needed for downstream xarray.CFTimeIndex test - pyreadstat # pandas.read_spss - tabulate>=0.8.3 # DataFrame.to_markdown - pip: diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index b7f17aee35a44..7a32b8957003e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -38,8 +38,15 @@ cimport pandas._libs.util as util from pandas._libs.util cimport numeric, get_nat from pandas._libs.khash cimport ( - khiter_t, kh_destroy_int64, kh_put_int64, kh_init_int64, kh_int64_t, - kh_resize_int64, kh_get_int64) + kh_destroy_int64, + kh_get_int64, + kh_init_int64, + kh_int64_t, + kh_put_int64, + kh_resize_int64, + khiter_t, +) + import pandas._libs.missing as missing @@ -791,8 +798,13 @@ ctypedef fused rank_t: @cython.wraparound(False) @cython.boundscheck(False) -def rank_1d(rank_t[:] in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): +def rank_1d( + rank_t[:] in_arr, + ties_method="average", + bint ascending=True, + na_option="keep", + bint pct=False, +): """ Fast NaN-friendly version of ``scipy.stats.rankdata``. """ @@ -1009,8 +1021,14 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', return ranks -def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): +def rank_2d( + rank_t[:, :] in_arr, + int axis=0, + ties_method="average", + bint ascending=True, + na_option="keep", + bint pct=False, +): """ Fast NaN-friendly version of ``scipy.stats.rankdata``. """ @@ -1190,9 +1208,12 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) -def diff_2d(diff_t[:, :] arr, - out_t[:, :] out, - Py_ssize_t periods, int axis): +def diff_2d( + diff_t[:, :] arr, + out_t[:, :] out, + Py_ssize_t periods, + int axis, +): cdef: Py_ssize_t i, j, sx, sy, start, stop bint f_contig = arr.is_f_contig() diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0ba5cb7e9bc40..4d26842cc0277 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -2,10 +2,19 @@ import warnings import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, intp_t, - float64_t, float32_t, - int64_t, int32_t, int16_t, int8_t, - uint64_t, uint32_t, uint16_t, uint8_t +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + intp_t, + ndarray, + uint8_t, + uint16_t, + uint32_t, + uint64_t, ) cnp.import_array() @@ -364,7 +373,7 @@ cdef class ObjectEngine(IndexEngine): cdef class DatetimeEngine(Int64Engine): - cdef _get_box_dtype(self): + cdef str _get_box_dtype(self): return 'M8[ns]' cdef int64_t _unbox_scalar(self, scalar) except? -1: @@ -454,7 +463,7 @@ cdef class DatetimeEngine(Int64Engine): cdef class TimedeltaEngine(DatetimeEngine): - cdef _get_box_dtype(self): + cdef str _get_box_dtype(self): return 'm8[ns]' cdef int64_t _unbox_scalar(self, scalar) except? -1: diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 316943edee124..f9aedeb8ad93e 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -2,7 +2,8 @@ cdef class _NDFrameIndexerBase: """ A base class for _NDFrameIndexer for fast instantiation and attribute access. """ - cdef public object obj, name, _ndim + cdef public: + object obj, name, _ndim def __init__(self, name, obj): self.obj = obj diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 3bebd7e23fb5a..d69b417f6e056 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -20,7 +20,6 @@ cdef class BlockPlacement: cdef: slice _as_slice object _as_array - bint _has_slice, _has_array, _is_known_slice_like def __init__(self, val): @@ -56,12 +55,13 @@ cdef class BlockPlacement: def __str__(self) -> str: cdef: slice s = self._ensure_has_slice() + if s is not None: v = self._as_slice else: v = self._as_array - return f'{type(self).__name__}({v})' + return f"{type(self).__name__}({v})" def __repr__(self) -> str: return str(self) @@ -69,6 +69,7 @@ cdef class BlockPlacement: def __len__(self) -> int: cdef: slice s = self._ensure_has_slice() + if s is not None: return slice_len(s) else: @@ -78,6 +79,7 @@ cdef class BlockPlacement: cdef: slice s = self._ensure_has_slice() Py_ssize_t start, stop, step, _ + if s is not None: start, stop, step, _ = slice_get_indices_ex(s) return iter(range(start, stop, step)) @@ -88,15 +90,17 @@ cdef class BlockPlacement: def as_slice(self) -> slice: cdef: slice s = self._ensure_has_slice() - if s is None: - raise TypeError('Not slice-like') - else: + + if s is not None: return s + else: + raise TypeError("Not slice-like") @property def indexer(self): cdef: slice s = self._ensure_has_slice() + if s is not None: return s else: @@ -104,29 +108,34 @@ cdef class BlockPlacement: def isin(self, arr): from pandas.core.indexes.api import Int64Index + return Int64Index(self.as_array, copy=False).isin(arr) @property def as_array(self): cdef: Py_ssize_t start, stop, end, _ + if not self._has_array: start, stop, step, _ = slice_get_indices_ex(self._as_slice) # NOTE: this is the C-optimized equivalent of - # np.arange(start, stop, step, dtype=np.int64) + # `np.arange(start, stop, step, dtype=np.int64)` self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64) self._has_array = True + return self._as_array @property def is_slice_like(self) -> bool: cdef: slice s = self._ensure_has_slice() + return s is not None def __getitem__(self, loc): cdef: slice s = self._ensure_has_slice() + if s is not None: val = slice_getitem(s, loc) else: @@ -141,11 +150,12 @@ cdef class BlockPlacement: return BlockPlacement(np.delete(self.as_array, loc, axis=0)) def append(self, others): - if len(others) == 0: + if not len(others): return self - return BlockPlacement(np.concatenate([self.as_array] + - [o.as_array for o in others])) + return BlockPlacement( + np.concatenate([self.as_array] + [o.as_array for o in others]) + ) cdef iadd(self, other): cdef: @@ -163,8 +173,7 @@ cdef class BlockPlacement: start += other_int stop += other_int - if ((step > 0 and start < 0) or - (step < 0 and stop < step)): + if (step > 0 and start < 0) or (step < 0 and stop < step): raise ValueError("iadd causes length change") if stop < 0: @@ -191,6 +200,7 @@ cdef class BlockPlacement: if not self._has_slice: self._as_slice = indexer_as_slice(self._as_array) self._has_slice = True + return self._as_slice @@ -240,8 +250,7 @@ cdef slice slice_canonize(slice s): return slice(start, stop, step) -cpdef Py_ssize_t slice_len( - slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: +cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: """ Get length of a bounded slice. @@ -258,8 +267,7 @@ cpdef Py_ssize_t slice_len( if slc is None: raise TypeError("slc must be slice") - PySlice_GetIndicesEx(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return length @@ -277,8 +285,7 @@ cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): if slc is None: raise TypeError("slc should be a slice") - PySlice_GetIndicesEx(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return start, stop, step, length @@ -378,8 +385,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): # blockno handling. cdef: int64_t cur_blkno - Py_ssize_t i, start, stop, n, diff - + Py_ssize_t i, start, stop, n, diff, tot_len object blkno object group_dict = defaultdict(list) diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 2240c821cd239..6e41ff189592c 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,8 +1,16 @@ import numbers from operator import le, lt -from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, - PyObject_RichCompare) +from cpython.object cimport ( + Py_EQ, + Py_GE, + Py_GT, + Py_LE, + Py_LT, + Py_NE, + PyObject_RichCompare, +) + import cython from cython import Py_ssize_t @@ -10,9 +18,16 @@ from cython import Py_ssize_t import numpy as np cimport numpy as cnp from numpy cimport ( - int64_t, int32_t, float64_t, float32_t, uint64_t, + NPY_QUICKSORT, + PyArray_ArgSort, + PyArray_Take, + float32_t, + float64_t, + int32_t, + int64_t, ndarray, - PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take) + uint64_t, +) cnp.import_array() diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index cbe0e71153565..54892a7e4bc77 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -78,7 +78,7 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, @cython.boundscheck(False) def left_outer_join(const int64_t[:] left, const int64_t[:] right, - Py_ssize_t max_groups, sort=True): + Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 ndarray[int64_t] left_count, right_count, left_sorter, right_sorter @@ -670,7 +670,7 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 + bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 HashTable hash_table @@ -678,7 +678,7 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, # if we are using tolerance, set our objects if tolerance is not None: - has_tolerance = 1 + has_tolerance = True tolerance_ = tolerance left_size = len(left_values) @@ -739,7 +739,7 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 + bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 HashTable hash_table @@ -747,7 +747,7 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, # if we are using tolerance, set our objects if tolerance is not None: - has_tolerance = 1 + has_tolerance = True tolerance_ = tolerance left_size = len(left_values) @@ -802,7 +802,7 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, asof_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, - bint allow_exact_matches=1, + bint allow_exact_matches=True, tolerance=None): cdef: @@ -853,19 +853,19 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, def asof_join_backward(asof_t[:] left_values, asof_t[:] right_values, - bint allow_exact_matches=1, + bint allow_exact_matches=True, tolerance=None): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 + bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: - has_tolerance = 1 + has_tolerance = True tolerance_ = tolerance left_size = len(left_values) @@ -906,19 +906,19 @@ def asof_join_backward(asof_t[:] left_values, def asof_join_forward(asof_t[:] left_values, asof_t[:] right_values, - bint allow_exact_matches=1, + bint allow_exact_matches=True, tolerance=None): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 + bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: - has_tolerance = 1 + has_tolerance = True tolerance_ = tolerance left_size = len(left_values) @@ -960,7 +960,7 @@ def asof_join_forward(asof_t[:] left_values, def asof_join_nearest(asof_t[:] left_values, asof_t[:] right_values, - bint allow_exact_matches=1, + bint allow_exact_matches=True, tolerance=None): cdef: diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index ca3b83852b098..b5fe73df5d9be 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from cpython.object cimport PyObject from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6aa9a8b2dedfd..6c6f6a8600ba2 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -530,14 +530,14 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): cdef: Py_ssize_t i, n = len(mask) Py_ssize_t start = 0, end = 0 - bint started = 0, finished = 0 + bint started = False, finished = False for i in range(n): if mask[i]: if finished: return mask.view(np.bool_) if not started: - started = 1 + started = True start = i else: if finished: @@ -545,7 +545,7 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): if started: end = i - finished = 1 + finished = True if not started: return slice(0, 0) @@ -657,13 +657,13 @@ def clean_index_list(obj: list): cdef: Py_ssize_t i, n = len(obj) object val - bint all_arrays = 1 + bint all_arrays = True for i in range(n): val = obj[i] if not (isinstance(val, list) or util.is_array(val) or hasattr(val, '_data')): - all_arrays = 0 + all_arrays = False break if all_arrays: @@ -692,7 +692,7 @@ def clean_index_list(obj: list): @cython.boundscheck(False) @cython.wraparound(False) def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner, - object closed='left', bint hasnans=0): + object closed='left', bint hasnans=False): """ Int64 (datetime64) version of generic python version in ``groupby.py``. """ @@ -1064,29 +1064,29 @@ cdef class Seen: bint timedelta_ # seen_timedelta bint datetimetz_ # seen_datetimetz - def __cinit__(self, bint coerce_numeric=0): + def __cinit__(self, bint coerce_numeric=False): """ Initialize a Seen instance. Parameters ---------- - coerce_numeric : bint, default 0 + coerce_numeric : bool, default False Whether or not to force conversion to a numeric data type if initial methods to convert to numeric fail. """ - self.int_ = 0 - self.nat_ = 0 - self.bool_ = 0 - self.null_ = 0 - self.nan_ = 0 - self.uint_ = 0 - self.sint_ = 0 - self.float_ = 0 - self.object_ = 0 - self.complex_ = 0 - self.datetime_ = 0 - self.timedelta_ = 0 - self.datetimetz_ = 0 + self.int_ = False + self.nat_ = False + self.bool_ = False + self.null_ = False + self.nan_ = False + self.uint_ = False + self.sint_ = False + self.float_ = False + self.object_ = False + self.complex_ = False + self.datetime_ = False + self.timedelta_ = False + self.datetimetz_ = False self.coerce_numeric = coerce_numeric cdef inline bint check_uint64_conflict(self) except -1: @@ -1127,8 +1127,8 @@ cdef class Seen: """ Set flags indicating that a null value was encountered. """ - self.null_ = 1 - self.float_ = 1 + self.null_ = True + self.float_ = True cdef saw_int(self, object val): """ @@ -1147,7 +1147,7 @@ cdef class Seen: val : Python int Value with which to set the flags. """ - self.int_ = 1 + self.int_ = True self.sint_ = self.sint_ or (oINT64_MIN <= val < 0) self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) @@ -1445,9 +1445,9 @@ def infer_datetimelike_array(arr: object) -> object: """ cdef: Py_ssize_t i, n = len(arr) - bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0 - bint seen_tz_aware = 0, seen_tz_naive = 0 - bint seen_nat = 0 + bint seen_timedelta = False, seen_date = False, seen_datetime = False + bint seen_tz_aware = False, seen_tz_naive = False + bint seen_nat = False list objs = [] object v @@ -1463,27 +1463,27 @@ def infer_datetimelike_array(arr: object) -> object: # nan or None pass elif v is NaT: - seen_nat = 1 + seen_nat = True elif PyDateTime_Check(v): # datetime - seen_datetime = 1 + seen_datetime = True # disambiguate between tz-naive and tz-aware if v.tzinfo is None: - seen_tz_naive = 1 + seen_tz_naive = True else: - seen_tz_aware = 1 + seen_tz_aware = True if seen_tz_naive and seen_tz_aware: return 'mixed' elif util.is_datetime64_object(v): # np.datetime64 - seen_datetime = 1 + seen_datetime = True elif PyDate_Check(v): - seen_date = 1 + seen_date = True elif is_timedelta(v): # timedelta, or timedelta64 - seen_timedelta = 1 + seen_timedelta = True else: return "mixed" @@ -2035,10 +2035,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, @cython.boundscheck(False) @cython.wraparound(False) -def maybe_convert_objects(ndarray[object] objects, bint try_float=0, - bint safe=0, bint convert_datetime=0, - bint convert_timedelta=0, - bint convert_to_nullable_integer=0): +def maybe_convert_objects(ndarray[object] objects, bint try_float=False, + bint safe=False, bint convert_datetime=False, + bint convert_timedelta=False, + bint convert_to_nullable_integer=False): """ Type inference function-- convert object array to proper dtype @@ -2102,45 +2102,45 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, val = objects[i] if val is None: - seen.null_ = 1 + seen.null_ = True floats[i] = complexes[i] = fnan mask[i] = True elif val is NaT: - seen.nat_ = 1 + seen.nat_ = True if convert_datetime: idatetimes[i] = NPY_NAT if convert_timedelta: itimedeltas[i] = NPY_NAT if not (convert_datetime or convert_timedelta): - seen.object_ = 1 + seen.object_ = True break elif val is np.nan: - seen.nan_ = 1 + seen.nan_ = True mask[i] = True floats[i] = complexes[i] = val elif util.is_bool_object(val): - seen.bool_ = 1 + seen.bool_ = True bools[i] = val elif util.is_float_object(val): floats[i] = complexes[i] = val - seen.float_ = 1 + seen.float_ = True elif util.is_datetime64_object(val): if convert_datetime: idatetimes[i] = convert_to_tsobject( val, None, None, 0, 0).value - seen.datetime_ = 1 + seen.datetime_ = True else: - seen.object_ = 1 + seen.object_ = True break elif is_timedelta(val): if convert_timedelta: itimedeltas[i] = convert_to_timedelta64(val, 'ns') - seen.timedelta_ = 1 + seen.timedelta_ = True else: - seen.object_ = 1 + seen.object_ = True break elif util.is_integer_object(val): - seen.int_ = 1 + seen.int_ = True floats[i] = val complexes[i] = val if not seen.null_: @@ -2149,7 +2149,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if ((seen.uint_ and seen.sint_) or val > oUINT64_MAX or val < oINT64_MIN): - seen.object_ = 1 + seen.object_ = True break if seen.uint_: @@ -2162,32 +2162,32 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, elif util.is_complex_object(val): complexes[i] = val - seen.complex_ = 1 + seen.complex_ = True elif PyDateTime_Check(val) or util.is_datetime64_object(val): # if we have an tz's attached then return the objects if convert_datetime: if getattr(val, 'tzinfo', None) is not None: - seen.datetimetz_ = 1 + seen.datetimetz_ = True break else: - seen.datetime_ = 1 + seen.datetime_ = True idatetimes[i] = convert_to_tsobject( val, None, None, 0, 0).value else: - seen.object_ = 1 + seen.object_ = True break elif try_float and not isinstance(val, str): # this will convert Decimal objects try: floats[i] = float(val) complexes[i] = complex(val) - seen.float_ = 1 + seen.float_ = True except (ValueError, TypeError): - seen.object_ = 1 + seen.object_ = True break else: - seen.object_ = 1 + seen.object_ = True break # we try to coerce datetime w/tz but must all have the same tz @@ -2195,7 +2195,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if is_datetime_with_singletz_array(objects): from pandas import DatetimeIndex return DatetimeIndex(objects) - seen.object_ = 1 + seen.object_ = True if not seen.object_: if not safe: @@ -2294,7 +2294,7 @@ no_default = object() #: Sentinel indicating the default value. @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, +def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, object na_value=no_default, object dtype=object): """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2343,16 +2343,16 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, if convert: return maybe_convert_objects(result, - try_float=0, - convert_datetime=0, - convert_timedelta=0) + try_float=False, + convert_datetime=False, + convert_timedelta=False) return result @cython.boundscheck(False) @cython.wraparound(False) -def map_infer(ndarray arr, object f, bint convert=1): +def map_infer(ndarray arr, object f, bint convert=True): """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2385,9 +2385,9 @@ def map_infer(ndarray arr, object f, bint convert=1): if convert: return maybe_convert_objects(result, - try_float=0, - convert_datetime=0, - convert_timedelta=0) + try_float=False, + convert_datetime=False, + convert_timedelta=False) return result diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index d4303ac28b9a5..5ab42a736712f 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from numpy cimport ndarray, uint8_t cpdef bint checknull(object val) diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index c0971b91a2fa1..658600cdfbe6c 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -1,7 +1,15 @@ import operator -from cpython.object cimport (PyObject_RichCompareBool, - Py_EQ, Py_NE, Py_LT, Py_LE, Py_GT, Py_GE) +from cpython.object cimport ( + Py_EQ, + Py_GE, + Py_GT, + Py_LE, + Py_LT, + Py_NE, + PyObject_RichCompareBool, +) + import cython from cython import Py_ssize_t diff --git a/pandas/_libs/properties.pyx b/pandas/_libs/properties.pyx index 857119789ab45..0e04c5417cd7e 100644 --- a/pandas/_libs/properties.pyx +++ b/pandas/_libs/properties.pyx @@ -1,7 +1,6 @@ from cython import Py_ssize_t -from cpython.dict cimport ( - PyDict_Contains, PyDict_GetItem, PyDict_SetItem) +from cpython.dict cimport PyDict_Contains, PyDict_GetItem, PyDict_SetItem cdef class CachedProperty: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 94e757624c136..53bcf5be2586a 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -114,7 +114,7 @@ def ints_to_pydatetime( const int64_t[:] arr, object tz=None, object freq=None, - bint fold=0, + bint fold=False, str box="datetime" ): """ @@ -288,7 +288,8 @@ def format_array_from_datetime( cdef: int64_t val, ns, N = len(values) ndarray[int64_t] consider_values - bint show_ms = 0, show_us = 0, show_ns = 0, basic_format = 0 + bint show_ms = False, show_us = False, show_ns = False + bint basic_format = False ndarray[object] result = np.empty(N, dtype=object) object ts, res npy_datetimestruct dts @@ -576,10 +577,10 @@ cpdef array_to_datetime( ndarray[object] oresult npy_datetimestruct dts bint utc_convert = bool(utc) - bint seen_integer = 0 - bint seen_string = 0 - bint seen_datetime = 0 - bint seen_datetime_offset = 0 + bint seen_integer = False + bint seen_string = False + bint seen_datetime = False + bint seen_datetime_offset = False bint is_raise = errors=='raise' bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' @@ -606,7 +607,7 @@ cpdef array_to_datetime( iresult[i] = NPY_NAT elif PyDateTime_Check(val): - seen_datetime = 1 + seen_datetime = True if val.tzinfo is not None: if utc_convert: _ts = convert_datetime_to_tsobject(val, None) @@ -622,17 +623,17 @@ cpdef array_to_datetime( check_dts_bounds(&dts) elif PyDate_Check(val): - seen_datetime = 1 + seen_datetime = True iresult[i] = pydate_to_dt64(val, &dts) check_dts_bounds(&dts) elif is_datetime64_object(val): - seen_datetime = 1 + seen_datetime = True iresult[i] = get_datetime64_nanos(val) elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition - seen_integer = 1 + seen_integer = True if val != val or val == NPY_NAT: iresult[i] = NPY_NAT @@ -651,7 +652,7 @@ cpdef array_to_datetime( elif isinstance(val, str): # string - seen_string = 1 + seen_string = True if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT @@ -693,7 +694,7 @@ cpdef array_to_datetime( raise TypeError("invalid string coercion to datetime") if tz is not None: - seen_datetime_offset = 1 + seen_datetime_offset = True # dateutil timezone objects cannot be hashed, so # store the UTC offsets in seconds instead out_tzoffset_vals.add(tz.total_seconds()) @@ -709,7 +710,7 @@ cpdef array_to_datetime( # where we left off value = dtstruct_to_dt64(&dts) if out_local == 1: - seen_datetime_offset = 1 + seen_datetime_offset = True # Store the out_tzoffset in seconds # since we store the total_seconds of # dateutil.tz.tzoffset objects diff --git a/pandas/_libs/tslibs/c_timestamp.pxd b/pandas/_libs/tslibs/c_timestamp.pxd index e41197d0f20a2..d095b6027d2f9 100644 --- a/pandas/_libs/tslibs/c_timestamp.pxd +++ b/pandas/_libs/tslibs/c_timestamp.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from cpython.datetime cimport datetime from numpy cimport int64_t diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index 08f539a70a7ed..59ecaaaf2266e 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from cython cimport Py_ssize_t from numpy cimport int64_t, int32_t diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index bb20296e24587..e5b2a37860068 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from cpython.datetime cimport datetime from numpy cimport int64_t, int32_t diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 57483783faf9f..a318bea14b52b 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -595,8 +595,12 @@ cdef inline void localize_tso(_TSObject obj, tzinfo tz): obj.tzinfo = tz -cdef inline bint _infer_tsobject_fold(_TSObject obj, ndarray[int64_t] trans, - int64_t[:] deltas, int32_t pos): +cdef inline bint _infer_tsobject_fold( + _TSObject obj, + const int64_t[:] trans, + const int64_t[:] deltas, + int32_t pos, +): """ Infer _TSObject fold property from value by assuming 0 and then setting to 1 if necessary. @@ -738,7 +742,7 @@ def normalize_i8_timestamps(int64_t[:] stamps, object tz): @cython.wraparound(False) @cython.boundscheck(False) -cdef int64_t[:] _normalize_local(int64_t[:] stamps, tzinfo tz): +cdef int64_t[:] _normalize_local(const int64_t[:] stamps, tzinfo tz): """ Normalize each of the (nanosecond) timestamps in the given array by rounding down to the beginning of the day (i.e. midnight) for the @@ -818,7 +822,7 @@ cdef inline int64_t _normalized_stamp(npy_datetimestruct *dts) nogil: @cython.wraparound(False) @cython.boundscheck(False) -def is_date_array_normalized(int64_t[:] stamps, object tz=None): +def is_date_array_normalized(const int64_t[:] stamps, object tz=None): """ Check if all of the given (nanosecond) timestamps are normalized to midnight, i.e. hour == minute == second == 0. If the optional timezone diff --git a/pandas/_libs/tslibs/frequencies.pxd b/pandas/_libs/tslibs/frequencies.pxd index 6ec67ce250505..1b7efb8c5dfdf 100644 --- a/pandas/_libs/tslibs/frequencies.pxd +++ b/pandas/_libs/tslibs/frequencies.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - cpdef str get_rule_month(object source, str default=*) cpdef get_freq_code(freqstr) diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd index dae5bdc3f93b1..bd97462381b58 100644 --- a/pandas/_libs/tslibs/nattype.pxd +++ b/pandas/_libs/tslibs/nattype.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from cpython.datetime cimport datetime from numpy cimport int64_t diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index ebedee79405e5..c936d42b34db5 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from cpython.datetime cimport date, datetime from numpy cimport int64_t, int32_t diff --git a/pandas/_libs/tslibs/offsets.pxd b/pandas/_libs/tslibs/offsets.pxd index 2829a27b9905c..5a553be537e52 100644 --- a/pandas/_libs/tslibs/offsets.pxd +++ b/pandas/_libs/tslibs/offsets.pxd @@ -1,3 +1 @@ -# -*- coding: utf-8 -*- - cdef to_offset(object obj) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0849ba0f29624..a66c9cd86d00c 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -609,8 +609,13 @@ cdef inline int month_add_months(npy_datetimestruct dts, int months) nogil: @cython.wraparound(False) @cython.boundscheck(False) -def shift_quarters(int64_t[:] dtindex, int quarters, - int q1start_month, object day, int modby=3): +def shift_quarters( + const int64_t[:] dtindex, + int quarters, + int q1start_month, + object day, + int modby=3, +): """ Given an int64 array representing nanosecond timestamps, shift all elements by the specified number of quarters using DateOffset semantics. @@ -759,7 +764,7 @@ def shift_quarters(int64_t[:] dtindex, int quarters, @cython.wraparound(False) @cython.boundscheck(False) -def shift_months(int64_t[:] dtindex, int months, object day=None): +def shift_months(const int64_t[:] dtindex, int months, object day=None): """ Given an int64-based datetime index, shift all elements specified number of months using DateOffset semantics diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index 097309b17823b..b08592755f2ee 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from numpy cimport int64_t # Exposed for tslib, not intended for outside use. diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 457f3eb0749c2..c8bf317cbf041 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -82,6 +82,7 @@ cdef dict timedelta_abbrevs = { "us": "us", "microseconds": "us", "microsecond": "us", + "µs": "us", "micro": "us", "micros": "us", "u": "us", @@ -101,7 +102,7 @@ _no_input = object() @cython.boundscheck(False) @cython.wraparound(False) -def ints_to_pytimedelta(int64_t[:] arr, box=False): +def ints_to_pytimedelta(const int64_t[:] arr, box=False): """ convert an i8 repr to an ndarray of timedelta or Timedelta (if box == True) diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index 5e55e6e8d5297..3cb4b6cd8113b 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 50c4a41f97a82..6d6ae8f8576ad 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - cpdef bint is_utc(object tz) cdef bint is_tzlocal(object tz) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index a9702f91107ec..6915783ac3aaa 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -549,8 +549,9 @@ cdef int64_t _tz_convert_tzlocal_fromutc(int64_t val, tzinfo tz, bint *fold): @cython.boundscheck(False) @cython.wraparound(False) -cdef int64_t[:] _tz_convert_dst(int64_t[:] values, tzinfo tz, - bint to_utc=True): +cdef int64_t[:] _tz_convert_dst( + const int64_t[:] values, tzinfo tz, bint to_utc=True, +): """ tz_convert for non-UTC non-tzlocal cases where we have to check DST transitions pointwise. diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index a90d2f77e44d1..1d1963fb04818 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -846,7 +846,7 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, res, prev - bint err = 0 + bint err = False int ret = 0 skiplist_t *sl Py_ssize_t i, j diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index ebf98232da58b..091d76df26a17 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -82,7 +82,7 @@ def convert_json_to_lines(arr: object) -> str: """ cdef: Py_ssize_t i = 0, num_open_brackets_seen = 0, length - bint in_quotes = 0, is_escaping = 0 + bint in_quotes = False, is_escaping = False ndarray[uint8_t, ndim=1] narr unsigned char val, newline, comma, left_bracket, right_bracket, quote unsigned char backslash diff --git a/pandas/_testing.py b/pandas/_testing.py index e69263b81e1aa..1f6b645c821c8 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -2662,3 +2662,34 @@ def external_error_raised( import pytest return pytest.raises(expected_exception, match=None) + + +cython_table = pd.core.base.SelectionMixin._cython_table.items() + + +def get_cython_table_params(ndframe, func_names_and_expected): + """ + Combine frame, functions from SelectionMixin._cython_table + keys and expected result. + + Parameters + ---------- + ndframe : DataFrame or Series + func_names_and_expected : Sequence of two items + The first item is a name of a NDFrame method ('sum', 'prod') etc. + The second item is the expected return value. + + Returns + ------- + list + List of three items (DataFrame, function, expected result) + """ + results = [] + for func_name, expected in func_names_and_expected: + results.append((ndframe, func_name, expected)) + results += [ + (ndframe, func, expected) + for func, name in cython_table + if name == func_name + ] + return results diff --git a/pandas/_typing.py b/pandas/_typing.py index 3b7392f781525..e1b6a5e2e6876 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -11,6 +11,7 @@ List, Mapping, Optional, + Type, TypeVar, Union, ) @@ -44,7 +45,9 @@ # other -Dtype = Union[str, np.dtype, "ExtensionDtype"] +Dtype = Union[ + "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool]] +] DtypeObj = Union[np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] diff --git a/pandas/conftest.py b/pandas/conftest.py index 903e1a5dec132..ad21d46e601e8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -23,6 +23,7 @@ from decimal import Decimal import operator import os +from typing import List from dateutil.tz import tzlocal, tzutc import hypothesis @@ -31,6 +32,7 @@ import pytest from pytz import FixedOffset, utc +from pandas._typing import Dtype import pandas.util._test_decorators as td import pandas as pd @@ -309,7 +311,7 @@ def __init__(self, *args, **kwargs): @pytest.fixture -def non_mapping_dict_subclass(): +def non_dict_mapping_subclass(): """ Fixture for a non-mapping dictionary subclass. """ @@ -368,6 +370,17 @@ def _create_multiindex(): return mi +def _create_mi_with_dt64tz_level(): + """ + MultiIndex with a level that is a tzaware DatetimeIndex. + """ + # GH#8367 round trip with pickle + return MultiIndex.from_product( + [[1, 2], ["a", "b"], pd.date_range("20130101", periods=3, tz="US/Eastern")], + names=["one", "two", "three"], + ) + + indices_dict = { "unicode": tm.makeUnicodeIndex(100), "string": tm.makeStringIndex(100), @@ -384,6 +397,7 @@ def _create_multiindex(): "interval": tm.makeIntervalIndex(100), "empty": Index([]), "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), } @@ -404,6 +418,10 @@ def indices(request): return indices_dict[request.param].copy() +# Needed to generate cartesian product of indices +index_fixture2 = indices + + # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- @@ -786,14 +804,14 @@ def utc_fixture(request): UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"] -SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] +SIGNED_INT_DTYPES: List[Dtype] = [int, "int8", "int16", "int32", "int64"] SIGNED_EA_INT_DTYPES = ["Int8", "Int16", "Int32", "Int64"] ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES -FLOAT_DTYPES = [float, "float32", "float64"] -COMPLEX_DTYPES = [complex, "complex64", "complex128"] -STRING_DTYPES = [str, "str", "U"] +FLOAT_DTYPES: List[Dtype] = [float, "float32", "float64"] +COMPLEX_DTYPES: List[Dtype] = [complex, "complex64", "complex128"] +STRING_DTYPES: List[Dtype] = [str, "str", "U"] DATETIME64_DTYPES = ["datetime64[ns]", "M8[ns]"] TIMEDELTA64_DTYPES = ["timedelta64[ns]", "m8[ns]"] @@ -1119,10 +1137,7 @@ def spmatrix(request): return getattr(sparse, request.param + "_matrix") -_cython_table = pd.core.base.SelectionMixin._cython_table.items() - - -@pytest.fixture(params=list(_cython_table)) +@pytest.fixture(params=list(tm.cython_table)) def cython_table_items(request): """ Yields a tuple of a function and its corresponding name. Correspond to @@ -1131,34 +1146,6 @@ def cython_table_items(request): return request.param -def _get_cython_table_params(ndframe, func_names_and_expected): - """ - Combine frame, functions from SelectionMixin._cython_table - keys and expected result. - - Parameters - ---------- - ndframe : DataFrame or Series - func_names_and_expected : Sequence of two items - The first item is a name of a NDFrame method ('sum', 'prod') etc. - The second item is the expected return value. - - Returns - ------- - list - List of three items (DataFrame, function, expected result) - """ - results = [] - for func_name, expected in func_names_and_expected: - results.append((ndframe, func_name, expected)) - results += [ - (ndframe, func, expected) - for func, name in _cython_table - if name == func_name - ] - return results - - @pytest.fixture( params=[ getattr(pd.offsets, o) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5b324bc5753ec..9afdb82467f90 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -700,7 +700,7 @@ def value_counts( result = result.sort_index() # if we are dropna and we have NO values - if dropna and (result.values == 0).all(): + if dropna and (result._values == 0).all(): result = result.iloc[0:0] # normalizing is by len of all (regardless of dropna) @@ -713,7 +713,7 @@ def value_counts( # handle Categorical and sparse, result = Series(values)._values.value_counts(dropna=dropna) result.name = name - counts = result.values + counts = result._values else: keys, counts = _value_counts_arraylike(values, dropna) @@ -823,7 +823,7 @@ def mode(values, dropna: bool = True) -> "Series": # categorical is a fast-path if is_categorical_dtype(values): if isinstance(values, Series): - return Series(values.values.mode(dropna=dropna), name=values.name) + return Series(values._values.mode(dropna=dropna), name=values.name) return values.mode(dropna=dropna) if dropna and needs_i8_conversion(values.dtype): diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py new file mode 100644 index 0000000000000..0fb2605b554c2 --- /dev/null +++ b/pandas/core/array_algos/masked_reductions.py @@ -0,0 +1,47 @@ +""" +masked_reductions.py is for reduction algorithms using a mask-based approach +for missing values. +""" + +import numpy as np + +from pandas._libs import missing as libmissing +from pandas.compat.numpy import _np_version_under1p17 + +from pandas.core.nanops import check_below_min_count + + +def sum( + values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0, +): + """ + Sum for 1D masked array. + + Parameters + ---------- + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + """ + if not skipna: + if mask.any(): + return libmissing.NA + else: + if check_below_min_count(values.shape, None, min_count): + return libmissing.NA + return np.sum(values) + else: + if check_below_min_count(values.shape, mask, min_count): + return libmissing.NA + + if _np_version_under1p17: + return np.sum(values[~mask]) + else: + return np.sum(values, where=~mask) diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index bf3469924a700..1d538824e6d82 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -2,7 +2,6 @@ ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin, - try_cast_to_ea, ) from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.categorical import Categorical @@ -19,7 +18,6 @@ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", - "try_cast_to_ea", "BooleanArray", "Categorical", "DatetimeArray", diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 67e3807c477fb..af897e86a14d4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -19,6 +19,7 @@ from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import is_array_like, is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -32,29 +33,6 @@ _extension_array_shared_docs: Dict[str, str] = dict() -def try_cast_to_ea(cls_or_instance, obj, dtype=None): - """ - Call to `_from_sequence` that returns the object unchanged on Exception. - - Parameters - ---------- - cls_or_instance : ExtensionArray subclass or instance - obj : arraylike - Values to pass to cls._from_sequence - dtype : ExtensionDtype, optional - - Returns - ------- - ExtensionArray or obj - """ - try: - result = cls_or_instance._from_sequence(obj, dtype=dtype) - except Exception: - # We can't predict what downstream EA constructors may raise - result = obj - return result - - class ExtensionArray: """ Abstract base class for custom 1-D array types. @@ -1214,7 +1192,7 @@ def _maybe_convert(arr): # https://github.com/pandas-dev/pandas/issues/22850 # We catch all regular exceptions here, and fall back # to an ndarray. - res = try_cast_to_ea(self, arr) + res = maybe_cast_to_extension_array(type(self), arr) if not isinstance(res, type(self)): # exception raised in _from_sequence; ensure we have ndarray res = np.asarray(arr) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index d93b5fbc83312..442d4ca8cef6d 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -27,6 +27,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops +from pandas.core.array_algos import masked_reductions from pandas.core.indexers import check_array_indexer from .masked import BaseMaskedArray @@ -695,6 +696,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask + if name == "sum": + return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + # coerce to a nan-aware float if needed if self._hasna: data = self.to_numpy("float64", na_value=np.nan) @@ -706,7 +710,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): return libmissing.NA # if we have numeric op that would result in an int, coerce to int if possible - if name in ["sum", "prod"] and notna(result): + if name == "prod" and notna(result): int_result = np.int64(result) if int_result == result: result = int_result diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bfccc6f244219..c11d879840fb9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -19,7 +19,11 @@ ) from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike +from pandas.core.dtypes.cast import ( + coerce_indexer_dtype, + maybe_cast_to_extension_array, + maybe_infer_to_datetimelike, +) from pandas.core.dtypes.common import ( ensure_int64, ensure_object, @@ -47,11 +51,7 @@ from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d -from pandas.core.arrays.base import ( - ExtensionArray, - _extension_array_shared_docs, - try_cast_to_ea, -) +from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array @@ -2568,7 +2568,7 @@ def _get_codes_for_values(values, categories): # scalar objects. e.g. # Categorical(array[Period, Period], categories=PeriodIndex(...)) cls = categories.dtype.construct_array_type() - values = try_cast_to_ea(cls, values) + values = maybe_cast_to_extension_array(cls, values) if not isinstance(values, cls): # exception raised in _from_sequence values = ensure_object(values) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c3e79f40e7451..a153b4e06157b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -846,14 +846,14 @@ def searchsorted(self, value, side="left", sorter=None): elif isinstance(value, self._recognized_scalars): value = self._scalar_type(value) - elif isinstance(value, np.ndarray): + elif is_list_like(value) and not isinstance(value, type(self)): + value = array(value) + if not type(self)._is_recognized_dtype(value): raise TypeError( "searchsorted requires compatible dtype or scalar, " f"not {type(value).__name__}" ) - value = type(self)(value) - self._check_compatible_with(value) if not (isinstance(value, (self._scalar_type, type(self))) or (value is NaT)): raise TypeError(f"Unexpected type for 'value': {type(value)}") @@ -905,7 +905,7 @@ def value_counts(self, dropna=False): index = Index( cls(result.index.view("i8"), dtype=self.dtype), name=result.index.name ) - return Series(result.values, index=index, name=result.name) + return Series(result._values, index=index, name=result.name) def map(self, mapper): # TODO(GH-23179): Add ExtensionArray.map diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f2880c5cbee42..4f3c68aa03b16 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -27,6 +27,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops +from pandas.core.array_algos import masked_reductions import pandas.core.common as com from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison @@ -560,6 +561,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask + if name == "sum": + return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) if self._hasna: @@ -577,7 +581,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["sum", "min", "max", "prod"]: + elif name in ["min", "max", "prod"]: # GH#31409 more performant than casting-then-checking result = com.cast_scalar_indexer(result) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index d852ea4f584c9..22ce5a6f87a43 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -152,7 +152,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): def __new__(cls, data, closed=None, dtype=None, copy=False, verify_integrity=True): if isinstance(data, ABCSeries) and is_interval_dtype(data): - data = data.values + data = data._values if isinstance(data, (cls, ABCIntervalIndex)): left = data.left diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 47892b55b3ce8..cf6c16d4cad5d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -244,11 +244,11 @@ def value_counts(self, dropna: bool = True) -> "Series": # TODO(extension) # if we have allow Index to hold an ExtensionArray # this is easier - index = value_counts.index.values.astype(object) + index = value_counts.index._values.astype(object) # if we want nans, count the mask if dropna: - counts = value_counts.values + counts = value_counts._values else: counts = np.empty(len(value_counts) + 1, dtype="int64") counts[:-1] = value_counts diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f82790ac4c3d9..dbca8e74f5e1b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -13,7 +13,8 @@ from pandas import compat from pandas.core import ops -from pandas.core.arrays import PandasArray +from pandas.core.arrays import IntegerArray, PandasArray +from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna @@ -271,6 +272,13 @@ def astype(self, dtype, copy=True): if copy: return self.copy() return self + elif isinstance(dtype, _IntegerDtype): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = 0 + values = arr.astype(dtype.numpy_dtype) + return IntegerArray(values, mask, copy=False) + return super().astype(dtype, copy) def _reduce(self, name, skipna=True, **kwargs): diff --git a/pandas/core/base.py b/pandas/core/base.py index 148be3f50c0e7..9ff0d60b9cd6a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -123,15 +123,11 @@ def __setattr__(self, key: str, value): object.__setattr__(self, key, value) -class GroupByError(Exception): +class DataError(Exception): pass -class DataError(GroupByError): - pass - - -class SpecificationError(GroupByError): +class SpecificationError(Exception): pass @@ -372,7 +368,7 @@ def _agg_1dim(name, how, subset=None): ) return colg.aggregate(how) - def _agg_2dim(name, how): + def _agg_2dim(how): """ aggregate a 2-dim with how """ @@ -660,7 +656,7 @@ def item(self): ): # numpy returns ints instead of datetime64/timedelta64 objects, # which we need to wrap in Timestamp/Timedelta/Period regardless. - return self.values.item() + return self._values.item() if len(self) == 1: return next(iter(self)) @@ -1132,10 +1128,8 @@ def _map_values(self, mapper, na_action=None): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values return self._values.map(mapper) - if is_extension_array_dtype(self.dtype): - values = self._values - else: - values = self.values + + values = self._values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_1d(mapper._values, indexer) diff --git a/pandas/core/common.py b/pandas/core/common.py index fd7b4fd80bc5e..4ff1a93737d41 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -213,7 +213,7 @@ def asarray_tuplesafe(values, dtype=None): if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")): values = list(values) elif isinstance(values, ABCIndexClass): - return values.values + return values._values if isinstance(values, list) and dtype in [np.object_, object]: return construct_1d_object_array_from_listlike(values) diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 19a8898a2987c..327ec21c3c11c 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -24,7 +24,3 @@ def result_type_many(*arrays_and_dtypes): except ValueError: # we have > NPY_MAXARGS terms in our expression return reduce(np.result_type, arrays_and_dtypes) - - -class NameResolutionError(NameError): - pass diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index a488aac08e060..b74f99fca21c7 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ Top level ``eval`` module. """ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 97c02428cbdf9..da9646aa8c46f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -16,7 +16,7 @@ iNaT, ) from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import Dtype +from pandas._typing import Dtype, DtypeObj from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( @@ -246,6 +246,97 @@ def trans(x): return result +def maybe_cast_result( + result, obj: ABCSeries, numeric_only: bool = False, how: str = "" +): + """ + Try casting result to a different type if appropriate + + Parameters + ---------- + result : array-like + Result to cast. + obj : ABCSeries + Input series from which result was calculated. + numeric_only : bool, default False + Whether to cast only numerics or datetimes as well. + how : str, default "" + How the result was computed. + + Returns + ------- + result : array-like + result maybe casted to the dtype. + """ + if obj.ndim > 1: + dtype = obj._values.dtype + else: + dtype = obj.dtype + dtype = maybe_cast_result_dtype(dtype, how) + + if not is_scalar(result): + if is_extension_array_dtype(dtype) and dtype.kind != "M": + # The result may be of any type, cast back to original + # type if it's compatible. + if len(result) and isinstance(result[0], dtype.type): + cls = dtype.construct_array_type() + result = maybe_cast_to_extension_array(cls, result, dtype=dtype) + + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: + result = maybe_downcast_to_dtype(result, dtype) + + return result + + +def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: + """ + Get the desired dtype of a result based on the + input dtype and how it was computed. + + Parameters + ---------- + dtype : DtypeObj + Input dtype. + how : str + How the result was computed. + + Returns + ------- + DtypeObj + The desired dtype of the result. + """ + d = { + (np.dtype(np.bool), "add"): np.dtype(np.int64), + (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), + (np.dtype(np.bool), "sum"): np.dtype(np.int64), + } + return d.get((dtype, how), dtype) + + +def maybe_cast_to_extension_array(cls, obj, dtype=None): + """ + Call to `_from_sequence` that returns the object unchanged on Exception. + + Parameters + ---------- + cls : ExtensionArray subclass + obj : arraylike + Values to pass to cls._from_sequence + dtype : ExtensionDtype, optional + + Returns + ------- + ExtensionArray or obj + """ + assert isinstance(cls, type), f"must pass a type: {cls}" + try: + result = cls._from_sequence(obj, dtype=dtype) + except Exception: + # We can't predict what downstream EA constructors may raise + result = obj + return result + + def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): """ A safe version of putmask that potentially upcasts the result. @@ -888,7 +979,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): elif is_timedelta64_dtype(dtype): from pandas import to_timedelta - return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy) + return astype_nansafe(to_timedelta(arr)._values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): msg = ( diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index f5997a13e785d..b4b7fb36ee4d0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -188,7 +188,9 @@ def ensure_python_int(value: Union[int, np.integer]) -> int: TypeError: if the value isn't an int or can't be converted to one. """ if not is_scalar(value): - raise TypeError(f"Value needs to be a scalar value, was type {type(value)}") + raise TypeError( + f"Value needs to be a scalar value, was type {type(value).__name__}" + ) try: new_value = int(value) assert new_value == value diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 682a0722de3b7..581067b65b3bf 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -229,7 +229,7 @@ def _isna_ndarraylike(obj): if not is_extension: # Avoid accessing `.values` on things like # PeriodIndex, which may be expensive. - values = getattr(obj, "values", obj) + values = getattr(obj, "_values", obj) else: values = obj @@ -270,7 +270,7 @@ def _isna_ndarraylike(obj): def _isna_ndarraylike_old(obj): - values = getattr(obj, "values", obj) + values = getattr(obj, "_values", obj) dtype = values.dtype if is_string_dtype(dtype): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8deeb415c17c9..1e9f8995b6bed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2300,7 +2300,7 @@ def to_html( ) # ---------------------------------------------------------------------- - @Appender(info.__doc__) + @doc(info) def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None ) -> None: @@ -3525,6 +3525,9 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: n = len(row_labels) if n != len(col_labels): raise ValueError("Row labels must have same size as column labels") + if not (self.index.is_unique and self.columns.is_unique): + # GH#33041 + raise ValueError("DataFrame.lookup requires unique index and columns") thresh = 1000 if not self._is_mixed_type or n > thresh: @@ -3897,7 +3900,7 @@ def rename( columns : dict-like or function Alternative to specifying axis (``mapper, axis=1`` is equivalent to ``columns=mapper``). - axis : int or str + axis : {0 or 'index', 1 or 'columns'}, default 0 Axis to target with ``mapper``. Can be either the axis name ('index', 'columns') or number (0, 1). The default is 'index'. copy : bool, default True @@ -5260,6 +5263,9 @@ def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": ---------- i, j : int or str Levels of the indices to be swapped. Can pass level name as string. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to swap levels on. 0 or 'index' for row-wise, 1 or + 'columns' for column-wise. Returns ------- @@ -5289,7 +5295,7 @@ def reorder_levels(self, order, axis=0) -> "DataFrame": order : list of int or list of str List representing new level order. Reference level by number (position) or by key (label). - axis : int + axis : {0 or 'index', 1 or 'columns'}, default 0 Where to reorder levels. Returns @@ -7783,7 +7789,7 @@ def count(self, axis=0, level=None, numeric_only=False): ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index' counts are generated for each column. - If 1 or 'columns' counts are generated for each **row**. + If 1 or 'columns' counts are generated for each row. level : int or str, optional If the axis is a `MultiIndex` (hierarchical), count along a particular `level`, collapsing into a `DataFrame`. @@ -8341,7 +8347,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): ---------- q : float or array-like, default 0.5 (50% quantile) Value between 0 <= q <= 1, the quantile(s) to compute. - axis : {0, 1, 'index', 'columns'} (default 0) + axis : {0, 1, 'index', 'columns'}, default 0 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. numeric_only : bool, default True If False, the quantile of datetime and timedelta data will be diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8c6a5c9d020b4..5348040808e63 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1723,7 +1723,7 @@ def items(self): for h in self._info_axis: yield h, self[h] - @Appender(items.__doc__) + @doc(items) def iteritems(self): return self.items() @@ -7071,7 +7071,7 @@ def asof(self, where, subset=None): return Series(np.nan, index=self.columns, name=where[0]) - locs = self.index.asof_locs(where, ~(nulls.values)) + locs = self.index.asof_locs(where, ~(nulls._values)) # mask the missing missing = locs == -1 @@ -7230,7 +7230,7 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): raise ValueError("Cannot use an NA value as a clip threshold") result = self - mask = isna(self.values) + mask = isna(self._values) with np.errstate(all="ignore"): if upper is not None: @@ -8604,12 +8604,12 @@ def _where( if self.ndim == 1: - icond = cond.values + icond = cond._values # GH 2745 / GH 4192 # treat like a scalar if len(other) == 1: - other = np.array(other[0]) + other = other[0] # GH 3235 # match True cond to other @@ -8978,7 +8978,7 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: return new_obj.__finalize__(self) def tshift( - self: FrameOrSeries, periods: int = 1, freq=None, axis=0 + self: FrameOrSeries, periods: int = 1, freq=None, axis: Axis = 0 ) -> FrameOrSeries: """ Shift the time index, using the index's frequency if available. @@ -9020,22 +9020,22 @@ def tshift( if isinstance(freq, str): freq = to_offset(freq) - block_axis = self._get_block_manager_axis(axis) + axis = self._get_axis_number(axis) if isinstance(index, PeriodIndex): orig_freq = to_offset(index.freq) - if freq == orig_freq: - new_data = self._data.copy() - new_data.axes[block_axis] = index.shift(periods) - elif orig_freq is not None: + if freq != orig_freq: + assert orig_freq is not None # for mypy raise ValueError( f"Given freq {freq.rule_code} does not match " f"PeriodIndex freq {orig_freq.rule_code}" ) + new_ax = index.shift(periods) else: - new_data = self._data.copy() - new_data.axes[block_axis] = index.shift(periods, freq) + new_ax = index.shift(periods, freq) - return self._constructor(new_data).__finalize__(self) + result = self.copy() + result.set_axis(new_ax, axis, inplace=True) + return result.__finalize__(self) def truncate( self: FrameOrSeries, before=None, after=None, axis=None, copy: bool_t = True @@ -10222,7 +10222,7 @@ def _add_series_or_dataframe_operations(cls): """ from pandas.core.window import EWM, Expanding, Rolling, Window - @Appender(Rolling.__doc__) + @doc(Rolling) def rolling( self, window, @@ -10260,14 +10260,14 @@ def rolling( cls.rolling = rolling - @Appender(Expanding.__doc__) + @doc(Expanding) def expanding(self, min_periods=1, center=False, axis=0): axis = self._get_axis_number(axis) return Expanding(self, min_periods=min_periods, center=center, axis=axis) cls.expanding = expanding - @Appender(EWM.__doc__) + @doc(EWM) def ewm( self, com=None, @@ -10541,13 +10541,14 @@ def _doc_parms(cls): skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. -*args, **kwargs : +*args, **kwargs Additional keywords have no effect but might be accepted for compatibility with NumPy. Returns ------- %(name1)s or %(name2)s + Return cumulative %(desc)s of %(name1)s or %(name2)s. See Also -------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4102b8527b6aa..b7c071a8dfbbf 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -34,6 +34,8 @@ from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import ( + maybe_cast_result, + maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, maybe_downcast_to_dtype, @@ -526,7 +528,7 @@ def _transform_fast(self, result, func_nm: str) -> Series: cast = self._transform_should_cast(func_nm) out = algorithms.take_1d(result._values, ids) if cast: - out = self._try_cast(out, self.obj) + out = maybe_cast_result(out, self.obj, how=func_nm) return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): @@ -1072,8 +1074,10 @@ def _cython_agg_blocks( assert not isinstance(result, DataFrame) if result is not no_result: - # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype) + # see if we can cast the block to the desired dtype + # this may not be the original dtype + dtype = maybe_cast_result_dtype(block.dtype, how) + result = maybe_downcast_numeric(result, dtype) if block.is_extension and isinstance(result, np.ndarray): # e.g. block.values was an IntegerArray @@ -1175,7 +1179,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: else: if cast: - result[item] = self._try_cast(result[item], data) + result[item] = maybe_cast_result(result[item], data) result_columns = obj.columns if cannot_agg: @@ -1460,7 +1464,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: # TODO: we have no test cases that get here with EA dtypes; # try_cast may not be needed if EAs never get here if cast: - res = self._try_cast(res, obj.iloc[:, i]) + res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm) output.append(res) return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 19e51d05feb92..86171944d0c78 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -39,11 +39,10 @@ class providing the base-class of operations. from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float, is_datetime64_dtype, - is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -53,7 +52,7 @@ class providing the base-class of operations. from pandas.core import nanops import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea +from pandas.core.arrays import Categorical, DatetimeArray from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -792,36 +791,6 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only: bool = False): - """ - Try to cast the result to our obj original type, - we may have roundtripped through object in the mean-time. - - If numeric_only is True, then only try to cast numerics - and not datetimelikes. - - """ - if obj.ndim > 1: - dtype = obj._values.dtype - else: - dtype = obj.dtype - - if not is_scalar(result): - if is_extension_array_dtype(dtype) and dtype.kind != "M": - # The function can return something of any type, so check - # if the type is compatible with the calling EA. - # datetime64tz is handled correctly in agg_series, - # so is excluded here. - - if len(result) and isinstance(result[0], dtype.type): - cls = dtype.construct_array_type() - result = try_cast_to_ea(cls, result, dtype=dtype) - - elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: - result = maybe_downcast_to_dtype(result, dtype) - - return result - def _transform_should_cast(self, func_nm: str) -> bool: """ Parameters @@ -852,7 +821,7 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): continue if self._transform_should_cast(how): - result = self._try_cast(result, obj) + result = maybe_cast_result(result, obj, how=how) key = base.OutputKey(label=name, position=idx) output[key] = result @@ -895,12 +864,12 @@ def _cython_agg_general( assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - output[key] = self._try_cast(result_column, obj) + output[key] = maybe_cast_result(result_column, obj, how=how) idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj) + output[key] = maybe_cast_result(result, obj, how=how) idx += 1 if len(output) == 0: @@ -929,7 +898,7 @@ def _python_agg_general(self, func, *args, **kwargs): assert result is not None key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj, numeric_only=True) + output[key] = maybe_cast_result(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) @@ -944,7 +913,7 @@ def _python_agg_general(self, func, *args, **kwargs): if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[key] = self._try_cast(values[mask], result) + output[key] = maybe_cast_result(values[mask], result) return self._wrap_aggregated_output(output) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 577c874c9cbbe..742de397956c0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -525,9 +525,7 @@ def _cython_operation( np.empty(out_shape, dtype=out_dtype), fill_value=np.nan ) counts = np.zeros(self.ngroups, dtype=np.int64) - result = self._aggregate( - result, counts, values, codes, func, is_datetimelike, min_count - ) + result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = _maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan @@ -590,14 +588,7 @@ def transform(self, values, how: str, axis: int = 0, **kwargs): return self._cython_operation("transform", values, how, axis, **kwargs) def _aggregate( - self, - result, - counts, - values, - comp_ids, - agg_func, - is_datetimelike: bool, - min_count: int = -1, + self, result, counts, values, comp_ids, agg_func, min_count: int = -1, ): if agg_func is libgroupby.group_nth: # different signature from the others diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 8cfe1f4ac469c..feb9881ffdb81 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -321,7 +321,7 @@ def __new__(cls, data: "Series"): orig.array, name=orig.name, copy=False, - dtype=orig.values.categories.dtype, + dtype=orig._values.categories.dtype, ) if is_datetime64_dtype(data.dtype): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 83064fe22eaff..f6a422180b0df 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -395,10 +395,10 @@ def __new__( raise ValueError("Index data must be 1-dimensional") return cls._simple_new(subarr, name) - elif hasattr(data, "__array__"): - return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) elif data is None or is_scalar(data): raise cls._scalar_data_error(data) + elif hasattr(data, "__array__"): + return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) else: if tupleize_cols and is_list_like(data): # GH21470: convert iterable to list before determining if empty @@ -670,7 +670,7 @@ def astype(self, dtype, copy=True): return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), dtype=dtype, copy=copy) + return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy) try: casted = self.values.astype(dtype, copy=copy) @@ -3049,8 +3049,9 @@ def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: left_indexer = self.get_indexer(target, "pad", limit=limit) right_indexer = self.get_indexer(target, "backfill", limit=limit) - left_distances = np.abs(self[left_indexer] - target) - right_distances = np.abs(self[right_indexer] - target) + target_values = target._values + left_distances = np.abs(self._values[left_indexer] - target_values) + right_distances = np.abs(self._values[right_indexer] - target_values) op = operator.lt if self.is_monotonic_increasing else operator.le indexer = np.where( @@ -3059,13 +3060,16 @@ def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: right_indexer, ) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target, indexer, tolerance) + indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer def _filter_indexer_tolerance( - self, target: "Index", indexer: np.ndarray, tolerance + self, + target: Union["Index", np.ndarray, ExtensionArray], + indexer: np.ndarray, + tolerance, ) -> np.ndarray: - distance = abs(self.values[indexer] - target) + distance = abs(self._values[indexer] - target) indexer = np.where(distance <= tolerance, indexer, -1) return indexer diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 52423c4008399..2cae09ed08f36 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -243,8 +243,11 @@ def _simple_new(cls, values: Categorical, name: Label = None): @Appender(Index._shallow_copy.__doc__) def _shallow_copy(self, values=None, name: Label = no_default): + name = self.name if name is no_default else name + if values is not None: values = Categorical(values, dtype=self.dtype) + return super()._shallow_copy(values=values, name=name) def _is_dtype_compat(self, other) -> bool: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ca1995adc1ea9..ad6a3600752b6 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -287,7 +287,7 @@ def _is_dates_only(self) -> bool: """ from pandas.io.formats.format import _is_dates_only - return _is_dates_only(self.values) and self.tz is None + return self.tz is None and _is_dates_only(self._values) def __reduce__(self): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index f4942b72a6ad4..d5df661efa692 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1104,9 +1104,9 @@ def func(self, other, sort=sort): # GH 19101: ensure empty results have correct dtype if result.empty: - result = result.values.astype(self.dtype.subtype) + result = result._values.astype(self.dtype.subtype) else: - result = result.values + result = result._values return type(self).from_tuples(result, closed=self.closed, name=result_name) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1bcda72e77f2f..b00af4653dfe3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -21,7 +21,7 @@ from pandas._typing import AnyArrayLike, Scalar from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.dtypes.common import ( @@ -986,7 +986,7 @@ def _engine(self): def _constructor(self): return MultiIndex.from_tuples - @Appender(Index._shallow_copy.__doc__) + @doc(Index._shallow_copy) def _shallow_copy( self, values=None, @@ -1098,7 +1098,7 @@ def view(self, cls=None): result._id = self._id return result - @Appender(Index.__contains__.__doc__) + @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: hash(key) try: @@ -1119,7 +1119,7 @@ def f(l): return any(f(l) for l in self._inferred_type_levels) - @Appender(Index.memory_usage.__doc__) + @doc(Index.memory_usage) def memory_usage(self, deep: bool = False) -> int: # we are overwriting our base class to avoid # computing .values here which could materialize @@ -1351,7 +1351,7 @@ def _set_names(self, names, level=None, validate=True): # -------------------------------------------------------------------- - @Appender(Index._get_grouper_for_level.__doc__) + @doc(Index._get_grouper_for_level) def _get_grouper_for_level(self, mapper, level): indexer = self.codes[level] level_index = self.levels[level] @@ -1462,7 +1462,7 @@ def _inferred_type_levels(self): """ return a list of the inferred types, one for each level """ return [i.inferred_type for i in self.levels] - @Appender(Index.duplicated.__doc__) + @doc(Index.duplicated) def duplicated(self, keep="first"): shape = map(len, self.levels) ids = get_group_index(self.codes, shape, sort=False, xnull=False) @@ -1475,7 +1475,7 @@ def fillna(self, value=None, downcast=None): """ raise NotImplementedError("isna is not defined for MultiIndex") - @Appender(Index.dropna.__doc__) + @doc(Index.dropna) def dropna(self, how="any"): nans = [level_codes == -1 for level_codes in self.codes] if how == "any": @@ -1548,7 +1548,7 @@ def get_level_values(self, level): values = self._get_level_values(level) return values - @Appender(Index.unique.__doc__) + @doc(Index.unique) def unique(self, level=None): if level is None: @@ -3423,7 +3423,7 @@ def _convert_can_do_setop(self, other): # -------------------------------------------------------------------- - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_categorical_dtype(dtype): @@ -3498,7 +3498,7 @@ def _wrap_joined_index(self, joined, other): names = self.names if self.names == other.names else None return MultiIndex.from_tuples(joined, names=names) - @Appender(Index.isin.__doc__) + @doc(Index.isin) def isin(self, values, level=None): if level is None: values = MultiIndex.from_tuples(values, names=self.names)._values diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 3a6f3630c19e7..e2be58a56018d 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -4,7 +4,7 @@ from pandas._libs import index as libindex, lib from pandas._typing import Dtype, Label -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -95,14 +95,14 @@ def _validate_dtype(cls, dtype: Dtype) -> None: f"Incorrect `dtype` passed: expected {expected}, received {dtype}" ) - @Appender(Index._maybe_cast_slice_bound.__doc__) + @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side, kind): assert kind in ["loc", "getitem", None] # we will try to coerce to integers return self._maybe_cast_indexer(label) - @Appender(Index._shallow_copy.__doc__) + @doc(Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = lib.no_default): if values is not None and not self._can_hold_na and values.dtype.kind == "f": name = self.name if name is lib.no_default else name @@ -158,7 +158,7 @@ def is_all_dates(self) -> bool: """ return False - @Appender(Index.insert.__doc__) + @doc(Index.insert) def insert(self, loc: int, item): # treat NA values as nans: if is_scalar(item) and isna(item): @@ -295,7 +295,7 @@ class UInt64Index(IntegerIndex): _engine_type = libindex.UInt64Engine _default_dtype = np.dtype(np.uint64) - @Appender(Index._convert_arr_indexer.__doc__) + @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so that the values returned # from indexing are also uint64. @@ -307,7 +307,7 @@ def _convert_arr_indexer(self, keyarr): return com.asarray_tuplesafe(keyarr, dtype=dtype) - @Appender(Index._convert_index_indexer.__doc__) + @doc(Index._convert_index_indexer) def _convert_index_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are @@ -357,7 +357,7 @@ def inferred_type(self) -> str: """ return "floating" - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): @@ -369,17 +369,17 @@ def astype(self, dtype, copy=True): # TODO(jreback); this can change once we have an EA Index type # GH 13149 arr = astype_nansafe(self._values, dtype=dtype) - return Int64Index(arr) + return Int64Index(arr, name=self.name) return super().astype(dtype, copy=copy) # ---------------------------------------------------------------- # Indexing Methods - @Appender(Index._should_fallback_to_positional.__doc__) + @doc(Index._should_fallback_to_positional) def _should_fallback_to_positional(self): return False - @Appender(Index._convert_slice_indexer.__doc__) + @doc(Index._convert_slice_indexer) def _convert_slice_indexer(self, key: slice, kind: str): assert kind in ["loc", "getitem"] @@ -433,7 +433,7 @@ def __contains__(self, other: Any) -> bool: return is_float(other) and np.isnan(other) and self.hasnans - @Appender(Index.get_loc.__doc__) + @doc(Index.get_loc) def get_loc(self, key, method=None, tolerance=None): if is_bool(key): # Catch this to avoid accidentally casting to 1.0 @@ -453,7 +453,7 @@ def get_loc(self, key, method=None, tolerance=None): def is_unique(self) -> bool: return super().is_unique and self._nan_idxs.size < 2 - @Appender(Index.isin.__doc__) + @doc(Index.isin) def isin(self, values, level=None): if level is not None: self._validate_index_level(level) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f6bf02b6df676..68d7e8dd384f0 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -10,7 +10,7 @@ from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.period import Period from pandas._typing import DtypeObj, Label -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -312,7 +312,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: def _mpl_repr(self): # how to represent ourselves to matplotlib - return self.astype(object).values + return self.astype(object)._values @property def _formatter_func(self): @@ -327,7 +327,7 @@ def _engine(self): period = weakref.ref(self) return self._engine_type(period, len(self)) - @Appender(Index.__contains__.__doc__) + @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: if isinstance(key, Period): if key.freq != self.freq: @@ -389,7 +389,7 @@ def asof_locs(self, where, mask: np.ndarray) -> np.ndarray: """ where_idx = where if isinstance(where_idx, DatetimeIndex): - where_idx = PeriodIndex(where_idx.values, freq=self.freq) + where_idx = PeriodIndex(where_idx._values, freq=self.freq) elif not isinstance(where_idx, PeriodIndex): raise TypeError("asof_locs `where` must be DatetimeIndex or PeriodIndex") elif where_idx.freq != self.freq: @@ -405,7 +405,7 @@ def asof_locs(self, where, mask: np.ndarray) -> np.ndarray: return result - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True, how="start"): dtype = pandas_dtype(dtype) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 2c038564f4e6f..b463b8d738d30 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -11,7 +11,7 @@ from pandas._typing import Label import pandas.compat as compat from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -342,7 +342,7 @@ def __contains__(self, key: Any) -> bool: return False return key in self._range - @Appender(Int64Index.get_loc.__doc__) + @doc(Int64Index.get_loc) def get_loc(self, key, method=None, tolerance=None): if method is None and tolerance is None: if is_integer(key) or (is_float(key) and key.is_integer()): @@ -386,7 +386,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def tolist(self): return list(self._range) - @Appender(Int64Index._shallow_copy.__doc__) + @doc(Int64Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name @@ -397,7 +397,7 @@ def _shallow_copy(self, values=None, name: Label = no_default): else: return Int64Index._simple_new(values, name=name) - @Appender(Int64Index.copy.__doc__) + @doc(Int64Index.copy) def copy(self, name=None, deep=False, dtype=None, **kwargs): self._validate_dtype(dtype) if name is None: @@ -619,7 +619,7 @@ def _union(self, other, sort): return type(self)(start_r, end_r + step_o, step_o) return self._int64index._union(other, sort=sort) - @Appender(Int64Index.join.__doc__) + @doc(Int64Index.join) def join(self, other, how="left", level=None, return_indexers=False, sort=False): if how == "outer" and self is not other: # note: could return RangeIndex in more circumstances diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 588cb3e37bced..6acf9562f9b80 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -2,7 +2,7 @@ from pandas._libs import NaT, Timedelta, index as libindex from pandas._typing import DtypeObj, Label -from pandas.util._decorators import Appender +from pandas.util._decorators import doc from pandas.core.dtypes.common import ( _TD_DTYPE, @@ -195,7 +195,7 @@ def _formatter_func(self): # ------------------------------------------------------------------- - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 935ff09585b17..b2a8c7a0864b8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -55,6 +55,7 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, + ABCIndexClass, ABCPandasArray, ABCSeries, ) @@ -653,6 +654,20 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, dtype) return isinstance(element, dtype) + def should_store(self, value: ArrayLike) -> bool: + """ + Should we set self.values[indexer] = value inplace or do we need to cast? + + Parameters + ---------- + value : np.ndarray or ExtensionArray + + Returns + ------- + bool + """ + return is_dtype_equal(value.dtype, self.dtype) + def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -833,21 +848,24 @@ def setitem(self, indexer, value): else: # current dtype cannot store value, coerce to common dtype - find_dtype = False if hasattr(value, "dtype"): dtype = value.dtype - find_dtype = True elif lib.is_scalar(value) and not isna(value): dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) - find_dtype = True - if find_dtype: - dtype = find_common_type([values.dtype, dtype]) - if not is_dtype_equal(self.dtype, dtype): - b = self.astype(dtype) - return b.setitem(indexer, value) + else: + # e.g. we are bool dtype and value is nan + # TODO: watch out for case with listlike value and scalar/empty indexer + dtype, _ = maybe_promote(np.array(value).dtype) + return self.astype(dtype).setitem(indexer, value) + + dtype = find_common_type([values.dtype, dtype]) + assert not is_dtype_equal(self.dtype, dtype) + # otherwise should have _can_hold_element + + return self.astype(dtype).setitem(indexer, value) # value must be storeable at this moment if is_extension_array_dtype(getattr(value, "dtype", None)): @@ -857,11 +875,6 @@ def setitem(self, indexer, value): else: arr_value = np.array(value) - # cast the values to a type that can hold nan (if necessary) - if not self._can_hold_element(value): - dtype, _ = maybe_promote(arr_value.dtype) - values = values.astype(dtype) - if transpose: values = values.T @@ -881,11 +894,7 @@ def setitem(self, indexer, value): # be e.g. a list; see GH#6043 values[indexer] = value - elif ( - exact_match - and is_categorical_dtype(arr_value.dtype) - and not is_categorical_dtype(values) - ): + elif exact_match and is_categorical_dtype(arr_value.dtype): # GH25495 - If the current dtype is not categorical, # we need to create a new categorical block values[indexer] = value @@ -919,7 +928,7 @@ def putmask( Parameters ---------- - mask : the condition to respect + mask : np.ndarray[bool], SparseArray[bool], or BooleanArray new : a ndarray/object inplace : bool, default False Perform inplace modification. @@ -931,10 +940,10 @@ def putmask( ------- List[Block] """ - new_values = self.values if inplace else self.values.copy() + mask = _extract_bool_array(mask) + assert not isinstance(new, (ABCIndexClass, ABCSeries, ABCDataFrame)) - new = getattr(new, "values", new) - mask = getattr(mask, "values", mask) + new_values = self.values if inplace else self.values.copy() # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: @@ -1314,7 +1323,7 @@ def where( Parameters ---------- other : a ndarray/object - cond : the condition to respect + cond : np.ndarray[bool], SparseArray[bool], or BooleanArray errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object @@ -1322,10 +1331,13 @@ def where( Returns ------- - a new block(s), the result of the func + List[Block] """ import pandas.core.computation.expressions as expressions + cond = _extract_bool_array(cond) + assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)) + assert errors in ["raise", "ignore"] transpose = self.ndim == 2 @@ -1334,9 +1346,6 @@ def where( if transpose: values = values.T - other = getattr(other, "_values", getattr(other, "values", other)) - cond = getattr(cond, "values", cond) - # If the default broadcasting would go in the wrong direction, then # explicitly reshape other instead if getattr(other, "ndim", 0) >= 1: @@ -1634,9 +1643,9 @@ def putmask( """ inplace = validate_bool_kwarg(inplace, "inplace") - # use block's copy logic. - # .values may be an Index which does shallow copy by default - new_values = self.values if inplace else self.copy().values + mask = _extract_bool_array(mask) + + new_values = self.values if inplace else self.values.copy() if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] @@ -1752,10 +1761,7 @@ def setitem(self, indexer, value): def get_values(self, dtype=None): # ExtensionArrays must be iterable, so this works. - values = np.asarray(self.values) - if values.ndim == self.ndim - 1: - values = values.reshape((1,) + values.shape) - return values + return np.asarray(self.values).reshape(self.shape) def array_values(self) -> ExtensionArray: return self.values @@ -1865,19 +1871,19 @@ def shift( def where( self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, ) -> List["Block"]: - if isinstance(other, ABCDataFrame): - # ExtensionArrays are 1-D, so if we get here then - # `other` should be a DataFrame with a single column. - assert other.shape[1] == 1 - other = other.iloc[:, 0] - other = extract_array(other, extract_numpy=True) + cond = _extract_bool_array(cond) + assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)) - if isinstance(cond, ABCDataFrame): - assert cond.shape[1] == 1 - cond = cond.iloc[:, 0] + if isinstance(other, np.ndarray) and other.ndim == 2: + # TODO(EA2D): unnecessary with 2D EAs + assert other.shape[1] == 1 + other = other[:, 0] - cond = extract_array(cond, extract_numpy=True) + if isinstance(cond, np.ndarray) and cond.ndim == 2: + # TODO(EA2D): unnecessary with 2D EAs + assert cond.shape[1] == 1 + cond = cond[:, 0] if lib.is_scalar(other) and isna(other): # The default `other` for Series / Frame is np.nan @@ -2021,11 +2027,6 @@ def to_native_types( ) return formatter.get_result_as_array() - def should_store(self, value: ArrayLike) -> bool: - # when inserting a column should not coerce integers to floats - # unnecessarily - return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype - class ComplexBlock(FloatOrComplexBlock): __slots__ = () @@ -2058,9 +2059,6 @@ def _can_hold_element(self, element: Any) -> bool: ) return is_integer(element) - def should_store(self, value: ArrayLike) -> bool: - return is_integer_dtype(value) and value.dtype == self.dtype - class DatetimeLikeBlockMixin: """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" @@ -2069,9 +2067,6 @@ class DatetimeLikeBlockMixin: def _holder(self): return DatetimeArray - def should_store(self, value): - return is_dtype_equal(self.dtype, value.dtype) - @property def fill_value(self): return np.datetime64("NaT", "ns") @@ -2081,15 +2076,17 @@ def get_values(self, dtype=None): return object dtype as boxed values, such as Timestamps/Timedelta """ if is_object_dtype(dtype): - values = self.values.ravel() - result = self._holder(values).astype(object) - return result.reshape(self.values.shape) + # DTA/TDA constructor and astype can handle 2D + return self._holder(self.values).astype(object) return self.values def internal_values(self): # Override to return DatetimeArray and TimedeltaArray return self.array_values() + def array_values(self): + return self._holder._simple_new(self.values) + def iget(self, key): # GH#31649 we need to wrap scalars in Timestamp/Timedelta # TODO(EA2D): this can be removed if we ever have 2D EA @@ -2216,12 +2213,6 @@ def set(self, locs, values): self.values[locs] = values - def external_values(self): - return np.asarray(self.values.astype("datetime64[ns]", copy=False)) - - def array_values(self) -> ExtensionArray: - return DatetimeArray._simple_new(self.values) - class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ @@ -2234,7 +2225,8 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") - should_store = DatetimeBlock.should_store + should_store = Block.should_store + array_values = ExtensionBlock.array_values @property def _holder(self): @@ -2293,14 +2285,16 @@ def get_values(self, dtype=None): if is_object_dtype(dtype): values = values.astype(object) - values = np.asarray(values) + # TODO(EA2D): reshape unnecessary with 2D EAs + # Ensure that our shape is correct for DataFrame. + # ExtensionArrays are always 1-D, even in a DataFrame when + # the analogous NumPy-backed column would be a 2-D ndarray. + return np.asarray(values).reshape(self.shape) - if self.ndim == 2: - # Ensure that our shape is correct for DataFrame. - # ExtensionArrays are always 1-D, even in a DataFrame when - # the analogous NumPy-backed column would be a 2-D ndarray. - values = values.reshape(1, -1) - return values + def external_values(self): + # NB: this is different from np.asarray(self.values), since that + # return an object-dtype ndarray of Timestamps. + return np.asarray(self.values.astype("datetime64[ns]", copy=False)) def _slice(self, slicer): """ return a slice of my values """ @@ -2467,12 +2461,6 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): ) return rvalues - def external_values(self): - return np.asarray(self.values.astype("timedelta64[ns]", copy=False)) - - def array_values(self) -> ExtensionArray: - return TimedeltaArray._simple_new(self.values) - class BoolBlock(NumericBlock): __slots__ = () @@ -2485,11 +2473,6 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, np.bool_) return isinstance(element, (bool, np.bool_)) - def should_store(self, value: ArrayLike) -> bool: - return issubclass(value.dtype.type, np.bool_) and not is_extension_array_dtype( - value - ) - def replace( self, to_replace, value, inplace=False, filter=None, regex=False, convert=True ): @@ -2577,15 +2560,6 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] def _can_hold_element(self, element: Any) -> bool: return True - def should_store(self, value: ArrayLike) -> bool: - return not ( - issubclass( - value.dtype.type, - (np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_), - ) - or is_extension_array_dtype(value) - ) - def replace( self, to_replace, value, inplace=False, filter=None, regex=False, convert=True ): @@ -2816,6 +2790,8 @@ class CategoricalBlock(ExtensionBlock): _can_hold_na = True _concatenator = staticmethod(concat_categorical) + should_store = Block.should_store + def __init__(self, values, placement, ndim=None): # coerce to categorical if we can values = extract_array(values) @@ -2826,22 +2802,6 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return Categorical - def should_store(self, arr: ArrayLike): - return isinstance(arr, self._holder) and is_dtype_equal(self.dtype, arr.dtype) - - def to_native_types(self, slicer=None, na_rep="", quoting=None, **kwargs): - """ convert to our native types format, slicing if desired """ - values = self.values - if slicer is not None: - # Categorical is always one dimension - values = values[slicer] - mask = isna(values) - values = np.array(values, dtype="object") - values[mask] = na_rep - - # we are expected to return a 2-d ndarray - return values.reshape(1, len(values)) - def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. @@ -3119,3 +3079,16 @@ def _putmask_preserve(nv, n): v = v.astype(dtype) return _putmask_preserve(v, n) + + +def _extract_bool_array(mask: ArrayLike) -> np.ndarray: + """ + If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. + """ + if isinstance(mask, ExtensionArray): + # We could have BooleanArray, Sparse[bool], ... + mask = np.asarray(mask, dtype=np.bool_) + + assert isinstance(mask, np.ndarray), type(mask) + assert mask.dtype == bool, mask.dtype + return mask diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b245ac09029a2..dda932cafe73b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -33,6 +33,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject +from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index from pandas.core.internals.blocks import ( @@ -426,7 +427,7 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: for k, obj in aligned_args.items(): axis = obj._info_axis_number - kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) + kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)._values if callable(f): applied = b.apply(f, **kwargs) @@ -552,6 +553,7 @@ def where(self, **kwargs) -> "BlockManager": align_keys = ["other", "cond"] else: align_keys = ["cond"] + kwargs["other"] = extract_array(kwargs["other"], extract_numpy=True) return self.apply("where", align_keys=align_keys, **kwargs) @@ -567,6 +569,7 @@ def putmask( align_keys = ["new", "mask"] else: align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) return self.apply( "putmask", diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 87f937f9e7087..822ab775e7e46 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1238,7 +1238,7 @@ def _maybe_null_out( result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], - shape: Tuple, + shape: Tuple[int, ...], min_count: int = 1, ) -> float: """ @@ -1260,16 +1260,43 @@ def _maybe_null_out( # GH12941, use None to auto cast null result[null_mask] = None elif result is not NaT: - if mask is not None: - null_mask = mask.size - mask.sum() - else: - null_mask = np.prod(shape) - if null_mask < min_count: + if check_below_min_count(shape, mask, min_count): result = np.nan return result +def check_below_min_count( + shape: Tuple[int, ...], mask: Optional[np.ndarray], min_count: int +): + """ + Check for the `min_count` keyword. Returns True if below `min_count` (when + missing value should be returned from the reduction). + + Parameters + ---------- + shape : tuple + The shape of the values (`values.shape`). + mask : ndarray or None + Boolean numpy array (typically of same shape as `shape`) or None. + min_count : int + Keyword passed through from sum/prod call. + + Returns + ------- + bool + """ + if min_count > 0: + if mask is None: + # no missing values, only check size + non_nulls = np.prod(shape) + else: + non_nulls = mask.size - mask.sum() + if non_nulls < min_count: + return True + return False + + def _zero_out_fperr(arg): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index c7f58d738b578..5dd7af454cbd1 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -45,7 +45,7 @@ def comp_method_OBJECT_ARRAY(op, x, y): y = y.astype(np.object_) if isinstance(y, (ABCSeries, ABCIndex)): - y = y.values + y = y._values if x.shape != y.shape: raise ValueError("Shapes must match", x.shape, y.shape) diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 203ea3946d1b2..7b03b4b449ea5 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -53,7 +53,7 @@ def _make_flex_doc(op_name, typ): return doc -_add_example_SERIES = """ +_common_examples_algebra_SERIES = """ Examples -------- >>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) @@ -69,33 +69,44 @@ def _make_flex_doc(op_name, typ): b NaN d 1.0 e NaN -dtype: float64 ->>> a.add(b, fill_value=0) -a 2.0 -b 1.0 -c 1.0 -d 1.0 -e NaN -dtype: float64 -""" +dtype: float64""" -_sub_example_SERIES = """ +_common_examples_comparison_SERIES = """ Examples -------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a = pd.Series([1, 1, 1, np.nan, 1], index=['a', 'b', 'c', 'd', 'e']) >>> a a 1.0 b 1.0 c 1.0 d NaN +e 1.0 dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b = pd.Series([0, 1, 2, np.nan, 1], index=['a', 'b', 'c', 'd', 'f']) >>> b -a 1.0 -b NaN +a 0.0 +b 1.0 +c 2.0 +d NaN +f 1.0 +dtype: float64""" + +_add_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.add(b, fill_value=0) +a 2.0 +b 1.0 +c 1.0 d 1.0 e NaN dtype: float64 +""" +) + +_sub_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.subtract(b, fill_value=0) a 0.0 b 1.0 @@ -104,24 +115,11 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) -_mul_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +_mul_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.multiply(b, fill_value=0) a 1.0 b 0.0 @@ -130,24 +128,11 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) -_div_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +_div_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.divide(b, fill_value=0) a 1.0 b inf @@ -156,24 +141,11 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) -_floordiv_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +_floordiv_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.floordiv(b, fill_value=0) a 1.0 b NaN @@ -182,24 +154,11 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) -_mod_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +_mod_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.mod(b, fill_value=0) a 0.0 b NaN @@ -208,23 +167,10 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ -_pow_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +) +_pow_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.pow(b, fill_value=0) a 1.0 b 1.0 @@ -233,6 +179,89 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) + +_ne_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.ne(b, fill_value=0) +a False +b True +c True +d True +e True +dtype: bool +""" +) + +_eq_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.eq(b, fill_value=0) +a True +b False +c False +d False +e False +dtype: bool +""" +) + +_lt_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.lt(b, fill_value=0) +a False +b False +c True +d False +e False +f True +dtype: bool +""" +) + +_le_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.le(b, fill_value=0) +a False +b True +c True +d False +e False +f True +dtype: bool +""" +) + +_gt_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.gt(b, fill_value=0) +a True +b False +c False +d False +e True +f False +dtype: bool +""" +) + +_ge_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.ge(b, fill_value=0) +a True +b True +c False +d False +e True +f False +dtype: bool +""" +) _returns_series = """Series\n The result of the operation.""" @@ -306,42 +335,42 @@ def _make_flex_doc(op_name, typ): "op": "==", "desc": "Equal to", "reverse": None, - "series_examples": None, + "series_examples": _eq_example_SERIES, "series_returns": _returns_series, }, "ne": { "op": "!=", "desc": "Not equal to", "reverse": None, - "series_examples": None, + "series_examples": _ne_example_SERIES, "series_returns": _returns_series, }, "lt": { "op": "<", "desc": "Less than", "reverse": None, - "series_examples": None, + "series_examples": _lt_example_SERIES, "series_returns": _returns_series, }, "le": { "op": "<=", "desc": "Less than or equal to", "reverse": None, - "series_examples": None, + "series_examples": _le_example_SERIES, "series_returns": _returns_series, }, "gt": { "op": ">", "desc": "Greater than", "reverse": None, - "series_examples": None, + "series_examples": _gt_example_SERIES, "series_returns": _returns_series, }, "ge": { "op": ">=", "desc": "Greater than or equal to", "reverse": None, - "series_examples": None, + "series_examples": _ge_example_SERIES, "series_returns": _returns_series, }, } diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index c04658565f235..0cf1ac4d107f6 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -93,7 +93,8 @@ def _wrap_inplace_method(method): def f(self, other): result = method(self, other) - + # Delete cacher + self._reset_cacher() # this makes sure that we are aligned like the input # we are updating inplace so we want to ignore is_copy self._update_inplace( diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f19a82ab6f86a..9e3318db3cfb9 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -11,7 +11,7 @@ from pandas._libs.tslibs.period import IncompatibleFrequency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -858,7 +858,7 @@ def var(self, ddof=1, *args, **kwargs): nv.validate_resampler_func("var", args, kwargs) return self._downsample("var", ddof=ddof) - @Appender(GroupBy.size.__doc__) + @doc(GroupBy.size) def size(self): result = self._downsample("size") if not len(self.ax): @@ -871,7 +871,7 @@ def size(self): result = Series([], index=result.index, dtype="int64", name=name) return result - @Appender(GroupBy.count.__doc__) + @doc(GroupBy.count) def count(self): result = self._downsample("count") if not len(self.ax): @@ -1596,7 +1596,7 @@ def _get_period_bins(self, ax): def _take_new_index(obj, indexer, new_index, axis=0): if isinstance(obj, ABCSeries): - new_values = algos.take_1d(obj.values, indexer) + new_values = algos.take_1d(obj._values, indexer) return obj._constructor(new_values, index=new_index, name=obj.name) elif isinstance(obj, ABCDataFrame): if axis == 1: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 091129707228f..b4497ce1780e6 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -2,6 +2,7 @@ Concat routines. """ +from collections import abc from typing import Iterable, List, Mapping, Union, overload import numpy as np @@ -85,7 +86,7 @@ def concat( Parameters ---------- objs : a sequence or mapping of Series or DataFrame objects - If a dict is passed, the sorted keys will be used as the `keys` + If a mapping is passed, the sorted keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. @@ -315,7 +316,7 @@ def __init__( "Only can inner (intersect) or outer (union) join the other axis" ) - if isinstance(objs, dict): + if isinstance(objs, abc.Mapping): if keys is None: keys = list(objs.keys()) objs = [objs[k] for k in keys] diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 782b8043430e1..c3e170b0e39c4 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -105,12 +105,12 @@ def melt( if is_extension_array_dtype(id_data): id_data = concat([id_data] * K, ignore_index=True) else: - id_data = np.tile(id_data.values, K) + id_data = np.tile(id_data._values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] - mdata[value_name] = frame.values.ravel("F") + mdata[value_name] = frame._values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N) @@ -170,13 +170,13 @@ def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFr pivot_cols = [] for target, names in zip(keys, values): - to_concat = [data[col].values for col in names] + to_concat = [data[col]._values for col in names] mdata[target] = concat_compat(to_concat) pivot_cols.append(target) for col in id_cols: - mdata[col] = np.tile(data[col].values, K) + mdata[col] = np.tile(data[col]._values, K) if dropna: mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index acd4a68e3fd09..4b1fd73d9950e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -6,14 +6,14 @@ import datetime from functools import partial import string -from typing import TYPE_CHECKING, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Tuple, Union, cast import warnings import numpy as np from pandas._libs import Timedelta, hashtable as libhashtable, lib import pandas._libs.join as libjoin -from pandas._typing import FrameOrSeries +from pandas._typing import ArrayLike, FrameOrSeries from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution @@ -24,6 +24,7 @@ is_array_like, is_bool, is_bool_dtype, + is_categorical, is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal, @@ -222,7 +223,14 @@ def merge_ordered( Examples -------- - >>> A + >>> df1 = pd.DataFrame( + ... { + ... "key": ["a", "c", "e", "a", "c", "e"], + ... "lvalue": [1, 2, 3, 1, 2, 3], + ... "group": ["a", "a", "a", "b", "b", "b"] + ... } + ... ) + >>> df1 key lvalue group 0 a 1 a 1 c 2 a @@ -231,24 +239,25 @@ def merge_ordered( 4 c 2 b 5 e 3 b - >>> B - Key rvalue - 0 b 1 - 1 c 2 - 2 d 3 - - >>> merge_ordered(A, B, fill_method='ffill', left_by='group') - group key lvalue rvalue - 0 a a 1 NaN - 1 a b 1 1.0 - 2 a c 2 2.0 - 3 a d 2 3.0 - 4 a e 3 3.0 - 5 b a 1 NaN - 6 b b 1 1.0 - 7 b c 2 2.0 - 8 b d 2 3.0 - 9 b e 3 3.0 + >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + >>> df2 + key rvalue + 0 b 1 + 1 c 2 + 2 d 3 + + >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group") + key lvalue group rvalue + 0 a 1 a NaN + 1 b 1 a 1.0 + 2 c 2 a 2.0 + 3 d 2 a 3.0 + 4 e 3 a 3.0 + 5 a 1 b NaN + 6 b 1 b 1.0 + 7 c 2 b 2.0 + 8 d 2 b 3.0 + 9 e 3 b 3.0 """ def _merger(x, y): @@ -369,15 +378,14 @@ def merge_asof( Examples -------- - >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) + >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) >>> left a left_val 0 1 a 1 5 b 2 10 c - >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - ... 'right_val': [1, 2, 3, 6, 7]}) + >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) >>> right a right_val 0 1 1 @@ -386,25 +394,25 @@ def merge_asof( 3 6 6 4 7 7 - >>> pd.merge_asof(left, right, on='a') + >>> pd.merge_asof(left, right, on="a") a left_val right_val 0 1 a 1 1 5 b 3 2 10 c 7 - >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) + >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False) a left_val right_val 0 1 a NaN 1 5 b 3.0 2 10 c 7.0 - >>> pd.merge_asof(left, right, on='a', direction='forward') + >>> pd.merge_asof(left, right, on="a", direction="forward") a left_val right_val 0 1 a 1.0 1 5 b 6.0 2 10 c NaN - >>> pd.merge_asof(left, right, on='a', direction='nearest') + >>> pd.merge_asof(left, right, on="a", direction="nearest") a left_val right_val 0 1 a 1 1 5 b 6 @@ -412,15 +420,14 @@ def merge_asof( We can use indexed DataFrames as well. - >>> left = pd.DataFrame({'left_val': ['a', 'b', 'c']}, index=[1, 5, 10]) + >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) >>> left left_val 1 a 5 b 10 c - >>> right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7]}, - ... index=[1, 2, 3, 6, 7]) + >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) >>> right right_val 1 1 @@ -437,6 +444,32 @@ def merge_asof( Here is a real-world times-series example + >>> quotes = pd.DataFrame( + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.030"), + ... pd.Timestamp("2016-05-25 13:30:00.041"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.049"), + ... pd.Timestamp("2016-05-25 13:30:00.072"), + ... pd.Timestamp("2016-05-25 13:30:00.075") + ... ], + ... "ticker": [ + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT" + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] + ... } + ... ) >>> quotes time ticker bid ask 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 @@ -448,6 +481,20 @@ def merge_asof( 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 + >>> trades = pd.DataFrame( + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.038"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048") + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100] + ... } + ... ) >>> trades time ticker price quantity 0 2016-05-25 13:30:00.023 MSFT 51.95 75 @@ -458,9 +505,7 @@ def merge_asof( By default we are taking the asof of the quotes - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker') + >>> pd.merge_asof(trades, quotes, on="time", by="ticker") time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 @@ -470,10 +515,9 @@ def merge_asof( We only asof within 2ms between the quote time and the trade time - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker', - ... tolerance=pd.Timedelta('2ms')) + >>> pd.merge_asof( + ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") + ... ) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN @@ -485,11 +529,14 @@ def merge_asof( and we exclude exact matches on time. However *prior* data will propagate forward - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker', - ... tolerance=pd.Timedelta('10ms'), - ... allow_exact_matches=False) + >>> pd.merge_asof( + ... trades, + ... quotes, + ... on="time", + ... by="ticker", + ... tolerance=pd.Timedelta("10ms"), + ... allow_exact_matches=False + ... ) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 @@ -1271,7 +1318,7 @@ def _get_join_indexers( # get left & right join labels and num. of levels at each location mapped = ( - _factorize_keys(left_keys[n], right_keys[n], sort=sort) + _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) for n in range(len(left_keys)) ) zipped = zip(*mapped) @@ -1283,8 +1330,8 @@ def _get_join_indexers( # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) if how == "left": @@ -1347,7 +1394,7 @@ def _convert_to_mulitindex(index) -> MultiIndex: if isinstance(index, MultiIndex): return index else: - return MultiIndex.from_arrays([index.values], names=[index.name]) + return MultiIndex.from_arrays([index._values], names=[index.name]) # For multi-multi joins with one overlapping level, # the returned index if of type Index @@ -1672,10 +1719,10 @@ def flip(xs) -> np.ndarray: # values to compare left_values = ( - self.left.index.values if self.left_index else self.left_join_keys[-1] + self.left.index._values if self.left_index else self.left_join_keys[-1] ) right_values = ( - self.right.index.values if self.right_index else self.right_join_keys[-1] + self.right.index._values if self.right_index else self.right_join_keys[-1] ) tolerance = self.tolerance @@ -1822,7 +1869,59 @@ def _right_outer_join(x, y, max_groups): return left_indexer, right_indexer -def _factorize_keys(lk, rk, sort=True): +def _factorize_keys( + lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" +) -> Tuple[np.array, np.array, int]: + """ + Encode left and right keys as enumerated types. + + This is used to get the join indexers to be used when merging DataFrames. + + Parameters + ---------- + lk : array-like + Left key. + rk : array-like + Right key. + sort : bool, defaults to True + If True, the encoding is done such that the unique elements in the + keys are sorted. + how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’ + Type of merge. + + Returns + ------- + array + Left (resp. right if called with `key='right'`) labels, as enumerated type. + array + Right (resp. left if called with `key='right'`) labels, as enumerated type. + int + Number of unique elements in union of left and right labels. + + See Also + -------- + merge : Merge DataFrame or named Series objects + with a database-style join. + algorithms.factorize : Encode the object as an enumerated type + or categorical variable. + + Examples + -------- + >>> lk = np.array(["a", "c", "b"]) + >>> rk = np.array(["a", "c"]) + + Here, the unique values are `'a', 'b', 'c'`. With the default + `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`: + + >>> pd.core.reshape.merge._factorize_keys(lk, rk) + (array([0, 2, 1]), array([0, 2]), 3) + + With the `sort=False`, the encoding will correspond to the order + in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`: + + >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False) + (array([0, 1, 2]), array([0, 1]), 3) + """ # Some pre-processing for non-ndarray lk / rk lk = extract_array(lk, extract_numpy=True) rk = extract_array(rk, extract_numpy=True) @@ -1834,8 +1933,11 @@ def _factorize_keys(lk, rk, sort=True): rk, _ = rk._values_for_factorize() elif ( - is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) + is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk) ): + assert is_categorical(lk) and is_categorical(rk) + lk = cast(Categorical, lk) + rk = cast(Categorical, rk) if lk.categories.equals(rk.categories): # if we exactly match in categories, allow us to factorize on codes rk = rk.codes @@ -1892,6 +1994,8 @@ def _factorize_keys(lk, rk, sort=True): np.putmask(rlab, rmask, count) count += 1 + if how == "right": + return rlab, llab, count return llab, rlab, count diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index a8801d8ab3f6e..b3b0166334413 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -456,10 +456,10 @@ def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFram if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name indexed = data._constructor( - data[values].values, index=index, columns=values + data[values]._values, index=index, columns=values ) else: - indexed = data._constructor_sliced(data[values].values, index=index) + indexed = data._constructor_sliced(data[values]._values, index=index) return indexed.unstack(columns) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 145cf43112be3..88e61d2392773 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -338,7 +338,7 @@ def _unstack_multiple(data, clocs, fill_value=None): comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) - if rlocs == []: + if not rlocs: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name="__placeholder__") else: @@ -363,7 +363,7 @@ def _unstack_multiple(data, clocs, fill_value=None): for i in range(len(clocs)): val = clocs[i] result = result.unstack(val, fill_value=fill_value) - clocs = [v if i > v else v - 1 for v in clocs] + clocs = [v if v < val else v - 1 for v in clocs] return result @@ -541,9 +541,9 @@ def factorize(index): ) if frame._is_homogeneous_type: - # For homogeneous EAs, frame.values will coerce to object. So + # For homogeneous EAs, frame._values will coerce to object. So # we concatenate instead. - dtypes = list(frame.dtypes.values) + dtypes = list(frame.dtypes._values) dtype = dtypes[0] if is_extension_array_dtype(dtype): @@ -554,11 +554,11 @@ def factorize(index): new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA - new_values = frame.values.ravel() + new_values = frame._values.ravel() else: # non-homogeneous - new_values = frame.values.ravel() + new_values = frame._values.ravel() if dropna: mask = notna(new_values) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index b9eb89b4d14c6..11fb8cc121fb8 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -171,24 +171,26 @@ def cut( ... index=['a', 'b', 'c', 'd', 'e']) >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) ... # doctest: +ELLIPSIS - (a 0.0 - b 1.0 - c 2.0 - d 3.0 - e 4.0 - dtype: float64, array([0, 2, 4, 6, 8])) + (a 1.0 + b 2.0 + c 3.0 + d 4.0 + e NaN + dtype: float64, + array([ 0, 2, 4, 6, 8, 10])) Use `drop` optional when bins is not unique >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, ... right=False, duplicates='drop') ... # doctest: +ELLIPSIS - (a 0.0 - b 1.0 - c 2.0 + (a 1.0 + b 2.0 + c 3.0 d 3.0 - e 3.0 - dtype: float64, array([0, 2, 4, 6, 8])) + e NaN + dtype: float64, + array([ 0, 2, 4, 6, 10])) Passing an IntervalIndex for `bins` results in those categories exactly. Notice that values not covered by the IntervalIndex are set to NaN. 0 @@ -197,7 +199,7 @@ def cut( >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) - [NaN, (0, 1], NaN, (2, 3], (4, 5]] + [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index 7abb14303f8cc..6949270317f7c 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -19,8 +19,7 @@ def cartesian_product(X): Examples -------- >>> cartesian_product([list('ABC'), [1, 2]]) - [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), - array([1, 2, 1, 2, 1, 2])] + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' "Series": if is_categorical_dtype(self.dtype): pass elif is_extension_array_dtype(self.dtype): + # TODO: can we do this for only SparseDtype? # The function can return something of any type, so check # if the type is compatible with the calling EA. - new_values = try_cast_to_ea(self._values, new_values) + new_values = maybe_cast_to_extension_array(type(self._values), new_values) return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other) -> "Series": @@ -3852,7 +3857,7 @@ def f(x): # GH#23179 some EAs do not have `map` mapped = self._values.map(f) else: - values = self.astype(object).values + values = self.astype(object)._values mapped = lib.map_infer(values, f, convert=convert_dtype) if len(mapped) and isinstance(mapped[0], Series): diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7f26c7a26d4d8..59b8b37f72695 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union import warnings import numpy as np @@ -10,7 +10,7 @@ import pandas._libs.lib as lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype +from pandas._typing import ArrayLike, Dtype, Scalar from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -205,7 +205,7 @@ def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object): return np.ndarray(0, dtype=dtype) if isinstance(arr, ABCSeries): - arr = arr.values + arr = arr._values # TODO: extract_array? if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) if na_mask: @@ -787,9 +787,15 @@ def rep(x, r): return result -def str_match(arr, pat, case=True, flags=0, na=np.nan): +def str_match( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): """ - Determine if each string matches a regular expression. + Determine if each string starts with a match of a regular expression. Parameters ---------- @@ -808,6 +814,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): See Also -------- + fullmatch : Stricter matching that requires the entire string to match. contains : Analogous, but less strict, relying on re.search instead of re.match. extract : Extract matched groups. @@ -823,6 +830,50 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): return _na_map(f, arr, na, dtype=dtype) +def str_fullmatch( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): + """ + Determine if each string entirely matches a regular expression. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + match : Similar, but also returns `True` when only a *prefix* of the string + matches the regular expression. + extract : Extract matched groups. + """ + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + dtype = bool + f = lambda x: regex.fullmatch(x) is not None + + return _na_map(f, arr, na, dtype=dtype) + + def _get_single_group_name(rx): try: return list(rx.groupindex.keys()).pop() @@ -2034,8 +2085,8 @@ def __init__(self, data): self._is_categorical = is_categorical_dtype(data) self._is_string = data.dtype.name == "string" - # .values.categories works for both Series/Index - self._parent = data.values.categories if self._is_categorical else data + # ._values.categories works for both Series/Index + self._parent = data._values.categories if self._is_categorical else data # save orig to blow up categoricals to the right type self._orig = data self._freeze() @@ -2236,7 +2287,7 @@ def _get_series_list(self, others): if isinstance(others, ABCSeries): return [others] elif isinstance(others, ABCIndexClass): - return [Series(others.values, index=others)] + return [Series(others._values, index=others)] elif isinstance(others, ABCDataFrame): return [others[x] for x in others] elif isinstance(others, np.ndarray) and others.ndim == 2: @@ -2762,6 +2813,12 @@ def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) + @copy(str_fullmatch) + @forbid_nonstring_types(["bytes"]) + def fullmatch(self, pat, case=True, flags=0, na=np.nan): + result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + @copy(str_replace) @forbid_nonstring_types(["bytes"]) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7414165ab5711..3dd17f5747df9 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -260,7 +260,7 @@ def _convert_listlike_datetimes( Parameters ---------- arg : list, tuple, ndarray, Series, Index - date to be parced + date to be parsed name : object None or string for the Index name tz : object diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index ed0b816f64800..fcde494f7f751 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -296,7 +296,7 @@ def zsqrt(x): mask = x < 0 if isinstance(x, ABCDataFrame): - if mask.values.any(): + if mask._values.any(): result[mask] = 0 else: if mask.any(): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 63d0b8abe59d9..3528be7608798 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -934,6 +934,8 @@ def __init__(self, data, x, y, s=None, c=None, **kwargs): # hide the matplotlib default for size, in case we want to change # the handling of this argument later s = 20 + elif s in data.columns: + s = data[s] super().__init__(data, x, y, s=s, **kwargs) if is_integer(c) and not self.data.columns.holds_integer(): c = self.data.columns[c] diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index 7a8146ef14de0..ce50266c756a8 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -46,7 +46,9 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if dropna: s = s.dropna() - if op in ("sum", "prod"): + if op == "sum": + assert isinstance(getattr(s, op)(), np.int_) + elif op == "prod": assert isinstance(getattr(s, op)(), np.int64) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) diff --git a/pandas/tests/arrays/integer/__init__.py b/pandas/tests/arrays/integer/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/arrays/integer/conftest.py b/pandas/tests/arrays/integer/conftest.py new file mode 100644 index 0000000000000..994fccf837f08 --- /dev/null +++ b/pandas/tests/arrays/integer/conftest.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +from pandas.core.arrays import integer_array +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) + + +@pytest.fixture( + params=[ + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ] +) +def dtype(request): + return request.param() + + +@pytest.fixture +def data(dtype): + return integer_array( + list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100], + dtype=dtype, + ) + + +@pytest.fixture +def data_missing(dtype): + return integer_array([np.nan, 1], dtype=dtype) + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py new file mode 100644 index 0000000000000..18f1dac3c13b2 --- /dev/null +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -0,0 +1,348 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.api.types import is_float, is_float_dtype, is_scalar +from pandas.core.arrays import IntegerArray, integer_array +from pandas.tests.extension.base import BaseOpsUtil + + +class TestArithmeticOps(BaseOpsUtil): + def _check_divmod_op(self, s, op, other, exc=None): + super()._check_divmod_op(s, op, other, None) + + def _check_op(self, s, op_name, other, exc=None): + op = self.get_op_from_name(op_name) + result = op(s, other) + + # compute expected + mask = s.isna() + + # if s is a DataFrame, squeeze to a Series + # for comparison + if isinstance(s, pd.DataFrame): + result = result.squeeze() + s = s.squeeze() + mask = mask.squeeze() + + # other array is an Integer + if isinstance(other, IntegerArray): + omask = getattr(other, "mask", None) + mask = getattr(other, "data", other) + if omask is not None: + mask |= omask + + # 1 ** na is na, so need to unmask those + if op_name == "__pow__": + mask = np.where(~s.isna() & (s == 1), False, mask) + + elif op_name == "__rpow__": + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) + + # float result type or float op + if ( + is_float_dtype(other) + or is_float(other) + or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"] + ): + rs = s.astype("float") + expected = op(rs, other) + self._check_op_float(result, expected, mask, s, op_name, other) + + # integer result type + else: + rs = pd.Series(s.values._data, name=s.name) + expected = op(rs, other) + self._check_op_integer(result, expected, mask, s, op_name, other) + + def _check_op_float(self, result, expected, mask, s, op_name, other): + # check comparisons that are resulting in float dtypes + + expected[mask] = np.nan + if "floordiv" in op_name: + # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) + mask2 = np.isinf(expected) & np.isnan(result) + expected[mask2] = np.nan + tm.assert_series_equal(result, expected) + + def _check_op_integer(self, result, expected, mask, s, op_name, other): + # check comparisons that are resulting in integer dtypes + + # to compare properly, we convert the expected + # to float, mask to nans and convert infs + # if we have uints then we process as uints + # then convert to float + # and we ultimately want to create a IntArray + # for comparisons + + fill_value = 0 + + # mod/rmod turn floating 0 into NaN while + # integer works as expected (no nan) + if op_name in ["__mod__", "__rmod__"]: + if is_scalar(other): + if other == 0: + expected[s.values == 0] = 0 + else: + expected = expected.fillna(0) + else: + expected[ + (s.values == 0).fillna(False) + & ((expected == 0).fillna(False) | expected.isna()) + ] = 0 + try: + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value + original = expected + expected = expected.astype(s.dtype) + + except ValueError: + + expected = expected.astype(float) + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value + original = expected + expected = expected.astype(s.dtype) + + expected[mask] = pd.NA + + # assert that the expected astype is ok + # (skip for unsigned as they have wrap around) + if not s.dtype.is_unsigned_integer: + original = pd.Series(original) + + # we need to fill with 0's to emulate what an astype('int') does + # (truncation) for certain ops + if op_name in ["__rtruediv__", "__rdiv__"]: + mask |= original.isna() + original = original.fillna(0).astype("int") + + original = original.astype("float") + original[mask] = np.nan + tm.assert_series_equal(original, expected.astype("float")) + + # assert our expected result + tm.assert_series_equal(result, expected) + + def test_arith_integer_array(self, data, all_arithmetic_operators): + # we operate with a rhs of an integer array + + op = all_arithmetic_operators + + s = pd.Series(data) + rhs = pd.Series([1] * len(data), dtype=data.dtype) + rhs.iloc[-1] = np.nan + + self._check_op(s, op, rhs) + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + # scalar + op = all_arithmetic_operators + s = pd.Series(data) + self._check_op(s, op, 1, exc=TypeError) + + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + # frame & scalar + op = all_arithmetic_operators + df = pd.DataFrame({"A": data}) + self._check_op(df, op, 1, exc=TypeError) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + # ndarray & other series + op = all_arithmetic_operators + s = pd.Series(data) + other = np.ones(len(s), dtype=s.dtype.type) + self._check_op(s, op, other, exc=TypeError) + + def test_arith_coerce_scalar(self, data, all_arithmetic_operators): + + op = all_arithmetic_operators + s = pd.Series(data) + + other = 0.01 + self._check_op(s, op, other) + + @pytest.mark.parametrize("other", [1.0, np.array(1.0)]) + def test_arithmetic_conversion(self, all_arithmetic_operators, other): + # if we have a float operand we should have a float result + # if that is equal to an integer + op = self.get_op_from_name(all_arithmetic_operators) + + s = pd.Series([1, 2, 3], dtype="Int64") + result = op(s, other) + assert result.dtype is np.dtype("float") + + def test_arith_len_mismatch(self, all_arithmetic_operators): + # operating with a list-like with non-matching length raises + op = self.get_op_from_name(all_arithmetic_operators) + other = np.array([1.0]) + + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(ValueError, match="Lengths must match"): + op(s, other) + + @pytest.mark.parametrize("other", [0, 0.5]) + def test_arith_zero_dim_ndarray(self, other): + arr = integer_array([1, None, 2]) + result = arr + np.array(other) + expected = arr + other + tm.assert_equal(result, expected) + + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + msg = ( + r"(:?can only perform ops with numeric values)" + r"|(:?IntegerArray cannot perform the operation mod)" + ) + with pytest.raises(TypeError, match=msg): + ops("foo") + with pytest.raises(TypeError, match=msg): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + with pytest.raises(TypeError, match=msg): + ops(pd.Series("foo", index=s.index)) + + if op != "__rpow__": + # TODO(extension) + # rpow with a datetimelike coerces the integer array incorrectly + msg = ( + "can only perform ops with numeric values|" + "cannot perform .* with this index type: DatetimeArray|" + "Addition/subtraction of integers and integer-arrays " + "with DatetimeArray is no longer supported. *" + ) + with pytest.raises(TypeError, match=msg): + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + msg = r"can only perform ops with 1-d structures" + with pytest.raises(NotImplementedError, match=msg): + opa(np.arange(len(s)).reshape(-1, len(s))) + + @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) + def test_divide_by_zero(self, zero, negative): + # https://github.com/pandas-dev/pandas/issues/27398 + a = pd.array([0, 1, -1, None], dtype="Int64") + result = a / zero + expected = np.array([np.nan, np.inf, -np.inf, np.nan]) + if negative: + expected *= -1 + tm.assert_numpy_array_equal(result, expected) + + def test_pow_scalar(self): + a = pd.array([-1, 0, 1, None, 2], dtype="Int64") + result = a ** 0 + expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** 1 + expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** pd.NA + expected = pd.array([None, None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** np.nan + expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # reversed + a = a[1:] # Can't raise integers to negative powers. + + result = 0 ** a + expected = pd.array([1, 0, None, 0], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = 1 ** a + expected = pd.array([1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = np.nan ** a + expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + def test_pow_array(self): + a = integer_array([0, 0, 0, 1, 1, 1, None, None, None]) + b = integer_array([0, 1, None, 0, 1, None, 0, 1, None]) + result = a ** b + expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None]) + tm.assert_extension_array_equal(result, expected) + + def test_rpow_one_to_na(self): + # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 + arr = integer_array([np.nan, np.nan]) + result = np.array([1.0, 2.0]) ** arr + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) + + +def test_cross_type_arithmetic(): + + df = pd.DataFrame( + { + "A": pd.Series([1, 2, np.nan], dtype="Int64"), + "B": pd.Series([1, np.nan, 3], dtype="UInt8"), + "C": [1, 2, 3], + } + ) + + result = df.A + df.C + expected = pd.Series([2, 4, np.nan], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = (df.A + df.C) * 3 == 12 + expected = pd.Series([False, True, None], dtype="boolean") + tm.assert_series_equal(result, expected) + + result = df.A + df.B + expected = pd.Series([2, np.nan, np.nan], dtype="Int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("op", ["mean"]) +def test_reduce_to_float(op): + # some reduce ops always return float, even if the result + # is a rounded number + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) + + # op + result = getattr(df.C, op)() + assert isinstance(result, float) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_comparison.py b/pandas/tests/arrays/integer/test_comparison.py new file mode 100644 index 0000000000000..d76ed2c21ca0e --- /dev/null +++ b/pandas/tests/arrays/integer/test_comparison.py @@ -0,0 +1,106 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension.base import BaseOpsUtil + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + + # fill the nan locations + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = op(pd.Series(data._data), other) + + # fill the nan locations + expected[data._mask] = pd.NA + expected = expected.astype("boolean") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1, 0, None], dtype="Int64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Int64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask + + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Int64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + + def test_no_shared_mask(self, data): + result = data + 1 + assert np.shares_memory(result._mask, data._mask) is False + + def test_compare_to_string(self, any_nullable_int_dtype): + # GH 28930 + s = pd.Series([1, None], dtype=any_nullable_int_dtype) + result = s == "a" + expected = pd.Series([False, pd.NA], dtype="boolean") + + self.assert_series_equal(result, expected) + + def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): + # GH 28930 + s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) + s2 = pd.Series([1, None, 3], dtype="float") + + method = getattr(s1, all_compare_operators) + result = method(2) + + method = getattr(s2, all_compare_operators) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA + + self.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py new file mode 100644 index 0000000000000..4a62a35e23d93 --- /dev/null +++ b/pandas/tests/arrays/integer/test_construction.py @@ -0,0 +1,238 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.api.types import is_integer +from pandas.core.arrays import IntegerArray, integer_array +from pandas.core.arrays.integer import Int8Dtype, Int32Dtype, Int64Dtype + + +def test_uses_pandas_na(): + a = pd.array([1, None], dtype=pd.Int64Dtype()) + assert a[1] is pd.NA + + +def test_from_dtype_from_float(data): + # construct from our dtype & string dtype + dtype = data.dtype + + # from float + expected = pd.Series(data) + result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from int / list + expected = pd.Series(data) + result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from int / array + expected = pd.Series(data).dropna().reset_index(drop=True) + dropped = np.array(data.dropna()).astype(np.dtype((dtype.type))) + result = pd.Series(dropped, dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + +def test_conversions(data_missing): + + # astype to object series + df = pd.DataFrame({"A": data_missing}) + result = df["A"].astype("object") + expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") + tm.assert_series_equal(result, expected) + + # convert to object ndarray + # we assert that we are exactly equal + # including type conversions of scalars + result = df["A"].astype("object").values + expected = np.array([pd.NA, 1], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + for r, e in zip(result, expected): + if pd.isnull(r): + assert pd.isnull(e) + elif is_integer(r): + assert r == e + assert is_integer(e) + else: + assert r == e + assert type(r) == type(e) + + +def test_integer_array_constructor(): + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") + + result = IntegerArray(values, mask) + expected = integer_array([1, 2, 3, np.nan], dtype="int64") + tm.assert_extension_array_equal(result, expected) + + msg = r".* should be .* numpy array. Use the 'integer_array' function instead" + with pytest.raises(TypeError, match=msg): + IntegerArray(values.tolist(), mask) + + with pytest.raises(TypeError, match=msg): + IntegerArray(values, mask.tolist()) + + with pytest.raises(TypeError, match=msg): + IntegerArray(values.astype(float), mask) + msg = r"__init__\(\) missing 1 required positional argument: 'mask'" + with pytest.raises(TypeError, match=msg): + IntegerArray(values) + + +@pytest.mark.parametrize( + "a, b", + [ + ([1, None], [1, np.nan]), + ([None], [np.nan]), + ([None, np.nan], [np.nan, np.nan]), + ([np.nan, np.nan], [np.nan, np.nan]), + ], +) +def test_integer_array_constructor_none_is_nan(a, b): + result = integer_array(a) + expected = integer_array(b) + tm.assert_extension_array_equal(result, expected) + + +def test_integer_array_constructor_copy(): + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") + + result = IntegerArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = IntegerArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + "foo", + 1, + 1.0, + pd.date_range("20130101", periods=2), + np.array(["foo"]), + [[1, 2], [3, 4]], + [np.nan, {"a": 1}], + ], +) +def test_to_integer_array_error(values): + # error in converting existing arrays to IntegerArrays + msg = ( + r"(:?.* cannot be converted to an IntegerDtype)" + r"|(:?values must be a 1D list-like)" + ) + with pytest.raises(TypeError, match=msg): + integer_array(values) + + +def test_to_integer_array_inferred_dtype(): + # if values has dtype -> respect it + result = integer_array(np.array([1, 2], dtype="int8")) + assert result.dtype == Int8Dtype() + result = integer_array(np.array([1, 2], dtype="int32")) + assert result.dtype == Int32Dtype() + + # if values have no dtype -> always int64 + result = integer_array([1, 2]) + assert result.dtype == Int64Dtype() + + +def test_to_integer_array_dtype_keyword(): + result = integer_array([1, 2], dtype="int8") + assert result.dtype == Int8Dtype() + + # if values has dtype -> override it + result = integer_array(np.array([1, 2], dtype="int8"), dtype="int32") + assert result.dtype == Int32Dtype() + + +def test_to_integer_array_float(): + result = integer_array([1.0, 2.0]) + expected = integer_array([1, 2]) + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): + integer_array([1.5, 2.0]) + + # for float dtypes, the itemsize is not preserved + result = integer_array(np.array([1.0, 2.0], dtype="float32")) + assert result.dtype == Int64Dtype() + + +@pytest.mark.parametrize( + "bool_values, int_values, target_dtype, expected_dtype", + [ + ([False, True], [0, 1], Int64Dtype(), Int64Dtype()), + ([False, True], [0, 1], "Int64", Int64Dtype()), + ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()), + ], +) +def test_to_integer_array_bool(bool_values, int_values, target_dtype, expected_dtype): + result = integer_array(bool_values, dtype=target_dtype) + assert result.dtype == expected_dtype + expected = integer_array(int_values, dtype=target_dtype) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, to_dtype, result_dtype", + [ + (np.array([1], dtype="int64"), None, Int64Dtype), + (np.array([1, np.nan]), None, Int64Dtype), + (np.array([1, np.nan]), "int8", Int8Dtype), + ], +) +def test_to_integer_array(values, to_dtype, result_dtype): + # convert existing arrays to IntegerArrays + result = integer_array(values, dtype=to_dtype) + assert result.dtype == result_dtype() + expected = integer_array(values, dtype=result_dtype()) + tm.assert_extension_array_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.16.0") +def test_arrow_roundtrip(data): + # roundtrip possible from arrow 0.16.0 + import pyarrow as pa + + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + result = table.to_pandas() + tm.assert_frame_equal(result, df) + + +@td.skip_if_no("pyarrow", min_version="0.16.0") +def test_arrow_from_arrow_uint(): + # https://github.com/pandas-dev/pandas/issues/31896 + # possible mismatch in types + import pyarrow as pa + + dtype = pd.UInt32Dtype() + result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) + expected = pd.array([1, 2, 3, 4, None], dtype="UInt32") + + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py new file mode 100644 index 0000000000000..ee1ec86745246 --- /dev/null +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -0,0 +1,251 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.generic import ABCIndexClass + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import integer_array +from pandas.core.arrays.integer import Int8Dtype, UInt32Dtype + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + if dtype.is_signed_integer: + assert np.dtype(dtype.type).kind == "i" + else: + assert np.dtype(dtype.type).kind == "u" + assert dtype.name is not None + + +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) +def test_preserve_dtypes(op): + # TODO(#22346): preserve Int64 dtype + # for ops that enable (mean would actually work here + # but generally it is a float return value) + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) + + # op + result = getattr(df.C, op)() + if op == "sum": + assert isinstance(result, np.int64) + else: + assert isinstance(result, int) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_astype_nansafe(): + # see gh-22343 + arr = integer_array([np.nan, 1, 2], dtype="Int8") + msg = "cannot convert to 'uint32'-dtype NumPy array with missing values." + + with pytest.raises(ValueError, match=msg): + arr.astype("uint32") + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_construct_index(all_data, dropna): + # ensure that we do not coerce to Float64Index, rather + # keep as Index + + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Index(integer_array(other, dtype=all_data.dtype)) + expected = pd.Index(other, dtype=object) + + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_astype_index(all_data, dropna): + # as an int/uint index to Index + + all_data = all_data[:10] + if dropna: + other = all_data[~all_data.isna()] + else: + other = all_data + + dtype = all_data.dtype + idx = pd.Index(np.array(other)) + assert isinstance(idx, ABCIndexClass) + + result = idx.astype(dtype) + expected = idx.astype(object).astype(dtype) + tm.assert_index_equal(result, expected) + + +def test_astype(all_data): + all_data = all_data[:10] + + ints = all_data[~all_data.isna()] + mixed = all_data + dtype = Int8Dtype() + + # coerce to same type - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype) + expected = pd.Series(ints) + tm.assert_series_equal(result, expected) + + # coerce to same other - ints + s = pd.Series(ints) + result = s.astype(dtype) + expected = pd.Series(ints, dtype=dtype) + tm.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype.numpy_dtype) + expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype)) + tm.assert_series_equal(result, expected) + + # coerce to same type - mixed + s = pd.Series(mixed) + result = s.astype(all_data.dtype) + expected = pd.Series(mixed) + tm.assert_series_equal(result, expected) + + # coerce to same other - mixed + s = pd.Series(mixed) + result = s.astype(dtype) + expected = pd.Series(mixed, dtype=dtype) + tm.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - mixed + s = pd.Series(mixed) + msg = r"cannot convert to .*-dtype NumPy array with missing values.*" + with pytest.raises(ValueError, match=msg): + s.astype(all_data.dtype.numpy_dtype) + + # coerce to object + s = pd.Series(mixed) + result = s.astype("object") + expected = pd.Series(np.asarray(mixed)) + tm.assert_series_equal(result, expected) + + +def test_astype_to_larger_numpy(): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) +def test_astype_specific_casting(dtype): + s = pd.Series([1, 2, 3], dtype="Int64") + result = s.astype(dtype) + expected = pd.Series([1, 2, 3], dtype=dtype) + tm.assert_series_equal(result, expected) + + s = pd.Series([1, 2, 3, None], dtype="Int64") + result = s.astype(dtype) + expected = pd.Series([1, 2, 3, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + +def test_astype_dt64(): + # GH#32435 + arr = pd.array([1, 2, 3, pd.NA]) * 10 ** 9 + + result = arr.astype("datetime64[ns]") + + expected = np.array([1, 2, 3, "NaT"], dtype="M8[s]").astype("M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + +def test_construct_cast_invalid(dtype): + + msg = "cannot safely" + arr = [1.2, 2.3, 3.7] + with pytest.raises(TypeError, match=msg): + integer_array(arr, dtype=dtype) + + with pytest.raises(TypeError, match=msg): + pd.Series(arr).astype(dtype) + + arr = [1.2, 2.3, 3.7, np.nan] + with pytest.raises(TypeError, match=msg): + integer_array(arr, dtype=dtype) + + with pytest.raises(TypeError, match=msg): + pd.Series(arr).astype(dtype) + + +@pytest.mark.parametrize("in_series", [True, False]) +def test_to_numpy_na_nan(in_series): + a = pd.array([0, 1, None], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.0, 1.0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="int64", na_value=-1) + expected = np.array([0, 1, -1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="bool", na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("in_series", [True, False]) +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) +def test_to_numpy_dtype(dtype, in_series): + a = pd.array([0, 1], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) +def test_to_numpy_na_raises(dtype): + a = pd.array([0, 1, None], dtype="Int64") + with pytest.raises(ValueError, match=dtype): + a.to_numpy(dtype=dtype) + + +def test_astype_str(): + a = pd.array([1, 2, None], dtype="Int64") + expected = np.array(["1", "2", ""], dtype=object) + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) + + +def test_astype_boolean(): + # https://github.com/pandas-dev/pandas/issues/31102 + a = pd.array([1, 0, -1, 2, None], dtype="Int64") + result = a.astype("boolean") + expected = pd.array([True, False, True, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py new file mode 100644 index 0000000000000..58913189593a9 --- /dev/null +++ b/pandas/tests/arrays/integer/test_function.py @@ -0,0 +1,110 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import integer_array + + +@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) +# np.sign emits a warning with nans, +@pytest.mark.filterwarnings("ignore:invalid value encountered in sign") +def test_ufuncs_single_int(ufunc): + a = integer_array([1, 2, -3, np.nan]) + result = ufunc(a) + expected = integer_array(ufunc(a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(integer_array(ufunc(a.astype(float)))) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) +def test_ufuncs_single_float(ufunc): + a = integer_array([1, 2, -3, np.nan]) + with np.errstate(invalid="ignore"): + result = ufunc(a) + expected = ufunc(a.astype(float)) + tm.assert_numpy_array_equal(result, expected) + + s = pd.Series(a) + with np.errstate(invalid="ignore"): + result = ufunc(s) + expected = ufunc(s.astype(float)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) +def test_ufuncs_binary_int(ufunc): + # two IntegerArrays + a = integer_array([1, 2, -3, np.nan]) + result = ufunc(a, a) + expected = integer_array(ufunc(a.astype(float), a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with numpy array + arr = np.array([1, 2, 3, 4]) + result = ufunc(a, arr) + expected = integer_array(ufunc(a.astype(float), arr)) + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = integer_array(ufunc(arr, a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with scalar + result = ufunc(a, 1) + expected = integer_array(ufunc(a.astype(float), 1)) + tm.assert_extension_array_equal(result, expected) + + result = ufunc(1, a) + expected = integer_array(ufunc(1, a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("values", [[0, 1], [0, None]]) +def test_ufunc_reduce_raises(values): + a = integer_array(values) + msg = r"The 'reduce' method is not supported." + with pytest.raises(NotImplementedError, match=msg): + np.add.reduce(a) + + +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + +def test_value_counts_na(): + arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + +# TODO(jreback) - these need testing / are broken + +# shift + +# set_index (destroys type) diff --git a/pandas/tests/arrays/integer/test_indexing.py b/pandas/tests/arrays/integer/test_indexing.py new file mode 100644 index 0000000000000..4b953d699108b --- /dev/null +++ b/pandas/tests/arrays/integer/test_indexing.py @@ -0,0 +1,19 @@ +import pandas as pd +import pandas._testing as tm + + +def test_array_setitem_nullable_boolean_mask(): + # GH 31446 + ser = pd.Series([1, 2], dtype="Int64") + result = ser.where(ser > 1) + expected = pd.Series([pd.NA, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_array_setitem(): + # GH 31446 + arr = pd.Series([1, 2], dtype="Int64").array + arr[arr > 1] = 1 + + expected = pd.array([1, 1], dtype="Int64") + tm.assert_extension_array_equal(arr, expected) diff --git a/pandas/tests/arrays/integer/test_repr.py b/pandas/tests/arrays/integer/test_repr.py new file mode 100644 index 0000000000000..bdc5724e85e0d --- /dev/null +++ b/pandas/tests/arrays/integer/test_repr.py @@ -0,0 +1,69 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays import integer_array +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + if dtype.is_signed_integer: + assert np.dtype(dtype.type).kind == "i" + else: + assert np.dtype(dtype.type).kind == "u" + assert dtype.name is not None + + +@pytest.mark.parametrize( + "dtype, expected", + [ + (Int8Dtype(), "Int8Dtype()"), + (Int16Dtype(), "Int16Dtype()"), + (Int32Dtype(), "Int32Dtype()"), + (Int64Dtype(), "Int64Dtype()"), + (UInt8Dtype(), "UInt8Dtype()"), + (UInt16Dtype(), "UInt16Dtype()"), + (UInt32Dtype(), "UInt32Dtype()"), + (UInt64Dtype(), "UInt64Dtype()"), + ], +) +def test_repr_dtype(dtype, expected): + assert repr(dtype) == expected + + +def test_repr_array(): + result = repr(integer_array([1, None, 3])) + expected = "\n[1, , 3]\nLength: 3, dtype: Int64" + assert result == expected + + +def test_repr_array_long(): + data = integer_array([1, 2, None] * 1000) + expected = ( + "\n" + "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" + " ...\n" + " , 1, 2, , 1, 2, , 1, 2, ]\n" + "Length: 3000, dtype: Int64" + ) + result = repr(data) + assert result == expected + + +def test_frame_repr(data_missing): + + df = pd.DataFrame({"A": data_missing}) + result = repr(df) + expected = " A\n0 \n1 1" + assert result == expected diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5e2f14af341ab..fe770eed84b62 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -214,6 +214,14 @@ def test_from_sequence_no_mutate(copy): tm.assert_numpy_array_equal(a, original) +def test_astype_int(): + arr = pd.array(["1", pd.NA, "3"], dtype="string") + + result = arr.astype("Int64") + expected = pd.array([1, pd.NA, 3], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index e505917da1dc4..928173aa82797 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -812,3 +812,38 @@ def test_to_numpy_extra(array): assert result[0] == result[1] tm.assert_equal(array, original) + + +@pytest.mark.parametrize( + "values", + [ + pd.to_datetime(["2020-01-01", "2020-02-01"]), + pd.TimedeltaIndex([1, 2], unit="D"), + pd.PeriodIndex(["2020-01-01", "2020-02-01"], freq="D"), + ], +) +@pytest.mark.parametrize("klass", [list, np.array, pd.array, pd.Series]) +def test_searchsorted_datetimelike_with_listlike(values, klass): + # https://github.com/pandas-dev/pandas/issues/32762 + result = values.searchsorted(klass(values)) + expected = np.array([0, 1], dtype=result.dtype) + + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + pd.to_datetime(["2020-01-01", "2020-02-01"]), + pd.TimedeltaIndex([1, 2], unit="D"), + pd.PeriodIndex(["2020-01-01", "2020-02-01"], freq="D"), + ], +) +@pytest.mark.parametrize( + "arg", [[1, 2], ["a", "b"], [pd.Timestamp("2020-01-01", tz="Europe/London")] * 2] +) +def test_searchsorted_datetimelike_with_listlike_invalid_dtype(values, arg): + # https://github.com/pandas-dev/pandas/issues/32762 + msg = "[Unexpected type|Cannot compare]" + with pytest.raises(TypeError, match=msg): + values.searchsorted(arg) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py deleted file mode 100644 index 70a029bd74bda..0000000000000 --- a/pandas/tests/arrays/test_integer.py +++ /dev/null @@ -1,1125 +0,0 @@ -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -from pandas.core.dtypes.generic import ABCIndexClass - -import pandas as pd -import pandas._testing as tm -from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar -from pandas.core.arrays import IntegerArray, integer_array -from pandas.core.arrays.integer import ( - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, -) -from pandas.tests.extension.base import BaseOpsUtil - - -def make_data(): - return list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] - - -@pytest.fixture( - params=[ - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, - ] -) -def dtype(request): - return request.param() - - -@pytest.fixture -def data(dtype): - return integer_array(make_data(), dtype=dtype) - - -@pytest.fixture -def data_missing(dtype): - return integer_array([np.nan, 1], dtype=dtype) - - -@pytest.fixture(params=["data", "data_missing"]) -def all_data(request, data, data_missing): - """Parametrized fixture giving 'data' and 'data_missing'""" - if request.param == "data": - return data - elif request.param == "data_missing": - return data_missing - - -def test_dtypes(dtype): - # smoke tests on auto dtype construction - - if dtype.is_signed_integer: - assert np.dtype(dtype.type).kind == "i" - else: - assert np.dtype(dtype.type).kind == "u" - assert dtype.name is not None - - -@pytest.mark.parametrize( - "dtype, expected", - [ - (Int8Dtype(), "Int8Dtype()"), - (Int16Dtype(), "Int16Dtype()"), - (Int32Dtype(), "Int32Dtype()"), - (Int64Dtype(), "Int64Dtype()"), - (UInt8Dtype(), "UInt8Dtype()"), - (UInt16Dtype(), "UInt16Dtype()"), - (UInt32Dtype(), "UInt32Dtype()"), - (UInt64Dtype(), "UInt64Dtype()"), - ], -) -def test_repr_dtype(dtype, expected): - assert repr(dtype) == expected - - -def test_repr_array(): - result = repr(integer_array([1, None, 3])) - expected = "\n[1, , 3]\nLength: 3, dtype: Int64" - assert result == expected - - -def test_repr_array_long(): - data = integer_array([1, 2, None] * 1000) - expected = ( - "\n" - "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" - " ...\n" - " , 1, 2, , 1, 2, , 1, 2, ]\n" - "Length: 3000, dtype: Int64" - ) - result = repr(data) - assert result == expected - - -class TestConstructors: - def test_uses_pandas_na(self): - a = pd.array([1, None], dtype=pd.Int64Dtype()) - assert a[1] is pd.NA - - def test_from_dtype_from_float(self, data): - # construct from our dtype & string dtype - dtype = data.dtype - - # from float - expected = pd.Series(data) - result = pd.Series( - data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype) - ) - tm.assert_series_equal(result, expected) - - # from int / list - expected = pd.Series(data) - result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) - tm.assert_series_equal(result, expected) - - # from int / array - expected = pd.Series(data).dropna().reset_index(drop=True) - dropped = np.array(data.dropna()).astype(np.dtype((dtype.type))) - result = pd.Series(dropped, dtype=str(dtype)) - tm.assert_series_equal(result, expected) - - -class TestArithmeticOps(BaseOpsUtil): - def _check_divmod_op(self, s, op, other, exc=None): - super()._check_divmod_op(s, op, other, None) - - def _check_op(self, s, op_name, other, exc=None): - op = self.get_op_from_name(op_name) - result = op(s, other) - - # compute expected - mask = s.isna() - - # if s is a DataFrame, squeeze to a Series - # for comparison - if isinstance(s, pd.DataFrame): - result = result.squeeze() - s = s.squeeze() - mask = mask.squeeze() - - # other array is an Integer - if isinstance(other, IntegerArray): - omask = getattr(other, "mask", None) - mask = getattr(other, "data", other) - if omask is not None: - mask |= omask - - # 1 ** na is na, so need to unmask those - if op_name == "__pow__": - mask = np.where(~s.isna() & (s == 1), False, mask) - - elif op_name == "__rpow__": - other_is_one = other == 1 - if isinstance(other_is_one, pd.Series): - other_is_one = other_is_one.fillna(False) - mask = np.where(other_is_one, False, mask) - - # float result type or float op - if ( - is_float_dtype(other) - or is_float(other) - or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"] - ): - rs = s.astype("float") - expected = op(rs, other) - self._check_op_float(result, expected, mask, s, op_name, other) - - # integer result type - else: - rs = pd.Series(s.values._data, name=s.name) - expected = op(rs, other) - self._check_op_integer(result, expected, mask, s, op_name, other) - - def _check_op_float(self, result, expected, mask, s, op_name, other): - # check comparisons that are resulting in float dtypes - - expected[mask] = np.nan - if "floordiv" in op_name: - # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) - mask2 = np.isinf(expected) & np.isnan(result) - expected[mask2] = np.nan - tm.assert_series_equal(result, expected) - - def _check_op_integer(self, result, expected, mask, s, op_name, other): - # check comparisons that are resulting in integer dtypes - - # to compare properly, we convert the expected - # to float, mask to nans and convert infs - # if we have uints then we process as uints - # then convert to float - # and we ultimately want to create a IntArray - # for comparisons - - fill_value = 0 - - # mod/rmod turn floating 0 into NaN while - # integer works as expected (no nan) - if op_name in ["__mod__", "__rmod__"]: - if is_scalar(other): - if other == 0: - expected[s.values == 0] = 0 - else: - expected = expected.fillna(0) - else: - expected[ - (s.values == 0).fillna(False) - & ((expected == 0).fillna(False) | expected.isna()) - ] = 0 - try: - expected[ - ((expected == np.inf) | (expected == -np.inf)).fillna(False) - ] = fill_value - original = expected - expected = expected.astype(s.dtype) - - except ValueError: - - expected = expected.astype(float) - expected[ - ((expected == np.inf) | (expected == -np.inf)).fillna(False) - ] = fill_value - original = expected - expected = expected.astype(s.dtype) - - expected[mask] = pd.NA - - # assert that the expected astype is ok - # (skip for unsigned as they have wrap around) - if not s.dtype.is_unsigned_integer: - original = pd.Series(original) - - # we need to fill with 0's to emulate what an astype('int') does - # (truncation) for certain ops - if op_name in ["__rtruediv__", "__rdiv__"]: - mask |= original.isna() - original = original.fillna(0).astype("int") - - original = original.astype("float") - original[mask] = np.nan - tm.assert_series_equal(original, expected.astype("float")) - - # assert our expected result - tm.assert_series_equal(result, expected) - - def test_arith_integer_array(self, data, all_arithmetic_operators): - # we operate with a rhs of an integer array - - op = all_arithmetic_operators - - s = pd.Series(data) - rhs = pd.Series([1] * len(data), dtype=data.dtype) - rhs.iloc[-1] = np.nan - - self._check_op(s, op, rhs) - - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): - # scalar - op = all_arithmetic_operators - s = pd.Series(data) - self._check_op(s, op, 1, exc=TypeError) - - def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): - # frame & scalar - op = all_arithmetic_operators - df = pd.DataFrame({"A": data}) - self._check_op(df, op, 1, exc=TypeError) - - def test_arith_series_with_array(self, data, all_arithmetic_operators): - # ndarray & other series - op = all_arithmetic_operators - s = pd.Series(data) - other = np.ones(len(s), dtype=s.dtype.type) - self._check_op(s, op, other, exc=TypeError) - - def test_arith_coerce_scalar(self, data, all_arithmetic_operators): - - op = all_arithmetic_operators - s = pd.Series(data) - - other = 0.01 - self._check_op(s, op, other) - - @pytest.mark.parametrize("other", [1.0, np.array(1.0)]) - def test_arithmetic_conversion(self, all_arithmetic_operators, other): - # if we have a float operand we should have a float result - # if that is equal to an integer - op = self.get_op_from_name(all_arithmetic_operators) - - s = pd.Series([1, 2, 3], dtype="Int64") - result = op(s, other) - assert result.dtype is np.dtype("float") - - def test_arith_len_mismatch(self, all_arithmetic_operators): - # operating with a list-like with non-matching length raises - op = self.get_op_from_name(all_arithmetic_operators) - other = np.array([1.0]) - - s = pd.Series([1, 2, 3], dtype="Int64") - with pytest.raises(ValueError, match="Lengths must match"): - op(s, other) - - @pytest.mark.parametrize("other", [0, 0.5]) - def test_arith_zero_dim_ndarray(self, other): - arr = integer_array([1, None, 2]) - result = arr + np.array(other) - expected = arr + other - tm.assert_equal(result, expected) - - def test_error(self, data, all_arithmetic_operators): - # invalid ops - - op = all_arithmetic_operators - s = pd.Series(data) - ops = getattr(s, op) - opa = getattr(data, op) - - # invalid scalars - msg = ( - r"(:?can only perform ops with numeric values)" - r"|(:?IntegerArray cannot perform the operation mod)" - ) - with pytest.raises(TypeError, match=msg): - ops("foo") - with pytest.raises(TypeError, match=msg): - ops(pd.Timestamp("20180101")) - - # invalid array-likes - with pytest.raises(TypeError, match=msg): - ops(pd.Series("foo", index=s.index)) - - if op != "__rpow__": - # TODO(extension) - # rpow with a datetimelike coerces the integer array incorrectly - msg = ( - "can only perform ops with numeric values|" - "cannot perform .* with this index type: DatetimeArray|" - "Addition/subtraction of integers and integer-arrays " - "with DatetimeArray is no longer supported. *" - ) - with pytest.raises(TypeError, match=msg): - ops(pd.Series(pd.date_range("20180101", periods=len(s)))) - - # 2d - result = opa(pd.DataFrame({"A": s})) - assert result is NotImplemented - - msg = r"can only perform ops with 1-d structures" - with pytest.raises(NotImplementedError, match=msg): - opa(np.arange(len(s)).reshape(-1, len(s))) - - @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) - def test_divide_by_zero(self, zero, negative): - # https://github.com/pandas-dev/pandas/issues/27398 - a = pd.array([0, 1, -1, None], dtype="Int64") - result = a / zero - expected = np.array([np.nan, np.inf, -np.inf, np.nan]) - if negative: - expected *= -1 - tm.assert_numpy_array_equal(result, expected) - - def test_pow_scalar(self): - a = pd.array([-1, 0, 1, None, 2], dtype="Int64") - result = a ** 0 - expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = a ** 1 - expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = a ** pd.NA - expected = pd.array([None, None, 1, None, None], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = a ** np.nan - expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - # reversed - a = a[1:] # Can't raise integers to negative powers. - - result = 0 ** a - expected = pd.array([1, 0, None, 0], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = 1 ** a - expected = pd.array([1, 1, 1, 1], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = pd.NA ** a - expected = pd.array([1, None, None, None], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = np.nan ** a - expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - def test_pow_array(self): - a = integer_array([0, 0, 0, 1, 1, 1, None, None, None]) - b = integer_array([0, 1, None, 0, 1, None, 0, 1, None]) - result = a ** b - expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None]) - tm.assert_extension_array_equal(result, expected) - - def test_rpow_one_to_na(self): - # https://github.com/pandas-dev/pandas/issues/22022 - # https://github.com/pandas-dev/pandas/issues/29997 - arr = integer_array([np.nan, np.nan]) - result = np.array([1.0, 2.0]) ** arr - expected = np.array([1.0, np.nan]) - tm.assert_numpy_array_equal(result, expected) - - -class TestComparisonOps(BaseOpsUtil): - def _compare_other(self, data, op_name, other): - op = self.get_op_from_name(op_name) - - # array - result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other), dtype="boolean") - - # fill the nan locations - expected[data._mask] = pd.NA - - tm.assert_series_equal(result, expected) - - # series - s = pd.Series(data) - result = op(s, other) - - expected = op(pd.Series(data._data), other) - - # fill the nan locations - expected[data._mask] = pd.NA - expected = expected.astype("boolean") - - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) - def test_scalar(self, other, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([1, 0, None], dtype="Int64") - - result = op(a, other) - - if other is pd.NA: - expected = pd.array([None, None, None], dtype="boolean") - else: - values = op(a._data, other) - expected = pd.arrays.BooleanArray(values, a._mask, copy=True) - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - result[0] = pd.NA - tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) - - def test_array(self, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([0, 1, 2, None, None, None], dtype="Int64") - b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") - - result = op(a, b) - values = op(a._data, b._data) - mask = a._mask | b._mask - - expected = pd.arrays.BooleanArray(values, mask) - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - result[0] = pd.NA - tm.assert_extension_array_equal( - a, pd.array([0, 1, 2, None, None, None], dtype="Int64") - ) - tm.assert_extension_array_equal( - b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") - ) - - def test_compare_with_booleanarray(self, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([True, False, None] * 3, dtype="boolean") - b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") - other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") - expected = op(a, other) - result = op(a, b) - tm.assert_extension_array_equal(result, expected) - - def test_no_shared_mask(self, data): - result = data + 1 - assert np.shares_memory(result._mask, data._mask) is False - - def test_compare_to_string(self, any_nullable_int_dtype): - # GH 28930 - s = pd.Series([1, None], dtype=any_nullable_int_dtype) - result = s == "a" - expected = pd.Series([False, pd.NA], dtype="boolean") - - self.assert_series_equal(result, expected) - - def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): - # GH 28930 - s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) - s2 = pd.Series([1, None, 3], dtype="float") - - method = getattr(s1, all_compare_operators) - result = method(2) - - method = getattr(s2, all_compare_operators) - expected = method(2).astype("boolean") - expected[s2.isna()] = pd.NA - - self.assert_series_equal(result, expected) - - -class TestCasting: - @pytest.mark.parametrize("dropna", [True, False]) - def test_construct_index(self, all_data, dropna): - # ensure that we do not coerce to Float64Index, rather - # keep as Index - - all_data = all_data[:10] - if dropna: - other = np.array(all_data[~all_data.isna()]) - else: - other = all_data - - result = pd.Index(integer_array(other, dtype=all_data.dtype)) - expected = pd.Index(other, dtype=object) - - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("dropna", [True, False]) - def test_astype_index(self, all_data, dropna): - # as an int/uint index to Index - - all_data = all_data[:10] - if dropna: - other = all_data[~all_data.isna()] - else: - other = all_data - - dtype = all_data.dtype - idx = pd.Index(np.array(other)) - assert isinstance(idx, ABCIndexClass) - - result = idx.astype(dtype) - expected = idx.astype(object).astype(dtype) - tm.assert_index_equal(result, expected) - - def test_astype(self, all_data): - all_data = all_data[:10] - - ints = all_data[~all_data.isna()] - mixed = all_data - dtype = Int8Dtype() - - # coerce to same type - ints - s = pd.Series(ints) - result = s.astype(all_data.dtype) - expected = pd.Series(ints) - tm.assert_series_equal(result, expected) - - # coerce to same other - ints - s = pd.Series(ints) - result = s.astype(dtype) - expected = pd.Series(ints, dtype=dtype) - tm.assert_series_equal(result, expected) - - # coerce to same numpy_dtype - ints - s = pd.Series(ints) - result = s.astype(all_data.dtype.numpy_dtype) - expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype)) - tm.assert_series_equal(result, expected) - - # coerce to same type - mixed - s = pd.Series(mixed) - result = s.astype(all_data.dtype) - expected = pd.Series(mixed) - tm.assert_series_equal(result, expected) - - # coerce to same other - mixed - s = pd.Series(mixed) - result = s.astype(dtype) - expected = pd.Series(mixed, dtype=dtype) - tm.assert_series_equal(result, expected) - - # coerce to same numpy_dtype - mixed - s = pd.Series(mixed) - msg = r"cannot convert to .*-dtype NumPy array with missing values.*" - with pytest.raises(ValueError, match=msg): - s.astype(all_data.dtype.numpy_dtype) - - # coerce to object - s = pd.Series(mixed) - result = s.astype("object") - expected = pd.Series(np.asarray(mixed)) - tm.assert_series_equal(result, expected) - - def test_astype_to_larger_numpy(self): - a = pd.array([1, 2], dtype="Int32") - result = a.astype("int64") - expected = np.array([1, 2], dtype="int64") - tm.assert_numpy_array_equal(result, expected) - - a = pd.array([1, 2], dtype="UInt32") - result = a.astype("uint64") - expected = np.array([1, 2], dtype="uint64") - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) - def test_astype_specific_casting(self, dtype): - s = pd.Series([1, 2, 3], dtype="Int64") - result = s.astype(dtype) - expected = pd.Series([1, 2, 3], dtype=dtype) - tm.assert_series_equal(result, expected) - - s = pd.Series([1, 2, 3, None], dtype="Int64") - result = s.astype(dtype) - expected = pd.Series([1, 2, 3, None], dtype=dtype) - tm.assert_series_equal(result, expected) - - def test_astype_dt64(self): - # GH#32435 - arr = pd.array([1, 2, 3, pd.NA]) * 10 ** 9 - - result = arr.astype("datetime64[ns]") - - expected = np.array([1, 2, 3, "NaT"], dtype="M8[s]").astype("M8[ns]") - tm.assert_numpy_array_equal(result, expected) - - def test_construct_cast_invalid(self, dtype): - - msg = "cannot safely" - arr = [1.2, 2.3, 3.7] - with pytest.raises(TypeError, match=msg): - integer_array(arr, dtype=dtype) - - with pytest.raises(TypeError, match=msg): - pd.Series(arr).astype(dtype) - - arr = [1.2, 2.3, 3.7, np.nan] - with pytest.raises(TypeError, match=msg): - integer_array(arr, dtype=dtype) - - with pytest.raises(TypeError, match=msg): - pd.Series(arr).astype(dtype) - - @pytest.mark.parametrize("in_series", [True, False]) - def test_to_numpy_na_nan(self, in_series): - a = pd.array([0, 1, None], dtype="Int64") - if in_series: - a = pd.Series(a) - - result = a.to_numpy(dtype="float64", na_value=np.nan) - expected = np.array([0.0, 1.0, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - result = a.to_numpy(dtype="int64", na_value=-1) - expected = np.array([0, 1, -1], dtype="int64") - tm.assert_numpy_array_equal(result, expected) - - result = a.to_numpy(dtype="bool", na_value=False) - expected = np.array([False, True, False], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("in_series", [True, False]) - @pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) - def test_to_numpy_dtype(self, dtype, in_series): - a = pd.array([0, 1], dtype="Int64") - if in_series: - a = pd.Series(a) - - result = a.to_numpy(dtype=dtype) - expected = np.array([0, 1], dtype=dtype) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) - def test_to_numpy_na_raises(self, dtype): - a = pd.array([0, 1, None], dtype="Int64") - with pytest.raises(ValueError, match=dtype): - a.to_numpy(dtype=dtype) - - def test_astype_str(self): - a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", ""], dtype=object) - - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) - - def test_astype_boolean(self): - # https://github.com/pandas-dev/pandas/issues/31102 - a = pd.array([1, 0, -1, 2, None], dtype="Int64") - result = a.astype("boolean") - expected = pd.array([True, False, True, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_frame_repr(data_missing): - - df = pd.DataFrame({"A": data_missing}) - result = repr(df) - expected = " A\n0 \n1 1" - assert result == expected - - -def test_conversions(data_missing): - - # astype to object series - df = pd.DataFrame({"A": data_missing}) - result = df["A"].astype("object") - expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") - tm.assert_series_equal(result, expected) - - # convert to object ndarray - # we assert that we are exactly equal - # including type conversions of scalars - result = df["A"].astype("object").values - expected = np.array([pd.NA, 1], dtype=object) - tm.assert_numpy_array_equal(result, expected) - - for r, e in zip(result, expected): - if pd.isnull(r): - assert pd.isnull(e) - elif is_integer(r): - assert r == e - assert is_integer(e) - else: - assert r == e - assert type(r) == type(e) - - -def test_integer_array_constructor(): - values = np.array([1, 2, 3, 4], dtype="int64") - mask = np.array([False, False, False, True], dtype="bool") - - result = IntegerArray(values, mask) - expected = integer_array([1, 2, 3, np.nan], dtype="int64") - tm.assert_extension_array_equal(result, expected) - - msg = r".* should be .* numpy array. Use the 'integer_array' function instead" - with pytest.raises(TypeError, match=msg): - IntegerArray(values.tolist(), mask) - - with pytest.raises(TypeError, match=msg): - IntegerArray(values, mask.tolist()) - - with pytest.raises(TypeError, match=msg): - IntegerArray(values.astype(float), mask) - msg = r"__init__\(\) missing 1 required positional argument: 'mask'" - with pytest.raises(TypeError, match=msg): - IntegerArray(values) - - -@pytest.mark.parametrize( - "a, b", - [ - ([1, None], [1, np.nan]), - ([None], [np.nan]), - ([None, np.nan], [np.nan, np.nan]), - ([np.nan, np.nan], [np.nan, np.nan]), - ], -) -def test_integer_array_constructor_none_is_nan(a, b): - result = integer_array(a) - expected = integer_array(b) - tm.assert_extension_array_equal(result, expected) - - -def test_integer_array_constructor_copy(): - values = np.array([1, 2, 3, 4], dtype="int64") - mask = np.array([False, False, False, True], dtype="bool") - - result = IntegerArray(values, mask) - assert result._data is values - assert result._mask is mask - - result = IntegerArray(values, mask, copy=True) - assert result._data is not values - assert result._mask is not mask - - -@pytest.mark.parametrize( - "values", - [ - ["foo", "bar"], - ["1", "2"], - "foo", - 1, - 1.0, - pd.date_range("20130101", periods=2), - np.array(["foo"]), - [[1, 2], [3, 4]], - [np.nan, {"a": 1}], - ], -) -def test_to_integer_array_error(values): - # error in converting existing arrays to IntegerArrays - msg = ( - r"(:?.* cannot be converted to an IntegerDtype)" - r"|(:?values must be a 1D list-like)" - ) - with pytest.raises(TypeError, match=msg): - integer_array(values) - - -def test_to_integer_array_inferred_dtype(): - # if values has dtype -> respect it - result = integer_array(np.array([1, 2], dtype="int8")) - assert result.dtype == Int8Dtype() - result = integer_array(np.array([1, 2], dtype="int32")) - assert result.dtype == Int32Dtype() - - # if values have no dtype -> always int64 - result = integer_array([1, 2]) - assert result.dtype == Int64Dtype() - - -def test_to_integer_array_dtype_keyword(): - result = integer_array([1, 2], dtype="int8") - assert result.dtype == Int8Dtype() - - # if values has dtype -> override it - result = integer_array(np.array([1, 2], dtype="int8"), dtype="int32") - assert result.dtype == Int32Dtype() - - -def test_to_integer_array_float(): - result = integer_array([1.0, 2.0]) - expected = integer_array([1, 2]) - tm.assert_extension_array_equal(result, expected) - - with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): - integer_array([1.5, 2.0]) - - # for float dtypes, the itemsize is not preserved - result = integer_array(np.array([1.0, 2.0], dtype="float32")) - assert result.dtype == Int64Dtype() - - -@pytest.mark.parametrize( - "bool_values, int_values, target_dtype, expected_dtype", - [ - ([False, True], [0, 1], Int64Dtype(), Int64Dtype()), - ([False, True], [0, 1], "Int64", Int64Dtype()), - ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()), - ], -) -def test_to_integer_array_bool(bool_values, int_values, target_dtype, expected_dtype): - result = integer_array(bool_values, dtype=target_dtype) - assert result.dtype == expected_dtype - expected = integer_array(int_values, dtype=target_dtype) - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize( - "values, to_dtype, result_dtype", - [ - (np.array([1], dtype="int64"), None, Int64Dtype), - (np.array([1, np.nan]), None, Int64Dtype), - (np.array([1, np.nan]), "int8", Int8Dtype), - ], -) -def test_to_integer_array(values, to_dtype, result_dtype): - # convert existing arrays to IntegerArrays - result = integer_array(values, dtype=to_dtype) - assert result.dtype == result_dtype() - expected = integer_array(values, dtype=result_dtype()) - tm.assert_extension_array_equal(result, expected) - - -def test_cross_type_arithmetic(): - - df = pd.DataFrame( - { - "A": pd.Series([1, 2, np.nan], dtype="Int64"), - "B": pd.Series([1, np.nan, 3], dtype="UInt8"), - "C": [1, 2, 3], - } - ) - - result = df.A + df.C - expected = pd.Series([2, 4, np.nan], dtype="Int64") - tm.assert_series_equal(result, expected) - - result = (df.A + df.C) * 3 == 12 - expected = pd.Series([False, True, None], dtype="boolean") - tm.assert_series_equal(result, expected) - - result = df.A + df.B - expected = pd.Series([2, np.nan, np.nan], dtype="Int64") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) -def test_preserve_dtypes(op): - # TODO(#22346): preserve Int64 dtype - # for ops that enable (mean would actually work here - # but generally it is a float return value) - df = pd.DataFrame( - { - "A": ["a", "b", "b"], - "B": [1, None, 3], - "C": integer_array([1, None, 3], dtype="Int64"), - } - ) - - # op - result = getattr(df.C, op)() - assert isinstance(result, int) - - # groupby - result = getattr(df.groupby("A"), op)() - - expected = pd.DataFrame( - {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, - index=pd.Index(["a", "b"], name="A"), - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("op", ["mean"]) -def test_reduce_to_float(op): - # some reduce ops always return float, even if the result - # is a rounded number - df = pd.DataFrame( - { - "A": ["a", "b", "b"], - "B": [1, None, 3], - "C": integer_array([1, None, 3], dtype="Int64"), - } - ) - - # op - result = getattr(df.C, op)() - assert isinstance(result, float) - - # groupby - result = getattr(df.groupby("A"), op)() - - expected = pd.DataFrame( - {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, - index=pd.Index(["a", "b"], name="A"), - ) - tm.assert_frame_equal(result, expected) - - -def test_astype_nansafe(): - # see gh-22343 - arr = integer_array([np.nan, 1, 2], dtype="Int8") - msg = "cannot convert to 'uint32'-dtype NumPy array with missing values." - - with pytest.raises(ValueError, match=msg): - arr.astype("uint32") - - -@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) -# np.sign emits a warning with nans, -@pytest.mark.filterwarnings("ignore:invalid value encountered in sign") -def test_ufuncs_single_int(ufunc): - a = integer_array([1, 2, -3, np.nan]) - result = ufunc(a) - expected = integer_array(ufunc(a.astype(float))) - tm.assert_extension_array_equal(result, expected) - - s = pd.Series(a) - result = ufunc(s) - expected = pd.Series(integer_array(ufunc(a.astype(float)))) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) -def test_ufuncs_single_float(ufunc): - a = integer_array([1, 2, -3, np.nan]) - with np.errstate(invalid="ignore"): - result = ufunc(a) - expected = ufunc(a.astype(float)) - tm.assert_numpy_array_equal(result, expected) - - s = pd.Series(a) - with np.errstate(invalid="ignore"): - result = ufunc(s) - expected = ufunc(s.astype(float)) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) -def test_ufuncs_binary_int(ufunc): - # two IntegerArrays - a = integer_array([1, 2, -3, np.nan]) - result = ufunc(a, a) - expected = integer_array(ufunc(a.astype(float), a.astype(float))) - tm.assert_extension_array_equal(result, expected) - - # IntegerArray with numpy array - arr = np.array([1, 2, 3, 4]) - result = ufunc(a, arr) - expected = integer_array(ufunc(a.astype(float), arr)) - tm.assert_extension_array_equal(result, expected) - - result = ufunc(arr, a) - expected = integer_array(ufunc(arr, a.astype(float))) - tm.assert_extension_array_equal(result, expected) - - # IntegerArray with scalar - result = ufunc(a, 1) - expected = integer_array(ufunc(a.astype(float), 1)) - tm.assert_extension_array_equal(result, expected) - - result = ufunc(1, a) - expected = integer_array(ufunc(1, a.astype(float))) - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize("values", [[0, 1], [0, None]]) -def test_ufunc_reduce_raises(values): - a = integer_array(values) - msg = r"The 'reduce' method is not supported." - with pytest.raises(NotImplementedError, match=msg): - np.add.reduce(a) - - -@td.skip_if_no("pyarrow", min_version="0.15.0") -def test_arrow_array(data): - # protocol added in 0.15.0 - import pyarrow as pa - - arr = pa.array(data) - expected = np.array(data, dtype=object) - expected[data.isna()] = None - expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) - assert arr.equals(expected) - - -@td.skip_if_no("pyarrow", min_version="0.16.0") -def test_arrow_roundtrip(data): - # roundtrip possible from arrow 0.16.0 - import pyarrow as pa - - df = pd.DataFrame({"a": data}) - table = pa.table(df) - assert table.field("a").type == str(data.dtype.numpy_dtype) - result = table.to_pandas() - tm.assert_frame_equal(result, df) - - -@td.skip_if_no("pyarrow", min_version="0.16.0") -def test_arrow_from_arrow_uint(): - # https://github.com/pandas-dev/pandas/issues/31896 - # possible mismatch in types - import pyarrow as pa - - dtype = pd.UInt32Dtype() - result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) - expected = pd.array([1, 2, 3, 4, None], dtype="UInt32") - - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize( - "pandasmethname, kwargs", - [ - ("var", {"ddof": 0}), - ("var", {"ddof": 1}), - ("kurtosis", {}), - ("skew", {}), - ("sem", {}), - ], -) -def test_stat_method(pandasmethname, kwargs): - s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64") - pandasmeth = getattr(s, pandasmethname) - result = pandasmeth(**kwargs) - s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") - pandasmeth = getattr(s2, pandasmethname) - expected = pandasmeth(**kwargs) - assert expected == result - - -def test_value_counts_na(): - arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") - result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") - tm.assert_series_equal(result, expected) - - result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") - tm.assert_series_equal(result, expected) - - -def test_array_setitem_nullable_boolean_mask(): - # GH 31446 - ser = pd.Series([1, 2], dtype="Int64") - result = ser.where(ser > 1) - expected = pd.Series([pd.NA, 2], dtype="Int64") - tm.assert_series_equal(result, expected) - - -def test_array_setitem(): - # GH 31446 - arr = pd.Series([1, 2], dtype="Int64").array - arr[arr > 1] = 1 - - expected = pd.array([1, 1], dtype="Int64") - tm.assert_extension_array_equal(arr, expected) - - -# TODO(jreback) - these need testing / are broken - -# shift - -# set_index (destroys type) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 923447889d04c..a7aacc9e0968a 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1424,6 +1424,24 @@ def test_lookup_raises(self, float_frame): with pytest.raises(ValueError, match="same size"): float_frame.lookup(["a", "b", "c"], ["a"]) + def test_lookup_requires_unique_axes(self): + # GH#33041 raise with a helpful error message + df = pd.DataFrame(np.random.randn(6).reshape(3, 2), columns=["A", "A"]) + + rows = [0, 1] + cols = ["A", "A"] + + # homogeneous-dtype case + with pytest.raises(ValueError, match="requires unique index and columns"): + df.lookup(rows, cols) + with pytest.raises(ValueError, match="requires unique index and columns"): + df.T.lookup(cols, rows) + + # heterogeneous dtype + df["B"] = 0 + with pytest.raises(ValueError, match="requires unique index and columns"): + df.lookup(rows, cols) + def test_set_value(self, float_frame): for idx in float_frame.index: for col in float_frame.columns: diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py new file mode 100644 index 0000000000000..622c93d1c2fdc --- /dev/null +++ b/pandas/tests/frame/indexing/test_insert.py @@ -0,0 +1,68 @@ +""" +test_insert is specifically for the DataFrame.insert method; not to be +confused with tests with "insert" in their names that are really testing +__setitem__. +""" +import numpy as np +import pytest + +from pandas import DataFrame, Index +import pandas._testing as tm + + +class TestDataFrameInsert: + def test_insert(self): + df = DataFrame( + np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"] + ) + + df.insert(0, "foo", df["a"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"])) + tm.assert_series_equal(df["a"], df["foo"], check_names=False) + + df.insert(2, "bar", df["c"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"])) + tm.assert_almost_equal(df["c"], df["bar"], check_names=False) + + with pytest.raises(ValueError, match="already exists"): + df.insert(1, "a", df["b"]) + + msg = "cannot insert c, already exists" + with pytest.raises(ValueError, match=msg): + df.insert(1, "c", df["b"]) + + df.columns.name = "some_name" + # preserve columns name field + df.insert(0, "baz", df["c"]) + assert df.columns.name == "some_name" + + def test_insert_column_bug_4032(self): + + # GH#4032, inserting a column and renaming causing errors + df = DataFrame({"b": [1.1, 2.2]}) + + df = df.rename(columns={}) + df.insert(0, "a", [1, 2]) + result = df.rename(columns={}) + + str(result) + expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + df.insert(0, "c", [1.3, 2.3]) + result = df.rename(columns={}) + + str(result) + expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) + tm.assert_frame_equal(result, expected) + + def test_insert_with_columns_dups(self): + # GH#14291 + df = DataFrame() + df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True) + df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) + df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) + exp = DataFrame( + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + ) + tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index bbf8ee5978e7c..24eb424bd5735 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -591,3 +591,40 @@ def test_where_tz_values(self, tz_naive_fixture): ) result = df1.where(mask, df2) tm.assert_frame_equal(exp, result) + + def test_df_where_change_dtype(self): + # GH#16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + result = df.where(mask) + expected = DataFrame( + [[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC") + ) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("kwargs", [dict(), dict(other=None)]) + def test_df_where_with_category(self, kwargs): + # GH#16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + # change type to category + df.A = df.A.astype("category") + df.B = df.B.astype("category") + df.C = df.C.astype("category") + + result = df.where(mask, **kwargs) + A = pd.Categorical([0, np.nan], categories=[0, 3]) + B = pd.Categorical([np.nan, np.nan], categories=[1, 4]) + C = pd.Categorical([np.nan, 5], categories=[2, 5]) + expected = DataFrame({"A": A, "B": B, "C": C}) + + tm.assert_frame_equal(result, expected) + + # Check Series.where while we're here + result = df.A.where(mask[:, 0], **kwargs) + expected = Series(A, name="A") + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py new file mode 100644 index 0000000000000..36a9a6b5b3d58 --- /dev/null +++ b/pandas/tests/frame/methods/test_align.py @@ -0,0 +1,245 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series +import pandas._testing as tm + + +class TestDataFrameAlign: + def test_align_float(self, float_frame): + af, bf = float_frame.align(float_frame) + assert af._data is not float_frame._data + + af, bf = float_frame.align(float_frame, copy=False) + assert af._data is float_frame._data + + # axis = 0 + other = float_frame.iloc[:-5, :3] + af, bf = float_frame.align(other, axis=0, fill_value=-1) + + tm.assert_index_equal(bf.columns, other.columns) + + # test fill value + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) + diff_b = other.index.difference(join_idx) + diff_a_vals = af.reindex(diff_a).values + diff_b_vals = bf.reindex(diff_b).values + assert (diff_a_vals == -1).all() + + af, bf = float_frame.align(other, join="right", axis=0) + tm.assert_index_equal(bf.columns, other.columns) + tm.assert_index_equal(bf.index, other.index) + tm.assert_index_equal(af.index, other.index) + + # axis = 1 + other = float_frame.iloc[:-5, :3].copy() + af, bf = float_frame.align(other, axis=1) + tm.assert_index_equal(bf.columns, float_frame.columns) + tm.assert_index_equal(bf.index, other.index) + + # test fill value + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) + diff_b = other.index.difference(join_idx) + diff_a_vals = af.reindex(diff_a).values + + # TODO(wesm): unused? + diff_b_vals = bf.reindex(diff_b).values # noqa + + assert (diff_a_vals == -1).all() + + af, bf = float_frame.align(other, join="inner", axis=1) + tm.assert_index_equal(bf.columns, other.columns) + + af, bf = float_frame.align(other, join="inner", axis=1, method="pad") + tm.assert_index_equal(bf.columns, other.columns) + + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None + ) + tm.assert_index_equal(bf.index, Index([])) + + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + # Try to align DataFrame to Series along bad axis + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + float_frame.align(af.iloc[0, :3], join="inner", axis=2) + + # align dataframe to series with broadcast or not + idx = float_frame.index + s = Series(range(len(idx)), index=idx) + + left, right = float_frame.align(s, axis=0) + tm.assert_index_equal(left.index, float_frame.index) + tm.assert_index_equal(right.index, float_frame.index) + assert isinstance(right, Series) + + left, right = float_frame.align(s, broadcast_axis=1) + tm.assert_index_equal(left.index, float_frame.index) + expected = {c: s for c in float_frame.columns} + expected = DataFrame( + expected, index=float_frame.index, columns=float_frame.columns + ) + tm.assert_frame_equal(right, expected) + + # see gh-9558 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df[df["a"] == 2] + expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + result = df.where(df["a"] == 2, 0) + expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) + tm.assert_frame_equal(result, expected) + + def test_align_int(self, int_frame): + # test other non-float types + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + af, bf = int_frame.align(other, join="inner", axis=1, method="pad") + tm.assert_index_equal(bf.columns, other.columns) + + def test_align_mixed_type(self, float_string_frame): + + af, bf = float_string_frame.align( + float_string_frame, join="inner", axis=1, method="pad" + ) + tm.assert_index_equal(bf.columns, float_string_frame.columns) + + def test_align_mixed_float(self, mixed_float_frame): + # mixed floats/ints + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + af, bf = mixed_float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + def test_align_mixed_int(self, mixed_int_frame): + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + af, bf = mixed_int_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + def test_align_multiindex(self): + # GH#10665 + # same test cases as test_align_multiindex in test_series.py + + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + df1 = pd.DataFrame(np.arange(12, dtype="int64"), index=midx) + df2 = pd.DataFrame(np.arange(2, dtype="int64"), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = df1.align(df2, join="left") + res2l, res2r = df2.align(df1, join="right") + + expl = df1 + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + + res1l, res1r = df1.align(df2, join="right") + res2l, res2r = df2.align(df1, join="left") + + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) + expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + + def test_align_series_combinations(self): + df = pd.DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = pd.Series([1, 2, 4], index=list("ABD"), name="x") + + # frame + series + res1, res2 = df.align(s, axis=0) + exp1 = pd.DataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") + + tm.assert_frame_equal(res1, exp1) + tm.assert_series_equal(res2, exp2) + + # series + frame + res1, res2 = s.align(df) + tm.assert_series_equal(res1, exp2) + tm.assert_frame_equal(res2, exp1) + + def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): + aa, ab = a.align( + b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis + ) + + join_index, join_columns = None, None + + ea, eb = a, b + if axis is None or axis == 0: + join_index = a.index.join(b.index, how=how) + ea = ea.reindex(index=join_index) + eb = eb.reindex(index=join_index) + + if axis is None or axis == 1: + join_columns = a.columns.join(b.columns, how=how) + ea = ea.reindex(columns=join_columns) + eb = eb.reindex(columns=join_columns) + + ea = ea.fillna(axis=fill_axis, method=method, limit=limit) + eb = eb.fillna(axis=fill_axis, method=method, limit=limit) + + tm.assert_frame_equal(aa, ea) + tm.assert_frame_equal(ab, eb) + + @pytest.mark.parametrize("meth", ["pad", "bfill"]) + @pytest.mark.parametrize("ax", [0, 1, None]) + @pytest.mark.parametrize("fax", [0, 1]) + @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) + def test_align_fill_method(self, how, meth, ax, fax, float_frame): + df = float_frame + self._check_align_fill(df, how, meth, ax, fax) + + def _check_align_fill(self, frame, kind, meth, ax, fax): + left = frame.iloc[0:4, :10] + right = frame.iloc[2:, 6:] + empty = frame.iloc[:0, :0] + + self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + # empty left + self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + # empty right + self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + # both empty + self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index e6d002369f758..0bc234dcb39aa 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -1,7 +1,12 @@ +import re + import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd +from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm @@ -52,3 +57,204 @@ def test_drop_with_non_unique_datetime_index_and_invalid_keys(): with pytest.raises(KeyError, match="not found in axis"): df.drop(["a", "b"]) # Dropping with labels not exist in the index + + +class TestDataFrameDrop: + def test_drop_names(self): + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + df.index.name, df.columns.name = "first", "second" + df_dropped_b = df.drop("b") + df_dropped_e = df.drop("e", axis=1) + df_inplace_b, df_inplace_e = df.copy(), df.copy() + df_inplace_b.drop("b", inplace=True) + df_inplace_e.drop("e", axis=1, inplace=True) + for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): + assert obj.index.name == "first" + assert obj.columns.name == "second" + assert list(df.columns) == ["d", "e", "f"] + + msg = r"\['g'\] not found in axis" + with pytest.raises(KeyError, match=msg): + df.drop(["g"]) + with pytest.raises(KeyError, match=msg): + df.drop(["g"], 1) + + # errors = 'ignore' + dropped = df.drop(["g"], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["b", "g"], errors="ignore") + expected = Index(["a", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["g"], axis=1, errors="ignore") + expected = Index(["d", "e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + dropped = df.drop(["d", "g"], axis=1, errors="ignore") + expected = Index(["e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + # GH 16398 + dropped = df.drop([], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + def test_drop(self): + simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) + tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]]) + tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]]) + tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) + tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) + + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop(5) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop("C", 1) + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop([1, 5]) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop(["A", "C"], 1) + + # errors = 'ignore' + tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] + ) + tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] + ) + + # non-unique - wheee! + nu_df = DataFrame( + list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"] + ) + tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]]) + tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) + tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 + + nu_df = nu_df.set_index(pd.Index(["X", "Y", "X"])) + nu_df.columns = list("abc") + tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) + tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) + + # inplace cache issue + # GH#5628 + df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) + expected = df[~(df.b > 0)] + df.drop(labels=df[df.b > 0].index, inplace=True) + tm.assert_frame_equal(df, expected) + + def test_drop_multiindex_not_lexsorted(self): + # GH#11640 + + # define the lexsorted version + lexsorted_mi = MultiIndex.from_tuples( + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + assert lexsorted_df.columns.is_lexsorted() + + # define the non-lexsorted version + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + not_lexsorted_df = not_lexsorted_df.pivot_table( + index="a", columns=["b", "c"], values="d" + ) + not_lexsorted_df = not_lexsorted_df.reset_index() + assert not not_lexsorted_df.columns.is_lexsorted() + + # compare the results + tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) + + expected = lexsorted_df.drop("a", axis=1) + with tm.assert_produces_warning(PerformanceWarning): + result = not_lexsorted_df.drop("a", axis=1) + + tm.assert_frame_equal(result, expected) + + def test_drop_api_equivalence(self): + # equivalence of the labels/axis and index/columns API's (GH#12392) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.drop("a") + res2 = df.drop(index="a") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop("d", 1) + res2 = df.drop(columns="d") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(labels="e", axis=1) + res2 = df.drop(columns="e") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0) + res2 = df.drop(index=["a"]) + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0).drop(["d"], axis=1) + res2 = df.drop(index=["a"], columns=["d"]) + tm.assert_frame_equal(res1, res2) + + msg = "Cannot specify both 'labels' and 'index'/'columns'" + with pytest.raises(ValueError, match=msg): + df.drop(labels="a", index="b") + + with pytest.raises(ValueError, match=msg): + df.drop(labels="a", columns="b") + + msg = "Need to specify at least one of 'labels', 'index' or 'columns'" + with pytest.raises(ValueError, match=msg): + df.drop(axis=1) + + data = [[1, 2, 3], [1, 2, 3]] + + @pytest.mark.parametrize( + "actual", + [ + DataFrame(data=data, index=["a", "a"]), + DataFrame(data=data, index=["a", "b"]), + DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), + DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), + ], + ) + def test_raise_on_drop_duplicate_index(self, actual): + + # GH#19186 + level = 0 if isinstance(actual.index, MultiIndex) else None + msg = re.escape("\"['c'] not found in axis\"") + with pytest.raises(KeyError, match=msg): + actual.drop("c", level=level, axis=0) + with pytest.raises(KeyError, match=msg): + actual.T.drop("c", level=level, axis=1) + expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err, actual) + expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err.T, actual) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize("drop_labels", [[], [1], [2]]) + def test_drop_empty_list(self, index, drop_labels): + # GH#21494 + expected_index = [i for i in index if i not in drop_labels] + frame = pd.DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) + def test_drop_non_empty_list(self, index, drop_labels): + # GH# 21494 + with pytest.raises(KeyError, match="not found in axis"): + pd.DataFrame(index=index).drop(drop_labels) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 6dee4424f1cec..e328523253144 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -12,7 +12,6 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, notna import pandas._testing as tm -from pandas.conftest import _get_cython_table_params from pandas.core.apply import frame_apply from pandas.core.base import SpecificationError @@ -1323,7 +1322,7 @@ def func(group_col): @pytest.mark.parametrize( "df, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( DataFrame(), [ ("sum", Series(dtype="float64")), @@ -1338,7 +1337,7 @@ def func(group_col): ("median", Series(dtype="float64")), ], ), - _get_cython_table_params( + tm.get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ ("sum", Series([1.0, 3])), @@ -1365,10 +1364,10 @@ def test_agg_cython_table(self, df, func, expected, axis): @pytest.mark.parametrize( "df, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] ), - _get_cython_table_params( + tm.get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), @@ -1390,7 +1389,7 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): @pytest.mark.parametrize( "df, func, expected", - _get_cython_table_params( + tm.get_cython_table_params( DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] ), ) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 2150e1da9e8ad..9e0b51767df2c 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -530,6 +530,15 @@ def test_arith_flex_zero_len_raises(self): with pytest.raises(NotImplementedError, match="fill_value"): df_len0.sub(df["A"], axis=None, fill_value=3) + def test_flex_add_scalar_fill_value(self): + # GH#12723 + dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") + df = pd.DataFrame({"foo": dat}, index=range(6)) + + exp = df.fillna(0).add(2) + res = df.add(2, fill_value=0) + tm.assert_frame_equal(res, exp) + class TestFrameArithmetic: def test_td64_op_nat_casting(self): diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index ea21359c2f75c..d1d55d38f4a9a 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -1,11 +1,8 @@ from datetime import datetime -import re import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna import pandas._testing as tm @@ -15,52 +12,7 @@ class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing - def test_drop_names(self): - df = DataFrame( - [[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=["a", "b", "c"], - columns=["d", "e", "f"], - ) - df.index.name, df.columns.name = "first", "second" - df_dropped_b = df.drop("b") - df_dropped_e = df.drop("e", axis=1) - df_inplace_b, df_inplace_e = df.copy(), df.copy() - df_inplace_b.drop("b", inplace=True) - df_inplace_e.drop("e", axis=1, inplace=True) - for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): - assert obj.index.name == "first" - assert obj.columns.name == "second" - assert list(df.columns) == ["d", "e", "f"] - - msg = r"\['g'\] not found in axis" - with pytest.raises(KeyError, match=msg): - df.drop(["g"]) - with pytest.raises(KeyError, match=msg): - df.drop(["g"], 1) - - # errors = 'ignore' - dropped = df.drop(["g"], errors="ignore") - expected = Index(["a", "b", "c"], name="first") - tm.assert_index_equal(dropped.index, expected) - - dropped = df.drop(["b", "g"], errors="ignore") - expected = Index(["a", "c"], name="first") - tm.assert_index_equal(dropped.index, expected) - - dropped = df.drop(["g"], axis=1, errors="ignore") - expected = Index(["d", "e", "f"], name="second") - tm.assert_index_equal(dropped.columns, expected) - - dropped = df.drop(["d", "g"], axis=1, errors="ignore") - expected = Index(["e", "f"], name="second") - tm.assert_index_equal(dropped.columns, expected) - - # GH 16398 - dropped = df.drop([], errors="ignore") - expected = Index(["a", "b", "c"], name="first") - tm.assert_index_equal(dropped.index, expected) - - def test_drop_col_still_multiindex(self): + def test_delitem_col_still_multiindex(self): arrays = [["a", "b", "c", "top"], ["", "", "", "OD"], ["", "", "", "wx"]] tuples = sorted(zip(*arrays)) @@ -70,120 +22,6 @@ def test_drop_col_still_multiindex(self): del df[("a", "", "")] assert isinstance(df.columns, MultiIndex) - def test_drop(self): - simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) - tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]]) - tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]]) - tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) - tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) - - with pytest.raises(KeyError, match=r"\[5\] not found in axis"): - simple.drop(5) - with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): - simple.drop("C", 1) - with pytest.raises(KeyError, match=r"\[5\] not found in axis"): - simple.drop([1, 5]) - with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): - simple.drop(["A", "C"], 1) - - # errors = 'ignore' - tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple) - tm.assert_frame_equal( - simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] - ) - tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple) - tm.assert_frame_equal( - simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] - ) - - # non-unique - wheee! - nu_df = DataFrame( - list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"] - ) - tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]]) - tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) - tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 - - nu_df = nu_df.set_index(pd.Index(["X", "Y", "X"])) - nu_df.columns = list("abc") - tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) - tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) - - # inplace cache issue - # GH 5628 - df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) - expected = df[~(df.b > 0)] - df.drop(labels=df[df.b > 0].index, inplace=True) - tm.assert_frame_equal(df, expected) - - def test_drop_multiindex_not_lexsorted(self): - # GH 11640 - - # define the lexsorted version - lexsorted_mi = MultiIndex.from_tuples( - [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] - ) - lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) - assert lexsorted_df.columns.is_lexsorted() - - # define the non-lexsorted version - not_lexsorted_df = DataFrame( - columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] - ) - not_lexsorted_df = not_lexsorted_df.pivot_table( - index="a", columns=["b", "c"], values="d" - ) - not_lexsorted_df = not_lexsorted_df.reset_index() - assert not not_lexsorted_df.columns.is_lexsorted() - - # compare the results - tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - - expected = lexsorted_df.drop("a", axis=1) - with tm.assert_produces_warning(PerformanceWarning): - result = not_lexsorted_df.drop("a", axis=1) - - tm.assert_frame_equal(result, expected) - - def test_drop_api_equivalence(self): - # equivalence of the labels/axis and index/columns API's (GH12392) - df = DataFrame( - [[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=["a", "b", "c"], - columns=["d", "e", "f"], - ) - - res1 = df.drop("a") - res2 = df.drop(index="a") - tm.assert_frame_equal(res1, res2) - - res1 = df.drop("d", 1) - res2 = df.drop(columns="d") - tm.assert_frame_equal(res1, res2) - - res1 = df.drop(labels="e", axis=1) - res2 = df.drop(columns="e") - tm.assert_frame_equal(res1, res2) - - res1 = df.drop(["a"], axis=0) - res2 = df.drop(index=["a"]) - tm.assert_frame_equal(res1, res2) - - res1 = df.drop(["a"], axis=0).drop(["d"], axis=1) - res2 = df.drop(index=["a"], columns=["d"]) - tm.assert_frame_equal(res1, res2) - - msg = "Cannot specify both 'labels' and 'index'/'columns'" - with pytest.raises(ValueError, match=msg): - df.drop(labels="a", index="b") - - with pytest.raises(ValueError, match=msg): - df.drop(labels="a", columns="b") - - msg = "Need to specify at least one of 'labels', 'index' or 'columns'" - with pytest.raises(ValueError, match=msg): - df.drop(axis=1) - def test_merge_join_different_levels(self): # GH 9455 @@ -558,188 +396,6 @@ def test_reindex_api_equivalence(self): for res in [res2, res3]: tm.assert_frame_equal(res1, res) - def test_align_float(self, float_frame): - af, bf = float_frame.align(float_frame) - assert af._data is not float_frame._data - - af, bf = float_frame.align(float_frame, copy=False) - assert af._data is float_frame._data - - # axis = 0 - other = float_frame.iloc[:-5, :3] - af, bf = float_frame.align(other, axis=0, fill_value=-1) - - tm.assert_index_equal(bf.columns, other.columns) - - # test fill value - join_idx = float_frame.index.join(other.index) - diff_a = float_frame.index.difference(join_idx) - diff_b = other.index.difference(join_idx) - diff_a_vals = af.reindex(diff_a).values - diff_b_vals = bf.reindex(diff_b).values - assert (diff_a_vals == -1).all() - - af, bf = float_frame.align(other, join="right", axis=0) - tm.assert_index_equal(bf.columns, other.columns) - tm.assert_index_equal(bf.index, other.index) - tm.assert_index_equal(af.index, other.index) - - # axis = 1 - other = float_frame.iloc[:-5, :3].copy() - af, bf = float_frame.align(other, axis=1) - tm.assert_index_equal(bf.columns, float_frame.columns) - tm.assert_index_equal(bf.index, other.index) - - # test fill value - join_idx = float_frame.index.join(other.index) - diff_a = float_frame.index.difference(join_idx) - diff_b = other.index.difference(join_idx) - diff_a_vals = af.reindex(diff_a).values - - # TODO(wesm): unused? - diff_b_vals = bf.reindex(diff_b).values # noqa - - assert (diff_a_vals == -1).all() - - af, bf = float_frame.align(other, join="inner", axis=1) - tm.assert_index_equal(bf.columns, other.columns) - - af, bf = float_frame.align(other, join="inner", axis=1, method="pad") - tm.assert_index_equal(bf.columns, other.columns) - - af, bf = float_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None - ) - tm.assert_index_equal(bf.index, Index([])) - - af, bf = float_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 - ) - tm.assert_index_equal(bf.index, Index([])) - - # Try to align DataFrame to Series along bad axis - msg = "No axis named 2 for object type DataFrame" - with pytest.raises(ValueError, match=msg): - float_frame.align(af.iloc[0, :3], join="inner", axis=2) - - # align dataframe to series with broadcast or not - idx = float_frame.index - s = Series(range(len(idx)), index=idx) - - left, right = float_frame.align(s, axis=0) - tm.assert_index_equal(left.index, float_frame.index) - tm.assert_index_equal(right.index, float_frame.index) - assert isinstance(right, Series) - - left, right = float_frame.align(s, broadcast_axis=1) - tm.assert_index_equal(left.index, float_frame.index) - expected = {c: s for c in float_frame.columns} - expected = DataFrame( - expected, index=float_frame.index, columns=float_frame.columns - ) - tm.assert_frame_equal(right, expected) - - # see gh-9558 - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - result = df[df["a"] == 2] - expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - - result = df.where(df["a"] == 2, 0) - expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) - tm.assert_frame_equal(result, expected) - - def test_align_int(self, int_frame): - # test other non-float types - other = DataFrame(index=range(5), columns=["A", "B", "C"]) - - af, bf = int_frame.align(other, join="inner", axis=1, method="pad") - tm.assert_index_equal(bf.columns, other.columns) - - def test_align_mixed_type(self, float_string_frame): - - af, bf = float_string_frame.align( - float_string_frame, join="inner", axis=1, method="pad" - ) - tm.assert_index_equal(bf.columns, float_string_frame.columns) - - def test_align_mixed_float(self, mixed_float_frame): - # mixed floats/ints - other = DataFrame(index=range(5), columns=["A", "B", "C"]) - - af, bf = mixed_float_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 - ) - tm.assert_index_equal(bf.index, Index([])) - - def test_align_mixed_int(self, mixed_int_frame): - other = DataFrame(index=range(5), columns=["A", "B", "C"]) - - af, bf = mixed_int_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 - ) - tm.assert_index_equal(bf.index, Index([])) - - def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): - aa, ab = a.align( - b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis - ) - - join_index, join_columns = None, None - - ea, eb = a, b - if axis is None or axis == 0: - join_index = a.index.join(b.index, how=how) - ea = ea.reindex(index=join_index) - eb = eb.reindex(index=join_index) - - if axis is None or axis == 1: - join_columns = a.columns.join(b.columns, how=how) - ea = ea.reindex(columns=join_columns) - eb = eb.reindex(columns=join_columns) - - ea = ea.fillna(axis=fill_axis, method=method, limit=limit) - eb = eb.fillna(axis=fill_axis, method=method, limit=limit) - - tm.assert_frame_equal(aa, ea) - tm.assert_frame_equal(ab, eb) - - @pytest.mark.parametrize("meth", ["pad", "bfill"]) - @pytest.mark.parametrize("ax", [0, 1, None]) - @pytest.mark.parametrize("fax", [0, 1]) - @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) - def test_align_fill_method(self, how, meth, ax, fax, float_frame): - df = float_frame - self._check_align_fill(df, how, meth, ax, fax) - - def _check_align_fill(self, frame, kind, meth, ax, fax): - left = frame.iloc[0:4, :10] - right = frame.iloc[2:, 6:] - empty = frame.iloc[:0, :0] - - self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - - # empty left - self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - - # empty right - self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - - # both empty - self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - def test_align_int_fill_bug(self): # GH #910 X = np.arange(10 * 10, dtype="float64").reshape(10, 10) @@ -754,61 +410,6 @@ def test_align_int_fill_bug(self): expected = df2 - df2.mean() tm.assert_frame_equal(result, expected) - def test_align_multiindex(self): - # GH 10665 - # same test cases as test_align_multiindex in test_series.py - - midx = pd.MultiIndex.from_product( - [range(2), range(3), range(2)], names=("a", "b", "c") - ) - idx = pd.Index(range(2), name="b") - df1 = pd.DataFrame(np.arange(12, dtype="int64"), index=midx) - df2 = pd.DataFrame(np.arange(2, dtype="int64"), index=idx) - - # these must be the same results (but flipped) - res1l, res1r = df1.align(df2, join="left") - res2l, res2r = df2.align(df1, join="right") - - expl = df1 - tm.assert_frame_equal(expl, res1l) - tm.assert_frame_equal(expl, res2r) - expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) - tm.assert_frame_equal(expr, res1r) - tm.assert_frame_equal(expr, res2l) - - res1l, res1r = df1.align(df2, join="right") - res2l, res2r = df2.align(df1, join="left") - - exp_idx = pd.MultiIndex.from_product( - [range(2), range(2), range(2)], names=("a", "b", "c") - ) - expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) - tm.assert_frame_equal(expl, res1l) - tm.assert_frame_equal(expl, res2r) - expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx) - tm.assert_frame_equal(expr, res1r) - tm.assert_frame_equal(expr, res2l) - - def test_align_series_combinations(self): - df = pd.DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) - s = pd.Series([1, 2, 4], index=list("ABD"), name="x") - - # frame + series - res1, res2 = df.align(s, axis=0) - exp1 = pd.DataFrame( - {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, - index=list("ABCDE"), - ) - exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") - - tm.assert_frame_equal(res1, exp1) - tm.assert_series_equal(res2, exp2) - - # series + frame - res1, res2 = s.align(df) - tm.assert_series_equal(res1, exp2) - tm.assert_frame_equal(res2, exp1) - def test_filter(self, float_frame, float_string_frame): # Items filtered = float_frame.filter(["A", "B", "E"]) @@ -1116,42 +717,23 @@ def test_reindex_multi_categorical_time(self): expected = pd.DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) tm.assert_frame_equal(result, expected) - data = [[1, 2, 3], [1, 2, 3]] - @pytest.mark.parametrize( - "actual", - [ - DataFrame(data=data, index=["a", "a"]), - DataFrame(data=data, index=["a", "b"]), - DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), - DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), - ], + "operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"] ) - def test_raise_on_drop_duplicate_index(self, actual): - - # issue 19186 - level = 0 if isinstance(actual.index, MultiIndex) else None - msg = re.escape("\"['c'] not found in axis\"") - with pytest.raises(KeyError, match=msg): - actual.drop("c", level=level, axis=0) - with pytest.raises(KeyError, match=msg): - actual.T.drop("c", level=level, axis=1) - expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") - tm.assert_frame_equal(expected_no_err, actual) - expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore") - tm.assert_frame_equal(expected_no_err.T, actual) - - @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]]) - @pytest.mark.parametrize("drop_labels", [[], [1], [2]]) - def test_drop_empty_list(self, index, drop_labels): - # GH 21494 - expected_index = [i for i in index if i not in drop_labels] - frame = pd.DataFrame(index=index).drop(drop_labels) - tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) - - @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) - @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) - def test_drop_non_empty_list(self, index, drop_labels): - # GH 21494 - with pytest.raises(KeyError, match="not found in axis"): - pd.DataFrame(index=index).drop(drop_labels) + @pytest.mark.parametrize("inplace", [False, True]) + def test_inplace_drop_and_operation(self, operation, inplace): + # GH 30484 + df = pd.DataFrame({"x": range(5)}) + expected = df.copy() + df["y"] = range(5) + y = df["y"] + + with tm.assert_produces_warning(None): + if inplace: + df.drop("y", axis=1, inplace=inplace) + else: + df = df.drop("y", axis=1, inplace=inplace) + + # Perform operation and check result + getattr(y, operation)(1) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 323a13a940ac3..2cda4ba16f7ce 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -453,22 +453,6 @@ def test_astype_extension_dtypes_duplicate_col(self, dtype): expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("kwargs", [dict(), dict(other=None)]) - def test_df_where_with_category(self, kwargs): - # GH 16979 - df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) - mask = np.array([[True, False, True], [False, True, True]]) - - # change type to category - df.A = df.A.astype("category") - df.B = df.B.astype("category") - df.C = df.C.astype("category") - - result = df.A.where(mask[:, 0], **kwargs) - expected = Series(pd.Categorical([0, np.nan], categories=[0, 3]), name="A") - - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] ) @@ -479,31 +463,6 @@ def test_astype_column_metadata(self, dtype): df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) - def test_df_where_change_dtype(self): - # GH 16979 - df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) - mask = np.array([[True, False, False], [False, False, True]]) - - result = df.where(mask) - expected = DataFrame( - [[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC") - ) - - tm.assert_frame_equal(result, expected) - - # change type to category - df.A = df.A.astype("category") - df.B = df.B.astype("category") - df.C = df.C.astype("category") - - result = df.where(mask) - A = pd.Categorical([0, np.nan], categories=[0, 3]) - B = pd.Categorical([np.nan, np.nan], categories=[1, 4]) - C = pd.Categorical([np.nan, 5], categories=[2, 5]) - expected = DataFrame({"A": A, "B": B, "C": C}) - - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_object(self, dtype, unit): diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 470da25a922a1..e4de749c5f5c5 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -694,12 +694,3 @@ def test_fill_corner(self, float_frame, float_string_frame): # TODO(wesm): unused? result = empty_float.fillna(value=0) # noqa - - def test_fill_value_when_combine_const(self): - # GH12723 - dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") - df = DataFrame({"foo": dat}, index=range(6)) - - exp = df.fillna(0).add(2) - res = df.add(2, fill_value=0) - tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 33f71602f4713..9d1b6abff6241 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -3,14 +3,14 @@ import numpy as np import pytest -from pandas import DataFrame, Index, MultiIndex, Series +from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm # Column add, remove, delete. class TestDataFrameMutateColumns: - def test_insert_error_msmgs(self): + def test_setitem_error_msmgs(self): # GH 7432 df = DataFrame( @@ -30,7 +30,7 @@ def test_insert_error_msmgs(self): with pytest.raises(TypeError, match=msg): df["gr"] = df.groupby(["b", "c"]).count() - def test_insert_benchmark(self): + def test_setitem_benchmark(self): # from the vb_suite/frame_methods/frame_insert_columns N = 10 K = 5 @@ -41,18 +41,12 @@ def test_insert_benchmark(self): expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N)) tm.assert_frame_equal(df, expected) - def test_insert(self): + def test_setitem_different_dtype(self): df = DataFrame( np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"] ) - df.insert(0, "foo", df["a"]) - tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"])) - tm.assert_series_equal(df["a"], df["foo"], check_names=False) - df.insert(2, "bar", df["c"]) - tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"])) - tm.assert_almost_equal(df["c"], df["bar"], check_names=False) # diff dtype @@ -82,17 +76,7 @@ def test_insert(self): ) tm.assert_series_equal(result, expected) - with pytest.raises(ValueError, match="already exists"): - df.insert(1, "a", df["b"]) - msg = "cannot insert c, already exists" - with pytest.raises(ValueError, match=msg): - df.insert(1, "c", df["b"]) - - df.columns.name = "some_name" - # preserve columns name field - df.insert(0, "baz", df["c"]) - assert df.columns.name == "some_name" - + def test_setitem_empty_columns(self): # GH 13522 df = DataFrame(index=["A", "B", "C"]) df["X"] = df.index @@ -165,22 +149,3 @@ def test_pop_non_unique_cols(self): assert "b" in df.columns assert "a" not in df.columns assert len(df.index) == 2 - - def test_insert_column_bug_4032(self): - - # GH4032, inserting a column and renaming causing errors - df = DataFrame({"b": [1.1, 2.2]}) - df = df.rename(columns={}) - df.insert(0, "a", [1, 2]) - - result = df.rename(columns={}) - str(result) - expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - df.insert(0, "c", [1.3, 2.3]) - - result = df.rename(columns={}) - str(result) - - expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 233c0f4bd3544..2530886802921 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -513,14 +513,3 @@ def test_set_value_by_index(self): df.iloc[:, 0] = 3 tm.assert_series_equal(df.iloc[:, 1], expected) - - def test_insert_with_columns_dups(self): - # GH 14291 - df = pd.DataFrame() - df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True) - df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) - df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) - exp = pd.DataFrame( - [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] - ) - tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 48cf37a9abc8b..6d786d9580542 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -17,9 +17,6 @@ import pandas.io.formats.format as fmt -# Segregated collection of methods that require the BlockManager internal data -# structure - class TestDataFrameReprInfoEtc: def test_repr_empty(self): @@ -137,6 +134,10 @@ def test_unicode_string_with_unicode(self): df = DataFrame({"A": ["\u05d0"]}) str(df) + def test_repr_unicode_columns(self): + df = DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) + repr(df.columns) # should not raise UnicodeDecodeError + def test_str_to_bytes_raises(self): # GH 26447 df = DataFrame({"A": ["abc"]}) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 4f039baa5c7bd..9d3c40ce926d7 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -765,6 +765,60 @@ def test_unstack_unused_level(self, cols): expected.index = expected.index.droplevel("C") tm.assert_frame_equal(result, expected) + def test_unstack_long_index(self): + # PH 32624: Error when using a lot of indices to unstack. + # The error occurred only, if a lot of indices are used. + df = pd.DataFrame( + [[1]], + columns=pd.MultiIndex.from_tuples([[0]], names=["c1"]), + index=pd.MultiIndex.from_tuples( + [[0, 0, 1, 0, 0, 0, 1]], + names=["i1", "i2", "i3", "i4", "i5", "i6", "i7"], + ), + ) + result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"]) + expected = pd.DataFrame( + [[1]], + columns=pd.MultiIndex.from_tuples( + [[0, 0, 1, 0, 0, 0, 1]], + names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"], + ), + index=pd.Index([0], name="i1"), + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_multi_level_cols(self): + # PH 24729: Unstack a df with multi level columns + df = pd.DataFrame( + [[0.0, 0.0], [0.0, 0.0]], + columns=pd.MultiIndex.from_tuples( + [["B", "C"], ["B", "D"]], names=["c1", "c2"] + ), + index=pd.MultiIndex.from_tuples( + [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"], + ), + ) + assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"] + + def test_unstack_multi_level_rows_and_cols(self): + # PH 28306: Unstack df with multi level cols and rows + df = pd.DataFrame( + [[1, 2], [3, 4], [-1, -2], [-3, -4]], + columns=pd.MultiIndex.from_tuples([["a", "b", "c"], ["d", "e", "f"]]), + index=pd.MultiIndex.from_tuples( + [ + ["m1", "P3", 222], + ["m1", "A5", 111], + ["m2", "P3", 222], + ["m2", "A5", 111], + ], + names=["i1", "i2", "i3"], + ), + ) + result = df.unstack(["i3", "i2"]) + expected = df.unstack(["i3"]).unstack(["i2"]) + tm.assert_frame_equal(result, expected) + def test_unstack_nan_index(self): # GH7466 def cast(val): val_str = "" if val != val else val diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index 250fe950a05fc..b6abdf09a7f62 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -1,5 +1,3 @@ -from distutils.version import LooseVersion - import numpy as np import pytest @@ -9,21 +7,9 @@ from pandas import DataFrame, Series import pandas._testing as tm -try: - import xarray - - _XARRAY_INSTALLED = True -except ImportError: - _XARRAY_INSTALLED = False - class TestDataFrameToXArray: - @pytest.mark.skipif( - not _XARRAY_INSTALLED - or _XARRAY_INSTALLED - and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), - reason="xarray >= 0.10.0 required", - ) + @td.skip_if_no("xarray", "0.10.0") def test_to_xarray_index_types(self, indices): if isinstance(indices, pd.MultiIndex): pytest.skip("MultiIndex is tested separately") @@ -106,12 +92,7 @@ def test_to_xarray(self): class TestSeriesToXArray: - @pytest.mark.skipif( - not _XARRAY_INSTALLED - or _XARRAY_INSTALLED - and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), - reason="xarray >= 0.10.0 required", - ) + @td.skip_if_no("xarray", "0.10.0") def test_to_xarray_index_types(self, indices): if isinstance(indices, pd.MultiIndex): pytest.skip("MultiIndex is tested separately") diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 1265547653d7b..e860ea1a3d052 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_integer_dtype + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat import pandas._testing as tm @@ -340,6 +342,30 @@ def test_groupby_agg_coercing_bools(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "op", + [ + lambda x: x.sum(), + lambda x: x.cumsum(), + lambda x: x.transform("sum"), + lambda x: x.transform("cumsum"), + lambda x: x.agg("sum"), + lambda x: x.agg("cumsum"), + ], +) +def test_bool_agg_dtype(op): + # GH 7001 + # Bool sum aggregations result in int + df = pd.DataFrame({"a": [1, 1], "b": [False, True]}) + s = df.set_index("a")["b"] + + result = op(df.groupby("a"))["b"].dtype + assert is_integer_dtype(result) + + result = op(s.groupby("a")).dtype + assert is_integer_dtype(result) + + def test_order_aggregate_multiple_funcs(): # GH 25692 df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5662d41e19885..b8d8f56512a69 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1765,7 +1765,7 @@ def test_tuple_as_grouping(): } ) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=r"('a', 'b')"): df[["a", "b", "c"]].groupby(("a", "b")) result = df.groupby(("a", "b"))["c"].sum() diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 6b8bd9e805a0c..7cac13efb71f3 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -214,7 +214,7 @@ def test_timegrouper_with_reg_groups(self): result = df.groupby([pd.Grouper(freq="1M", level=0), "Buyer"]).sum() tm.assert_frame_equal(result, expected) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="The level foo is not valid"): df.groupby([pd.Grouper(freq="1M", level="foo"), "Buyer"]).sum() # multi names @@ -235,7 +235,8 @@ def test_timegrouper_with_reg_groups(self): tm.assert_frame_equal(result, expected) # error as we have both a level and a name! - with pytest.raises(ValueError): + msg = "The Grouper cannot specify both a key and a level!" + with pytest.raises(ValueError, match=msg): df.groupby( [pd.Grouper(freq="1M", key="Date", level="Date"), "Buyer"] ).sum() diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 9e6a8f34c135d..02b32c46e7d6f 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from pandas import Index, MultiIndex @@ -7,14 +8,15 @@ class TestIndexConstructor: # Tests for the Index constructor, specifically for cases that do # not return a subclass - def test_constructor_corner(self): + @pytest.mark.parametrize("value", [1, np.int64(1)]) + def test_constructor_corner(self, value): # corner case msg = ( r"Index\(\.\.\.\) must be called with a collection of some " - "kind, 0 was passed" + f"kind, {value} was passed" ) with pytest.raises(TypeError, match=msg): - Index(0) + Index(value) @pytest.mark.parametrize("index_vals", [[("A", 1), "B"], ["B", ("A", 1)]]) def test_construction_list_mixed_tuples(self, index_vals): diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index c18cd1f252c83..6e8e81230b2bb 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas._config.config as cf - from pandas._libs import index as libindex from pandas.core.dtypes.dtypes import CategoricalDtype @@ -100,65 +98,6 @@ def test_method_delegation(self): with pytest.raises(ValueError, match=msg): ci.set_categories(list("cab"), inplace=True) - def test_contains(self): - - ci = self.create_index(categories=list("cabdef")) - - assert "a" in ci - assert "z" not in ci - assert "e" not in ci - assert np.nan not in ci - - # assert codes NOT in index - assert 0 not in ci - assert 1 not in ci - - ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) - assert np.nan in ci - - @pytest.mark.parametrize( - "item, expected", - [ - (pd.Interval(0, 1), True), - (1.5, True), - (pd.Interval(0.5, 1.5), False), - ("a", False), - (pd.Timestamp(1), False), - (pd.Timedelta(1), False), - ], - ids=str, - ) - def test_contains_interval(self, item, expected): - # GH 23705 - ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) - result = item in ci - assert result is expected - - def test_contains_list(self): - # GH#21729 - idx = pd.CategoricalIndex([1, 2, 3]) - - assert "a" not in idx - - with pytest.raises(TypeError, match="unhashable type"): - ["a"] in idx - - with pytest.raises(TypeError, match="unhashable type"): - ["a", "b"] in idx - - @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) - def test_where(self, klass): - i = self.create_index() - cond = [True] * len(i) - expected = i - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - - cond = [False] + [True] * (len(i) - 1) - expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_append(self): ci = self.create_index() @@ -488,7 +427,7 @@ def test_equals_categorical(self): assert not ci.equals(CategoricalIndex(list("aabca") + [np.nan], ordered=True)) assert ci.equals(ci.copy()) - def test_equals_categoridcal_unordered(self): + def test_equals_categorical_unordered(self): # https://github.com/pandas-dev/pandas/issues/16603 a = pd.CategoricalIndex(["A"], categories=["A", "B"]) b = pd.CategoricalIndex(["A"], categories=["B", "A"]) @@ -503,106 +442,6 @@ def test_frame_repr(self): expected = " A\na 1\nb 2\nc 3" assert result == expected - def test_string_categorical_index_repr(self): - # short - idx = pd.CategoricalIndex(["a", "bb", "ccc"]) - expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa - assert repr(idx) == expected - - # multiple lines - idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10) - expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', - 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', - 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # truncated - idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100) - expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', - ... - 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa - - assert repr(idx) == expected - - # larger categories - idx = pd.CategoricalIndex(list("abcdefghijklmmo")) - expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', - 'm', 'm', 'o'], - categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # short - idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) - expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa - assert repr(idx) == expected - - # multiple lines - idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) - expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', - 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # truncated - idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) - expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', - ... - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa - - assert repr(idx) == expected - - # larger categories - idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) - expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', - 'す', 'せ', 'そ'], - categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # Emable Unicode option ----------------------------------------- - with cf.option_context("display.unicode.east_asian_width", True): - - # short - idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) - expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa - assert repr(idx) == expected - - # multiple lines - idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) - expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # truncated - idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) - expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう', 'あ', - ... - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa - - assert repr(idx) == expected - - # larger categories - idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) - expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', - 'さ', 'し', 'す', 'せ', 'そ'], - categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - def test_fillna_categorical(self): # GH 11343 idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py new file mode 100644 index 0000000000000..a5607224f6448 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -0,0 +1,108 @@ +""" +Tests for CategoricalIndex.__repr__ and related methods. +""" +import pandas._config.config as cf + +import pandas as pd + + +class TestCategoricalIndexRepr: + def test_string_categorical_index_repr(self): + # short + idx = pd.CategoricalIndex(["a", "bb", "ccc"]) + expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa + assert repr(idx) == expected + + # multiple lines + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10) + expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # truncated + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100) + expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + ... + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa + + assert repr(idx) == expected + + # larger categories + idx = pd.CategoricalIndex(list("abcdefghijklmmo")) + expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'm', 'o'], + categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # short + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + assert repr(idx) == expected + + # multiple lines + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # truncated + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa + + assert repr(idx) == expected + + # larger categories + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) + expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', + 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # Emable Unicode option ----------------------------------------- + with cf.option_context("display.unicode.east_asian_width", True): + + # short + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + assert repr(idx) == expected + + # multiple lines + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # truncated + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa + + assert repr(idx) == expected + + # larger categories + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) + expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', + 'さ', 'し', 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 1d41e17e327a8..a36568bbbe633 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import CategoricalIndex, Index +from pandas import CategoricalIndex, Index, IntervalIndex import pandas._testing as tm @@ -250,3 +250,67 @@ def test_get_indexer(self): msg = "method='nearest' not implemented yet for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="nearest") + + +class TestWhere: + @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) + def test_where(self, klass): + i = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * (len(i) - 1) + expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + +class TestContains: + def test_contains(self): + + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=False) + + assert "a" in ci + assert "z" not in ci + assert "e" not in ci + assert np.nan not in ci + + # assert codes NOT in index + assert 0 not in ci + assert 1 not in ci + + def test_contains_nan(self): + ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) + assert np.nan in ci + + @pytest.mark.parametrize( + "item, expected", + [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ("a", False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + ], + ids=str, + ) + def test_contains_interval(self, item, expected): + # GH 23705 + ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) + result = item in ci + assert result is expected + + def test_contains_list(self): + # GH#21729 + idx = pd.CategoricalIndex([1, 2, 3]) + + assert "a" not in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a"] in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a", "b"] in idx diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 1473058b2a0a9..964cf320a422b 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -49,34 +49,6 @@ def test_pickle_compat_construction(self): with pytest.raises(TypeError, match=msg): self._holder() - def test_to_series(self): - # assert that we are creating a copy of the index - - idx = self.create_index() - s = idx.to_series() - assert s.values is not idx.values - assert s.index is not idx - assert s.name == idx.name - - def test_to_series_with_arguments(self): - # GH18699 - - # index kwarg - idx = self.create_index() - s = idx.to_series(index=idx) - - assert s.values is not idx.values - assert s.index is idx - assert s.name == idx.name - - # name kwarg - idx = self.create_index() - s = idx.to_series(name="__test") - - assert s.values is not idx.values - assert s.index is not idx - assert s.name != idx.name - @pytest.mark.parametrize("name", [None, "new_name"]) def test_to_frame(self, name): # see GH-15230, GH-22580 @@ -198,15 +170,6 @@ def test_logical_compat(self): with pytest.raises(TypeError, match="cannot perform any"): idx.any() - def test_boolean_context_compat(self): - - # boolean context compat - idx = self.create_index() - - with pytest.raises(ValueError, match="The truth value of a"): - if idx: - pass - def test_reindex_base(self): idx = self.create_index() expected = np.arange(idx.size, dtype=np.intp) @@ -253,14 +216,6 @@ def test_repr_roundtrip(self): idx = self.create_index() tm.assert_index_equal(eval(repr(idx)), idx) - def test_str(self): - - # test the string repr - idx = self.create_index() - idx.name = "foo" - assert "'foo'" in str(idx) - assert type(idx).__name__ in str(idx) - def test_repr_max_seq_item_setting(self): # GH10182 idx = self.create_index() diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 916f722247a14..34169a670c169 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -22,27 +22,32 @@ class TestDatetimeIndex: def test_astype(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], name="idx") result = idx.astype(object) - expected = Index([Timestamp("2016-05-16")] + [NaT] * 3, dtype=object) + expected = Index( + [Timestamp("2016-05-16")] + [NaT] * 3, dtype=object, name="idx" + ) tm.assert_index_equal(result, expected) result = idx.astype(int) expected = Int64Index( - [1463356800000000000] + [-9223372036854775808] * 3, dtype=np.int64 + [1463356800000000000] + [-9223372036854775808] * 3, + dtype=np.int64, + name="idx", ) tm.assert_index_equal(result, expected) - rng = date_range("1/1/2000", periods=10) + rng = date_range("1/1/2000", periods=10, name="idx") result = rng.astype("i8") - tm.assert_index_equal(result, Index(rng.asi8)) + tm.assert_index_equal(result, Index(rng.asi8, name="idx")) tm.assert_numpy_array_equal(result.values, rng.asi8) def test_astype_uint(self): - arr = date_range("2000", periods=2) + arr = date_range("2000", periods=2, name="idx") expected = pd.UInt64Index( - np.array([946684800000000000, 946771200000000000], dtype="uint64") + np.array([946684800000000000, 946771200000000000], dtype="uint64"), + name="idx", ) tm.assert_index_equal(arr.astype("uint64"), expected) @@ -148,7 +153,7 @@ def test_astype_str(self): def test_astype_datetime64(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], name="idx") result = idx.astype("datetime64[ns]") tm.assert_index_equal(result, idx) @@ -158,10 +163,12 @@ def test_astype_datetime64(self): tm.assert_index_equal(result, idx) assert result is idx - idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], tz="EST") + idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], tz="EST", name="idx") result = idx_tz.astype("datetime64[ns]") expected = DatetimeIndex( - ["2016-05-16 05:00:00", "NaT", "NaT", "NaT"], dtype="datetime64[ns]" + ["2016-05-16 05:00:00", "NaT", "NaT", "NaT"], + dtype="datetime64[ns]", + name="idx", ) tm.assert_index_equal(result, expected) @@ -273,8 +280,8 @@ def _check_rng(rng): def test_integer_index_astype_datetime(self, tz, dtype): # GH 20997, 20964, 24559 val = [pd.Timestamp("2018-01-01", tz=tz).value] - result = pd.Index(val).astype(dtype) - expected = pd.DatetimeIndex(["2018-01-01"], tz=tz) + result = pd.Index(val, name="idx").astype(dtype) + expected = pd.DatetimeIndex(["2018-01-01"], tz=tz, name="idx") tm.assert_index_equal(result, expected) def test_dti_astype_period(self): @@ -292,10 +299,11 @@ def test_dti_astype_period(self): class TestAstype: @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_astype_category(self, tz): - obj = pd.date_range("2000", periods=2, tz=tz) + obj = pd.date_range("2000", periods=2, tz=tz, name="idx") result = obj.astype("category") expected = pd.CategoricalIndex( - [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)], + name="idx", ) tm.assert_index_equal(result, expected) @@ -305,9 +313,9 @@ def test_astype_category(self, tz): @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_astype_array_fallback(self, tz): - obj = pd.date_range("2000", periods=2, tz=tz) + obj = pd.date_range("2000", periods=2, tz=tz, name="idx") result = obj.astype(bool) - expected = pd.Index(np.array([True, True])) + expected = pd.Index(np.array([True, True]), name="idx") tm.assert_index_equal(result, expected) result = obj._data.astype(bool) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 12c4abe7a1b00..1529a259c49af 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -104,13 +104,6 @@ def test_week_of_month_frequency(self): expected = DatetimeIndex(dates, freq="WOM-1SAT") tm.assert_index_equal(result, expected) - def test_hash_error(self): - index = date_range("20010101", periods=10) - with pytest.raises( - TypeError, match=f"unhashable type: '{type(index).__name__}'" - ): - hash(index) - def test_stringified_slice_with_tz(self): # GH#2658 start = "2013-01-07" diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index cbb598286aefe..cbf6b7b63bd50 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -1,5 +1,4 @@ from datetime import datetime -import warnings import numpy as np import pytest @@ -16,7 +15,7 @@ ) import pandas._testing as tm -from pandas.tseries.offsets import BDay, BMonthEnd, CDay, Day, Hour +from pandas.tseries.offsets import BDay, Day, Hour START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) @@ -443,23 +442,6 @@ def test_copy(self): repr(cp) tm.assert_index_equal(cp, self.rng) - def test_shift(self): - shifted = self.rng.shift(5) - assert shifted[0] == self.rng[5] - assert shifted.freq == self.rng.freq - - shifted = self.rng.shift(-5) - assert shifted[5] == self.rng[0] - assert shifted.freq == self.rng.freq - - shifted = self.rng.shift(0) - assert shifted[0] == self.rng[0] - assert shifted.freq == self.rng.freq - - rng = date_range(START, END, freq=BMonthEnd()) - shifted = rng.shift(1, freq=BDay()) - assert shifted[0] == rng[0] + BDay() - def test_equals(self): assert not self.rng.equals(list(self.rng)) @@ -497,32 +479,6 @@ def test_copy(self): repr(cp) tm.assert_index_equal(cp, self.rng) - def test_shift(self): - - shifted = self.rng.shift(5) - assert shifted[0] == self.rng[5] - assert shifted.freq == self.rng.freq - - shifted = self.rng.shift(-5) - assert shifted[5] == self.rng[0] - assert shifted.freq == self.rng.freq - - shifted = self.rng.shift(0) - assert shifted[0] == self.rng[0] - assert shifted.freq == self.rng.freq - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", pd.errors.PerformanceWarning) - rng = date_range(START, END, freq=BMonthEnd()) - shifted = rng.shift(1, freq=CDay()) - assert shifted[0] == rng[0] + CDay() - - def test_shift_periods(self): - # GH#22458 : argument 'n' was deprecated in favor of 'periods' - idx = pd.date_range(start=START, end=END, periods=3) - tm.assert_index_equal(idx.shift(periods=0), idx) - tm.assert_index_equal(idx.shift(0), idx) - def test_pickle_unpickle(self): unpickled = tm.round_trip_pickle(self.rng) assert unpickled.freq is not None diff --git a/pandas/tests/indexes/datetimes/test_shift.py b/pandas/tests/indexes/datetimes/test_shift.py index 1e21404551fa8..6e53492b71578 100644 --- a/pandas/tests/indexes/datetimes/test_shift.py +++ b/pandas/tests/indexes/datetimes/test_shift.py @@ -9,6 +9,8 @@ from pandas import DatetimeIndex, Series, date_range import pandas._testing as tm +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + class TestDatetimeIndexShift: @@ -115,3 +117,34 @@ def test_dti_shift_near_midnight(self, shift, result_time): result = s.shift(shift, freq="H") expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) tm.assert_series_equal(result, expected) + + def test_shift_periods(self): + # GH#22458 : argument 'n' was deprecated in favor of 'periods' + idx = pd.date_range(start=START, end=END, periods=3) + tm.assert_index_equal(idx.shift(periods=0), idx) + tm.assert_index_equal(idx.shift(0), idx) + + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_shift_bday(self, freq): + rng = date_range(START, END, freq=freq) + shifted = rng.shift(5) + assert shifted[0] == rng[5] + assert shifted.freq == rng.freq + + shifted = rng.shift(-5) + assert shifted[5] == rng[0] + assert shifted.freq == rng.freq + + shifted = rng.shift(0) + assert shifted[0] == rng[0] + assert shifted.freq == rng.freq + + def test_shift_bmonth(self): + rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + shifted = rng.shift(1, freq=pd.offsets.BDay()) + assert shifted[0] == rng[0] + pd.offsets.BDay() + + rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + with tm.assert_produces_warning(pd.errors.PerformanceWarning): + shifted = rng.shift(1, freq=pd.offsets.CDay()) + assert shifted[0] == rng[0] + pd.offsets.CDay() diff --git a/pandas/tests/indexes/datetimes/test_to_period.py b/pandas/tests/indexes/datetimes/test_to_period.py index ddbb43787abb4..7b75e676a2c12 100644 --- a/pandas/tests/indexes/datetimes/test_to_period.py +++ b/pandas/tests/indexes/datetimes/test_to_period.py @@ -147,7 +147,8 @@ def test_to_period_tz_utc_offset_consistency(self, tz): def test_to_period_nofreq(self): idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) - with pytest.raises(ValueError): + msg = "You must pass a freq argument as current index has none." + with pytest.raises(ValueError, match=msg): idx.to_period() idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="infer") diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index efdd3fc9907a2..1b2bfa8573c21 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -863,3 +863,25 @@ def test_dir(): index = IntervalIndex.from_arrays([0, 1], [1, 2]) result = dir(index) assert "str" not in result + + +@pytest.mark.parametrize("klass", [list, np.array, pd.array, pd.Series]) +def test_searchsorted_different_argument_classes(klass): + # https://github.com/pandas-dev/pandas/issues/32762 + values = IntervalIndex([Interval(0, 1), Interval(1, 2)]) + result = values.searchsorted(klass(values)) + expected = np.array([0, 1], dtype=result.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = values._data.searchsorted(klass(values)) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "arg", [[1, 2], ["a", "b"], [pd.Timestamp("2020-01-01", tz="Europe/London")] * 2] +) +def test_searchsorted_invalid_argument(arg): + values = IntervalIndex([Interval(0, 1), Interval(1, 2)]) + msg = "unorderable types" + with pytest.raises(TypeError, match=msg): + values.searchsorted(arg) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index a9e02934f27ab..cd98a87459061 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -57,23 +57,6 @@ def test_truncate(): index.truncate(3, 1) -def test_where(): - i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) - - msg = r"\.where is not supported for MultiIndex operations" - with pytest.raises(NotImplementedError, match=msg): - i.where(True) - - -@pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) -def test_where_array_like(klass): - i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) - cond = [False, True] - msg = r"\.where is not supported for MultiIndex operations" - with pytest.raises(NotImplementedError, match=msg): - i.where(klass(cond)) - - # TODO: reshape diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index ef549beccda5d..9273de9c20412 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -37,7 +37,11 @@ def test_logical_compat(idx, method): def test_boolean_context_compat(idx): - with pytest.raises(ValueError): + msg = ( + "The truth value of a MultiIndex is ambiguous. " + r"Use a.empty, a.bool\(\), a.item\(\), a.any\(\) or a.all\(\)." + ) + with pytest.raises(ValueError, match=msg): bool(idx) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index bfc432a18458a..3519c5d0d5a9a 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -2,16 +2,10 @@ import pytest import pandas as pd -from pandas import DataFrame, MultiIndex, date_range +from pandas import DataFrame, MultiIndex import pandas._testing as tm -def test_tolist(idx): - result = idx.tolist() - exp = list(idx.values) - assert result == exp - - def test_to_numpy(idx): result = idx.to_numpy() exp = idx.values @@ -129,47 +123,6 @@ def test_to_frame_resulting_column_order(): assert result == expected -def test_roundtrip_pickle_with_tz(): - return # FIXME: this can't be right? - - # GH 8367 - # round-trip of timezone - index = MultiIndex.from_product( - [[1, 2], ["a", "b"], date_range("20130101", periods=3, tz="US/Eastern")], - names=["one", "two", "three"], - ) - unpickled = tm.round_trip_pickle(index) - assert index.equal_levels(unpickled) - - -def test_to_series(idx): - # assert that we are creating a copy of the index - - s = idx.to_series() - assert s.values is not idx.values - assert s.index is not idx - assert s.name == idx.name - - -def test_to_series_with_arguments(idx): - # GH18699 - - # index kwarg - s = idx.to_series(index=idx) - - assert s.values is not idx.values - assert s.index is idx - assert s.name == idx.name - - # name kwarg - idx = idx - s = idx.to_series(name="__test") - - assert s.values is not idx.values - assert s.index is not idx - assert s.name != idx.name - - def test_to_flat_index(idx): expected = pd.Index( ( diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 433b631ab9472..e48731b9c8099 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -83,12 +83,14 @@ def test_get_unique_index(idx, dropna): def test_duplicate_multiindex_codes(): # GH 17464 # Make sure that a MultiIndex with duplicate levels throws a ValueError - with pytest.raises(ValueError): + msg = r"Level values must be unique: \[[A', ]+\] on level 0" + with pytest.raises(ValueError, match=msg): mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)]) # And that using set_levels with duplicate levels fails mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) - with pytest.raises(ValueError): + msg = r"Level values must be unique: \[[AB', ]+\] on level 0" + with pytest.raises(ValueError, match=msg): mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_formats.py similarity index 98% rename from pandas/tests/indexes/multi/test_format.py rename to pandas/tests/indexes/multi/test_formats.py index 75499bd79cca0..792dcf4c535e3 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -58,7 +58,8 @@ def test_repr_with_unicode_data(): def test_repr_roundtrip_raises(): mi = MultiIndex.from_product([list("ab"), range(3)], names=["first", "second"]) - with pytest.raises(TypeError): + msg = "Must pass both levels and codes" + with pytest.raises(TypeError, match=msg): eval(repr(mi)) diff --git a/pandas/tests/indexes/multi/test_get_level_values.py b/pandas/tests/indexes/multi/test_get_level_values.py index 6f0b23c1ef4a0..1215e72be3c59 100644 --- a/pandas/tests/indexes/multi/test_get_level_values.py +++ b/pandas/tests/indexes/multi/test_get_level_values.py @@ -1,4 +1,8 @@ -from pandas import MultiIndex, Timestamp, date_range +import numpy as np + +import pandas as pd +from pandas import CategoricalIndex, Index, MultiIndex, Timestamp, date_range +import pandas._testing as tm class TestGetLevelValues: @@ -11,3 +15,77 @@ def test_get_level_values_box_datetime64(self): index = MultiIndex(levels=levels, codes=codes) assert isinstance(index.get_level_values(0)[0], Timestamp) + + +def test_get_level_values(idx): + result = idx.get_level_values(0) + expected = Index(["foo", "foo", "bar", "baz", "qux", "qux"], name="first") + tm.assert_index_equal(result, expected) + assert result.name == "first" + + result = idx.get_level_values("first") + expected = idx.get_level_values(0) + tm.assert_index_equal(result, expected) + + # GH 10460 + index = MultiIndex( + levels=[CategoricalIndex(["A", "B"]), CategoricalIndex([1, 2, 3])], + codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])], + ) + + exp = CategoricalIndex(["A", "A", "A", "B", "B", "B"]) + tm.assert_index_equal(index.get_level_values(0), exp) + exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) + tm.assert_index_equal(index.get_level_values(1), exp) + + +def test_get_level_values_all_na(): + # GH#17924 when level entirely consists of nan + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = pd.Index([np.nan, np.nan, np.nan], dtype=np.float64) + tm.assert_index_equal(result, expected) + + result = index.get_level_values(1) + expected = pd.Index(["a", np.nan, 1], dtype=object) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_int_with_na(): + # GH#17924 + arrays = [["a", "b", "b"], [1, np.nan, 2]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = Index([1, np.nan, 2]) + tm.assert_index_equal(result, expected) + + arrays = [["a", "b", "b"], [np.nan, np.nan, 2]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = Index([np.nan, np.nan, 2]) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_na(): + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = pd.Index([np.nan, np.nan, np.nan]) + tm.assert_index_equal(result, expected) + + result = index.get_level_values(1) + expected = pd.Index(["a", np.nan, 1]) + tm.assert_index_equal(result, expected) + + arrays = [["a", "b", "b"], pd.DatetimeIndex([0, 1, pd.NaT])] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = pd.DatetimeIndex([0, 1, pd.NaT]) + tm.assert_index_equal(result, expected) + + arrays = [[], []] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = pd.Index([], dtype=object) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 675a1e2e832f3..8a3deca0236e4 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import CategoricalIndex, Index, MultiIndex +from pandas import CategoricalIndex, MultiIndex import pandas._testing as tm @@ -27,90 +27,6 @@ def test_get_level_number_integer(idx): idx._get_level_number("fourth") -def test_get_level_values(idx): - result = idx.get_level_values(0) - expected = Index(["foo", "foo", "bar", "baz", "qux", "qux"], name="first") - tm.assert_index_equal(result, expected) - assert result.name == "first" - - result = idx.get_level_values("first") - expected = idx.get_level_values(0) - tm.assert_index_equal(result, expected) - - # GH 10460 - index = MultiIndex( - levels=[CategoricalIndex(["A", "B"]), CategoricalIndex([1, 2, 3])], - codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])], - ) - - exp = CategoricalIndex(["A", "A", "A", "B", "B", "B"]) - tm.assert_index_equal(index.get_level_values(0), exp) - exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) - tm.assert_index_equal(index.get_level_values(1), exp) - - -def test_get_value_duplicates(): - index = MultiIndex( - levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], - codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=["tag", "day"], - ) - - assert index.get_loc("D") == slice(0, 3) - - -def test_get_level_values_all_na(): - # GH 17924 when level entirely consists of nan - arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(0) - expected = pd.Index([np.nan, np.nan, np.nan], dtype=np.float64) - tm.assert_index_equal(result, expected) - - result = index.get_level_values(1) - expected = pd.Index(["a", np.nan, 1], dtype=object) - tm.assert_index_equal(result, expected) - - -def test_get_level_values_int_with_na(): - # GH 17924 - arrays = [["a", "b", "b"], [1, np.nan, 2]] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(1) - expected = Index([1, np.nan, 2]) - tm.assert_index_equal(result, expected) - - arrays = [["a", "b", "b"], [np.nan, np.nan, 2]] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(1) - expected = Index([np.nan, np.nan, 2]) - tm.assert_index_equal(result, expected) - - -def test_get_level_values_na(): - arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(0) - expected = pd.Index([np.nan, np.nan, np.nan]) - tm.assert_index_equal(result, expected) - - result = index.get_level_values(1) - expected = pd.Index(["a", np.nan, 1]) - tm.assert_index_equal(result, expected) - - arrays = [["a", "b", "b"], pd.DatetimeIndex([0, 1, pd.NaT])] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(1) - expected = pd.DatetimeIndex([0, 1, pd.NaT]) - tm.assert_index_equal(result, expected) - - arrays = [[], []] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(0) - expected = pd.Index([], dtype=object) - tm.assert_index_equal(result, expected) - - def test_set_name_methods(idx, index_names): # so long as these are synonyms, we don't need to test set_names assert idx.rename == idx.set_names diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index b7d7b3b459aff..3b3ae074c774a 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -441,6 +441,65 @@ def test_get_loc_with_values_including_missing_values(self): expected = slice(2, 4, None) assert idx.get_loc((np.nan, 1)) == expected + def test_get_loc_duplicates2(self): + # TODO: de-duplicate with test_get_loc_duplicates above? + index = MultiIndex( + levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) + + assert index.get_loc("D") == slice(0, 3) + + +class TestWhere: + def test_where(self): + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + + msg = r"\.where is not supported for MultiIndex operations" + with pytest.raises(NotImplementedError, match=msg): + i.where(True) + + @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) + def test_where_array_like(self, klass): + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + cond = [False, True] + msg = r"\.where is not supported for MultiIndex operations" + with pytest.raises(NotImplementedError, match=msg): + i.where(klass(cond)) + + +class TestContains: + def test_contains_top_level(self): + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) + assert "A" in midx + assert "A" not in midx._engine + + def test_contains_with_nat(self): + # MI with a NaT + mi = MultiIndex( + levels=[["C"], pd.date_range("2012-01-01", periods=5)], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, "B"], + ) + assert ("C", pd.Timestamp("2012-01-01")) in mi + for val in mi.values: + assert val in mi + + def test_contains(self, idx): + assert ("foo", "two") in idx + assert ("bar", "two") not in idx + assert None not in idx + + def test_contains_with_missing_value(self): + # GH#19132 + idx = MultiIndex.from_arrays([[1, np.nan, 2]]) + assert np.nan in idx + + idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) + assert np.nan not in idx + assert (1, np.nan) in idx + def test_timestamp_multiindex_indexer(): # https://github.com/pandas-dev/pandas/issues/26944 diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_isin.py similarity index 78% rename from pandas/tests/indexes/multi/test_contains.py rename to pandas/tests/indexes/multi/test_isin.py index 49aa63210cd5e..122263e6ec198 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -3,35 +3,10 @@ from pandas.compat import PYPY -import pandas as pd from pandas import MultiIndex import pandas._testing as tm -def test_contains_top_level(): - midx = MultiIndex.from_product([["A", "B"], [1, 2]]) - assert "A" in midx - assert "A" not in midx._engine - - -def test_contains_with_nat(): - # MI with a NaT - mi = MultiIndex( - levels=[["C"], pd.date_range("2012-01-01", periods=5)], - codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], - names=[None, "B"], - ) - assert ("C", pd.Timestamp("2012-01-01")) in mi - for val in mi.values: - assert val in mi - - -def test_contains(idx): - assert ("foo", "two") in idx - assert ("bar", "two") not in idx - assert None not in idx - - @pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy") def test_isin_nan_pypy(): idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) @@ -100,16 +75,6 @@ def test_isin_level_kwarg(): idx.isin(vals_1, level="C") -def test_contains_with_missing_value(): - # issue 19132 - idx = MultiIndex.from_arrays([[1, np.nan, 2]]) - assert np.nan in idx - - idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) - assert np.nan not in idx - assert (1, np.nan) in idx - - @pytest.mark.parametrize( "labels,expected,level", [ diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index b24f56afee376..c97704e8a2066 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -209,7 +209,8 @@ def test_difference_sort_incomparable(): # sort=None, the default # MultiIndex.difference deviates here from other difference # implementations in not catching the TypeError - with pytest.raises(TypeError): + msg = "'<' not supported between instances of 'Timestamp' and 'int'" + with pytest.raises(TypeError, match=msg): result = idx.difference(other) # sort=False diff --git a/pandas/tests/indexes/period/test_astype.py b/pandas/tests/indexes/period/test_astype.py index 2f10e45193d5d..b286191623ebb 100644 --- a/pandas/tests/indexes/period/test_astype.py +++ b/pandas/tests/indexes/period/test_astype.py @@ -27,31 +27,34 @@ def test_astype_raises(self, dtype): def test_astype_conversion(self): # GH#13149, GH#13209 - idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D", name="idx") result = idx.astype(object) expected = Index( [Period("2016-05-16", freq="D")] + [Period(NaT, freq="D")] * 3, dtype="object", + name="idx", ) tm.assert_index_equal(result, expected) result = idx.astype(np.int64) - expected = Int64Index([16937] + [-9223372036854775808] * 3, dtype=np.int64) + expected = Int64Index( + [16937] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" + ) tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index(str(x) for x in idx) + expected = Index([str(x) for x in idx], name="idx") tm.assert_index_equal(result, expected) - idx = period_range("1990", "2009", freq="A") + idx = period_range("1990", "2009", freq="A", name="idx") result = idx.astype("i8") - tm.assert_index_equal(result, Index(idx.asi8)) + tm.assert_index_equal(result, Index(idx.asi8, name="idx")) tm.assert_numpy_array_equal(result.values, idx.asi8) def test_astype_uint(self): - arr = period_range("2000", periods=2) - expected = UInt64Index(np.array([10957, 10958], dtype="uint64")) + arr = period_range("2000", periods=2, name="idx") + expected = UInt64Index(np.array([10957, 10958], dtype="uint64"), name="idx") tm.assert_index_equal(arr.astype("uint64"), expected) tm.assert_index_equal(arr.astype("uint32"), expected) @@ -116,10 +119,10 @@ def test_astype_object2(self): assert result_list[2] is NaT def test_astype_category(self): - obj = period_range("2000", periods=2) + obj = period_range("2000", periods=2, name="idx") result = obj.astype("category") expected = CategoricalIndex( - [Period("2000-01-01", freq="D"), Period("2000-01-02", freq="D")] + [Period("2000-01-01", freq="D"), Period("2000-01-02", freq="D")], name="idx" ) tm.assert_index_equal(result, expected) @@ -128,9 +131,9 @@ def test_astype_category(self): tm.assert_categorical_equal(result, expected) def test_astype_array_fallback(self): - obj = period_range("2000", periods=2) + obj = period_range("2000", periods=2, name="idx") result = obj.astype(bool) - expected = Index(np.array([True, True])) + expected = Index(np.array([True, True]), name="idx") tm.assert_index_equal(result, expected) result = obj._data.astype(bool) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index df2f85cd7f1e2..a62936655e09c 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -105,12 +105,6 @@ def test_no_millisecond_field(self): with pytest.raises(AttributeError, match=msg): DatetimeIndex([]).millisecond - def test_hash_error(self): - index = period_range("20010101", periods=10) - msg = f"unhashable type: '{type(index).__name__}'" - with pytest.raises(TypeError, match=msg): - hash(index) - def test_make_time_series(self): index = period_range(freq="A", start="1/1/2001", end="12/1/2009") series = Series(1, index=index) diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py new file mode 100644 index 0000000000000..f5a2583bf2e10 --- /dev/null +++ b/pandas/tests/indexes/period/test_searchsorted.py @@ -0,0 +1,77 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import IncompatibleFrequency + +from pandas import NaT, Period, PeriodIndex, Series, array +import pandas._testing as tm + + +class TestSearchsorted: + @pytest.mark.parametrize("freq", ["D", "2D"]) + def test_searchsorted(self, freq): + pidx = PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq=freq, + ) + + p1 = Period("2014-01-01", freq=freq) + assert pidx.searchsorted(p1) == 0 + + p2 = Period("2014-01-04", freq=freq) + assert pidx.searchsorted(p2) == 3 + + assert pidx.searchsorted(NaT) == 0 + + msg = "Input has different freq=H from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): + pidx.searchsorted(Period("2014-01-01", freq="H")) + + msg = "Input has different freq=5D from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): + pidx.searchsorted(Period("2014-01-01", freq="5D")) + + @pytest.mark.parametrize("klass", [list, np.array, array, Series]) + def test_searchsorted_different_argument_classes(self, klass): + pidx = PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq="D", + ) + result = pidx.searchsorted(klass(pidx)) + expected = np.arange(len(pidx), dtype=result.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = pidx._data.searchsorted(klass(pidx)) + tm.assert_numpy_array_equal(result, expected) + + def test_searchsorted_invalid(self): + pidx = PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq="D", + ) + + other = np.array([0, 1], dtype=np.int64) + + msg = "|".join( + [ + "searchsorted requires compatible dtype or scalar", + "Unexpected type for 'value'", + ] + ) + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other.astype("timedelta64[ns]")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64(4)) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64("NaT", "ms")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64(4, "ns")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64("NaT", "ns")) diff --git a/pandas/tests/indexes/period/test_shift.py b/pandas/tests/indexes/period/test_shift.py index b4c9810f3a554..278bb7f07c679 100644 --- a/pandas/tests/indexes/period/test_shift.py +++ b/pandas/tests/indexes/period/test_shift.py @@ -63,7 +63,8 @@ def test_shift_corner_cases(self): # GH#9903 idx = PeriodIndex([], name="xxx", freq="H") - with pytest.raises(TypeError): + msg = "`freq` argument is not supported for PeriodArray._time_shift" + with pytest.raises(TypeError, match=msg): # period shift doesn't accept freq idx.shift(1, freq="H") diff --git a/pandas/tests/indexes/period/test_to_timestamp.py b/pandas/tests/indexes/period/test_to_timestamp.py new file mode 100644 index 0000000000000..23787586cb3d3 --- /dev/null +++ b/pandas/tests/indexes/period/test_to_timestamp.py @@ -0,0 +1,101 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + NaT, + PeriodIndex, + Timedelta, + Timestamp, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestToTimestamp: + def test_to_timestamp_freq(self): + idx = period_range("2017", periods=12, freq="A-DEC") + result = idx.to_timestamp() + expected = date_range("2017", periods=12, freq="AS-JAN") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_pi_nat(self): + # GH#7228 + index = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") + + result = index.to_timestamp("D") + expected = DatetimeIndex( + [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.name == "idx" + + result2 = result.to_period(freq="M") + tm.assert_index_equal(result2, index) + assert result2.name == "idx" + + result3 = result.to_period(freq="3M") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") + tm.assert_index_equal(result3, exp) + assert result3.freqstr == "3M" + + msg = "Frequency must be positive, because it represents span: -2A" + with pytest.raises(ValueError, match=msg): + result.to_period(freq="-2A") + + def test_to_timestamp_preserve_name(self): + index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") + assert index.name == "foo" + + conv = index.to_timestamp("D") + assert conv.name == "foo" + + def test_to_timestamp_quarterly_bug(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(list(range(1, 5)), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + stamps = pindex.to_timestamp("D", "end") + expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) + tm.assert_index_equal(stamps, expected) + + def test_to_timestamp_pi_mult(self): + idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex(["2011-01-01", "NaT", "2011-02-01"], name="idx") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex(["2011-02-28", "NaT", "2011-03-31"], name="idx") + expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_pi_combined(self): + idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex( + ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx" + ) + expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E", freq="H") + expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") + expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_1703(self): + index = period_range("1/1/2012", periods=4, freq="D") + + result = index.to_timestamp() + assert result[0] == Timestamp("1/1/2012") diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index dae220006ebe0..82c13240c6bf2 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -1,20 +1,7 @@ -from datetime import datetime - import numpy as np import pytest -from pandas._libs.tslibs import IncompatibleFrequency - -from pandas import ( - DatetimeIndex, - NaT, - Period, - PeriodIndex, - Timedelta, - Timestamp, - date_range, - period_range, -) +from pandas import Period, PeriodIndex, period_range import pandas._testing as tm @@ -40,63 +27,6 @@ def test_freq(self, freq): self._check_freq(freq, "1970-01-01") -class TestSearchsorted: - @pytest.mark.parametrize("freq", ["D", "2D"]) - def test_searchsorted(self, freq): - pidx = PeriodIndex( - ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], - freq=freq, - ) - - p1 = Period("2014-01-01", freq=freq) - assert pidx.searchsorted(p1) == 0 - - p2 = Period("2014-01-04", freq=freq) - assert pidx.searchsorted(p2) == 3 - - assert pidx.searchsorted(NaT) == 0 - - msg = "Input has different freq=H from PeriodArray" - with pytest.raises(IncompatibleFrequency, match=msg): - pidx.searchsorted(Period("2014-01-01", freq="H")) - - msg = "Input has different freq=5D from PeriodArray" - with pytest.raises(IncompatibleFrequency, match=msg): - pidx.searchsorted(Period("2014-01-01", freq="5D")) - - def test_searchsorted_invalid(self): - pidx = PeriodIndex( - ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], - freq="D", - ) - - other = np.array([0, 1], dtype=np.int64) - - msg = "|".join( - [ - "searchsorted requires compatible dtype or scalar", - "Unexpected type for 'value'", - ] - ) - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(other) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(other.astype("timedelta64[ns]")) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(np.timedelta64(4)) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(np.timedelta64("NaT", "ms")) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(np.datetime64(4, "ns")) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(np.datetime64("NaT", "ns")) - - class TestPeriodIndexConversion: def test_tolist(self): index = period_range(freq="A", start="1/1/2001", end="12/1/2009") @@ -106,89 +36,3 @@ def test_tolist(self): recon = PeriodIndex(rs) tm.assert_index_equal(index, recon) - - -class TestToTimestamp: - def test_to_timestamp_freq(self): - idx = period_range("2017", periods=12, freq="A-DEC") - result = idx.to_timestamp() - expected = date_range("2017", periods=12, freq="AS-JAN") - tm.assert_index_equal(result, expected) - - def test_to_timestamp_pi_nat(self): - # GH#7228 - index = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") - - result = index.to_timestamp("D") - expected = DatetimeIndex( - [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.name == "idx" - - result2 = result.to_period(freq="M") - tm.assert_index_equal(result2, index) - assert result2.name == "idx" - - result3 = result.to_period(freq="3M") - exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") - tm.assert_index_equal(result3, exp) - assert result3.freqstr == "3M" - - msg = "Frequency must be positive, because it represents span: -2A" - with pytest.raises(ValueError, match=msg): - result.to_period(freq="-2A") - - def test_to_timestamp_preserve_name(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") - assert index.name == "foo" - - conv = index.to_timestamp("D") - assert conv.name == "foo" - - def test_to_timestamp_quarterly_bug(self): - years = np.arange(1960, 2000).repeat(4) - quarters = np.tile(list(range(1, 5)), 40) - - pindex = PeriodIndex(year=years, quarter=quarters) - - stamps = pindex.to_timestamp("D", "end") - expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) - tm.assert_index_equal(stamps, expected) - - def test_to_timestamp_pi_mult(self): - idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") - - result = idx.to_timestamp() - expected = DatetimeIndex(["2011-01-01", "NaT", "2011-02-01"], name="idx") - tm.assert_index_equal(result, expected) - - result = idx.to_timestamp(how="E") - expected = DatetimeIndex(["2011-02-28", "NaT", "2011-03-31"], name="idx") - expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") - tm.assert_index_equal(result, expected) - - def test_to_timestamp_pi_combined(self): - idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") - - result = idx.to_timestamp() - expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") - tm.assert_index_equal(result, expected) - - result = idx.to_timestamp(how="E") - expected = DatetimeIndex( - ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx" - ) - expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") - tm.assert_index_equal(result, expected) - - result = idx.to_timestamp(how="E", freq="H") - expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") - expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") - tm.assert_index_equal(result, expected) - - def test_to_timestamp_1703(self): - index = period_range("1/1/2012", periods=4, freq="D") - - result = index.to_timestamp() - assert result[0] == Timestamp("1/1/2012") diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py index ba1de6d551d6b..426341a53a5d1 100644 --- a/pandas/tests/indexes/ranges/test_constructors.py +++ b/pandas/tests/indexes/ranges/test_constructors.py @@ -37,28 +37,36 @@ def test_constructor_invalid_args(self): with pytest.raises(TypeError, match=msg): RangeIndex(name="Foo") - # invalid args - for i in [ + # we don't allow on a bare Index + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, 0 was passed" + ) + with pytest.raises(TypeError, match=msg): + Index(0, 1000) + + @pytest.mark.parametrize( + "args", + [ Index(["a", "b"]), Series(["a", "b"]), np.array(["a", "b"]), [], - "foo", - datetime(2000, 1, 1, 0, 0), np.arange(0, 10), np.array([1]), [1], - ]: - with pytest.raises(TypeError): - RangeIndex(i) + ], + ) + def test_constructor_additional_invalid_args(self, args): + msg = f"Value needs to be a scalar value, was type {type(args).__name__}" + with pytest.raises(TypeError, match=msg): + RangeIndex(args) - # we don't allow on a bare Index - msg = ( - r"Index\(\.\.\.\) must be called with a collection of some " - r"kind, 0 was passed" - ) + @pytest.mark.parametrize("args", ["foo", datetime(2000, 1, 1, 0, 0)]) + def test_constructor_invalid_args_wrong_type(self, args): + msg = f"Wrong type {type(args)} for value {args}" with pytest.raises(TypeError, match=msg): - Index(0, 1000) + RangeIndex(args) def test_constructor_same(self): @@ -81,7 +89,7 @@ def test_constructor_same(self): def test_constructor_range(self): - msg = "Value needs to be a scalar value, was type " + msg = "Value needs to be a scalar value, was type range" with pytest.raises(TypeError, match=msg): result = RangeIndex(range(1, 5, 2)) diff --git a/pandas/tests/indexes/ranges/test_indexing.py b/pandas/tests/indexes/ranges/test_indexing.py new file mode 100644 index 0000000000000..238c33c3db6d7 --- /dev/null +++ b/pandas/tests/indexes/ranges/test_indexing.py @@ -0,0 +1,79 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import RangeIndex +import pandas._testing as tm + + +class TestGetIndexer: + def test_get_indexer(self): + index = RangeIndex(start=0, stop=20, step=2) + target = RangeIndex(10) + indexer = index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_pad(self): + index = RangeIndex(start=0, stop=20, step=2) + target = RangeIndex(10) + indexer = index.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_backfill(self): + index = RangeIndex(start=0, stop=20, step=2) + target = RangeIndex(10) + indexer = index.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_limit(self): + # GH#28631 + idx = RangeIndex(4) + target = RangeIndex(6) + result = idx.get_indexer(target, method="pad", limit=1) + expected = np.array([0, 1, 2, 3, 3, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("stop", [0, -1, -2]) + def test_get_indexer_decreasing(self, stop): + # GH#28678 + index = RangeIndex(7, stop, -3) + result = index.get_indexer(range(9)) + expected = np.array([-1, 2, -1, -1, 1, -1, -1, 0, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + +class TestTake: + def test_take_preserve_name(self): + index = RangeIndex(1, 5, name="foo") + taken = index.take([3, 0, 1]) + assert index.name == taken.name + + def test_take_fill_value(self): + # GH#12631 + idx = pd.RangeIndex(1, 4, name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = pd.Int64Index([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + msg = "Unable to fill values because RangeIndex cannot contain NA" + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -1]), fill_value=True) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Int64Index([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + msg = "Unable to fill values because RangeIndex cannot contain NA" + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 61ac937f5fda0..05422e7b4419f 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -257,43 +257,6 @@ def test_identical(self): assert not index.copy(dtype=object).identical(index.copy(dtype="int64")) - def test_get_indexer(self): - index = self.create_index() - target = RangeIndex(10) - indexer = index.get_indexer(target) - expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - def test_get_indexer_pad(self): - index = self.create_index() - target = RangeIndex(10) - indexer = index.get_indexer(target, method="pad") - expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - def test_get_indexer_backfill(self): - index = self.create_index() - target = RangeIndex(10) - indexer = index.get_indexer(target, method="backfill") - expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - def test_get_indexer_limit(self): - # GH 28631 - idx = RangeIndex(4) - target = RangeIndex(6) - result = idx.get_indexer(target, method="pad", limit=1) - expected = np.array([0, 1, 2, 3, 3, -1], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("stop", [0, -1, -2]) - def test_get_indexer_decreasing(self, stop): - # GH 28678 - index = RangeIndex(7, stop, -3) - result = index.get_indexer(range(9)) - expected = np.array([-1, 2, -1, -1, 1, -1, -1, 0, -1], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - def test_nbytes(self): # memory savings vs int index @@ -304,14 +267,19 @@ def test_nbytes(self): i2 = RangeIndex(0, 10) assert i.nbytes == i2.nbytes - def test_cant_or_shouldnt_cast(self): - # can't - with pytest.raises(TypeError): - RangeIndex("foo", "bar", "baz") - - # shouldn't - with pytest.raises(TypeError): - RangeIndex("0", "1", "2") + @pytest.mark.parametrize( + "start,stop,step", + [ + # can't + ("foo", "bar", "baz"), + # shouldn't + ("0", "1", "2"), + ], + ) + def test_cant_or_shouldnt_cast(self, start, stop, step): + msg = f"Wrong type {type(start)} for value {start}" + with pytest.raises(TypeError, match=msg): + RangeIndex(start, stop, step) def test_view_index(self): index = self.create_index() @@ -322,41 +290,6 @@ def test_prevent_casting(self): result = index.astype("O") assert result.dtype == np.object_ - def test_take_preserve_name(self): - index = RangeIndex(1, 5, name="foo") - taken = index.take([3, 0, 1]) - assert index.name == taken.name - - def test_take_fill_value(self): - # GH 12631 - idx = pd.RangeIndex(1, 4, name="xxx") - result = idx.take(np.array([1, 0, -1])) - expected = pd.Int64Index([2, 1, 3], name="xxx") - tm.assert_index_equal(result, expected) - - # fill_value - msg = "Unable to fill values because RangeIndex cannot contain NA" - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -1]), fill_value=True) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = pd.Int64Index([2, 1, 3], name="xxx") - tm.assert_index_equal(result, expected) - - msg = "Unable to fill values because RangeIndex cannot contain NA" - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with pytest.raises(IndexError): - idx.take(np.array([1, -5])) - - def test_print_unicode_columns(self): - df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) - repr(df.columns) # should not raise UnicodeDecodeError - def test_repr_roundtrip(self): index = self.create_index() tm.assert_index_equal(eval(repr(index)), index) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 86881b8984228..8cbea846bc870 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -5,6 +5,14 @@ """ import pytest +import pandas._testing as tm + + +def test_boolean_context_compat(indices): + with pytest.raises(ValueError, match="The truth value of a"): + if indices: + pass + def test_sort(indices): msg = "cannot sort an Index object in-place, use sort_values instead" @@ -27,9 +35,58 @@ def test_mutability(indices): def test_wrong_number_names(indices): + names = indices.nlevels * ["apple", "banana", "carrot"] with pytest.raises(ValueError, match="^Length"): - indices.names = ["apple", "banana", "carrot"] + indices.names = names + + +class TestConversion: + def test_to_series(self, indices): + # assert that we are creating a copy of the index + + ser = indices.to_series() + assert ser.values is not indices.values + assert ser.index is not indices + assert ser.name == indices.name + + def test_to_series_with_arguments(self, indices): + # GH#18699 + + # index kwarg + ser = indices.to_series(index=indices) + + assert ser.values is not indices.values + assert ser.index is indices + assert ser.name == indices.name + + # name kwarg + ser = indices.to_series(name="__test") + + assert ser.values is not indices.values + assert ser.index is not indices + assert ser.name != indices.name + + def test_tolist_matches_list(self, indices): + assert indices.tolist() == list(indices) + + +class TestRoundTrips: + def test_pickle_roundtrip(self, indices): + result = tm.round_trip_pickle(indices) + tm.assert_index_equal(result, indices) + if result.nlevels > 1: + # GH#8367 round-trip with timezone + assert indices.equal_levels(result) + + +class TestIndexing: + def test_slice_keeps_name(self, indices): + assert indices.name == indices[1:].name -def test_tolist_matches_list(indices): - assert indices.tolist() == list(indices) +class TestRendering: + def test_str(self, indices): + # test the string repr + indices.name = "foo" + assert "'foo'" in str(indices) + assert type(indices).__name__ in str(indices) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 5bdbc18769ce5..9bc19be2999df 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1823,17 +1823,17 @@ def test_isin_level_kwarg(self, level, index): index.name = "foobar" tm.assert_numpy_array_equal(expected, index.isin(values, level="foobar")) - @pytest.mark.parametrize("level", [2, 10, -3]) - def test_isin_level_kwarg_bad_level_raises(self, level, indices): + def test_isin_level_kwarg_bad_level_raises(self, indices): index = indices - with pytest.raises(IndexError, match="Too many levels"): - index.isin([], level=level) + for level in [10, index.nlevels, -(index.nlevels + 1)]: + with pytest.raises(IndexError, match="Too many levels"): + index.isin([], level=level) @pytest.mark.parametrize("label", [1.0, "foobar", "xyzzy", np.nan]) def test_isin_level_kwarg_bad_label_raises(self, label, indices): index = indices if isinstance(index, MultiIndex): - index = index.rename(["foo", "bar"]) + index = index.rename(["foo", "bar"] + index.names[2:]) msg = f"'Level {label} not found'" else: index = index.rename("foo") @@ -2263,7 +2263,8 @@ def test_contains_method_removed(self, indices): if isinstance(indices, pd.IntervalIndex): indices.contains(1) else: - with pytest.raises(AttributeError): + msg = f"'{type(indices).__name__}' object has no attribute 'contains'" + with pytest.raises(AttributeError, match=msg): indices.contains(1) @@ -2437,10 +2438,6 @@ def test_int_name_format(self, klass): result = klass(list(range(3)), index=index) assert "0" in repr(result) - def test_print_unicode_columns(self): - df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) - repr(df.columns) # should not raise UnicodeDecodeError - def test_str_to_bytes_raises(self): # GH 26447 index = Index([str(x) for x in range(10)]) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index a220ae6361b79..01d72670f37aa 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -125,10 +125,6 @@ def test_to_flat_index(self, indices): result = indices.to_flat_index() tm.assert_index_equal(result, indices) - def test_wrong_number_names(self, indices): - with pytest.raises(ValueError, match="^Length"): - indices.names = ["apple", "banana", "carrot"] - def test_set_name_methods(self, indices): new_name = "This is the new name for this index" @@ -373,3 +369,29 @@ def test_has_duplicates(self, indices): idx = holder([indices[0]] * 5) assert idx.is_unique is False assert idx.has_duplicates is True + + @pytest.mark.parametrize( + "dtype", + ["int64", "uint64", "float64", "category", "datetime64[ns]", "timedelta64[ns]"], + ) + @pytest.mark.parametrize("copy", [True, False]) + def test_astype_preserves_name(self, indices, dtype, copy): + # https://github.com/pandas-dev/pandas/issues/32013 + if isinstance(indices, MultiIndex): + indices.names = ["idx" + str(i) for i in range(indices.nlevels)] + else: + indices.name = "idx" + + try: + # Some of these conversions cannot succeed so we use a try / except + if copy: + result = indices.copy(dtype=dtype) + else: + result = indices.astype(dtype) + except (ValueError, TypeError, NotImplementedError, SystemError): + return + + if isinstance(indices, MultiIndex): + assert result.names == indices.names + else: + assert result.name == indices.name diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index 2e53e29c3fab1..cde3fc00eaaaa 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -17,7 +17,8 @@ def check_mutable_error(self, *args, **kwargs): # Pass whatever function you normally would to pytest.raises # (after the Exception kind). mutable_regex = re.compile("does not support mutable operations") - with pytest.raises(TypeError): + msg = "'(_s)?re.(SRE_)?Pattern' object is not callable" + with pytest.raises(TypeError, match=msg): mutable_regex(*args, **kwargs) def test_no_mutable_funcs(self): diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 23877c2c7607a..49f3060e95388 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -506,7 +506,8 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) @@ -645,13 +646,10 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) - def test_slice_keep_name(self): - idx = self._holder([1, 2], name="asdf") - assert idx.name == idx[1:].name - class TestInt64Index(NumericInt): _dtype = "int64" diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index d0cbb2ab75f72..818d5474eddf5 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -2,8 +2,6 @@ The tests in this package are to ensure the proper resultant dtypes of set operations. """ -import itertools as it - import numpy as np import pytest @@ -13,7 +11,6 @@ from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index import pandas._testing as tm from pandas.api.types import pandas_dtype -from pandas.conftest import indices_dict COMPATIBLE_INCONSISTENT_PAIRS = { (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex), @@ -23,14 +20,6 @@ } -@pytest.fixture(params=it.combinations(indices_dict, 2), ids="-".join) -def index_pair(request): - """ - Create all combinations of 2 index types. - """ - return indices_dict[request.param[0]], indices_dict[request.param[1]] - - def test_union_same_types(indices): # Union with a non-unique, non-monotonic index raises error # Only needed for bool index factory @@ -39,14 +28,15 @@ def test_union_same_types(indices): assert idx1.union(idx2).dtype == idx1.dtype -def test_union_different_types(index_pair): +def test_union_different_types(indices, index_fixture2): + # This test only considers combinations of indices # GH 23525 - idx1, idx2 = index_pair + idx1, idx2 = indices, index_fixture2 type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: pytest.xfail("This test only considers non compatible indexes.") - if any(isinstance(idx, pd.MultiIndex) for idx in index_pair): + if any(isinstance(idx, pd.MultiIndex) for idx in (idx1, idx2)): pytest.xfail("This test doesn't consider multiindixes.") if is_dtype_equal(idx1.dtype, idx2.dtype): diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index 82c9d995c9c7c..d9f24b4a35520 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -47,20 +47,22 @@ def test_astype_object_with_nat(self): def test_astype(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN]) + idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN], name="idx") result = idx.astype(object) - expected = Index([Timedelta("1 days 03:46:40")] + [NaT] * 3, dtype=object) + expected = Index( + [Timedelta("1 days 03:46:40")] + [NaT] * 3, dtype=object, name="idx" + ) tm.assert_index_equal(result, expected) result = idx.astype(int) expected = Int64Index( - [100000000000000] + [-9223372036854775808] * 3, dtype=np.int64 + [100000000000000] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" ) tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index(str(x) for x in idx) + expected = Index([str(x) for x in idx], name="idx") tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index 8e54561df1624..3e5bb56c3e58e 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -168,7 +168,11 @@ def test_constructor_coverage(self): with pytest.raises(TypeError, match=msg): timedelta_range(start="1 days", periods="foo", freq="D") - with pytest.raises(TypeError): + msg = ( + r"TimedeltaIndex\(\) must be called with a collection of some kind, " + "'1 days' was passed" + ) + with pytest.raises(TypeError, match=msg): TimedeltaIndex("1 days") # generator expression @@ -220,5 +224,6 @@ def test_constructor_no_precision_raises(self): pd.Index(["2000"], dtype="timedelta64") def test_constructor_wrong_precision_raises(self): - with pytest.raises(ValueError): + msg = r"dtype timedelta64\[us\] cannot be converted to timedelta64\[ns\]" + with pytest.raises(ValueError, match=msg): pd.TimedeltaIndex(["2000"], dtype="timedelta64[us]") diff --git a/pandas/tests/indexes/timedeltas/test_delete.py b/pandas/tests/indexes/timedeltas/test_delete.py new file mode 100644 index 0000000000000..593ed7bb0a1ac --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_delete.py @@ -0,0 +1,70 @@ +import pytest + +from pandas import TimedeltaIndex, timedelta_range +import pandas._testing as tm + + +class TestTimedeltaIndexDelete: + def test_delete(self): + idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") + + # preserve freq + expected_0 = timedelta_range(start="2 Days", periods=4, freq="D", name="idx") + expected_4 = timedelta_range(start="1 Days", periods=4, freq="D", name="idx") + + # reset freq to None + expected_1 = TimedeltaIndex( + ["1 day", "3 day", "4 day", "5 day"], freq=None, name="idx" + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + with pytest.raises((IndexError, ValueError)): + # either depending on numpy version + idx.delete(5) + + def test_delete_slice(self): + idx = timedelta_range(start="1 days", periods=10, freq="D", name="idx") + + # preserve freq + expected_0_2 = timedelta_range(start="4 days", periods=7, freq="D", name="idx") + expected_7_9 = timedelta_range(start="1 days", periods=7, freq="D", name="idx") + + # reset freq to None + expected_3_5 = TimedeltaIndex( + ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + def test_delete_doesnt_infer_freq(self): + # GH#30655 behavior matches DatetimeIndex + + tdi = TimedeltaIndex(["1 Day", "2 Days", None, "3 Days", "4 Days"]) + result = tdi.delete(2) + assert result.freq is None diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 5dec799832291..72d7763b549e7 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -65,6 +65,72 @@ def test_timestamp_invalid_key(self, key): tdi.get_loc(key) +class TestGetLoc: + def test_get_loc(self): + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + + for method in [None, "pad", "backfill", "nearest"]: + assert idx.get_loc(idx[1], method) == 1 + assert idx.get_loc(idx[1].to_pytimedelta(), method) == 1 + assert idx.get_loc(str(idx[1]), method) == 1 + + assert idx.get_loc(idx[1], "pad", tolerance=Timedelta(0)) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=np.timedelta64(0, "s")) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=timedelta(0)) == 1 + + with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + idx.get_loc(idx[1], method="nearest", tolerance="foo") + + with pytest.raises(ValueError, match="tolerance size must match"): + idx.get_loc( + idx[1], + method="nearest", + tolerance=[ + Timedelta(0).to_timedelta64(), + Timedelta(0).to_timedelta64(), + ], + ) + + for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: + assert idx.get_loc("1 day 1 hour", method) == loc + + # GH 16909 + assert idx.get_loc(idx[1].to_timedelta64()) == 1 + + # GH 16896 + assert idx.get_loc("0 days") == 0 + + def test_get_loc_nat(self): + tidx = TimedeltaIndex(["1 days 01:00:00", "NaT", "2 days 01:00:00"]) + + assert tidx.get_loc(pd.NaT) == 1 + assert tidx.get_loc(None) == 1 + assert tidx.get_loc(float("nan")) == 1 + assert tidx.get_loc(np.nan) == 1 + + +class TestGetIndexer: + def test_get_indexer(self): + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + + res = idx.get_indexer(target, "nearest", tolerance=Timedelta("1 hour")) + tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) + + class TestWhere: def test_where_doesnt_retain_freq(self): tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") @@ -184,217 +250,6 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) - - -class TestTimedeltaIndex: - def test_insert_empty(self): - # Corner case inserting with length zero doesnt raise IndexError - idx = timedelta_range("1 Day", periods=3) - td = idx[0] - - idx[:0].insert(0, td) - idx[:0].insert(1, td) - idx[:0].insert(-1, td) - - def test_insert(self): - - idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") - - result = idx.insert(2, timedelta(days=5)) - exp = TimedeltaIndex(["4day", "1day", "5day", "2day"], name="idx") - tm.assert_index_equal(result, exp) - - # insertion of non-datetime should coerce to object index - result = idx.insert(1, "inserted") - expected = Index( - [Timedelta("4day"), "inserted", Timedelta("1day"), Timedelta("2day")], - name="idx", - ) - assert not isinstance(result, TimedeltaIndex) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - - idx = timedelta_range("1day 00:00:01", periods=3, freq="s", name="idx") - - # preserve freq - expected_0 = TimedeltaIndex( - ["1day", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], - name="idx", - freq="s", - ) - expected_3 = TimedeltaIndex( - ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:04"], - name="idx", - freq="s", - ) - - # reset freq to None - expected_1_nofreq = TimedeltaIndex( - ["1day 00:00:01", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], - name="idx", - freq=None, - ) - expected_3_nofreq = TimedeltaIndex( - ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:05"], - name="idx", - freq=None, - ) - - cases = [ - (0, Timedelta("1day"), expected_0), - (-3, Timedelta("1day"), expected_0), - (3, Timedelta("1day 00:00:04"), expected_3), - (1, Timedelta("1day 00:00:01"), expected_1_nofreq), - (3, Timedelta("1day 00:00:05"), expected_3_nofreq), - ] - - for n, d, expected in cases: - result = idx.insert(n, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - @pytest.mark.parametrize( - "null", [None, np.nan, np.timedelta64("NaT"), pd.NaT, pd.NA] - ) - def test_insert_nat(self, null): - # GH 18295 (test missing) - idx = timedelta_range("1day", "3day") - result = idx.insert(1, null) - expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) - tm.assert_index_equal(result, expected) - - def test_insert_invalid_na(self): - idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") - with pytest.raises(TypeError, match="incompatible label"): - idx.insert(0, np.datetime64("NaT")) - - def test_insert_dont_cast_strings(self): - # To match DatetimeIndex and PeriodIndex behavior, dont try to - # parse strings to Timedelta - idx = timedelta_range("1day", "3day") - - result = idx.insert(0, "1 Day") - assert result.dtype == object - assert result[0] == "1 Day" - - def test_delete(self): - idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") - - # preserve freq - expected_0 = timedelta_range(start="2 Days", periods=4, freq="D", name="idx") - expected_4 = timedelta_range(start="1 Days", periods=4, freq="D", name="idx") - - # reset freq to None - expected_1 = TimedeltaIndex( - ["1 day", "3 day", "4 day", "5 day"], freq=None, name="idx" - ) - - cases = { - 0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1, - } - for n, expected in cases.items(): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - with pytest.raises((IndexError, ValueError)): - # either depending on numpy version - idx.delete(5) - - def test_delete_slice(self): - idx = timedelta_range(start="1 days", periods=10, freq="D", name="idx") - - # preserve freq - expected_0_2 = timedelta_range(start="4 days", periods=7, freq="D", name="idx") - expected_7_9 = timedelta_range(start="1 days", periods=7, freq="D", name="idx") - - # reset freq to None - expected_3_5 = TimedeltaIndex( - ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" - ) - - cases = { - (0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5, - } - for n, expected in cases.items(): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - result = idx.delete(slice(n[0], n[-1] + 1)) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - def test_get_loc(self): - idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) - - for method in [None, "pad", "backfill", "nearest"]: - assert idx.get_loc(idx[1], method) == 1 - assert idx.get_loc(idx[1].to_pytimedelta(), method) == 1 - assert idx.get_loc(str(idx[1]), method) == 1 - - assert idx.get_loc(idx[1], "pad", tolerance=Timedelta(0)) == 1 - assert idx.get_loc(idx[1], "pad", tolerance=np.timedelta64(0, "s")) == 1 - assert idx.get_loc(idx[1], "pad", tolerance=timedelta(0)) == 1 - - with pytest.raises(ValueError, match="unit abbreviation w/o a number"): - idx.get_loc(idx[1], method="nearest", tolerance="foo") - - with pytest.raises(ValueError, match="tolerance size must match"): - idx.get_loc( - idx[1], - method="nearest", - tolerance=[ - Timedelta(0).to_timedelta64(), - Timedelta(0).to_timedelta64(), - ], - ) - - for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: - assert idx.get_loc("1 day 1 hour", method) == loc - - # GH 16909 - assert idx.get_loc(idx[1].to_timedelta64()) == 1 - - # GH 16896 - assert idx.get_loc("0 days") == 0 - - def test_get_loc_nat(self): - tidx = TimedeltaIndex(["1 days 01:00:00", "NaT", "2 days 01:00:00"]) - - assert tidx.get_loc(pd.NaT) == 1 - assert tidx.get_loc(None) == 1 - assert tidx.get_loc(float("nan")) == 1 - assert tidx.get_loc(np.nan) == 1 - - def test_get_indexer(self): - idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) - tm.assert_numpy_array_equal( - idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) - ) - - target = pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) - tm.assert_numpy_array_equal( - idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) - ) - tm.assert_numpy_array_equal( - idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) - ) - tm.assert_numpy_array_equal( - idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) - ) - - res = idx.get_indexer(target, "nearest", tolerance=Timedelta("1 hour")) - tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) diff --git a/pandas/tests/indexes/timedeltas/test_insert.py b/pandas/tests/indexes/timedeltas/test_insert.py new file mode 100644 index 0000000000000..b214e009db869 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_insert.py @@ -0,0 +1,101 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, Timedelta, TimedeltaIndex, timedelta_range +import pandas._testing as tm + + +class TestTimedeltaIndexInsert: + def test_insert(self): + + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + + result = idx.insert(2, timedelta(days=5)) + exp = TimedeltaIndex(["4day", "1day", "5day", "2day"], name="idx") + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, "inserted") + expected = Index( + [Timedelta("4day"), "inserted", Timedelta("1day"), Timedelta("2day")], + name="idx", + ) + assert not isinstance(result, TimedeltaIndex) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + idx = timedelta_range("1day 00:00:01", periods=3, freq="s", name="idx") + + # preserve freq + expected_0 = TimedeltaIndex( + ["1day", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq="s", + ) + expected_3 = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:04"], + name="idx", + freq="s", + ) + + # reset freq to None + expected_1_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq=None, + ) + expected_3_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:05"], + name="idx", + freq=None, + ) + + cases = [ + (0, Timedelta("1day"), expected_0), + (-3, Timedelta("1day"), expected_0), + (3, Timedelta("1day 00:00:04"), expected_3), + (1, Timedelta("1day 00:00:01"), expected_1_nofreq), + (3, Timedelta("1day 00:00:05"), expected_3_nofreq), + ] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + @pytest.mark.parametrize( + "null", [None, np.nan, np.timedelta64("NaT"), pd.NaT, pd.NA] + ) + def test_insert_nat(self, null): + # GH 18295 (test missing) + idx = timedelta_range("1day", "3day") + result = idx.insert(1, null) + expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) + tm.assert_index_equal(result, expected) + + def test_insert_invalid_na(self): + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.datetime64("NaT")) + + def test_insert_dont_cast_strings(self): + # To match DatetimeIndex and PeriodIndex behavior, dont try to + # parse strings to Timedelta + idx = timedelta_range("1day", "3day") + + result = idx.insert(0, "1 Day") + assert result.dtype == object + assert result[0] == "1 Day" + + def test_insert_empty(self): + # Corner case inserting with length zero doesnt raise IndexError + idx = timedelta_range("1 Day", periods=3) + td = idx[0] + + idx[:0].insert(0, td) + idx[:0].insert(1, td) + idx[:0].insert(-1, td) diff --git a/pandas/tests/indexes/timedeltas/test_shift.py b/pandas/tests/indexes/timedeltas/test_shift.py index 98933ff0423ab..c02aa71d97aac 100644 --- a/pandas/tests/indexes/timedeltas/test_shift.py +++ b/pandas/tests/indexes/timedeltas/test_shift.py @@ -71,5 +71,5 @@ def test_tdi_shift_nonstandard_freq(self): def test_shift_no_freq(self): # GH#19147 tdi = TimedeltaIndex(["1 days 01:00:00", "2 days 01:00:00"], freq=None) - with pytest.raises(NullFrequencyError): + with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): tdi.shift(2) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 971203d6fc720..fa00b870ca757 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -11,6 +11,7 @@ Series, Timedelta, TimedeltaIndex, + array, date_range, timedelta_range, ) @@ -111,6 +112,26 @@ def test_sort_values(self): tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]), check_dtype=False) + @pytest.mark.parametrize("klass", [list, np.array, array, Series]) + def test_searchsorted_different_argument_classes(self, klass): + idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) + result = idx.searchsorted(klass(idx)) + expected = np.arange(len(idx), dtype=result.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = idx._data.searchsorted(klass(idx)) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "arg", + [[1, 2], ["a", "b"], [pd.Timestamp("2020-01-01", tz="Europe/London")] * 2], + ) + def test_searchsorted_invalid_argument_dtype(self, arg): + idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) + msg = "searchsorted requires compatible dtype" + with pytest.raises(TypeError, match=msg): + idx.searchsorted(arg) + def test_argmin_argmax(self): idx = TimedeltaIndex(["1 day 00:00:05", "1 day 00:00:01", "1 day 00:00:02"]) assert idx.argmin() == 1 @@ -147,19 +168,6 @@ def test_pass_TimedeltaIndex_to_index(self): tm.assert_numpy_array_equal(idx.values, expected.values) - def test_pickle(self): - - rng = timedelta_range("1 days", periods=10) - rng_p = tm.round_trip_pickle(rng) - tm.assert_index_equal(rng, rng_p) - - def test_hash_error(self): - index = timedelta_range("1 days", periods=10) - with pytest.raises( - TypeError, match=(f"unhashable type: {repr(type(index).__name__)}") - ): - hash(index) - def test_append_numpy_bug_1681(self): td = timedelta_range("1 days", "10 days", freq="2D") @@ -170,13 +178,6 @@ def test_append_numpy_bug_1681(self): result = a.append(c) assert (result["B"] == td).all() - def test_delete_doesnt_infer_freq(self): - # GH#30655 behavior matches DatetimeIndex - - tdi = pd.TimedeltaIndex(["1 Day", "2 Days", None, "3 Days", "4 Days"]) - result = tdi.delete(2) - assert result.freq is None - def test_fields(self): rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") tm.assert_index_equal(rng.days, Index([1, 1], dtype="int64")) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 1e641760f7e8d..1f19244cf76d3 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -137,7 +137,8 @@ def test_multiindex_setitem(self): tm.assert_frame_equal(df.loc[["bar"]], expected) # raise because these have differing levels - with pytest.raises(TypeError): + msg = "cannot align on a multi-index with out specifying the join levels" + with pytest.raises(TypeError, match=msg): df.loc["bar"] *= 2 # from SO @@ -203,10 +204,14 @@ def test_multiindex_assignment(self): tm.assert_series_equal(df.loc[4, "c"], exp) # invalid assignments - with pytest.raises(ValueError): + msg = ( + "cannot set using a multi-index selection indexer " + "with a different length than the value" + ) + with pytest.raises(ValueError, match=msg): df.loc[4, "c"] = [0, 1, 2, 3] - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.loc[4, "c"] = [0] # groupby example diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 6fa9d3bd2cdbb..f367a92d0b006 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -111,7 +111,11 @@ def test_per_axis_per_level_getitem(self): expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) - with pytest.raises(ValueError): + msg = ( + "cannot index with a boolean indexer " + "that is not the same length as the index" + ) + with pytest.raises(ValueError, match=msg): df.loc[(slice(None), np.array([True, False])), :] # ambiguous notation @@ -411,7 +415,11 @@ def test_per_axis_per_level_doc_examples(self): tm.assert_frame_equal(result, expected) # not sorted - with pytest.raises(UnsortedIndexError): + msg = ( + "MultiIndex slicing requires the index to be lexsorted: " + r"slicing on levels \[1\], lexsort depth 1" + ) + with pytest.raises(UnsortedIndexError, match=msg): df.loc["A1", ("a", slice("foo"))] # GH 16734: not sorted, but no real slicing @@ -480,14 +488,10 @@ def test_loc_axis_arguments(self): tm.assert_frame_equal(result, expected) # invalid axis - with pytest.raises(ValueError): - df.loc(axis=-1)[:, :, ["C1", "C3"]] - - with pytest.raises(ValueError): - df.loc(axis=2)[:, :, ["C1", "C3"]] - - with pytest.raises(ValueError): - df.loc(axis="foo")[:, :, ["C1", "C3"]] + for i in [-1, 2, "foo"]: + msg = f"No axis named {i} for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.loc(axis=i)[:, :, ["C1", "C3"]] def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self): @@ -628,12 +632,14 @@ def test_per_axis_per_level_setitem(self): # not enough values df = df_orig.copy() - with pytest.raises(ValueError): + msg = "setting an array element with a sequence." + with pytest.raises(ValueError, match=msg): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( [[100], [100, 100]], dtype="int64" ) - with pytest.raises(ValueError): + msg = "Must have equal len keys and value when setting with an iterable" + with pytest.raises(ValueError, match=msg): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( [100, 100, 100, 100], dtype="int64" ) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index bea8eae9bb850..c390347236ad3 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -297,7 +297,8 @@ def test_setitem_index_object(self, val, exp_dtype): if exp_dtype is IndexError: temp = obj.copy() - with pytest.raises(exp_dtype): + msg = "index 5 is out of bounds for axis 0 with size 4" + with pytest.raises(exp_dtype, match=msg): temp[5] = 5 else: exp_index = pd.Index(list("abcd") + [val]) diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index ca853ba5f00f5..e64103bd2cde8 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - """ self-contained to write legacy storage pickle files diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 75b825687209c..f2f7b37170ec9 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python3 -# coding: utf-8 - import os import warnings diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index b84fcffe26991..0a096acc9fa6d 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,3 @@ -# coding: utf-8 - import itertools import string diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 32673b9a0a5cf..7d1cc8bdd3c75 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for DataFrame.plot """ from datetime import date, datetime @@ -1256,6 +1254,16 @@ def test_plot_scatter_with_categorical_data(self, x, y): _check_plot_works(df.plot.scatter, x=x, y=y) + @pytest.mark.slow + def test_plot_scatter_with_s(self): + # this refers to GH 32904 + df = DataFrame( + np.random.random((10,3))*100, + columns=['a', 'b', 'c'], + ) + + _check_plot_works(df.plot.scatter(x='a', y='b', s='c')) + @pytest.mark.slow def test_plot_scatter_with_c(self): df = DataFrame( diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index 8fec4bb134cb4..238639bd3732d 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for GroupBy.plot """ diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 50ebbc22f2739..fba4f07f6cc0f 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for .hist method """ import numpy as np diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 168e8c7de0b83..27039948dfc16 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for misc plot functions """ import numpy as np diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 8463f30bee8f0..5341878d4986e 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for Series.plot """ diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index abd99aadfb484..962b105d1e8fc 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -531,13 +531,14 @@ def test_sum_inf(self): res = nanops.nansum(arr, axis=1) assert np.isinf(res).all() + @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"]) @pytest.mark.parametrize("use_bottleneck", [True, False]) @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) - def test_empty(self, method, unit, use_bottleneck): + def test_empty(self, method, unit, use_bottleneck, dtype): with pd.option_context("use_bottleneck", use_bottleneck): # GH#9422 / GH#18921 # Entirely empty - s = Series([], dtype=object) + s = Series([], dtype=dtype) # NA by default result = getattr(s, method)() assert result == unit @@ -560,8 +561,14 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(skipna=True, min_count=1) assert pd.isna(result) + result = getattr(s, method)(skipna=False, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=False, min_count=1) + assert pd.isna(result) + # All-NA - s = Series([np.nan]) + s = Series([np.nan], dtype=dtype) # NA by default result = getattr(s, method)() assert result == unit @@ -585,7 +592,7 @@ def test_empty(self, method, unit, use_bottleneck): assert pd.isna(result) # Mix of valid, empty - s = Series([np.nan, 1]) + s = Series([np.nan, 1], dtype=dtype) # Default result = getattr(s, method)() assert result == 1.0 @@ -604,22 +611,22 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(skipna=True, min_count=0) assert result == 1.0 - result = getattr(s, method)(skipna=True, min_count=1) - assert result == 1.0 - # GH#844 (changed in GH#9422) - df = DataFrame(np.empty((10, 0))) + df = DataFrame(np.empty((10, 0)), dtype=dtype) assert (getattr(df, method)(1) == unit).all() - s = pd.Series([1]) + s = pd.Series([1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) - s = pd.Series([np.nan]) + result = getattr(s, method)(skipna=False, min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) - s = pd.Series([np.nan, 1]) + s = pd.Series([np.nan, 1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 51e6f80df657d..a6a76a1078667 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index) - df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]}) + df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index) + df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on="key", right_index=True, how=how) expected = pd.DataFrame( [ - [1.0, 0, 1], - [2.0, 2, 3], - [3.0, 2, 3], - [np.nan, 1, 2], - [np.nan, 3, 4], - [np.nan, 4, 5], + [0, 0, 0], + [1, 1, 1], + [2, 2, 2], + [np.nan, 3, 3], + [np.nan, 4, 4], + [np.nan, 5, 5], ], columns=["a", "key", "b"], ) @@ -1318,6 +1318,20 @@ def test_merge_right_index_right(self): result = left.merge(right, left_on="key", right_index=True, how="right") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("how", ["left", "right"]) + def test_merge_preserves_row_order(self, how): + # GH 27453 + left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) + right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]}) + result = left_df.merge(right_df, on=["animal", "max_speed"], how=how) + if how == "right": + expected = pd.DataFrame( + {"animal": ["quetzal", "pig"], "max_speed": [80, 11]} + ) + else: + expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) + tm.assert_frame_equal(result, expected) + def test_merge_take_missing_values_from_index_of_other_dtype(self): # GH 24212 left = pd.DataFrame( diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index afd8f4178f741..a12395b32ab4e 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1220,13 +1220,17 @@ def test_concat_series_partial_columns_names(self): expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) tm.assert_frame_equal(result, expected) - def test_concat_dict(self): - frames = { - "foo": DataFrame(np.random.randn(4, 3)), - "bar": DataFrame(np.random.randn(4, 3)), - "baz": DataFrame(np.random.randn(4, 3)), - "qux": DataFrame(np.random.randn(4, 3)), - } + @pytest.mark.parametrize("mapping", ["mapping", "dict"]) + def test_concat_mapping(self, mapping, non_dict_mapping_subclass): + constructor = dict if mapping == "dict" else non_dict_mapping_subclass + frames = constructor( + { + "foo": DataFrame(np.random.randn(4, 3)), + "bar": DataFrame(np.random.randn(4, 3)), + "baz": DataFrame(np.random.randn(4, 3)), + "qux": DataFrame(np.random.randn(4, 3)), + } + ) sorted_keys = list(frames.keys()) diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index b51429d0338e3..b21e98827ca92 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -49,7 +49,8 @@ def test_equal(self): assert Interval(0, 1) != 0 def test_comparison(self): - with pytest.raises(TypeError, match="unorderable types"): + msg = "unorderable types" + with pytest.raises(TypeError, match=msg): Interval(0, 1) < 2 assert Interval(0, 1) < Interval(1, 2) @@ -254,6 +255,12 @@ def test_constructor_errors_tz(self, tz_left, tz_right): # GH 18538 left = Timestamp("2017-01-01", tz=tz_left) right = Timestamp("2017-01-02", tz=tz_right) - error = TypeError if com.any_none(tz_left, tz_right) else ValueError - with pytest.raises(error): + + if com.any_none(tz_left, tz_right): + error = TypeError + msg = "Cannot compare tz-naive and tz-aware timestamps" + else: + error = ValueError + msg = "left and right must have the same time zone" + with pytest.raises(error, match=msg): Interval(left, right) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 436810042186a..b9f637c178d53 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -33,7 +33,8 @@ def test_asfreq_near_zero_weekly(self): def test_to_timestamp_out_of_bounds(self): # GH#19643, used to incorrectly give Timestamp in 1754 per = Period("0001-01-01", freq="B") - with pytest.raises(OutOfBoundsDatetime): + msg = "Out of bounds nanosecond timestamp" + with pytest.raises(OutOfBoundsDatetime, match=msg): per.to_timestamp() def test_asfreq_corner(self): @@ -668,9 +669,10 @@ def test_conv_microsecond(self): assert start.value == per.ordinal * 1000 per2 = Period("2300-01-01", "us") - with pytest.raises(OutOfBoundsDatetime, match="2300-01-01"): + msg = "2300-01-01" + with pytest.raises(OutOfBoundsDatetime, match=msg): per2.start_time - with pytest.raises(OutOfBoundsDatetime, match="2300-01-01"): + with pytest.raises(OutOfBoundsDatetime, match=msg): per2.end_time def test_asfreq_mult(self): diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 1fee40c2a902b..304033f82c7a2 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -79,7 +79,8 @@ def test_construction(self): with pytest.raises(ValueError, match=msg): Period(ordinal=200701) - with pytest.raises(ValueError, match="Invalid frequency: X"): + msg = "Invalid frequency: X" + with pytest.raises(ValueError, match=msg): Period("2007-1-1", freq="X") def test_construction_bday(self): @@ -235,26 +236,34 @@ def test_period_constructor_offsets(self): assert i1 == expected def test_invalid_arguments(self): - with pytest.raises(ValueError): + msg = "Must supply freq for datetime value" + with pytest.raises(ValueError, match=msg): Period(datetime.now()) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Period(datetime.now().date()) - with pytest.raises(ValueError): + msg = "Value must be Period, string, integer, or datetime" + with pytest.raises(ValueError, match=msg): Period(1.6, freq="D") - with pytest.raises(ValueError): + msg = "Ordinal must be an integer" + with pytest.raises(ValueError, match=msg): Period(ordinal=1.6, freq="D") - with pytest.raises(ValueError): + msg = "Only value or ordinal but not both should be given but not both" + with pytest.raises(ValueError, match=msg): Period(ordinal=2, value=1, freq="D") - with pytest.raises(ValueError): + msg = "If value is None, freq cannot be None" + with pytest.raises(ValueError, match=msg): Period(month=1) - with pytest.raises(ValueError): + msg = "Given date string not likely a datetime" + with pytest.raises(ValueError, match=msg): Period("-2000", "A") - with pytest.raises(DateParseError): + msg = "day is out of range for month" + with pytest.raises(DateParseError, match=msg): Period("0", "A") - with pytest.raises(DateParseError): + msg = "Unknown datetime string format, unable to parse" + with pytest.raises(DateParseError, match=msg): Period("1/1/-2000", "A") def test_constructor_corner(self): @@ -1030,7 +1039,8 @@ def test_sub_delta(self): result = left - right assert result == 4 * right.freq - with pytest.raises(IncompatibleFrequency): + msg = r"Input has different freq=M from Period\(freq=A-DEC\)" + with pytest.raises(IncompatibleFrequency, match=msg): left - Period("2007-01", freq="M") def test_add_integer(self): @@ -1072,10 +1082,14 @@ def test_add_timestamp_raises(self, rbox, lbox): # We may get a different message depending on which class raises # the error. - msg = ( - r"cannot add|unsupported operand|" - r"can only operate on a|incompatible type|" - r"ufunc add cannot use operands" + msg = "|".join( + [ + "cannot add", + "unsupported operand", + "can only operate on a", + "incompatible type", + "ufunc add cannot use operands", + ] ) with pytest.raises(TypeError, match=msg): lbox(ts) + rbox(per) @@ -1148,14 +1162,22 @@ def test_add_offset(self): np.timedelta64(365, "D"), timedelta(365), ]: - with pytest.raises(IncompatibleFrequency): + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): p + o if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): o + p else: - with pytest.raises(IncompatibleFrequency): + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + with pytest.raises(IncompatibleFrequency, match=msg): o + p for freq in ["M", "2M", "3M"]: @@ -1175,14 +1197,22 @@ def test_add_offset(self): np.timedelta64(365, "D"), timedelta(365), ]: - with pytest.raises(IncompatibleFrequency): + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): p + o if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): o + p else: - with pytest.raises(IncompatibleFrequency): + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + with pytest.raises(IncompatibleFrequency, match=msg): o + p # freq is Tick @@ -1199,12 +1229,13 @@ def test_add_offset(self): exp = Period("2011-04-03", freq=freq) assert p + np.timedelta64(2, "D") == exp - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): np.timedelta64(2, "D") + p exp = Period("2011-04-02", freq=freq) assert p + np.timedelta64(3600 * 24, "s") == exp - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): np.timedelta64(3600 * 24, "s") + p exp = Period("2011-03-30", freq=freq) @@ -1222,14 +1253,22 @@ def test_add_offset(self): np.timedelta64(4, "h"), timedelta(hours=23), ]: - with pytest.raises(IncompatibleFrequency): + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): p + o if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): o + p else: - with pytest.raises(IncompatibleFrequency): + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + with pytest.raises(IncompatibleFrequency, match=msg): o + p for freq in ["H", "2H", "3H"]: @@ -1243,14 +1282,15 @@ def test_add_offset(self): assert p + offsets.Hour(3) == exp assert offsets.Hour(3) + p == exp + msg = "cannot use operands with types" exp = Period("2011-04-01 12:00", freq=freq) assert p + np.timedelta64(3, "h") == exp - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): np.timedelta64(3, "h") + p exp = Period("2011-04-01 10:00", freq=freq) assert p + np.timedelta64(3600, "s") == exp - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): np.timedelta64(3600, "s") + p exp = Period("2011-04-01 11:00", freq=freq) @@ -1268,18 +1308,27 @@ def test_add_offset(self): np.timedelta64(3200, "s"), timedelta(hours=23, minutes=30), ]: - with pytest.raises(IncompatibleFrequency): + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): p + o if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): o + p else: - with pytest.raises(IncompatibleFrequency): + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + with pytest.raises(IncompatibleFrequency, match=msg): o + p def test_sub_offset(self): # freq is DateOffset + msg = "Input has different freq|Input cannot be converted to Period" for freq in ["A", "2A", "3A"]: p = Period("2011", freq=freq) assert p - offsets.YearEnd(2) == Period("2009", freq=freq) @@ -1291,7 +1340,7 @@ def test_sub_offset(self): np.timedelta64(365, "D"), timedelta(365), ]: - with pytest.raises(IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency, match=msg): p - o for freq in ["M", "2M", "3M"]: @@ -1306,7 +1355,7 @@ def test_sub_offset(self): np.timedelta64(365, "D"), timedelta(365), ]: - with pytest.raises(IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency, match=msg): p - o # freq is Tick @@ -1326,7 +1375,7 @@ def test_sub_offset(self): np.timedelta64(4, "h"), timedelta(hours=23), ]: - with pytest.raises(IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency, match=msg): p - o for freq in ["H", "2H", "3H"]: @@ -1349,7 +1398,7 @@ def test_sub_offset(self): np.timedelta64(3200, "s"), timedelta(hours=23, minutes=30), ]: - with pytest.raises(IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency, match=msg): p - o @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) @@ -1377,12 +1426,14 @@ def test_period_ops_offset(self): def test_period_immutable(): # see gh-17116 + msg = "not writable" + per = Period("2014Q1") - with pytest.raises(AttributeError): + with pytest.raises(AttributeError, match=msg): per.ordinal = 14 freq = per.freq - with pytest.raises(AttributeError): + with pytest.raises(AttributeError, match=msg): per.freq = 2 * freq diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 07656de2e9062..a0e3f8984fbe4 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -23,10 +23,12 @@ def test_repr(): def test_truthiness(): - with pytest.raises(TypeError): + msg = "boolean value of NA is ambiguous" + + with pytest.raises(TypeError, match=msg): bool(NA) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): not NA @@ -145,7 +147,8 @@ def test_logical_and(): assert False & NA is False assert NA & NA is NA - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): NA & 5 @@ -157,7 +160,8 @@ def test_logical_or(): assert False | NA is NA assert NA | NA is NA - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): NA | 5 @@ -169,7 +173,8 @@ def test_logical_xor(): assert False ^ NA is NA assert NA ^ NA is NA - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): NA ^ 5 @@ -216,7 +221,8 @@ def test_ufunc(): def test_ufunc_raises(): - with pytest.raises(ValueError, match="ufunc method 'at'"): + msg = "ufunc method 'at'" + with pytest.raises(ValueError, match=msg): np.log.at(pd.NA, 0) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index f94b96b47fc05..0e5414a8b4d2d 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -393,12 +393,14 @@ def test_nat_arithmetic_scalar(op_name, value, val_type): elif val_type == "str": # un-specific check here because the message comes from str # and varies by method - msg = ( - "can only concatenate str|" - "unsupported operand type|" - "can't multiply sequence|" - "Can't convert 'NaTType'|" - "must be str, not NaTType" + msg = "|".join( + [ + "can only concatenate str", + "unsupported operand type", + "can't multiply sequence", + "Can't convert 'NaTType'", + "must be str, not NaTType", + ] ) else: msg = "unsupported operand type" diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 3cb868dd88605..12572648fca9e 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -89,10 +89,11 @@ def test_td_add_datetimelike_scalar(self, op): assert result is NaT def test_td_add_timestamp_overflow(self): - with pytest.raises(OverflowError): + msg = "int too (large|big) to convert" + with pytest.raises(OverflowError, match=msg): Timestamp("1700-01-01") + Timedelta(13 * 19999, unit="D") - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timestamp("1700-01-01") + timedelta(days=13 * 19999) @pytest.mark.parametrize("op", [operator.add, ops.radd]) @@ -180,14 +181,15 @@ def test_td_sub_offset(self): def test_td_add_sub_numeric_raises(self): td = Timedelta(10, unit="d") + msg = "unsupported operand type" for other in [2, 2.0, np.int64(2), np.float64(2)]: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): td + other - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): other + td - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): td - other - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): other - td def test_td_rsub_nat(self): @@ -228,7 +230,8 @@ def test_td_rsub_mixed_most_timedeltalike_object_dtype_array(self): # GH#21980 now = Timestamp.now() arr = np.array([now, Timedelta("1D"), np.timedelta64(2, "h")]) - with pytest.raises(TypeError): + msg = r"unsupported operand type\(s\) for \-: 'Timedelta' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): Timedelta("1D") - arr @pytest.mark.parametrize("op", [operator.add, ops.radd]) @@ -322,7 +325,8 @@ class TestTimedeltaMultiplicationDivision: def test_td_mul_nat(self, op, td_nat): # GH#19819 td = Timedelta(10, unit="d") - with pytest.raises(TypeError): + msg = "cannot use operands with types|Cannot multiply Timedelta with NaT" + with pytest.raises(TypeError, match=msg): op(td, td_nat) @pytest.mark.parametrize("nan", [np.nan, np.float64("NaN"), float("nan")]) @@ -349,11 +353,12 @@ def test_td_mul_scalar(self, op): assert op(-1, td).value == -1 * td.value assert op(-1.0, td).value == -1.0 * td.value - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): # timedelta * datetime is gibberish op(td, Timestamp(2016, 1, 2)) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): # invalid multiply with another timedelta op(td, td) @@ -452,10 +457,12 @@ def test_td_rdiv_na_scalar(self): result = np.timedelta64("NaT") / td assert np.isnan(result) - with pytest.raises(TypeError, match="cannot use operands with types dtype"): + msg = "cannot use operands with types dtype" + with pytest.raises(TypeError, match=msg): np.datetime64("NaT") / td - with pytest.raises(TypeError, match="Cannot divide float by Timedelta"): + msg = "Cannot divide float by Timedelta" + with pytest.raises(TypeError, match=msg): np.nan / td def test_td_rdiv_ndarray(self): @@ -472,11 +479,13 @@ def test_td_rdiv_ndarray(self): tm.assert_numpy_array_equal(result, expected) arr = np.array([np.nan], dtype=object) - with pytest.raises(TypeError, match="Cannot divide float by Timedelta"): + msg = "Cannot divide float by Timedelta" + with pytest.raises(TypeError, match=msg): arr / td arr = np.array([np.nan], dtype=np.float64) - with pytest.raises(TypeError, match="cannot use operands with types dtype"): + msg = "cannot use operands with types dtype" + with pytest.raises(TypeError, match=msg): arr / td # --------------------------------------------------------------- @@ -509,7 +518,13 @@ def test_td_floordiv_invalid_scalar(self): # GH#18846 td = Timedelta(hours=3, minutes=4) - with pytest.raises(TypeError): + msg = "|".join( + [ + r"Invalid dtype datetime64\[D\] for __floordiv__", + "'dtype' is an invalid keyword argument for this function", + ] + ) + with pytest.raises(TypeError, match=msg): td // np.datetime64("2016-01-01", dtype="datetime64[us]") def test_td_floordiv_numeric_scalar(self): @@ -580,7 +595,8 @@ def test_td_rfloordiv_invalid_scalar(self): td = Timedelta(hours=3, minutes=3) dt64 = np.datetime64("2016-01-01", "us") - with pytest.raises(TypeError): + msg = r"Invalid dtype datetime64\[us\] for __floordiv__" + with pytest.raises(TypeError, match=msg): td.__rfloordiv__(dt64) def test_td_rfloordiv_numeric_scalar(self): @@ -591,11 +607,12 @@ def test_td_rfloordiv_numeric_scalar(self): assert td.__rfloordiv__(3.5) is NotImplemented assert td.__rfloordiv__(2) is NotImplemented - with pytest.raises(TypeError): + msg = "Invalid dtype" + with pytest.raises(TypeError, match=msg): td.__rfloordiv__(np.float64(2.0)) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): td.__rfloordiv__(np.uint8(9)) - with pytest.raises(TypeError, match="Invalid dtype"): + with pytest.raises(TypeError, match=msg): # deprecated GH#19761, enforced GH#29797 td.__rfloordiv__(np.int32(2.0)) @@ -620,7 +637,8 @@ def test_td_rfloordiv_intarray(self): # deprecated GH#19761, enforced GH#29797 ints = np.array([1349654400, 1349740800, 1349827200, 1349913600]) * 10 ** 9 - with pytest.raises(TypeError, match="Invalid dtype"): + msg = "Invalid dtype" + with pytest.raises(TypeError, match=msg): ints // Timedelta(1, unit="s") def test_td_rfloordiv_numeric_series(self): @@ -630,7 +648,8 @@ def test_td_rfloordiv_numeric_series(self): res = td.__rfloordiv__(ser) assert res is NotImplemented - with pytest.raises(TypeError, match="Invalid dtype"): + msg = "Invalid dtype" + with pytest.raises(TypeError, match=msg): # Deprecated GH#19761, enforced GH#29797 # TODO: GH-19761. Change to TypeError. ser // td @@ -697,11 +716,11 @@ def test_mod_numeric(self): def test_mod_invalid(self): # GH#19365 td = Timedelta(hours=37) - - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): td % Timestamp("2018-01-22") - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): td % [] def test_rmod_pytimedelta(self): @@ -723,16 +742,18 @@ def test_rmod_invalid(self): # GH#19365 td = Timedelta(minutes=3) - with pytest.raises(TypeError): + msg = "unsupported operand" + with pytest.raises(TypeError, match=msg): Timestamp("2018-01-22") % td - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): 15 % td - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): 16.0 % td - with pytest.raises(TypeError): + msg = "Invalid dtype int" + with pytest.raises(TypeError, match=msg): np.array([22, 24]) % td # ---------------------------------------------------------------- @@ -783,7 +804,8 @@ def test_divmod_invalid(self): # GH#19365 td = Timedelta(days=2, hours=6) - with pytest.raises(TypeError): + msg = r"unsupported operand type\(s\) for //: 'Timedelta' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): divmod(td, Timestamp("2018-01-22")) def test_rdivmod_pytimedelta(self): @@ -802,17 +824,19 @@ def test_rdivmod_offset(self): def test_rdivmod_invalid(self): # GH#19365 td = Timedelta(minutes=3) + msg = "unsupported operand type" - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): divmod(Timestamp("2018-01-22"), td) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): divmod(15, td) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): divmod(16.0, td) - with pytest.raises(TypeError): + msg = "Invalid dtype int" + with pytest.raises(TypeError, match=msg): divmod(np.array([22, 24]), td) # ---------------------------------------------------------------- @@ -828,7 +852,8 @@ def test_rdivmod_invalid(self): ], ) def test_td_op_timedelta_timedeltalike_array(self, op, arr): - with pytest.raises(TypeError): + msg = "unsupported operand type|cannot use operands with types" + with pytest.raises(TypeError, match=msg): op(arr, Timedelta("1D")) @@ -918,13 +943,14 @@ def __gt__(self, other): def test_compare_unknown_type(self, val): # GH#20829 t = Timedelta("1s") - with pytest.raises(TypeError): + msg = "Cannot compare type Timedelta with type (int|str)" + with pytest.raises(TypeError, match=msg): t >= val - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): t > val - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): t <= val - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): t < val @@ -948,10 +974,18 @@ def test_ops_error_str(): for left, right in [(td, "a"), ("a", td)]: - with pytest.raises(TypeError): + msg = "|".join( + [ + "unsupported operand type", + r'can only concatenate str \(not "Timedelta"\) to str', + "must be str, not Timedelta", + ] + ) + with pytest.raises(TypeError, match=msg): left + right - with pytest.raises(TypeError): + msg = "Cannot compare type" + with pytest.raises(TypeError, match=msg): left > right assert not left == right diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index d32d1994cac74..ec3c6e9e3a326 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -51,6 +51,7 @@ def test_construction(): assert Timedelta("1 milli") == timedelta(milliseconds=1) assert Timedelta("1 millisecond") == timedelta(milliseconds=1) assert Timedelta("1 us") == timedelta(microseconds=1) + assert Timedelta("1 µs") == timedelta(microseconds=1) assert Timedelta("1 micros") == timedelta(microseconds=1) assert Timedelta("1 microsecond") == timedelta(microseconds=1) assert Timedelta("1.5 microsecond") == Timedelta("00:00:00.000001500") @@ -79,22 +80,26 @@ def test_construction(): # Currently invalid as it has a - on the hh:mm:dd part # (only allowed on the days) - with pytest.raises(ValueError): + msg = "only leading negative signs are allowed" + with pytest.raises(ValueError, match=msg): Timedelta("-10 days -1 h 1.5m 1s 3us") # only leading neg signs are allowed - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timedelta("10 days -1 h 1.5m 1s 3us") # no units specified - with pytest.raises(ValueError): + msg = "no units specified" + with pytest.raises(ValueError, match=msg): Timedelta("3.1415") # invalid construction - with pytest.raises(ValueError, match="cannot construct a Timedelta"): + msg = "cannot construct a Timedelta" + with pytest.raises(ValueError, match=msg): Timedelta() - with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + msg = "unit abbreviation w/o a number" + with pytest.raises(ValueError, match=msg): Timedelta("foo") msg = ( @@ -121,7 +126,8 @@ def test_construction(): assert result == expected assert to_timedelta(offsets.Hour(2)) == Timedelta("0 days, 02:00:00") - with pytest.raises(ValueError): + msg = "unit abbreviation w/o a number" + with pytest.raises(ValueError, match=msg): Timedelta("foo bar") @@ -177,16 +183,18 @@ def test_td_from_repr_roundtrip(val): def test_overflow_on_construction(): + msg = "int too (large|big) to convert" + # GH#3374 value = Timedelta("1day").value * 20169940 - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(value) # xref GH#17637 - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(7 * 19999, unit="D") - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(timedelta(days=13 * 19999)) @@ -272,7 +280,8 @@ def test_td_constructor_on_nanoseconds(constructed_td, conversion): def test_td_constructor_value_error(): - with pytest.raises(TypeError): + msg = "Invalid type . Must be int or float." + with pytest.raises(TypeError, match=msg): Timedelta(nanoseconds="abc") diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 0f2486be3a626..38e77321418d1 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -408,9 +408,11 @@ def conv(v): assert Timedelta(" - 10000D ") == -conv(np.timedelta64(10000, "D")) # invalid - with pytest.raises(ValueError): + msg = "invalid unit abbreviation" + with pytest.raises(ValueError, match=msg): Timedelta("1foo") - with pytest.raises(ValueError): + msg = "unit abbreviation w/o a number" + with pytest.raises(ValueError, match=msg): Timedelta("foo") def test_full_format_converters(self): @@ -439,7 +441,8 @@ def conv(v): ) # invalid - with pytest.raises(ValueError): + msg = "have leftover units" + with pytest.raises(ValueError, match=msg): Timedelta("- 1days, 00") def test_pickle(self): @@ -476,20 +479,21 @@ def test_implementation_limits(self): # Beyond lower limit, a NAT before the Overflow assert (min_td - Timedelta(1, "ns")) is NaT - with pytest.raises(OverflowError): + msg = "int too (large|big) to convert" + with pytest.raises(OverflowError, match=msg): min_td - Timedelta(2, "ns") - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): max_td + Timedelta(1, "ns") # Same tests using the internal nanosecond values td = Timedelta(min_td.value - 1, "ns") assert td is NaT - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(min_td.value - 2, "ns") - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(max_td.value + 1, "ns") def test_total_seconds_precision(self): diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index ccd7bf721430a..ee70d1d0432fc 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -90,7 +90,8 @@ def test_rsub_dtscalars(self, tz_naive_fixture): if tz_naive_fixture is None: assert other.to_datetime64() - ts == td else: - with pytest.raises(TypeError, match="subtraction must have"): + msg = "subtraction must have" + with pytest.raises(TypeError, match=msg): other.to_datetime64() - ts def test_timestamp_sub_datetime(self): @@ -195,7 +196,8 @@ def test_add_int_no_freq_raises(self, ts, other): with pytest.raises(TypeError, match=msg): ts - other - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): other - ts @pytest.mark.parametrize( @@ -215,14 +217,15 @@ def test_add_int_no_freq_raises(self, ts, other): ], ) def test_add_int_with_freq(self, ts, other): - - with pytest.raises(TypeError): + msg = "Addition/subtraction of integers and integer-arrays" + with pytest.raises(TypeError, match=msg): ts + other - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): other + ts - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): ts - other - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): other - ts diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index fce4fa6eb1eaa..4581e736b2ea1 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -28,7 +28,8 @@ def test_comparison_object_array(self): # tzaware mismatch arr = np.array([naive], dtype=object) - with pytest.raises(TypeError): + msg = "Cannot compare tz-naive and tz-aware timestamps" + with pytest.raises(TypeError, match=msg): arr < ts def test_comparison(self): @@ -85,30 +86,31 @@ def test_cant_compare_tz_naive_w_aware(self, utc_fixture): a = Timestamp("3/12/2012") b = Timestamp("3/12/2012", tz=utc_fixture) - with pytest.raises(TypeError): + msg = "Cannot compare tz-naive and tz-aware timestamps" + with pytest.raises(TypeError, match=msg): a == b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a != b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a < b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a <= b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a > b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a >= b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b == a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b != a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b < a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b <= a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b > a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b >= a assert not a == b.to_pydatetime() diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 4c75d1ebcd377..770753f42a4c8 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -165,20 +165,25 @@ def test_constructor_with_stringoffset(self): assert result == eval(repr(result)) def test_constructor_invalid(self): - with pytest.raises(TypeError, match="Cannot convert input"): + msg = "Cannot convert input" + with pytest.raises(TypeError, match=msg): Timestamp(slice(2)) - with pytest.raises(ValueError, match="Cannot convert Period"): + msg = "Cannot convert Period" + with pytest.raises(ValueError, match=msg): Timestamp(Period("1000-01-01")) def test_constructor_invalid_tz(self): # GH#17690 - with pytest.raises(TypeError, match="must be a datetime.tzinfo"): + msg = "must be a datetime.tzinfo" + with pytest.raises(TypeError, match=msg): Timestamp("2017-10-22", tzinfo="US/Eastern") - with pytest.raises(ValueError, match="at most one of"): + msg = "at most one of" + with pytest.raises(ValueError, match=msg): Timestamp("2017-10-22", tzinfo=pytz.utc, tz="UTC") - with pytest.raises(ValueError, match="Invalid frequency:"): + msg = "Invalid frequency:" + with pytest.raises(ValueError, match=msg): # GH#5168 # case where user tries to pass tz as an arg, not kwarg, gets # interpreted as a `freq` @@ -189,7 +194,8 @@ def test_constructor_strptime(self): # Test support for Timestamp.strptime fmt = "%Y%m%d-%H%M%S-%f%z" ts = "20190129-235348-000001+0000" - with pytest.raises(NotImplementedError): + msg = r"Timestamp.strptime\(\) is not implemented" + with pytest.raises(NotImplementedError, match=msg): Timestamp.strptime(ts, fmt) def test_constructor_tz_or_tzinfo(self): @@ -206,15 +212,20 @@ def test_constructor_tz_or_tzinfo(self): def test_constructor_positional(self): # see gh-10758 - with pytest.raises(TypeError): + msg = "an integer is required" + with pytest.raises(TypeError, match=msg): Timestamp(2000, 1) - with pytest.raises(ValueError): + + msg = "month must be in 1..12" + with pytest.raises(ValueError, match=msg): Timestamp(2000, 0, 1) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(2000, 13, 1) - with pytest.raises(ValueError): + + msg = "day is out of range for month" + with pytest.raises(ValueError, match=msg): Timestamp(2000, 1, 0) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(2000, 1, 32) # see gh-11630 @@ -225,15 +236,20 @@ def test_constructor_positional(self): def test_constructor_keyword(self): # GH 10758 - with pytest.raises(TypeError): + msg = "function missing required argument 'day'|Required argument 'day'" + with pytest.raises(TypeError, match=msg): Timestamp(year=2000, month=1) - with pytest.raises(ValueError): + + msg = "month must be in 1..12" + with pytest.raises(ValueError, match=msg): Timestamp(year=2000, month=0, day=1) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(year=2000, month=13, day=1) - with pytest.raises(ValueError): + + msg = "day is out of range for month" + with pytest.raises(ValueError, match=msg): Timestamp(year=2000, month=1, day=0) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(year=2000, month=1, day=32) assert repr(Timestamp(year=2015, month=11, day=12)) == repr( @@ -313,7 +329,8 @@ def test_constructor_nanosecond(self, result): @pytest.mark.parametrize("z", ["Z0", "Z00"]) def test_constructor_invalid_Z0_isostring(self, z): # GH 8910 - with pytest.raises(ValueError): + msg = "could not convert string to Timestamp" + with pytest.raises(ValueError, match=msg): Timestamp(f"2014-11-02 01:00{z}") @pytest.mark.parametrize( @@ -331,14 +348,17 @@ def test_constructor_invalid_Z0_isostring(self, z): ) def test_invalid_date_kwarg_with_string_input(self, arg): kwarg = {arg: 1} - with pytest.raises(ValueError): + msg = "Cannot pass a date attribute keyword argument" + with pytest.raises(ValueError, match=msg): Timestamp("2010-10-10 12:59:59.999999999", **kwarg) def test_out_of_bounds_integer_value(self): # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError - with pytest.raises(OutOfBoundsDatetime): + msg = str(Timestamp.max.value * 2) + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(Timestamp.max.value * 2) - with pytest.raises(OutOfBoundsDatetime): + msg = str(Timestamp.min.value * 2) + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(Timestamp.min.value * 2) def test_out_of_bounds_value(self): @@ -353,25 +373,28 @@ def test_out_of_bounds_value(self): Timestamp(min_ts_us) Timestamp(max_ts_us) + msg = "Out of bounds" # One us less than the minimum is an error - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(min_ts_us - one_us) # One us more than the maximum is an error - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(max_ts_us + one_us) def test_out_of_bounds_string(self): - with pytest.raises(ValueError): + msg = "Out of bounds" + with pytest.raises(ValueError, match=msg): Timestamp("1676-01-01") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp("2263-01-01") def test_barely_out_of_bounds(self): # GH#19529 # GH#19382 close enough to bounds that dropping nanos would result # in an in-bounds datetime - with pytest.raises(OutOfBoundsDatetime): + msg = "Out of bounds nanosecond timestamp: 2262-04-11 23:47:16" + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp("2262-04-11 23:47:16.854775808") def test_bounds_with_different_units(self): @@ -382,7 +405,8 @@ def test_bounds_with_different_units(self): for date_string in out_of_bounds_dates: for unit in time_units: dt64 = np.datetime64(date_string, unit) - with pytest.raises(ValueError): + msg = "Out of bounds" + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(dt64) in_bounds_dates = ("1677-09-23", "2262-04-11") @@ -449,7 +473,8 @@ def test_today(self): def test_disallow_setting_tz(self, tz): # GH 3746 ts = Timestamp("2010") - with pytest.raises(AttributeError): + msg = "Cannot directly set timezone" + with pytest.raises(AttributeError, match=msg): ts.tz = tz @pytest.mark.parametrize("offset", ["+0300", "+0200"]) @@ -476,16 +501,19 @@ def test_construct_timestamp_preserve_original_frequency(self): def test_constructor_invalid_frequency(self): # GH 22311 - with pytest.raises(ValueError, match="Invalid frequency:"): + msg = "Invalid frequency:" + with pytest.raises(ValueError, match=msg): Timestamp("2012-01-01", freq=[]) @pytest.mark.parametrize("box", [datetime, Timestamp]) def test_raise_tz_and_tzinfo_in_datetime_input(self, box): # GH 23579 kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": pytz.utc} - with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): + msg = "Cannot pass a datetime or Timestamp" + with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tz="US/Pacific") - with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): + msg = "Cannot pass a datetime or Timestamp" + with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) def test_dont_convert_dateutil_utc_to_pytz_utc(self): diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index cfa7da810ada1..9611c827be6fe 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -21,19 +21,20 @@ class TestTimestampTZOperations: # Timestamp.tz_localize def test_tz_localize_pushes_out_of_bounds(self): + msg = "^$" # GH#12677 # tz_localize that pushes away from the boundary is OK pac = Timestamp.min.tz_localize("US/Pacific") assert pac.value > Timestamp.min.value pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value - with pytest.raises(OutOfBoundsDatetime): + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp.min.tz_localize("Asia/Tokyo") # tz_localize that pushes away from the boundary is OK tokyo = Timestamp.max.tz_localize("Asia/Tokyo") assert tokyo.value < Timestamp.max.value tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value - with pytest.raises(OutOfBoundsDatetime): + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp.max.tz_localize("US/Pacific") def test_tz_localize_ambiguous_bool(self): @@ -43,7 +44,8 @@ def test_tz_localize_ambiguous_bool(self): expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") - with pytest.raises(pytz.AmbiguousTimeError): + msg = "Cannot infer dst time from 2015-11-01 01:00:03" + with pytest.raises(pytz.AmbiguousTimeError, match=msg): ts.tz_localize("US/Central") result = ts.tz_localize("US/Central", ambiguous=True) @@ -58,7 +60,8 @@ def test_tz_localize_ambiguous(self): ts_no_dst = ts.tz_localize("US/Eastern", ambiguous=False) assert (ts_no_dst.value - ts_dst.value) / 1e9 == 3600 - with pytest.raises(ValueError): + msg = "Cannot infer offset with only one time" + with pytest.raises(ValueError, match=msg): ts.tz_localize("US/Eastern", ambiguous="infer") # GH#8025 @@ -82,24 +85,29 @@ def test_tz_localize_ambiguous(self): def test_tz_localize_nonexistent(self, stamp, tz): # GH#13057 ts = Timestamp(stamp) - with pytest.raises(NonExistentTimeError): + with pytest.raises(NonExistentTimeError, match=stamp): ts.tz_localize(tz) # GH 22644 - with pytest.raises(NonExistentTimeError): + with pytest.raises(NonExistentTimeError, match=stamp): ts.tz_localize(tz, nonexistent="raise") assert ts.tz_localize(tz, nonexistent="NaT") is NaT def test_tz_localize_ambiguous_raise(self): # GH#13057 ts = Timestamp("2015-11-1 01:00") - with pytest.raises(AmbiguousTimeError): + msg = "Cannot infer dst time from 2015-11-01 01:00:00," + with pytest.raises(AmbiguousTimeError, match=msg): ts.tz_localize("US/Pacific", ambiguous="raise") def test_tz_localize_nonexistent_invalid_arg(self): # GH 22644 tz = "Europe/Warsaw" ts = Timestamp("2015-03-29 02:00:00") - with pytest.raises(ValueError): + msg = ( + "The nonexistent argument must be one of 'raise', 'NaT', " + "'shift_forward', 'shift_backward' or a timedelta object" + ) + with pytest.raises(ValueError, match=msg): ts.tz_localize(tz, nonexistent="foo") @pytest.mark.parametrize( @@ -117,7 +125,8 @@ def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): localized = ts.tz_localize(tz) assert localized == Timestamp(stamp, tz=tz) - with pytest.raises(TypeError): + msg = "Cannot localize tz-aware Timestamp" + with pytest.raises(TypeError, match=msg): localized.tz_localize(tz) reset = localized.tz_localize(None) @@ -249,9 +258,14 @@ def test_timestamp_tz_localize_nonexistent_NaT(self, tz): def test_timestamp_tz_localize_nonexistent_raise(self, tz): # GH 8917 ts = Timestamp("2015-03-29 02:20:00") - with pytest.raises(pytz.NonExistentTimeError): + msg = "2015-03-29 02:20:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): ts.tz_localize(tz, nonexistent="raise") - with pytest.raises(ValueError): + msg = ( + "The nonexistent argument must be one of 'raise', 'NaT', " + "'shift_forward', 'shift_backward' or a timedelta object" + ) + with pytest.raises(ValueError, match=msg): ts.tz_localize(tz, nonexistent="foo") # ------------------------------------------------------------------ @@ -327,14 +341,16 @@ def test_timestamp_constructor_near_dst_boundary(self): expected = Timestamp("2015-10-25 01:00").tz_localize(tz) assert result == expected - with pytest.raises(pytz.AmbiguousTimeError): + msg = "Cannot infer dst time from 2015-10-25 02:00:00" + with pytest.raises(pytz.AmbiguousTimeError, match=msg): Timestamp("2015-10-25 02:00", tz=tz) result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") expected = Timestamp("2017-03-26 01:00").tz_localize("Europe/Paris") assert result == expected - with pytest.raises(pytz.NonExistentTimeError): + msg = "2017-03-26 02:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): Timestamp("2017-03-26 02:00", tz="Europe/Paris") # GH#11708 @@ -352,7 +368,8 @@ def test_timestamp_constructor_near_dst_boundary(self): expected = Timestamp("2017-03-26 01:00:00+0100", tz="Europe/Paris") assert result == expected - with pytest.raises(pytz.NonExistentTimeError): + msg = "2017-03-26 02:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): Timestamp("2017-03-26 02:00", tz="Europe/Paris") result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris") diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 78e795e71cd07..e657559b55d5a 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -166,7 +166,8 @@ def test_round_dst_border_ambiguous(self, method): result = getattr(ts, method)("H", ambiguous="NaT") assert result is NaT - with pytest.raises(pytz.AmbiguousTimeError): + msg = "Cannot infer dst time" + with pytest.raises(pytz.AmbiguousTimeError, match=msg): getattr(ts, method)("H", ambiguous="raise") @pytest.mark.parametrize( @@ -187,7 +188,8 @@ def test_round_dst_border_nonexistent(self, method, ts_str, freq): result = getattr(ts, method)(freq, nonexistent="NaT") assert result is NaT - with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + msg = "2018-03-11 02:00:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): getattr(ts, method)(freq, nonexistent="raise") @pytest.mark.parametrize( @@ -298,14 +300,16 @@ def test_replace_invalid_kwarg(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) - with pytest.raises(TypeError): + msg = r"replace\(\) got an unexpected keyword argument" + with pytest.raises(TypeError, match=msg): ts.replace(foo=5) def test_replace_integer_args(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) - with pytest.raises(ValueError): + msg = "value must be an integer, received for hour" + with pytest.raises(ValueError, match=msg): ts.replace(hour=0.1) def test_replace_tzinfo_equiv_tz_localize_none(self): diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index a3c431696b689..b45f831ff00aa 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -8,162 +8,6 @@ import pandas._testing as tm -@pytest.mark.parametrize( - "first_slice,second_slice", - [ - [[2, None], [None, -5]], - [[None, 0], [None, -5]], - [[None, -5], [None, 0]], - [[None, 0], [None, 0]], - ], -) -@pytest.mark.parametrize("fill", [None, -1]) -def test_align(datetime_series, first_slice, second_slice, join_type, fill): - a = datetime_series[slice(*first_slice)] - b = datetime_series[slice(*second_slice)] - - aa, ab = a.align(b, join=join_type, fill_value=fill) - - join_index = a.index.join(b.index, how=join_type) - if fill is not None: - diff_a = aa.index.difference(join_index) - diff_b = ab.index.difference(join_index) - if len(diff_a) > 0: - assert (aa.reindex(diff_a) == fill).all() - if len(diff_b) > 0: - assert (ab.reindex(diff_b) == fill).all() - - ea = a.reindex(join_index) - eb = b.reindex(join_index) - - if fill is not None: - ea = ea.fillna(fill) - eb = eb.fillna(fill) - - tm.assert_series_equal(aa, ea) - tm.assert_series_equal(ab, eb) - assert aa.name == "ts" - assert ea.name == "ts" - assert ab.name == "ts" - assert eb.name == "ts" - - -@pytest.mark.parametrize( - "first_slice,second_slice", - [ - [[2, None], [None, -5]], - [[None, 0], [None, -5]], - [[None, -5], [None, 0]], - [[None, 0], [None, 0]], - ], -) -@pytest.mark.parametrize("method", ["pad", "bfill"]) -@pytest.mark.parametrize("limit", [None, 1]) -def test_align_fill_method( - datetime_series, first_slice, second_slice, join_type, method, limit -): - a = datetime_series[slice(*first_slice)] - b = datetime_series[slice(*second_slice)] - - aa, ab = a.align(b, join=join_type, method=method, limit=limit) - - join_index = a.index.join(b.index, how=join_type) - ea = a.reindex(join_index) - eb = b.reindex(join_index) - - ea = ea.fillna(method=method, limit=limit) - eb = eb.fillna(method=method, limit=limit) - - tm.assert_series_equal(aa, ea) - tm.assert_series_equal(ab, eb) - - -def test_align_nocopy(datetime_series): - b = datetime_series[:5].copy() - - # do copy - a = datetime_series.copy() - ra, _ = a.align(b, join="left") - ra[:5] = 5 - assert not (a[:5] == 5).any() - - # do not copy - a = datetime_series.copy() - ra, _ = a.align(b, join="left", copy=False) - ra[:5] = 5 - assert (a[:5] == 5).all() - - # do copy - a = datetime_series.copy() - b = datetime_series[:5].copy() - _, rb = a.align(b, join="right") - rb[:3] = 5 - assert not (b[:3] == 5).any() - - # do not copy - a = datetime_series.copy() - b = datetime_series[:5].copy() - _, rb = a.align(b, join="right", copy=False) - rb[:2] = 5 - assert (b[:2] == 5).all() - - -def test_align_same_index(datetime_series): - a, b = datetime_series.align(datetime_series, copy=False) - assert a.index is datetime_series.index - assert b.index is datetime_series.index - - a, b = datetime_series.align(datetime_series, copy=True) - assert a.index is not datetime_series.index - assert b.index is not datetime_series.index - - -def test_align_multiindex(): - # GH 10665 - - midx = pd.MultiIndex.from_product( - [range(2), range(3), range(2)], names=("a", "b", "c") - ) - idx = pd.Index(range(2), name="b") - s1 = pd.Series(np.arange(12, dtype="int64"), index=midx) - s2 = pd.Series(np.arange(2, dtype="int64"), index=idx) - - # these must be the same results (but flipped) - res1l, res1r = s1.align(s2, join="left") - res2l, res2r = s2.align(s1, join="right") - - expl = s1 - tm.assert_series_equal(expl, res1l) - tm.assert_series_equal(expl, res2r) - expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) - tm.assert_series_equal(expr, res1r) - tm.assert_series_equal(expr, res2l) - - res1l, res1r = s1.align(s2, join="right") - res2l, res2r = s2.align(s1, join="left") - - exp_idx = pd.MultiIndex.from_product( - [range(2), range(2), range(2)], names=("a", "b", "c") - ) - expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) - tm.assert_series_equal(expl, res1l) - tm.assert_series_equal(expl, res2r) - expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx) - tm.assert_series_equal(expr, res1r) - tm.assert_series_equal(expr, res2l) - - -@pytest.mark.parametrize("method", ["backfill", "bfill", "pad", "ffill", None]) -def test_align_method(method): - # GH31788 - ser = pd.Series(range(3), index=range(3)) - df = pd.DataFrame(0.0, index=range(3), columns=range(3)) - - result_ser, result_df = ser.align(df, method=method) - tm.assert_series_equal(result_ser, ser) - tm.assert_frame_equal(result_df, df) - - def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) @@ -477,95 +321,3 @@ def test_reindex_empty_series_tz_dtype(): result = Series(dtype="datetime64[ns, UTC]").reindex([0, 1]) expected = Series([pd.NaT] * 2, dtype="datetime64[ns, UTC]") tm.assert_equal(result, expected) - - -def test_rename(): - # GH 17407 - s = Series(range(1, 6), index=pd.Index(range(2, 7), name="IntIndex")) - result = s.rename(str) - expected = s.rename(lambda i: str(i)) - tm.assert_series_equal(result, expected) - - assert result.name == expected.name - - -@pytest.mark.parametrize( - "data, index, drop_labels, axis, expected_data, expected_index", - [ - # Unique Index - ([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]), - ([1, 2], ["one", "two"], ["two"], "rows", [1], ["one"]), - ([1, 1, 2], ["one", "two", "one"], ["two"], 0, [1, 2], ["one", "one"]), - # GH 5248 Non-Unique Index - ([1, 1, 2], ["one", "two", "one"], "two", 0, [1, 2], ["one", "one"]), - ([1, 1, 2], ["one", "two", "one"], ["one"], 0, [1], ["two"]), - ([1, 1, 2], ["one", "two", "one"], "one", 0, [1], ["two"]), - ], -) -def test_drop_unique_and_non_unique_index( - data, index, axis, drop_labels, expected_data, expected_index -): - - s = Series(data=data, index=index) - result = s.drop(drop_labels, axis=axis) - expected = Series(data=expected_data, index=expected_index) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "data, index, drop_labels, axis, error_type, error_desc", - [ - # single string/tuple-like - (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), - # bad axis - (range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"), - (range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"), - ], -) -def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): - ser = Series(data, index=index) - with pytest.raises(error_type, match=error_desc): - ser.drop(drop_labels, axis=axis) - - -def test_drop_with_ignore_errors(): - # errors='ignore' - s = Series(range(3), index=list("abc")) - result = s.drop("bc", errors="ignore") - tm.assert_series_equal(result, s) - result = s.drop(["a", "d"], errors="ignore") - expected = s.iloc[1:] - tm.assert_series_equal(result, expected) - - # GH 8522 - s = Series([2, 3], index=[True, False]) - assert s.index.is_object() - result = s.drop(True) - expected = Series([3], index=[False]) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 3]]) -@pytest.mark.parametrize("drop_labels", [[], [1], [3]]) -def test_drop_empty_list(index, drop_labels): - # GH 21494 - expected_index = [i for i in index if i not in drop_labels] - series = pd.Series(index=index, dtype=object).drop(drop_labels) - expected = pd.Series(index=expected_index, dtype=object) - tm.assert_series_equal(series, expected) - - -@pytest.mark.parametrize( - "data, index, drop_labels", - [ - (None, [1, 2, 3], [1, 4]), - (None, [1, 2, 2], [1, 4]), - ([2, 3], [0, 1], [False, True]), - ], -) -def test_drop_non_empty_list(data, index, drop_labels): - # GH 21494 and GH 16877 - dtype = object if data is None else None - ser = pd.Series(data=data, index=index, dtype=dtype) - with pytest.raises(KeyError, match="not found in axis"): - ser.drop(drop_labels) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 9703f5afaf689..6765d9f9d8266 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -435,3 +435,11 @@ def test_where_dt_tz_values(tz_naive_fixture): pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture) ) tm.assert_series_equal(exp, result) + + +def test_where_sparse(): + # GH#17198 make sure we dont get an AttributeError for sp_index + ser = pd.Series(pd.arrays.SparseArray([1, 2])) + result = ser.where(ser >= 2, 0) + expected = pd.Series(pd.arrays.SparseArray([0, 2])) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py new file mode 100644 index 0000000000000..974ba5d1e35a7 --- /dev/null +++ b/pandas/tests/series/methods/test_align.py @@ -0,0 +1,182 @@ +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import Series, date_range, period_range +import pandas._testing as tm + + +@pytest.mark.parametrize( + "first_slice,second_slice", + [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("fill", [None, -1]) +def test_align(datetime_series, first_slice, second_slice, join_type, fill): + a = datetime_series[slice(*first_slice)] + b = datetime_series[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, fill_value=fill) + + join_index = a.index.join(b.index, how=join_type) + if fill is not None: + diff_a = aa.index.difference(join_index) + diff_b = ab.index.difference(join_index) + if len(diff_a) > 0: + assert (aa.reindex(diff_a) == fill).all() + if len(diff_b) > 0: + assert (ab.reindex(diff_b) == fill).all() + + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + if fill is not None: + ea = ea.fillna(fill) + eb = eb.fillna(fill) + + tm.assert_series_equal(aa, ea) + tm.assert_series_equal(ab, eb) + assert aa.name == "ts" + assert ea.name == "ts" + assert ab.name == "ts" + assert eb.name == "ts" + + +@pytest.mark.parametrize( + "first_slice,second_slice", + [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("method", ["pad", "bfill"]) +@pytest.mark.parametrize("limit", [None, 1]) +def test_align_fill_method( + datetime_series, first_slice, second_slice, join_type, method, limit +): + a = datetime_series[slice(*first_slice)] + b = datetime_series[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, method=method, limit=limit) + + join_index = a.index.join(b.index, how=join_type) + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + ea = ea.fillna(method=method, limit=limit) + eb = eb.fillna(method=method, limit=limit) + + tm.assert_series_equal(aa, ea) + tm.assert_series_equal(ab, eb) + + +def test_align_nocopy(datetime_series): + b = datetime_series[:5].copy() + + # do copy + a = datetime_series.copy() + ra, _ = a.align(b, join="left") + ra[:5] = 5 + assert not (a[:5] == 5).any() + + # do not copy + a = datetime_series.copy() + ra, _ = a.align(b, join="left", copy=False) + ra[:5] = 5 + assert (a[:5] == 5).all() + + # do copy + a = datetime_series.copy() + b = datetime_series[:5].copy() + _, rb = a.align(b, join="right") + rb[:3] = 5 + assert not (b[:3] == 5).any() + + # do not copy + a = datetime_series.copy() + b = datetime_series[:5].copy() + _, rb = a.align(b, join="right", copy=False) + rb[:2] = 5 + assert (b[:2] == 5).all() + + +def test_align_same_index(datetime_series): + a, b = datetime_series.align(datetime_series, copy=False) + assert a.index is datetime_series.index + assert b.index is datetime_series.index + + a, b = datetime_series.align(datetime_series, copy=True) + assert a.index is not datetime_series.index + assert b.index is not datetime_series.index + + +def test_align_multiindex(): + # GH 10665 + + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + s1 = pd.Series(np.arange(12, dtype="int64"), index=midx) + s2 = pd.Series(np.arange(2, dtype="int64"), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = s1.align(s2, join="left") + res2l, res2r = s2.align(s1, join="right") + + expl = s1 + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + res1l, res1r = s1.align(s2, join="right") + res2l, res2r = s2.align(s1, join="left") + + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) + expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + +@pytest.mark.parametrize("method", ["backfill", "bfill", "pad", "ffill", None]) +def test_align_with_dataframe_method(method): + # GH31788 + ser = pd.Series(range(3), index=range(3)) + df = pd.DataFrame(0.0, index=range(3), columns=range(3)) + + result_ser, result_df = ser.align(df, method=method) + tm.assert_series_equal(result_ser, ser) + tm.assert_frame_equal(result_df, df) + + +def test_align_dt64tzindex_mismatched_tzs(): + idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + ser = Series(np.random.randn(len(idx1)), index=idx1) + ser_central = ser.tz_convert("US/Central") + # different timezones convert to UTC + + new1, new2 = ser.align(ser_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + +def test_align_periodindex(join_type): + rng = period_range("1/1/2000", "1/1/2010", freq="A") + ts = Series(np.random.randn(len(rng)), index=rng) + + # TODO: assert something? + ts.align(ts[::2], join=join_type) diff --git a/pandas/tests/series/methods/test_drop.py b/pandas/tests/series/methods/test_drop.py new file mode 100644 index 0000000000000..197fe9ff68df2 --- /dev/null +++ b/pandas/tests/series/methods/test_drop.py @@ -0,0 +1,87 @@ +import pytest + +import pandas as pd +from pandas import Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "data, index, drop_labels, axis, expected_data, expected_index", + [ + # Unique Index + ([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]), + ([1, 2], ["one", "two"], ["two"], "rows", [1], ["one"]), + ([1, 1, 2], ["one", "two", "one"], ["two"], 0, [1, 2], ["one", "one"]), + # GH 5248 Non-Unique Index + ([1, 1, 2], ["one", "two", "one"], "two", 0, [1, 2], ["one", "one"]), + ([1, 1, 2], ["one", "two", "one"], ["one"], 0, [1], ["two"]), + ([1, 1, 2], ["one", "two", "one"], "one", 0, [1], ["two"]), + ], +) +def test_drop_unique_and_non_unique_index( + data, index, axis, drop_labels, expected_data, expected_index +): + + s = Series(data=data, index=index) + result = s.drop(drop_labels, axis=axis) + expected = Series(data=expected_data, index=expected_index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data, index, drop_labels, axis, error_type, error_desc", + [ + # single string/tuple-like + (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), + # bad axis + (range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"), + (range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"), + ], +) +def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): + ser = Series(data, index=index) + with pytest.raises(error_type, match=error_desc): + ser.drop(drop_labels, axis=axis) + + +def test_drop_with_ignore_errors(): + # errors='ignore' + s = Series(range(3), index=list("abc")) + result = s.drop("bc", errors="ignore") + tm.assert_series_equal(result, s) + result = s.drop(["a", "d"], errors="ignore") + expected = s.iloc[1:] + tm.assert_series_equal(result, expected) + + # GH 8522 + s = Series([2, 3], index=[True, False]) + assert s.index.is_object() + result = s.drop(True) + expected = Series([3], index=[False]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize("drop_labels", [[], [1], [3]]) +def test_drop_empty_list(index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + series = pd.Series(index=index, dtype=object).drop(drop_labels) + expected = pd.Series(index=expected_index, dtype=object) + tm.assert_series_equal(series, expected) + + +@pytest.mark.parametrize( + "data, index, drop_labels", + [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]), + ], +) +def test_drop_non_empty_list(data, index, drop_labels): + # GH 21494 and GH 16877 + dtype = object if data is None else None + ser = pd.Series(data=data, index=index, dtype=dtype) + with pytest.raises(KeyError, match="not found in axis"): + ser.drop(drop_labels) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 60182f509e657..ac07fed7c951a 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -89,3 +89,12 @@ class MyIndexer: s = Series([1, 2, 3]) s.rename(ix, inplace=True) assert s.name is ix + + def test_rename_callable(self): + # GH 17407 + s = Series(range(1, 6), index=Index(range(2, 7), name="IntIndex")) + result = s.rename(str) + expected = s.rename(lambda i: str(i)) + tm.assert_series_equal(result, expected) + + assert result.name == expected.name diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 3c2cb5275f3a8..0661828814888 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -9,7 +9,6 @@ import pandas as pd from pandas import DataFrame, Index, Series, isna import pandas._testing as tm -from pandas.conftest import _get_cython_table_params from pandas.core.base import SpecificationError @@ -356,7 +355,7 @@ def test_non_callable_aggregates(self): @pytest.mark.parametrize( "series, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( Series(dtype=np.float64), [ ("sum", 0), @@ -371,7 +370,7 @@ def test_non_callable_aggregates(self): ("median", np.nan), ], ), - _get_cython_table_params( + tm.get_cython_table_params( Series([np.nan, 1, 2, 3]), [ ("sum", 6), @@ -386,7 +385,7 @@ def test_non_callable_aggregates(self): ("median", 2), ], ), - _get_cython_table_params( + tm.get_cython_table_params( Series("a b c".split()), [ ("sum", "abc"), @@ -411,21 +410,21 @@ def test_agg_cython_table(self, series, func, expected): @pytest.mark.parametrize( "series, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( Series(dtype=np.float64), [ ("cumprod", Series([], Index([]), dtype=np.float64)), ("cumsum", Series([], Index([]), dtype=np.float64)), ], ), - _get_cython_table_params( + tm.get_cython_table_params( Series([np.nan, 1, 2, 3]), [ ("cumprod", Series([np.nan, 1, 2, 6])), ("cumsum", Series([np.nan, 1, 3, 6])), ], ), - _get_cython_table_params( + tm.get_cython_table_params( Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] ), ), @@ -440,7 +439,7 @@ def test_agg_cython_table_transform(self, series, func, expected): @pytest.mark.parametrize( "series, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( Series("a b c".split()), [ ("mean", TypeError), # mean raises TypeError @@ -631,19 +630,19 @@ class DictWithoutMissing(dict): expected = Series([np.nan, np.nan, "three"]) tm.assert_series_equal(result, expected) - def test_map_abc_mapping(self, non_mapping_dict_subclass): + def test_map_abc_mapping(self, non_dict_mapping_subclass): # https://github.com/pandas-dev/pandas/issues/29733 # Check collections.abc.Mapping support as mapper for Series.map s = Series([1, 2, 3]) - not_a_dictionary = non_mapping_dict_subclass({3: "three"}) + not_a_dictionary = non_dict_mapping_subclass({3: "three"}) result = s.map(not_a_dictionary) expected = Series([np.nan, np.nan, "three"]) tm.assert_series_equal(result, expected) - def test_map_abc_mapping_with_missing(self, non_mapping_dict_subclass): + def test_map_abc_mapping_with_missing(self, non_dict_mapping_subclass): # https://github.com/pandas-dev/pandas/issues/29733 # Check collections.abc.Mapping support as mapper for Series.map - class NonDictMappingWithMissing(non_mapping_dict_subclass): + class NonDictMappingWithMissing(non_dict_mapping_subclass): def __missing__(self, key): return "missing" diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 95d04c9a45d25..a6385240537ca 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -1,3 +1,4 @@ +from datetime import timedelta import operator import numpy as np @@ -7,8 +8,9 @@ from pandas._libs.tslibs import IncompatibleFrequency import pandas as pd -from pandas import Series, date_range +from pandas import Categorical, Index, Series, bdate_range, date_range, isna import pandas._testing as tm +from pandas.core import nanops, ops def _permute(obj): @@ -64,6 +66,65 @@ def _constructor(self): result = op(m, 1) assert result.x == 42 + def test_flex_add_scalar_fill_value(self): + # GH12723 + s = Series([0, 1, np.nan, 3, 4, 5]) + + exp = s.fillna(0).add(2) + res = s.add(2, fill_value=0) + tm.assert_series_equal(res, exp) + + pairings = [(Series.div, operator.truediv, 1), (Series.rdiv, ops.rtruediv, 1)] + for op in ["add", "sub", "mul", "pow", "truediv", "floordiv"]: + fv = 0 + lop = getattr(Series, op) + lequiv = getattr(operator, op) + rop = getattr(Series, "r" + op) + # bind op at definition time... + requiv = lambda x, y, op=op: getattr(operator, op)(y, x) + pairings.append((lop, lequiv, fv)) + pairings.append((rop, requiv, fv)) + + @pytest.mark.parametrize("op, equiv_op, fv", pairings) + def test_operators_combine(self, op, equiv_op, fv): + def _check_fill(meth, op, a, b, fill_value=0): + exp_index = a.index.union(b.index) + a = a.reindex(exp_index) + b = b.reindex(exp_index) + + amask = isna(a) + bmask = isna(b) + + exp_values = [] + for i in range(len(exp_index)): + with np.errstate(all="ignore"): + if amask[i]: + if bmask[i]: + exp_values.append(np.nan) + continue + exp_values.append(op(fill_value, b[i])) + elif bmask[i]: + if amask[i]: + exp_values.append(np.nan) + continue + exp_values.append(op(a[i], fill_value)) + else: + exp_values.append(op(a[i], b[i])) + + result = meth(a, b, fill_value=fill_value) + expected = Series(exp_values, exp_index) + tm.assert_series_equal(result, expected) + + a = Series([np.nan, 1.0, 2.0, 3.0, np.nan], index=np.arange(5)) + b = Series([np.nan, 1, np.nan, 3, np.nan, 4.0], index=np.arange(6)) + + result = op(a, b) + exp = equiv_op(a, b) + tm.assert_series_equal(result, exp) + _check_fill(op, equiv_op, a, b, fill_value=fv) + # should accept axis=0 or axis='rows' + op(a, b, axis=0) + class TestSeriesArithmetic: # Some of these may end up in tests/arithmetic, but are not yet sorted @@ -99,6 +160,100 @@ def test_string_addition(self, target_add, input_value, expected_value): expected = Series(expected_value) tm.assert_series_equal(result, expected) + def test_divmod(self): + # GH#25557 + a = Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + b = Series([2, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + + result = a.divmod(b) + expected = divmod(a, b) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + result = a.rdivmod(b) + expected = divmod(b, a) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + @pytest.mark.parametrize("index", [None, range(9)]) + def test_series_integer_mod(self, index): + # GH#24396 + s1 = Series(range(1, 10)) + s2 = Series("foo", index=index) + + msg = "not all arguments converted during string formatting" + + with pytest.raises(TypeError, match=msg): + s2 % s1 + + def test_add_with_duplicate_index(self): + # GH14227 + s1 = Series([1, 2], index=[1, 1]) + s2 = Series([10, 10], index=[1, 2]) + result = s1 + s2 + expected = pd.Series([11, 12, np.nan], index=[1, 1, 2]) + tm.assert_series_equal(result, expected) + + def test_add_na_handling(self): + from decimal import Decimal + from datetime import date + + s = Series( + [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] + ) + + result = s + s.shift(1) + result2 = s.shift(1) + s + assert isna(result[0]) + assert isna(result2[0]) + + def test_add_corner_cases(self, datetime_series): + empty = Series([], index=Index([]), dtype=np.float64) + + result = datetime_series + empty + assert np.isnan(result).all() + + result = empty + empty.copy() + assert len(result) == 0 + + # FIXME: dont leave commented-out + # TODO: this returned NotImplemented earlier, what to do? + # deltas = Series([timedelta(1)] * 5, index=np.arange(5)) + # sub_deltas = deltas[::2] + # deltas5 = deltas * 5 + # deltas = deltas + sub_deltas + + # float + int + int_ts = datetime_series.astype(int)[:-5] + added = datetime_series + int_ts + expected = Series( + datetime_series.values[:-5] + int_ts.values, + index=datetime_series.index[:-5], + name="ts", + ) + tm.assert_series_equal(added[:-5], expected) + + def test_mul_empty_int_corner_case(self): + s1 = Series([], [], dtype=np.int32) + s2 = Series({"x": 0.0}) + tm.assert_series_equal(s1 * s2, Series([np.nan], index=["x"])) + + def test_sub_datetimelike_align(self): + # GH#7500 + # datetimelike ops need to align + dt = Series(date_range("2012-1-1", periods=3, freq="D")) + dt.iloc[2] = np.nan + dt2 = dt[::-1] + + expected = Series([timedelta(0), timedelta(0), pd.NaT]) + # name is reset + result = dt2 - dt + tm.assert_series_equal(result, expected) + + expected = Series(expected, name=0) + result = (dt2.to_frame() - dt.to_frame())[0] + tm.assert_series_equal(result, expected) + # ------------------------------------------------------------------ # Comparisons @@ -131,6 +286,50 @@ def test_comparison_flex_basic(self): with pytest.raises(ValueError, match=msg): getattr(left, op)(right, axis=1) + def test_comparison_flex_alignment(self): + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) + + exp = pd.Series([False, False, True, False], index=list("abcd")) + tm.assert_series_equal(left.eq(right), exp) + + exp = pd.Series([True, True, False, True], index=list("abcd")) + tm.assert_series_equal(left.ne(right), exp) + + exp = pd.Series([False, False, True, False], index=list("abcd")) + tm.assert_series_equal(left.le(right), exp) + + exp = pd.Series([False, False, False, False], index=list("abcd")) + tm.assert_series_equal(left.lt(right), exp) + + exp = pd.Series([False, True, True, False], index=list("abcd")) + tm.assert_series_equal(left.ge(right), exp) + + exp = pd.Series([False, True, False, False], index=list("abcd")) + tm.assert_series_equal(left.gt(right), exp) + + def test_comparison_flex_alignment_fill(self): + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) + + exp = pd.Series([False, False, True, True], index=list("abcd")) + tm.assert_series_equal(left.eq(right, fill_value=2), exp) + + exp = pd.Series([True, True, False, False], index=list("abcd")) + tm.assert_series_equal(left.ne(right, fill_value=2), exp) + + exp = pd.Series([False, False, True, True], index=list("abcd")) + tm.assert_series_equal(left.le(right, fill_value=0), exp) + + exp = pd.Series([False, False, False, True], index=list("abcd")) + tm.assert_series_equal(left.lt(right, fill_value=0), exp) + + exp = pd.Series([True, True, True, False], index=list("abcd")) + tm.assert_series_equal(left.ge(right, fill_value=0), exp) + + exp = pd.Series([True, True, False, False], index=list("abcd")) + tm.assert_series_equal(left.gt(right, fill_value=0), exp) + class TestSeriesComparison: def test_comparison_different_length(self): @@ -205,6 +404,220 @@ def test_ser_cmp_result_names(self, names, op): result = op(ser, cidx) assert result.name == names[2] + def test_comparisons(self): + left = np.random.randn(10) + right = np.random.randn(10) + left[:3] = np.nan + + result = nanops.nangt(left, right) + with np.errstate(invalid="ignore"): + expected = (left > right).astype("O") + expected[:3] = np.nan + + tm.assert_almost_equal(result, expected) + + s = Series(["a", "b", "c"]) + s2 = Series([False, True, False]) + + # it works! + exp = Series([False, False, False]) + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) + + # ----------------------------------------------------------------- + # Categorical Dtype Comparisons + + def test_categorical_comparisons(self): + # GH#8938 + # allow equality comparisons + a = Series(list("abc"), dtype="category") + b = Series(list("abc"), dtype="object") + c = Series(["a", "b", "cc"], dtype="object") + d = Series(list("acb"), dtype="object") + e = Categorical(list("abc")) + f = Categorical(list("acb")) + + # vs scalar + assert not (a == "a").all() + assert ((a != "a") == ~(a == "a")).all() + + assert not ("a" == a).all() + assert (a == "a")[0] + assert ("a" == a)[0] + assert not ("a" != a)[0] + + # vs list-like + assert (a == a).all() + assert not (a != a).all() + + assert (a == list(a)).all() + assert (a == b).all() + assert (b == a).all() + assert ((~(a == b)) == (a != b)).all() + assert ((~(b == a)) == (b != a)).all() + + assert not (a == c).all() + assert not (c == a).all() + assert not (a == d).all() + assert not (d == a).all() + + # vs a cat-like + assert (a == e).all() + assert (e == a).all() + assert not (a == f).all() + assert not (f == a).all() + + assert (~(a == e) == (a != e)).all() + assert (~(e == a) == (e != a)).all() + assert (~(a == f) == (a != f)).all() + assert (~(f == a) == (f != a)).all() + + # non-equality is not comparable + with pytest.raises(TypeError): + a < b + with pytest.raises(TypeError): + b < a + with pytest.raises(TypeError): + a > b + with pytest.raises(TypeError): + b > a + + def test_unequal_categorical_comparison_raises_type_error(self): + # unequal comparison should raise for unordered cats + cat = Series(Categorical(list("abc"))) + with pytest.raises(TypeError): + cat > "b" + + cat = Series(Categorical(list("abc"), ordered=False)) + with pytest.raises(TypeError): + cat > "b" + + # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 + # and following comparisons with scalars not in categories should raise + # for unequal comps, but not for equal/not equal + cat = Series(Categorical(list("abc"), ordered=True)) + + with pytest.raises(TypeError): + cat < "d" + with pytest.raises(TypeError): + cat > "d" + with pytest.raises(TypeError): + "d" < cat + with pytest.raises(TypeError): + "d" > cat + + tm.assert_series_equal(cat == "d", Series([False, False, False])) + tm.assert_series_equal(cat != "d", Series([True, True, True])) + + # ----------------------------------------------------------------- + + def test_comparison_tuples(self): + # GH#11339 + # comparisons vs tuple + s = Series([(1, 1), (1, 2)]) + + result = s == (1, 2) + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + + result = s != (1, 2) + expected = Series([True, False]) + tm.assert_series_equal(result, expected) + + result = s == (0, 0) + expected = Series([False, False]) + tm.assert_series_equal(result, expected) + + result = s != (0, 0) + expected = Series([True, True]) + tm.assert_series_equal(result, expected) + + s = Series([(1, 1), (1, 1)]) + + result = s == (1, 1) + expected = Series([True, True]) + tm.assert_series_equal(result, expected) + + result = s != (1, 1) + expected = Series([False, False]) + tm.assert_series_equal(result, expected) + + s = Series([frozenset([1]), frozenset([1, 2])]) + + result = s == frozenset([1]) + expected = Series([True, False]) + tm.assert_series_equal(result, expected) + + def test_comparison_operators_with_nas(self): + ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) + ser[::2] = np.nan + + # test that comparisons work + ops = ["lt", "le", "gt", "ge", "eq", "ne"] + for op in ops: + val = ser[5] + + f = getattr(operator, op) + result = f(ser, val) + + expected = f(ser.dropna(), val).reindex(ser.index) + + if op == "ne": + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) + + tm.assert_series_equal(result, expected) + + # FIXME: dont leave commented-out + # fffffffuuuuuuuuuuuu + # result = f(val, s) + # expected = f(val, s.dropna()).reindex(s.index) + # tm.assert_series_equal(result, expected) + + def test_ne(self): + ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) + expected = [True, True, False, True, True] + assert tm.equalContents(ts.index != 5, expected) + assert tm.equalContents(~(ts.index == 5), expected) + + def test_comp_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") + + s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") + + for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: + + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + left == right + + with pytest.raises(ValueError, match=msg): + left != right + + with pytest.raises(ValueError, match=msg): + left < right + + msg = "Can only compare identically-labeled DataFrame objects" + with pytest.raises(ValueError, match=msg): + left.to_frame() == right.to_frame() + + with pytest.raises(ValueError, match=msg): + left.to_frame() != right.to_frame() + + with pytest.raises(ValueError, match=msg): + left.to_frame() < right.to_frame() + + def test_compare_series_interval_keyword(self): + # GH#25338 + s = Series(["IntervalA", "IntervalB", "IntervalC"]) + result = s == "IntervalA" + expected = Series([True, False, False]) + tm.assert_series_equal(result, expected) + # ------------------------------------------------------------------ # Unsorted diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index e4c25f31c4b43..55af7355258a8 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1124,9 +1124,9 @@ def test_constructor_dict_tuple_indexer(self): ) tm.assert_series_equal(result, expected) - def test_constructor_mapping(self, non_mapping_dict_subclass): + def test_constructor_mapping(self, non_dict_mapping_subclass): # GH 29788 - ndm = non_mapping_dict_subclass({3: "three"}) + ndm = non_dict_mapping_subclass({3: "three"}) result = Series(ndm) expected = Series(["three"], index=[3]) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 15f1bc8941d47..1687f80e9f3ed 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -448,13 +448,6 @@ def test_fillna_consistency(self): s2[1] = "foo" tm.assert_series_equal(s2, expected) - def test_where_sparse(self): - # GH#17198 make sure we dont get an AttributeError for sp_index - ser = pd.Series(pd.arrays.SparseArray([1, 2])) - result = ser.where(ser >= 2, 0) - expected = pd.Series(pd.arrays.SparseArray([0, 2])) - tm.assert_series_equal(result, expected) - def test_datetime64tz_fillna_round_issue(self): # GH 14872 @@ -940,14 +933,6 @@ def test_dropna_preserve_name(self, datetime_series): ts.dropna(inplace=True) assert ts.name == name - def test_fill_value_when_combine_const(self): - # GH12723 - s = Series([0, 1, np.nan, 3, 4, 5]) - - exp = s.fillna(0).add(2) - res = s.add(2, fill_value=0) - tm.assert_series_equal(res, exp) - def test_series_fillna_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index bdd9f92d92d3f..1340f514e31ce 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1,14 +1,13 @@ -from datetime import datetime, timedelta +from datetime import datetime import operator import numpy as np import pytest import pandas as pd -from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna +from pandas import DataFrame, Index, Series, bdate_range import pandas._testing as tm from pandas.core import ops -import pandas.core.nanops as nanops class TestSeriesLogicalOps: @@ -519,409 +518,6 @@ def test_logical_ops_df_compat(self): tm.assert_frame_equal(s4.to_frame() | s3.to_frame(), exp_or.to_frame()) -class TestSeriesComparisons: - def test_comparisons(self): - left = np.random.randn(10) - right = np.random.randn(10) - left[:3] = np.nan - - result = nanops.nangt(left, right) - with np.errstate(invalid="ignore"): - expected = (left > right).astype("O") - expected[:3] = np.nan - - tm.assert_almost_equal(result, expected) - - s = Series(["a", "b", "c"]) - s2 = Series([False, True, False]) - - # it works! - exp = Series([False, False, False]) - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) - - def test_categorical_comparisons(self): - # GH 8938 - # allow equality comparisons - a = Series(list("abc"), dtype="category") - b = Series(list("abc"), dtype="object") - c = Series(["a", "b", "cc"], dtype="object") - d = Series(list("acb"), dtype="object") - e = Categorical(list("abc")) - f = Categorical(list("acb")) - - # vs scalar - assert not (a == "a").all() - assert ((a != "a") == ~(a == "a")).all() - - assert not ("a" == a).all() - assert (a == "a")[0] - assert ("a" == a)[0] - assert not ("a" != a)[0] - - # vs list-like - assert (a == a).all() - assert not (a != a).all() - - assert (a == list(a)).all() - assert (a == b).all() - assert (b == a).all() - assert ((~(a == b)) == (a != b)).all() - assert ((~(b == a)) == (b != a)).all() - - assert not (a == c).all() - assert not (c == a).all() - assert not (a == d).all() - assert not (d == a).all() - - # vs a cat-like - assert (a == e).all() - assert (e == a).all() - assert not (a == f).all() - assert not (f == a).all() - - assert (~(a == e) == (a != e)).all() - assert (~(e == a) == (e != a)).all() - assert (~(a == f) == (a != f)).all() - assert (~(f == a) == (f != a)).all() - - # non-equality is not comparable - with pytest.raises(TypeError): - a < b - with pytest.raises(TypeError): - b < a - with pytest.raises(TypeError): - a > b - with pytest.raises(TypeError): - b > a - - def test_comparison_tuples(self): - # GH11339 - # comparisons vs tuple - s = Series([(1, 1), (1, 2)]) - - result = s == (1, 2) - expected = Series([False, True]) - tm.assert_series_equal(result, expected) - - result = s != (1, 2) - expected = Series([True, False]) - tm.assert_series_equal(result, expected) - - result = s == (0, 0) - expected = Series([False, False]) - tm.assert_series_equal(result, expected) - - result = s != (0, 0) - expected = Series([True, True]) - tm.assert_series_equal(result, expected) - - s = Series([(1, 1), (1, 1)]) - - result = s == (1, 1) - expected = Series([True, True]) - tm.assert_series_equal(result, expected) - - result = s != (1, 1) - expected = Series([False, False]) - tm.assert_series_equal(result, expected) - - s = Series([frozenset([1]), frozenset([1, 2])]) - - result = s == frozenset([1]) - expected = Series([True, False]) - tm.assert_series_equal(result, expected) - - def test_comparison_operators_with_nas(self): - ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) - ser[::2] = np.nan - - # test that comparisons work - ops = ["lt", "le", "gt", "ge", "eq", "ne"] - for op in ops: - val = ser[5] - - f = getattr(operator, op) - result = f(ser, val) - - expected = f(ser.dropna(), val).reindex(ser.index) - - if op == "ne": - expected = expected.fillna(True).astype(bool) - else: - expected = expected.fillna(False).astype(bool) - - tm.assert_series_equal(result, expected) - - # FIXME: dont leave commented-out - # fffffffuuuuuuuuuuuu - # result = f(val, s) - # expected = f(val, s.dropna()).reindex(s.index) - # tm.assert_series_equal(result, expected) - - def test_unequal_categorical_comparison_raises_type_error(self): - # unequal comparison should raise for unordered cats - cat = Series(Categorical(list("abc"))) - with pytest.raises(TypeError): - cat > "b" - - cat = Series(Categorical(list("abc"), ordered=False)) - with pytest.raises(TypeError): - cat > "b" - - # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 - # and following comparisons with scalars not in categories should raise - # for unequal comps, but not for equal/not equal - cat = Series(Categorical(list("abc"), ordered=True)) - - with pytest.raises(TypeError): - cat < "d" - with pytest.raises(TypeError): - cat > "d" - with pytest.raises(TypeError): - "d" < cat - with pytest.raises(TypeError): - "d" > cat - - tm.assert_series_equal(cat == "d", Series([False, False, False])) - tm.assert_series_equal(cat != "d", Series([True, True, True])) - - def test_ne(self): - ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) - expected = [True, True, False, True, True] - assert tm.equalContents(ts.index != 5, expected) - assert tm.equalContents(~(ts.index == 5), expected) - - def test_comp_ops_df_compat(self): - # GH 1134 - s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") - s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") - - s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") - s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") - - for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: - - msg = "Can only compare identically-labeled Series objects" - with pytest.raises(ValueError, match=msg): - left == right - - with pytest.raises(ValueError, match=msg): - left != right - - with pytest.raises(ValueError, match=msg): - left < right - - msg = "Can only compare identically-labeled DataFrame objects" - with pytest.raises(ValueError, match=msg): - left.to_frame() == right.to_frame() - - with pytest.raises(ValueError, match=msg): - left.to_frame() != right.to_frame() - - with pytest.raises(ValueError, match=msg): - left.to_frame() < right.to_frame() - - def test_compare_series_interval_keyword(self): - # GH 25338 - s = Series(["IntervalA", "IntervalB", "IntervalC"]) - result = s == "IntervalA" - expected = Series([True, False, False]) - tm.assert_series_equal(result, expected) - - -class TestSeriesFlexComparisonOps: - def test_comparison_flex_alignment(self): - left = Series([1, 3, 2], index=list("abc")) - right = Series([2, 2, 2], index=list("bcd")) - - exp = pd.Series([False, False, True, False], index=list("abcd")) - tm.assert_series_equal(left.eq(right), exp) - - exp = pd.Series([True, True, False, True], index=list("abcd")) - tm.assert_series_equal(left.ne(right), exp) - - exp = pd.Series([False, False, True, False], index=list("abcd")) - tm.assert_series_equal(left.le(right), exp) - - exp = pd.Series([False, False, False, False], index=list("abcd")) - tm.assert_series_equal(left.lt(right), exp) - - exp = pd.Series([False, True, True, False], index=list("abcd")) - tm.assert_series_equal(left.ge(right), exp) - - exp = pd.Series([False, True, False, False], index=list("abcd")) - tm.assert_series_equal(left.gt(right), exp) - - def test_comparison_flex_alignment_fill(self): - left = Series([1, 3, 2], index=list("abc")) - right = Series([2, 2, 2], index=list("bcd")) - - exp = pd.Series([False, False, True, True], index=list("abcd")) - tm.assert_series_equal(left.eq(right, fill_value=2), exp) - - exp = pd.Series([True, True, False, False], index=list("abcd")) - tm.assert_series_equal(left.ne(right, fill_value=2), exp) - - exp = pd.Series([False, False, True, True], index=list("abcd")) - tm.assert_series_equal(left.le(right, fill_value=0), exp) - - exp = pd.Series([False, False, False, True], index=list("abcd")) - tm.assert_series_equal(left.lt(right, fill_value=0), exp) - - exp = pd.Series([True, True, True, False], index=list("abcd")) - tm.assert_series_equal(left.ge(right, fill_value=0), exp) - - exp = pd.Series([True, True, False, False], index=list("abcd")) - tm.assert_series_equal(left.gt(right, fill_value=0), exp) - - -class TestSeriesOperators: - def test_operators_empty_int_corner(self): - s1 = Series([], [], dtype=np.int32) - s2 = Series({"x": 0.0}) - tm.assert_series_equal(s1 * s2, Series([np.nan], index=["x"])) - - def test_ops_datetimelike_align(self): - # GH 7500 - # datetimelike ops need to align - dt = Series(date_range("2012-1-1", periods=3, freq="D")) - dt.iloc[2] = np.nan - dt2 = dt[::-1] - - expected = Series([timedelta(0), timedelta(0), pd.NaT]) - # name is reset - result = dt2 - dt - tm.assert_series_equal(result, expected) - - expected = Series(expected, name=0) - result = (dt2.to_frame() - dt.to_frame())[0] - tm.assert_series_equal(result, expected) - - def test_operators_corner(self, datetime_series): - empty = Series([], index=Index([]), dtype=np.float64) - - result = datetime_series + empty - assert np.isnan(result).all() - - result = empty + empty.copy() - assert len(result) == 0 - - # TODO: this returned NotImplemented earlier, what to do? - # deltas = Series([timedelta(1)] * 5, index=np.arange(5)) - # sub_deltas = deltas[::2] - # deltas5 = deltas * 5 - # deltas = deltas + sub_deltas - - # float + int - int_ts = datetime_series.astype(int)[:-5] - added = datetime_series + int_ts - expected = Series( - datetime_series.values[:-5] + int_ts.values, - index=datetime_series.index[:-5], - name="ts", - ) - tm.assert_series_equal(added[:-5], expected) - - pairings = [(Series.div, operator.truediv, 1), (Series.rdiv, ops.rtruediv, 1)] - for op in ["add", "sub", "mul", "pow", "truediv", "floordiv"]: - fv = 0 - lop = getattr(Series, op) - lequiv = getattr(operator, op) - rop = getattr(Series, "r" + op) - # bind op at definition time... - requiv = lambda x, y, op=op: getattr(operator, op)(y, x) - pairings.append((lop, lequiv, fv)) - pairings.append((rop, requiv, fv)) - - @pytest.mark.parametrize("op, equiv_op, fv", pairings) - def test_operators_combine(self, op, equiv_op, fv): - def _check_fill(meth, op, a, b, fill_value=0): - exp_index = a.index.union(b.index) - a = a.reindex(exp_index) - b = b.reindex(exp_index) - - amask = isna(a) - bmask = isna(b) - - exp_values = [] - for i in range(len(exp_index)): - with np.errstate(all="ignore"): - if amask[i]: - if bmask[i]: - exp_values.append(np.nan) - continue - exp_values.append(op(fill_value, b[i])) - elif bmask[i]: - if amask[i]: - exp_values.append(np.nan) - continue - exp_values.append(op(a[i], fill_value)) - else: - exp_values.append(op(a[i], b[i])) - - result = meth(a, b, fill_value=fill_value) - expected = Series(exp_values, exp_index) - tm.assert_series_equal(result, expected) - - a = Series([np.nan, 1.0, 2.0, 3.0, np.nan], index=np.arange(5)) - b = Series([np.nan, 1, np.nan, 3, np.nan, 4.0], index=np.arange(6)) - - result = op(a, b) - exp = equiv_op(a, b) - tm.assert_series_equal(result, exp) - _check_fill(op, equiv_op, a, b, fill_value=fv) - # should accept axis=0 or axis='rows' - op(a, b, axis=0) - - def test_operators_na_handling(self): - from decimal import Decimal - from datetime import date - - s = Series( - [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] - ) - - result = s + s.shift(1) - result2 = s.shift(1) + s - assert isna(result[0]) - assert isna(result2[0]) - - def test_op_duplicate_index(self): - # GH14227 - s1 = Series([1, 2], index=[1, 1]) - s2 = Series([10, 10], index=[1, 2]) - result = s1 + s2 - expected = pd.Series([11, 12, np.nan], index=[1, 1, 2]) - tm.assert_series_equal(result, expected) - - def test_divmod(self): - # GH25557 - a = Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) - b = Series([2, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) - - result = a.divmod(b) - expected = divmod(a, b) - tm.assert_series_equal(result[0], expected[0]) - tm.assert_series_equal(result[1], expected[1]) - - result = a.rdivmod(b) - expected = divmod(b, a) - tm.assert_series_equal(result[0], expected[0]) - tm.assert_series_equal(result[1], expected[1]) - - @pytest.mark.parametrize("index", [None, range(9)]) - def test_series_integer_mod(self, index): - # see gh-24396 - s1 = Series(range(1, 10)) - s2 = Series("foo", index=index) - - msg = "not all arguments converted during string formatting" - - with pytest.raises(TypeError, match=msg): - s2 % s1 - - class TestSeriesUnaryOps: # __neg__, __pos__, __inv__ diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index f41245c2872a7..d5a3efcf5757c 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -98,12 +98,6 @@ def test_intercept_astype_object(self): result = df.values.squeeze() assert (result[:, 0] == expected.values).all() - def test_align_series(self, join_type): - rng = period_range("1/1/2000", "1/1/2010", freq="A") - ts = Series(np.random.randn(len(rng)), index=rng) - - ts.align(ts[::2], join=join_type) - @pytest.mark.parametrize( "input_vals", [ diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py index ae4fd12abdb88..dfff1d581fe44 100644 --- a/pandas/tests/series/test_timezones.py +++ b/pandas/tests/series/test_timezones.py @@ -6,7 +6,6 @@ from dateutil.tz import tzoffset import numpy as np import pytest -import pytz from pandas._libs.tslibs import conversion, timezones @@ -38,16 +37,6 @@ def test_string_index_alias_tz_aware(self, tz): result = ser["1/3/2000"] tm.assert_almost_equal(result, ser[2]) - def test_series_align_aware(self): - idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") - ser = Series(np.random.randn(len(idx1)), index=idx1) - ser_central = ser.tz_convert("US/Central") - # # different timezones convert to UTC - - new1, new2 = ser.align(ser_central) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - @pytest.mark.parametrize("tzstr", ["Europe/Berlin", "dateutil/Europe/Berlin"]) def test_getitem_pydatetime_tz(self, tzstr): tz = timezones.maybe_get_tz(tzstr) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 122ef1f47968e..57542aa3bc7f6 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,6 +8,8 @@ import numpy as np # noqa import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame import pandas._testing as tm @@ -47,6 +49,19 @@ def test_xarray(df): assert df.to_xarray() is not None +@td.skip_if_no("cftime") +@td.skip_if_no("xarray", "0.10.4") +def test_xarray_cftimeindex_nearest(): + # https://github.com/pydata/xarray/issues/3751 + import cftime + import xarray + + times = xarray.cftime_range("0001", periods=2) + result = times.get_loc(cftime.DatetimeGregorian(2000, 1, 1), method="nearest") + expected = 1 + assert result == expected + + def test_oo_optimizable(): # GH 21071 subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 6abf174aa7fd2..6289c2efea7f1 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -41,6 +41,7 @@ def assert_series_or_index_equal(left, right): ("join", (",",), {}), ("ljust", (10,), {}), ("match", ("a",), {}), + ("fullmatch", ("a",), {}), ("normalize", ("NFC",), {}), ("pad", (10,), {}), ("partition", (" ",), {"expand": False}), @@ -1176,9 +1177,9 @@ def test_match(self): exp = Series([True, np.nan, False]) tm.assert_series_equal(result, exp) - values = Series(["fooBAD__barBAD", np.nan, "foo"]) + values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) result = values.str.match(".*BAD[_]+.*BAD") - exp = Series([True, np.nan, False]) + exp = Series([True, True, np.nan, False]) tm.assert_series_equal(result, exp) # mixed @@ -1208,6 +1209,22 @@ def test_match(self): exp = Series([True, np.nan, np.nan]) tm.assert_series_equal(exp, res) + def test_fullmatch(self): + # GH 32806 + values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) + result = values.str.fullmatch(".*BAD[_]+.*BAD") + exp = Series([True, False, np.nan, False]) + tm.assert_series_equal(result, exp) + + # Make sure that the new string arrays work + string_values = Series( + ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype="string" + ) + result = string_values.str.fullmatch(".*BAD[_]+.*BAD") + # Result is nullable boolean with StringDtype + string_exp = Series([True, False, np.nan, False], dtype="boolean") + tm.assert_series_equal(result, string_exp) + def test_extract_expand_None(self): values = Series(["fooBAD__barBAD", np.nan, "foo"]) with pytest.raises(ValueError, match="expand must be True or False"): @@ -3384,6 +3401,9 @@ def test_match_findall_flags(self): result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] + result = data.str.fullmatch(pat, flags=re.IGNORECASE) + assert result[0] + result = data.str.findall(pat, flags=re.IGNORECASE) assert result[0][0] == ("dave", "google", "com") diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 077c5046ac44d..22c0f455fa3ac 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -95,19 +95,19 @@ def to_offset(freq) -> Optional[DateOffset]: Examples -------- - >>> to_offset('5min') + >>> to_offset("5min") <5 * Minutes> - >>> to_offset('1D1H') + >>> to_offset("1D1H") <25 * Hours> - >>> to_offset(('W', 2)) + >>> to_offset(("W", 2)) <2 * Weeks: weekday=6> - >>> to_offset((2, 'B')) + >>> to_offset((2, "B")) <2 * BusinessDays> - >>> to_offset(datetime.timedelta(days=1)) + >>> to_offset(pd.Timedelta(days=1)) >>> to_offset(Hour()) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index fe30130e87c01..8ab37f787bd10 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -157,15 +157,34 @@ class from pandas.tseries.offsets -------- >>> from pandas.tseries.holiday import Holiday, nearest_workday >>> from dateutil.relativedelta import MO - >>> USMemorialDay = Holiday('Memorial Day', month=5, day=31, - offset=pd.DateOffset(weekday=MO(-1))) - >>> USLaborDay = Holiday('Labor Day', month=9, day=1, - offset=pd.DateOffset(weekday=MO(1))) - >>> July3rd = Holiday('July 3rd', month=7, day=3,) - >>> NewYears = Holiday('New Years Day', month=1, day=1, - observance=nearest_workday), - >>> July3rd = Holiday('July 3rd', month=7, day=3, - days_of_week=(0, 1, 2, 3)) + + >>> USMemorialDay = Holiday( + ... "Memorial Day", month=5, day=31, offset=pd.DateOffset(weekday=MO(-1)) + ... ) + >>> USMemorialDay + Holiday: Memorial Day (month=5, day=31, offset=) + + >>> USLaborDay = Holiday( + ... "Labor Day", month=9, day=1, offset=pd.DateOffset(weekday=MO(1)) + ... ) + >>> USLaborDay + Holiday: Labor Day (month=9, day=1, offset=) + + >>> July3rd = Holiday("July 3rd", month=7, day=3) + >>> July3rd + Holiday: July 3rd (month=7, day=3, ) + + >>> NewYears = Holiday( + ... "New Years Day", month=1, day=1, observance=nearest_workday + ... ) + >>> NewYears # doctest: +SKIP + Holiday: New Years Day ( + month=1, day=1, observance= + ) + + >>> July3rd = Holiday("July 3rd", month=7, day=3, days_of_week=(0, 1, 2, 3)) + >>> July3rd + Holiday: July 3rd (month=7, day=3, ) """ if offset is not None and observance is not None: raise NotImplementedError("Cannot use both offset and observance.") diff --git a/requirements-dev.txt b/requirements-dev.txt index 9ee67c56ab8ca..6a2cc7b53615e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -68,6 +68,7 @@ tables>=3.4.2 s3fs sqlalchemy xarray +cftime pyreadstat tabulate>=0.8.3 git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master