diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2d8f6468aca83..aec19e27a33e2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,3 +153,9 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + + # indexing iset related (temporary since other tests don't pass yet) + pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_multi_index --array-manager + pytest pandas/tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_listlike_indexer_duplicate_columns --array-manager + pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_astype_assignment_with_dups --array-manager + pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_frame_setitem_multi_column --array-manager diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index f3ccd78266ba6..b34373b82af1a 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -170,3 +170,11 @@ jobs: - name: Print skipped tests run: python ci/print_skipped.py + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + files: /tmp/test_coverage.xml + flags: unittests + name: codecov-pandas + fail_ci_if_error: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e0df3434b2906..3a371c8249eba 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -180,6 +180,12 @@ repos: language: pygrep types: [python] files: ^pandas/tests/ + - id: title-capitalization + name: Validate correct capitalization among titles in documentation + entry: python scripts/validate_rst_title_capitalization.py + language: python + types: [rst] + files: ^doc/source/(development|reference)/ - repo: https://github.com/asottile/yesqa rev: v1.2.2 hooks: diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py new file mode 100644 index 0000000000000..1c8de6d9f8e56 --- /dev/null +++ b/asv_bench/benchmarks/libs.py @@ -0,0 +1,42 @@ +""" +Benchmarks for code in pandas/_libs, excluding pandas/_libs/tslibs, +which has its own directory +""" +import numpy as np + +from pandas._libs.lib import ( + is_list_like, + is_scalar, +) + +from pandas import ( + NA, + NaT, +) + +# TODO: share with something in pd._testing? +scalars = [ + 0, + 1.0, + 1 + 2j, + True, + "foo", + b"bar", + None, + np.datetime64(123, "ns"), + np.timedelta64(123, "ns"), + NaT, + NA, +] +zero_dims = [np.array("123")] +listlikes = [np.array([1, 2, 3]), {0: "foo"}, set(1, 2, 3), [1, 2, 3], (1, 2, 3)] + + +class ScalarListLike: + params = scalars + zero_dims + listlikes + + def time_is_list_like(self, param): + is_list_like(param) + + def time_is_scalar(self, param): + is_scalar(param) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 597aced96eb18..251f450840ea9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -233,10 +233,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS02,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03 RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Validate correct capitalization among titles in documentation' ; echo $MSG - $BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development $BASE_DIR/doc/source/reference - RET=$(($RET + $?)) ; echo $MSG "DONE" - fi ### TYPING ### diff --git a/ci/deps/azure-37-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml index 7f658fe62d268..0c47b1a72774f 100644 --- a/ci/deps/azure-37-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -18,7 +18,7 @@ dependencies: - lxml - matplotlib=3.0.0 - numpy=1.16.* - - openpyxl=2.6.0 + - openpyxl=3.0.0 - python-dateutil - python-blosc - pytz=2017.3 diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml index f184ea87c89fe..9cc158b76cd41 100644 --- a/ci/deps/azure-37-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -19,7 +19,7 @@ dependencies: - numba=0.46.0 - numexpr=2.6.8 - numpy=1.16.5 - - openpyxl=2.6.0 + - openpyxl=3.0.0 - pytables=3.5.1 - python-dateutil=2.7.3 - pytz=2017.3 diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index bb89b91954518..4b69d5b0c8c77 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -476,6 +476,14 @@ storing numeric arrays with units. These arrays can be stored inside pandas' Series and DataFrame. Operations between Series and DataFrame columns which use pint's extension array are then units aware. +`Text Extensions for Pandas`_ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``Text Extensions for Pandas `` +provides extension types to cover common data structures for representing natural language +data, plus library integrations that convert the outputs of popular natural language +processing libraries into Pandas DataFrames. + .. _ecosystem.accessors: Accessors diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 49039f05b889a..06e1af75053d3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -274,7 +274,7 @@ html5lib 1.0.1 HTML parser for read_html (see :ref lxml 4.3.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations -openpyxl 2.6.0 Reading / writing for xlsx files +openpyxl 3.0.0 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access psycopg2 2.7 PostgreSQL engine for sqlalchemy pyarrow 0.15.0 Parquet, ORC, and feather reading / writing diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 07c856c96426d..4089f9523724f 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -178,6 +178,75 @@ To test for membership in the values, use the method :meth:`~pandas.Series.isin` For ``DataFrames``, likewise, ``in`` applies to the column axis, testing for membership in the list of column names. +.. _udf-mutation: + +Mutating with User Defined Function (UDF) methods +------------------------------------------------- + +It is a general rule in programming that one should not mutate a container +while it is being iterated over. Mutation will invalidate the iterator, +causing unexpected behavior. Consider the example: + +.. ipython:: python + + values = [0, 1, 2, 3, 4, 5] + n_removed = 0 + for k, value in enumerate(values): + idx = k - n_removed + if value % 2 == 1: + del values[idx] + n_removed += 1 + else: + values[idx] = value + 1 + values + +One probably would have expected that the result would be ``[1, 3, 5]``. +When using a pandas method that takes a UDF, internally pandas is often +iterating over the +``DataFrame`` or other pandas object. Therefore, if the UDF mutates (changes) +the ``DataFrame``, unexpected behavior can arise. + +Here is a similar example with :meth:`DataFrame.apply`: + +.. ipython:: python + + def f(s): + s.pop("a") + return s + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + try: + df.apply(f, axis="columns") + except Exception as err: + print(repr(err)) + +To resolve this issue, one can make a copy so that the mutation does +not apply to the container being iterated over. + +.. ipython:: python + + values = [0, 1, 2, 3, 4, 5] + n_removed = 0 + for k, value in enumerate(values.copy()): + idx = k - n_removed + if value % 2 == 1: + del values[idx] + n_removed += 1 + else: + values[idx] = value + 1 + values + +.. ipython:: python + + def f(s): + s = s.copy() + s.pop("a") + return s + + df = pd.DataFrame({"a": [1, 2, 3], 'b': [4, 5, 6]}) + df.apply(f, axis="columns") + + ``NaN``, Integer ``NA`` values and ``NA`` type promotions --------------------------------------------------------- diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 781054fc4de7c..490175914cef1 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -176,7 +176,7 @@ New plotting methods Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot types. For example, ``'kde'`` is a new option: -.. ipython:: python +.. code-block:: python s = pd.Series( np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3)) diff --git a/doc/source/whatsnew/v1.2.3.rst b/doc/source/whatsnew/v1.2.3.rst index e675b3ea921d1..4231b6d94b1b9 100644 --- a/doc/source/whatsnew/v1.2.3.rst +++ b/doc/source/whatsnew/v1.2.3.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :func:`pandas.to_excel` raising ``KeyError`` when giving duplicate columns with ``columns`` attribute (:issue:`39695`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 799bc88ffff4e..76bd95c1c5d9d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -186,7 +186,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | numba | 0.46.0 | | +-----------------+-----------------+---------+ -| openpyxl | 2.6.0 | | +| openpyxl | 3.0.0 | X | +-----------------+-----------------+---------+ | pyarrow | 0.15.0 | | +-----------------+-----------------+---------+ @@ -239,7 +239,7 @@ Deprecations - Deprecated :attr:`Rolling.is_datetimelike` (:issue:`38963`) - Deprecated :meth:`core.window.ewm.ExponentialMovingWindow.vol` (:issue:`39220`) - Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`) -- +- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) .. --------------------------------------------------------------------------- @@ -346,7 +346,9 @@ Indexing - Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, issue:`39619`) - Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`) - Bug in setting ``datetime64`` values into a :class:`Series` with integer-dtype incorrect casting the datetime64 values to integers (:issue:`39266`) +- Bug in setting ``np.datetime64("NaT")`` into a :class:`Series` with :class:`Datetime64TZDtype` incorrectly treating the timezone-naive value as timezone-aware (:issue:`39769`) - Bug in :meth:`Index.get_loc` not raising ``KeyError`` when method is specified for ``NaN`` value when ``NaN`` is not in :class:`Index` (:issue:`39382`) +- Bug in :meth:`DatetimeIndex.insert` when inserting ``np.datetime64("NaT")`` into a timezone-aware index incorrectly treating the timezone-naive value as timezone-aware (:issue:`39769`) - Bug in incorrectly raising in :meth:`Index.insert`, when setting a new column that cannot be held in the existing ``frame.columns``, or in :meth:`Series.reset_index` or :meth:`DataFrame.reset_index` instead of casting to a compatible dtype (:issue:`39068`) - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`) - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3a11e7fbbdf33..5da6c0778fb60 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1044,11 +1044,15 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1: return ( - isinstance(obj, abc.Iterable) + # equiv: `isinstance(obj, abc.Iterable)` + hasattr(obj, "__iter__") and not isinstance(obj, type) # we do not count strings/unicode/bytes as list-like and not isinstance(obj, (str, bytes)) # exclude zero-dimensional numpy arrays, effectively scalars - and not (util.is_array(obj) and obj.ndim == 0) + and not cnp.PyArray_IsZeroDim(obj) + # extra check for numpy-like objects which aren't captured by + # the above + and not (hasattr(obj, "ndim") and obj.ndim == 0) # exclude sets if allow_sets is False and not (allow_sets is False and isinstance(obj, abc.Set)) ) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 0b2be53131af6..97a152d9ade1e 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -559,7 +559,7 @@ def makeCustomIndex( "p": makePeriodIndex, }.get(idx_type) if idx_func: - # pandas\_testing.py:2120: error: Cannot call function of unknown type + # error: Cannot call function of unknown type idx = idx_func(nentries) # type: ignore[operator] # but we need to fill in the name if names: diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 5f27b016b68a2..8d387ff5674f7 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -82,9 +82,8 @@ def dec(f): is_decorating = not kwargs and len(args) == 1 and callable(args[0]) if is_decorating: f = args[0] - # pandas\_testing.py:2331: error: Incompatible types in assignment - # (expression has type "List[]", variable has type - # "Tuple[Any, ...]") + # error: Incompatible types in assignment (expression has type + # "List[]", variable has type "Tuple[Any, ...]") args = [] # type: ignore[assignment] return dec(f) else: @@ -205,8 +204,7 @@ def wrapper(*args, **kwargs): except Exception as err: errno = getattr(err, "errno", None) if not errno and hasattr(errno, "reason"): - # pandas\_testing.py:2521: error: "Exception" has no attribute - # "reason" + # error: "Exception" has no attribute "reason" errno = getattr(err.reason, "errno", None) # type: ignore[attr-defined] if errno in skip_errnos: diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index bcad9f1ddab09..eb2b4caddb7a6 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -17,7 +17,7 @@ "matplotlib": "2.2.3", "numexpr": "2.6.8", "odfpy": "1.3.0", - "openpyxl": "2.6.0", + "openpyxl": "3.0.0", "pandas_gbq": "0.12.0", "pyarrow": "0.15.0", "pytest": "5.0.1", diff --git a/pandas/conftest.py b/pandas/conftest.py index bc455092ebe86..79204c8896854 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1565,6 +1565,14 @@ def indexer_si(request): return request.param +@pytest.fixture(params=[tm.setitem, tm.loc]) +def indexer_sl(request): + """ + Parametrize over __setitem__, loc.__setitem__ + """ + return request.param + + @pytest.fixture def using_array_manager(request): """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6088837550ecd..337c1910102a7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2208,7 +2208,7 @@ def _sort_mixed(values): return np.concatenate([nums, np.asarray(strs, dtype=object)]) -def _sort_tuples(values: np.ndarray[tuple]): +def _sort_tuples(values: np.ndarray): """ Convert array of tuples (1d) to array or array (2d). We need to keep the columns separately as they contain different types and diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 828b460f84ec6..6e48d4b699977 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -147,18 +147,14 @@ def index(self) -> Index: def apply(self) -> FrameOrSeriesUnion: pass - def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]: + def agg(self) -> Optional[FrameOrSeriesUnion]: """ Provide an implementation for the aggregators. Returns ------- - tuple of result, how. - - Notes - ----- - how can be a string describe the required post-processing, or - None if not required. + Result of aggregation, or None if agg cannot be performed by + this method. """ obj = self.obj arg = self.f @@ -171,23 +167,21 @@ def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]: result = self.maybe_apply_str() if result is not None: - return result, None + return result if is_dict_like(arg): - return self.agg_dict_like(_axis), True + return self.agg_dict_like(_axis) elif is_list_like(arg): # we require a list, but not a 'str' - return self.agg_list_like(_axis=_axis), None - else: - result = None + return self.agg_list_like(_axis=_axis) if callable(arg): f = obj._get_cython_func(arg) if f and not args and not kwargs: - return getattr(obj, f)(), None + return getattr(obj, f)() # caller can react - return result, True + return None def agg_list_like(self, _axis: int) -> FrameOrSeriesUnion: """ diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index cb185dcf78f63..e17ba45f30d6c 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -228,7 +228,7 @@ def _maybe_fallback(ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): return NotImplemented -def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): +def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any): """ Compatibility with numpy ufuncs. @@ -341,9 +341,7 @@ def reconstruct(result): result = result.__finalize__(self) return result - if self.ndim > 1 and ( - len(inputs) > 1 or ufunc.nout > 1 # type: ignore[attr-defined] - ): + if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1): # Just give up on preserving types in the complex case. # In theory we could preserve them for them. # * nout>1 is doable if BlockManager.apply took nout and @@ -367,7 +365,7 @@ def reconstruct(result): # Those can have an axis keyword and thus can't be called block-by-block result = getattr(ufunc, method)(np.asarray(inputs[0]), **kwargs) - if ufunc.nout > 1: # type: ignore[attr-defined] + if ufunc.nout > 1: result = tuple(reconstruct(x) for x in result) else: result = reconstruct(result) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 162a69370bc61..3dd170f60a62c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -445,8 +445,7 @@ def _validate_comparison_value(self, other): raise InvalidComparison(other) if isinstance(other, self._recognized_scalars) or other is NaT: - # pandas\core\arrays\datetimelike.py:432: error: Too many arguments - # for "object" [call-arg] + # error: Too many arguments for "object" other = self._scalar_type(other) # type: ignore[call-arg] try: self._check_compatible_with(other) @@ -497,8 +496,7 @@ def _validate_shift_value(self, fill_value): if is_valid_na_for_dtype(fill_value, self.dtype): fill_value = NaT elif isinstance(fill_value, self._recognized_scalars): - # pandas\core\arrays\datetimelike.py:746: error: Too many arguments - # for "object" [call-arg] + # error: Too many arguments for "object" fill_value = self._scalar_type(fill_value) # type: ignore[call-arg] else: # only warn if we're not going to raise @@ -506,8 +504,7 @@ def _validate_shift_value(self, fill_value): # kludge for #31971 since Period(integer) tries to cast to str new_fill = Period._from_ordinal(fill_value, freq=self.freq) else: - # pandas\core\arrays\datetimelike.py:753: error: Too many - # arguments for "object" [call-arg] + # error: Too many arguments for "object" new_fill = self._scalar_type(fill_value) # type: ignore[call-arg] # stacklevel here is chosen to be correct when called from @@ -562,8 +559,14 @@ def _validate_scalar( # GH#18295 value = NaT + elif isna(value): + # if we are dt64tz and value is dt64("NaT"), dont cast to NaT, + # or else we'll fail to raise in _unbox_scalar + msg = self._validation_error_message(value, allow_listlike) + raise TypeError(msg) + elif isinstance(value, self._recognized_scalars): - # error: Too many arguments for "object" [call-arg] + # error: Too many arguments for "object" value = self._scalar_type(value) # type: ignore[call-arg] else: @@ -1679,7 +1682,7 @@ def factorize(self, na_sentinel=-1, sort: bool = False): # TODO: overload __getitem__, a slice indexer returns same type as self # error: Incompatible types in assignment (expression has type # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable - # has type "TimelikeOps") [assignment] + # has type "TimelikeOps") uniques = uniques[::-1] # type: ignore[assignment] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 144a7186f5826..70c2015c6d41c 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -464,10 +464,8 @@ def _generate_range( def _unbox_scalar(self, value, setitem: bool = False) -> np.datetime64: if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timestamp.") - if not isna(value): - self._check_compatible_with(value, setitem=setitem) - return value.asm8 - return np.datetime64(value.value, "ns") + self._check_compatible_with(value, setitem=setitem) + return value.asm8 def _scalar_from_string(self, value): return Timestamp(value, tz=self.tz) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 0f3e028c34c05..c720f4bdacaff 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -9,6 +9,7 @@ from pandas._config import get_option +from pandas._libs import NaT from pandas._libs.interval import ( VALID_CLOSED, Interval, @@ -23,7 +24,8 @@ from pandas.core.dtypes.cast import maybe_convert_platform from pandas.core.dtypes.common import ( is_categorical_dtype, - is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, is_dtype_equal, is_float_dtype, is_integer_dtype, @@ -999,9 +1001,12 @@ def _validate_setitem_value(self, value): if is_integer_dtype(self.dtype.subtype): # can't set NaN on a numpy integer array needs_float_conversion = True - elif is_datetime64_any_dtype(self.dtype.subtype): + elif is_datetime64_dtype(self.dtype.subtype): # need proper NaT to set directly on the numpy array value = np.datetime64("NaT") + elif is_datetime64tz_dtype(self.dtype.subtype): + # need proper NaT to set directly on the DatetimeArray array + value = NaT elif is_timedelta64_dtype(self.dtype.subtype): # need proper NaT to set directly on the numpy array value = np.timedelta64("NaT") @@ -1508,7 +1513,7 @@ def isin(self, values) -> np.ndarray: # GH#38353 instead of casting to object, operating on a # complex128 ndarray is much more performant. - # error: "ArrayLike" has no attribute "view" [attr-defined] + # error: "ArrayLike" has no attribute "view" left = self._combined.view("complex128") # type:ignore[attr-defined] right = values._combined.view("complex128") return np.in1d(left, right) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 65618ce32b6d7..b318757e8978a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -190,9 +190,8 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) - # pandas\core\arrays\string_.py:188: error: Incompatible types in - # assignment (expression has type "StringDtype", variable has type - # "PandasDtype") [assignment] + # error: Incompatible types in assignment (expression has type "StringDtype", + # variable has type "PandasDtype") self._dtype = StringDtype() # type: ignore[assignment] if not isinstance(values, type(self)): self._validate() diff --git a/pandas/core/base.py b/pandas/core/base.py index da8ed8a59f981..3f3b4cd1afec1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -95,8 +95,7 @@ def __sizeof__(self): either a value or Series of values """ if hasattr(self, "memory_usage"): - # pandas\core\base.py:84: error: "PandasObject" has no attribute - # "memory_usage" [attr-defined] + # error: "PandasObject" has no attribute "memory_usage" mem = self.memory_usage(deep=True) # type: ignore[attr-defined] return int(mem if is_scalar(mem) else mem.sum()) @@ -206,17 +205,14 @@ def _selection_list(self): @cache_readonly def _selected_obj(self): - # pandas\core\base.py:195: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" if self._selection is None or isinstance( self.obj, ABCSeries # type: ignore[attr-defined] ): - # pandas\core\base.py:194: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" return self.obj # type: ignore[attr-defined] else: - # pandas\core\base.py:204: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" return self.obj[self._selection] # type: ignore[attr-defined] @cache_readonly @@ -225,29 +221,22 @@ def ndim(self) -> int: @cache_readonly def _obj_with_exclusions(self): - # pandas\core\base.py:209: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" if self._selection is not None and isinstance( self.obj, ABCDataFrame # type: ignore[attr-defined] ): - # pandas\core\base.py:217: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" return self.obj.reindex( # type: ignore[attr-defined] columns=self._selection_list ) - # pandas\core\base.py:207: error: "SelectionMixin" has no attribute - # "exclusions" [attr-defined] + # error: "SelectionMixin" has no attribute "exclusions" if len(self.exclusions) > 0: # type: ignore[attr-defined] - # pandas\core\base.py:208: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] - - # pandas\core\base.py:208: error: "SelectionMixin" has no attribute - # "exclusions" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" + # error: "SelectionMixin" has no attribute "exclusions" return self.obj.drop(self.exclusions, axis=1) # type: ignore[attr-defined] else: - # pandas\core\base.py:210: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" return self.obj # type: ignore[attr-defined] def __getitem__(self, key): @@ -255,13 +244,11 @@ def __getitem__(self, key): raise IndexError(f"Column(s) {self._selection} already selected") if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)): - # pandas\core\base.py:217: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" if len( self.obj.columns.intersection(key) # type: ignore[attr-defined] ) != len(key): - # pandas\core\base.py:218: error: "SelectionMixin" has no - # attribute "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" bad_keys = list( set(key).difference(self.obj.columns) # type: ignore[attr-defined] ) @@ -269,13 +256,13 @@ def __getitem__(self, key): return self._gotitem(list(key), ndim=2) elif not getattr(self, "as_index", False): - # error: "SelectionMixin" has no attribute "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" if key not in self.obj.columns: # type: ignore[attr-defined] raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=2) else: - # error: "SelectionMixin" has no attribute "obj" [attr-defined] + # error: "SelectionMixin" has no attribute "obj" if key not in self.obj: # type: ignore[attr-defined] raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=1) @@ -601,8 +588,7 @@ def to_numpy( dtype='datetime64[ns]') """ if is_extension_array_dtype(self.dtype): - # pandas\core\base.py:837: error: Too many arguments for "to_numpy" - # of "ExtensionArray" [call-arg] + # error: Too many arguments for "to_numpy" of "ExtensionArray" return self.array.to_numpy( # type: ignore[call-arg] dtype, copy=copy, na_value=na_value, **kwargs ) @@ -914,13 +900,11 @@ def _map_values(self, mapper, na_action=None): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values - # pandas\core\base.py:893: error: Incompatible types in - # assignment (expression has type "Categorical", variable has - # type "IndexOpsMixin") [assignment] + # error: Incompatible types in assignment (expression has type + # "Categorical", variable has type "IndexOpsMixin") self = cast("Categorical", self) # type: ignore[assignment] - # pandas\core\base.py:894: error: Item "ExtensionArray" of - # "Union[ExtensionArray, Any]" has no attribute "map" - # [union-attr] + # error: Item "ExtensionArray" of "Union[ExtensionArray, Any]" has no + # attribute "map" return self._values.map(mapper) # type: ignore[union-attr] values = self._values @@ -938,8 +922,7 @@ def _map_values(self, mapper, na_action=None): raise NotImplementedError map_f = lambda values, f: values.map(f) else: - # pandas\core\base.py:1142: error: "IndexOpsMixin" has no attribute - # "astype" [attr-defined] + # error: "IndexOpsMixin" has no attribute "astype" values = self.astype(object)._values # type: ignore[attr-defined] if na_action == "ignore": map_f = lambda values, f: lib.map_infer_mask( @@ -1177,8 +1160,7 @@ def memory_usage(self, deep=False): are not components of the array if deep=False or if used on PyPy """ if hasattr(self.array, "memory_usage"): - # pandas\core\base.py:1379: error: "ExtensionArray" has no - # attribute "memory_usage" [attr-defined] + # error: "ExtensionArray" has no attribute "memory_usage" return self.array.memory_usage(deep=deep) # type: ignore[attr-defined] v = self.array.nbytes @@ -1313,8 +1295,7 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: def drop_duplicates(self, keep="first"): duplicated = self.duplicated(keep=keep) - # pandas\core\base.py:1507: error: Value of type "IndexOpsMixin" is not - # indexable [index] + # error: Value of type "IndexOpsMixin" is not indexable return self[~duplicated] # type: ignore[index] def duplicated(self, keep="first"): diff --git a/pandas/core/common.py b/pandas/core/common.py index aa24e12bf2cf1..89ba33da92661 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -268,10 +268,6 @@ def maybe_iterable_to_list(obj: Union[Iterable[T], T]) -> Union[Collection[T], T """ if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized): return list(obj) - # error: Incompatible return value type (got - # "Union[pandas.core.common., - # pandas.core.common.1, T]", expected - # "Union[Collection[T], T]") [return-value] obj = cast(Collection, obj) return obj diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index babf8116a5588..ee91ab4a282cf 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -659,8 +659,7 @@ def visit_Call(self, node, side=None, **kwargs): raise if res is None: - # pandas\core\computation\expr.py:663: error: "expr" has no - # attribute "id" [attr-defined] + # error: "expr" has no attribute "id" raise ValueError( f"Invalid function call {node.func.id}" # type: ignore[attr-defined] ) @@ -684,8 +683,7 @@ def visit_Call(self, node, side=None, **kwargs): for key in node.keywords: if not isinstance(key, ast.keyword): - # pandas\core\computation\expr.py:684: error: "expr" has no - # attribute "id" [attr-defined] + # error: "expr" has no attribute "id" raise ValueError( "keyword error in function call " # type: ignore[attr-defined] f"'{node.func.id}'" diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 7b42b21cadc1f..e8eae710623c5 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -71,8 +71,7 @@ def __init__(self, name: str, is_local: Optional[bool] = None): class Term: def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls - # pandas\core\computation\ops.py:72: error: Argument 2 for "super" not - # an instance of argument 1 [misc] + # error: Argument 2 for "super" not an instance of argument 1 supr_new = super(Term, klass).__new__ # type: ignore[misc] return supr_new(klass) @@ -593,7 +592,7 @@ def __init__(self, func, args): self.func = func def __call__(self, env): - # pandas\core\computation\ops.py:592: error: "Op" not callable [operator] + # error: "Op" not callable operands = [op(env) for op in self.operands] # type: ignore[operator] with np.errstate(all="ignore"): return self.func.func(*operands) diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index c2ba7f9892ef0..71d725051977f 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -131,17 +131,14 @@ def __init__( # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) - # pandas\core\computation\scope.py:132: error: Incompatible types - # in assignment (expression has type "ChainMap[str, Any]", variable - # has type "DeepChainMap[str, Any]") [assignment] + # error: Incompatible types in assignment (expression has type + # "ChainMap[str, Any]", variable has type "DeepChainMap[str, Any]") self.scope = self.scope.new_child( # type: ignore[assignment] (global_dict or frame.f_globals).copy() ) if not isinstance(local_dict, Scope): - # pandas\core\computation\scope.py:134: error: Incompatible - # types in assignment (expression has type "ChainMap[str, - # Any]", variable has type "DeepChainMap[str, Any]") - # [assignment] + # error: Incompatible types in assignment (expression has type + # "ChainMap[str, Any]", variable has type "DeepChainMap[str, Any]") self.scope = self.scope.new_child( # type: ignore[assignment] (local_dict or frame.f_locals).copy() ) @@ -150,8 +147,7 @@ def __init__( # assumes that resolvers are going from outermost scope to inner if isinstance(local_dict, Scope): - # pandas\core\computation\scope.py:140: error: Cannot determine - # type of 'resolvers' [has-type] + # error: Cannot determine type of 'resolvers' resolvers += tuple(local_dict.resolvers.maps) # type: ignore[has-type] self.resolvers = DeepChainMap(*resolvers) self.temps = {} @@ -239,8 +235,7 @@ def swapkey(self, old_key: str, new_key: str, new_value=None): for mapping in maps: if old_key in mapping: - # pandas\core\computation\scope.py:228: error: Unsupported - # target for indexed assignment ("Mapping[Any, Any]") [index] + # error: Unsupported target for indexed assignment ("Mapping[Any, Any]") mapping[new_key] = new_value # type: ignore[index] return @@ -260,10 +255,8 @@ def _get_vars(self, stack, scopes: List[str]): for scope, (frame, _, _, _, _, _) in variables: try: d = getattr(frame, "f_" + scope) - # pandas\core\computation\scope.py:247: error: Incompatible - # types in assignment (expression has type "ChainMap[str, - # Any]", variable has type "DeepChainMap[str, Any]") - # [assignment] + # error: Incompatible types in assignment (expression has type + # "ChainMap[str, Any]", variable has type "DeepChainMap[str, Any]") self.scope = self.scope.new_child(d) # type: ignore[assignment] finally: # won't remove it, but DECREF it @@ -331,13 +324,10 @@ def full_scope(self): vars : DeepChainMap All variables in this scope. """ - # pandas\core\computation\scope.py:314: error: Unsupported operand - # types for + ("List[Dict[Any, Any]]" and "List[Mapping[Any, Any]]") - # [operator] - - # pandas\core\computation\scope.py:314: error: Unsupported operand - # types for + ("List[Dict[Any, Any]]" and "List[Mapping[str, Any]]") - # [operator] + # error: Unsupported operand types for + ("List[Dict[Any, Any]]" and + # "List[Mapping[Any, Any]]") + # error: Unsupported operand types for + ("List[Dict[Any, Any]]" and + # "List[Mapping[str, Any]]") maps = ( [self.temps] + self.resolvers.maps # type: ignore[operator] diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e27c519304e2e..74d750288bdeb 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -5,7 +5,7 @@ from __future__ import annotations from contextlib import suppress -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta from typing import ( TYPE_CHECKING, Any, @@ -549,16 +549,46 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan): # returns tuple of (dtype, fill_value) if issubclass(dtype.type, np.datetime64): - if isinstance(fill_value, datetime) and fill_value.tzinfo is not None: - # Trying to insert tzaware into tznaive, have to cast to object - dtype = np.dtype(np.object_) - elif is_integer(fill_value) or is_float(fill_value): - dtype = np.dtype(np.object_) - else: + inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True) + if inferred == dtype: + return dtype, fv + + # TODO(2.0): once this deprecation is enforced, this whole case + # becomes equivalent to: + # dta = DatetimeArray._from_sequence([], dtype="M8[ns]") + # try: + # fv = dta._validate_setitem_value(fill_value) + # return dta.dtype, fv + # except (ValueError, TypeError): + # return np.dtype(object), fill_value + if isinstance(fill_value, date) and not isinstance(fill_value, datetime): + # deprecate casting of date object to match infer_dtype_from_scalar + # and DatetimeArray._validate_setitem_value try: - fill_value = Timestamp(fill_value).to_datetime64() - except (TypeError, ValueError): - dtype = np.dtype(np.object_) + fv = Timestamp(fill_value).to_datetime64() + except OutOfBoundsDatetime: + pass + else: + warnings.warn( + "Using a `date` object for fill_value with `datetime64[ns]` " + "dtype is deprecated. In a future version, this will be cast " + "to object dtype. Pass `fill_value=Timestamp(date_obj)` instead.", + FutureWarning, + stacklevel=7, + ) + return dtype, fv + elif isinstance(fill_value, str): + try: + # explicitly wrap in str to convert np.str_ + fv = Timestamp(str(fill_value)) + except (ValueError, TypeError): + pass + else: + if fv.tz is None: + return dtype, fv.asm8 + + return np.dtype(object), fill_value + elif issubclass(dtype.type, np.timedelta64): if ( is_integer(fill_value) @@ -723,13 +753,13 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, if val is NaT or val.tz is None: dtype = np.dtype("M8[ns]") + val = val.to_datetime64() else: if pandas_dtype: dtype = DatetimeTZDtype(unit="ns", tz=val.tz) else: # return datetimetz as object return np.dtype(object), val - val = val.value elif isinstance(val, (np.timedelta64, timedelta)): try: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index ef645313de614..7ebbbdc9ce7f9 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -604,7 +604,11 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: if not lib.is_scalar(obj) or not isna(obj): return False if dtype.kind == "M": - return not isinstance(obj, np.timedelta64) + if isinstance(dtype, np.dtype): + # i.e. not tzaware + return not isinstance(obj, np.timedelta64) + # we have to rule out tznaive dt64("NaT") + return not isinstance(obj, (np.timedelta64, np.datetime64)) if dtype.kind == "m": return not isinstance(obj, np.datetime64) if dtype.kind in ["i", "u", "f", "c"]: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 63d238da12101..4f3b1357b1000 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1163,8 +1163,8 @@ def __len__(self) -> int: """ return len(self.index) - # pandas/core/frame.py:1146: error: Overloaded function signatures 1 and 2 - # overlap with incompatible return types [misc] + # error: Overloaded function signatures 1 and 2 overlap with incompatible return + # types @overload def dot(self, other: Series) -> Series: # type: ignore[misc] ... @@ -4822,8 +4822,8 @@ def set_index( elif isinstance(col, (Index, Series)): # if Index then not MultiIndex (treated above) - # error: Argument 1 to "append" of "list" has incompatible - # type "Union[Index, Series]"; expected "Index" [arg-type] + # error: Argument 1 to "append" of "list" has incompatible type + # "Union[Index, Series]"; expected "Index" arrays.append(col) # type:ignore[arg-type] names.append(col.name) elif isinstance(col, (list, np.ndarray)): @@ -7686,7 +7686,7 @@ def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): result = None try: - result, how = self._aggregate(func, axis, *args, **kwargs) + result = self._aggregate(func, axis, *args, **kwargs) except TypeError as err: exc = TypeError( "DataFrame constructor called with " @@ -7720,14 +7720,14 @@ def _aggregate(self, arg, axis: Axis = 0, *args, **kwargs): args=args, kwargs=kwargs, ) - result, how = op.agg() + result = op.agg() if axis == 1: # NDFrame.aggregate returns a tuple, and we need to transpose # only result result = result.T if result is not None else result - return result, how + return result agg = aggregate @@ -7814,6 +7814,12 @@ def apply( DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`udf-mutation` + for more details. + Examples -------- >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) @@ -9723,12 +9729,3 @@ def _reindex_for_setitem(value: FrameOrSeriesUnion, index: Index) -> ArrayLike: "incompatible index of inserted column with frame index" ) from err return reindexed_value - - -def _maybe_atleast_2d(value): - # TODO(EA2D): not needed with 2D EAs - - if is_extension_array_dtype(value): - return value - - return np.atleast_2d(np.asarray(value)) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ec37da66760c3..6413489a74ae6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10530,8 +10530,7 @@ def _add_numeric_operations(cls): def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): return NDFrame.any(self, axis, bool_only, skipna, level, **kwargs) - # pandas\core\generic.py:10725: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.any = any # type: ignore[assignment] @doc( @@ -10547,13 +10546,11 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): return NDFrame.all(self, axis, bool_only, skipna, level, **kwargs) - # pandas\core\generic.py:10719: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method - # pandas\core\generic.py:10719: error: Incompatible types in assignment - # (expression has type "Callable[[Iterable[object]], bool]", variable - # has type "Callable[[NDFrame, Any, Any, Any, Any, KwArg(Any)], Any]") - # [assignment] + # error: Incompatible types in assignment (expression has type + # "Callable[[Iterable[object]], bool]", variable has type "Callable[[NDFrame, + # Any, Any, Any, Any, KwArg(Any)], Any]") cls.all = all # type: ignore[assignment] # error: Argument 1 to "doc" has incompatible type "Optional[str]"; expected @@ -10571,8 +10568,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): def mad(self, axis=None, skipna=None, level=None): return NDFrame.mad(self, axis, skipna, level) - # pandas\core\generic.py:10736: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.mad = mad # type: ignore[assignment] @doc( @@ -10595,8 +10591,7 @@ def sem( ): return NDFrame.sem(self, axis, skipna, level, ddof, numeric_only, **kwargs) - # pandas\core\generic.py:10758: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.sem = sem # type: ignore[assignment] @doc( @@ -10618,8 +10613,7 @@ def var( ): return NDFrame.var(self, axis, skipna, level, ddof, numeric_only, **kwargs) - # pandas\core\generic.py:10779: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.var = var # type: ignore[assignment] @doc( @@ -10642,8 +10636,7 @@ def std( ): return NDFrame.std(self, axis, skipna, level, ddof, numeric_only, **kwargs) - # pandas\core\generic.py:10801: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.std = std # type: ignore[assignment] @doc( @@ -10658,8 +10651,7 @@ def std( def cummin(self, axis=None, skipna=True, *args, **kwargs): return NDFrame.cummin(self, axis, skipna, *args, **kwargs) - # pandas\core\generic.py:10815: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.cummin = cummin # type: ignore[assignment] @doc( @@ -10674,8 +10666,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): def cummax(self, axis=None, skipna=True, *args, **kwargs): return NDFrame.cummax(self, axis, skipna, *args, **kwargs) - # pandas\core\generic.py:10829: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.cummax = cummax # type: ignore[assignment] @doc( @@ -10690,8 +10681,7 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): def cumsum(self, axis=None, skipna=True, *args, **kwargs): return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) - # pandas\core\generic.py:10843: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.cumsum = cumsum # type: ignore[assignment] @doc( @@ -10706,8 +10696,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): def cumprod(self, axis=None, skipna=True, *args, **kwargs): return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) - # pandas\core\generic.py:10857: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.cumprod = cumprod # type: ignore[assignment] @doc( @@ -10734,8 +10723,7 @@ def sum( self, axis, skipna, level, numeric_only, min_count, **kwargs ) - # pandas\core\generic.py:10883: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.sum = sum # type: ignore[assignment] @doc( @@ -10761,8 +10749,7 @@ def prod( self, axis, skipna, level, numeric_only, min_count, **kwargs ) - # pandas\core\generic.py:10908: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.prod = prod # type: ignore[assignment] cls.product = prod @@ -10779,8 +10766,7 @@ def prod( def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs) - # pandas\core\generic.py:10924: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.mean = mean # type: ignore[assignment] @doc( @@ -10796,8 +10782,7 @@ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs) - # pandas\core\generic.py:10939: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.skew = skew # type: ignore[assignment] @doc( @@ -10816,8 +10801,7 @@ def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs) - # pandas\core\generic.py:10957: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.kurt = kurt # type: ignore[assignment] cls.kurtosis = kurt @@ -10836,8 +10820,7 @@ def median( ): return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs) - # pandas\core\generic.py:10975: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.median = median # type: ignore[assignment] @doc( @@ -10855,8 +10838,7 @@ def median( def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs) - # pandas\core\generic.py:10992: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.max = max # type: ignore[assignment] @doc( @@ -10874,8 +10856,7 @@ def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs) - # pandas\core\generic.py:11009: error: Cannot assign to a method - # [assignment] + # error: Cannot assign to a method cls.min = min # type: ignore[assignment] @final diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 594c5899209df..9f1446b359f66 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -57,8 +57,7 @@ def _gotitem(self, key, ndim, subset=None): """ # create a new object to prevent aliasing if subset is None: - # pandas\core\groupby\base.py:52: error: "GotItemMixin" has no - # attribute "obj" [attr-defined] + # error: "GotItemMixin" has no attribute "obj" subset = self.obj # type: ignore[attr-defined] # we need to make a shallow copy of ourselves @@ -70,22 +69,15 @@ def _gotitem(self, key, ndim, subset=None): # Try to select from a DataFrame, falling back to a Series try: - # pandas\core\groupby\base.py:60: error: "GotItemMixin" has no - # attribute "_groupby" [attr-defined] + # error: "GotItemMixin" has no attribute "_groupby" groupby = self._groupby[key] # type: ignore[attr-defined] except IndexError: - # pandas\core\groupby\base.py:62: error: "GotItemMixin" has no - # attribute "_groupby" [attr-defined] + # error: "GotItemMixin" has no attribute "_groupby" groupby = self._groupby # type: ignore[attr-defined] - # pandas\core\groupby\base.py:64: error: Too many arguments for - # "GotItemMixin" [call-arg] - - # pandas\core\groupby\base.py:64: error: Unexpected keyword argument - # "groupby" for "GotItemMixin" [call-arg] - - # pandas\core\groupby\base.py:64: error: Unexpected keyword argument - # "parent" for "GotItemMixin" [call-arg] + # error: Too many arguments for "GotItemMixin" + # error: Unexpected keyword argument "groupby" for "GotItemMixin" + # error: Unexpected keyword argument "parent" for "GotItemMixin" self = type(self)( subset, groupby=groupby, parent=self, **kwargs # type: ignore[call-arg] ) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a7297923f1034..9369dc61ca5f6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -46,6 +46,7 @@ ensure_platform_int, is_bool, is_categorical_dtype, + is_dict_like, is_integer_dtype, is_interval_dtype, is_numeric_dtype, @@ -580,6 +581,12 @@ def filter(self, func, dropna=True, *args, **kwargs): dropna : Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`udf-mutation` + for more details. + Examples -------- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', @@ -962,8 +969,8 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) func = maybe_mangle_lambdas(func) op = GroupByApply(self, func, args, kwargs) - result, how = op.agg() - if how is None: + result = op.agg() + if not is_dict_like(func) and result is not None: return result if result is None: @@ -982,7 +989,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # try to treat as if we are passing a list try: - result, _ = GroupByApply( + result = GroupByApply( self, [func], args=(), kwargs={"_axis": self.axis} ).agg() @@ -1506,6 +1513,10 @@ def filter(self, func, dropna=True, *args, **kwargs): Each subframe is endowed the attribute 'name' in case you need to know which group you are working on. + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`udf-mutation` + for more details. + Examples -------- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5758762c13984..66e7bc78b2f81 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -344,7 +344,7 @@ class providing the base-class of operations. in the subframe. If f also supports application to the entire subframe, then a fast path is used starting from the second chunk. * f must not mutate groups. Mutation is not supported and may - produce unexpected results. + produce unexpected results. See :ref:`udf-mutation` for more details. When using ``engine='numba'``, there will be no "fall back" behavior internally. The group data and group index will be passed as numpy arrays to the JITed @@ -447,6 +447,10 @@ class providing the base-class of operations. The group data and group index will be passed as numpy arrays to the JITed user defined function, and no alternative execution attempts will be tried. {examples} + +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`udf-mutation` +for more details. """ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c7dc6d021a4c3..6a789bc26cabc 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -281,9 +281,8 @@ def _get_grouper(self, obj, validate: bool = True): a tuple of binner, grouper, obj (possibly sorted) """ self._set_grouper(obj) - # pandas\core\groupby\grouper.py:310: error: Value of type variable - # "FrameOrSeries" of "get_grouper" cannot be "Optional[Any]" - # [type-var] + # error: Value of type variable "FrameOrSeries" of "get_grouper" cannot be + # "Optional[Any]" self.grouper, _, self.obj = get_grouper( # type: ignore[type-var] self.obj, [self.key], @@ -370,8 +369,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): @final @property def groups(self): - # pandas\core\groupby\grouper.py:382: error: Item "None" of - # "Optional[Any]" has no attribute "groups" [union-attr] + # error: Item "None" of "Optional[Any]" has no attribute "groups" return self.grouper.groups # type: ignore[union-attr] @final diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 789ca04b894cd..f12a4fd382e7a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6071,14 +6071,14 @@ def ensure_index( if hasattr(index_like, "name"): # https://github.com/python/mypy/issues/1424 # error: Item "ExtensionArray" of "Union[ExtensionArray, - # Sequence[Any]]" has no attribute "name" [union-attr] + # Sequence[Any]]" has no attribute "name" # error: Item "Sequence[Any]" of "Union[ExtensionArray, Sequence[Any]]" - # has no attribute "name" [union-attr] - # error: "Sequence[Any]" has no attribute "name" [attr-defined] + # has no attribute "name" + # error: "Sequence[Any]" has no attribute "name" # error: Item "Sequence[Any]" of "Union[Series, Sequence[Any]]" has no - # attribute "name" [union-attr] + # attribute "name" # error: Item "Sequence[Any]" of "Union[Any, Sequence[Any]]" has no - # attribute "name" [union-attr] + # attribute "name" name = index_like.name # type: ignore[union-attr, attr-defined] return Index(index_like, name=name, copy=copy) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 265170dd28a3b..7e6d7d911b065 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -318,8 +318,7 @@ def _format_attrs(self): "categories", ibase.default_pprint(self.categories, max_seq_items=max_categories), ), - # pandas\core\indexes\category.py:315: error: "CategoricalIndex" - # has no attribute "ordered" [attr-defined] + # error: "CategoricalIndex" has no attribute "ordered" ("ordered", self.ordered), # type: ignore[attr-defined] ] if self.name is not None: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 00f47c0aaf538..2e6519a3b73ad 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -714,7 +714,7 @@ def _intersection(self, other: Index, sort=False) -> Index: left_chunk = left._values[lslice] # error: Argument 1 to "_simple_new" of "DatetimeIndexOpsMixin" has # incompatible type "Union[ExtensionArray, Any]"; expected - # "Union[DatetimeArray, TimedeltaArray, PeriodArray]" [arg-type] + # "Union[DatetimeArray, TimedeltaArray, PeriodArray]" result = type(self)._simple_new(left_chunk) # type: ignore[arg-type] return self._wrap_setop_result(other, result) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index ea3678a7e15d9..a0e783a74df3c 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -226,8 +226,8 @@ def __getitem__(self, key): if result.ndim == 1: return type(self)(result, name=self.name) # Unpack to ndarray for MPL compat - # pandas\core\indexes\extension.py:220: error: "ExtensionArray" has - # no attribute "_data" [attr-defined] + + # error: "ExtensionArray" has no attribute "_data" result = result._data # type: ignore[attr-defined] # Includes cases where we get a 2D ndarray back for MPL compat diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index af353cf3fb5f7..70adcd841a57d 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -533,6 +533,10 @@ def _maybe_convert_i8(self, key): key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True) if lib.is_period(key): key_i8 = key.ordinal + elif isinstance(key_i8, Timestamp): + key_i8 = key_i8.value + elif isinstance(key_i8, (np.datetime64, np.timedelta64)): + key_i8 = key_i8.view("i8") else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 26d59db1b08fd..1fdffcf8e5980 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1448,8 +1448,7 @@ def _set_names(self, names, level=None, validate=True): raise TypeError( f"{type(self).__name__}.name must be a hashable type" ) - # pandas\core\indexes\multi.py:1448: error: Cannot determine type - # of '__setitem__' [has-type] + # error: Cannot determine type of '__setitem__' self._names[lev] = name # type: ignore[has-type] # If .levels has been accessed, the names in our cache will be stale. diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index a9561cc477d4a..9664f41362c8a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -166,21 +166,21 @@ def to_timestamp(self, freq=None, how="start") -> DatetimeIndex: return DatetimeIndex._simple_new(arr, name=self.name) # https://github.com/python/mypy/issues/1362 - # error: Decorated property not supported [misc] + # error: Decorated property not supported @property # type:ignore[misc] @doc(PeriodArray.hour.fget) def hour(self) -> Int64Index: return Int64Index(self._data.hour, name=self.name) # https://github.com/python/mypy/issues/1362 - # error: Decorated property not supported [misc] + # error: Decorated property not supported @property # type:ignore[misc] @doc(PeriodArray.minute.fget) def minute(self) -> Int64Index: return Int64Index(self._data.minute, name=self.name) # https://github.com/python/mypy/issues/1362 - # error: Decorated property not supported [misc] + # error: Decorated property not supported @property # type:ignore[misc] @doc(PeriodArray.second.fget) def second(self) -> Int64Index: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a8493e647f39a..083f32488acd4 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -659,24 +659,53 @@ def idelete(self, indexer): def iset(self, loc: Union[int, slice, np.ndarray], value): """ - Set new item in-place. Does not consolidate. Adds new Block if not - contained in the current set of items + Set new column(s). + + This changes the ArrayManager in-place, but replaces (an) existing + column(s), not changing column values in-place). + + Parameters + ---------- + loc : integer, slice or boolean mask + Positional location (already bounds checked) + value : array-like """ + # single column -> single integer index if lib.is_integer(loc): - # TODO normalize array -> this should in theory not be needed? + # TODO the extract array should in theory not be needed? value = extract_array(value, extract_numpy=True) + + # TODO can we avoid needing to unpack this here? That means converting + # DataFrame into 1D array when loc is an integer if isinstance(value, np.ndarray) and value.ndim == 2: + assert value.shape[1] == 1 value = value[0, :] assert isinstance(value, (np.ndarray, ExtensionArray)) - # value = np.asarray(value) - # assert isinstance(value, np.ndarray) + assert value.ndim == 1 assert len(value) == len(self._axes[0]) self.arrays[loc] = value return - # TODO - raise Exception + # multiple columns -> convert slice or array to integer indices + elif isinstance(loc, slice): + indices = range( + loc.start if loc.start is not None else 0, + loc.stop if loc.stop is not None else self.shape_proper[1], + loc.step if loc.step is not None else 1, + ) + else: + assert isinstance(loc, np.ndarray) + assert loc.dtype == "bool" + indices = np.nonzero(loc)[0] + + assert value.ndim == 2 + assert value.shape[0] == len(self._axes[0]) + + for value_idx, mgr_idx in enumerate(indices): + value_arr = value[:, value_idx] + self.arrays[mgr_idx] = value_arr + return def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False): """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8ba6018e743bb..06bf2e5d7b18e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -8,6 +8,7 @@ from pandas._libs import ( Interval, + NaT, Period, Timestamp, algos as libalgos, @@ -32,14 +33,11 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( - DT64NS_DTYPE, - TD64NS_DTYPE, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, - is_integer, is_list_like, is_object_dtype, is_sparse, @@ -144,7 +142,8 @@ def __init__(self, values, placement, ndim: int): f"placement implies {len(self.mgr_locs)}" ) - def _maybe_coerce_values(self, values): + @classmethod + def _maybe_coerce_values(cls, values): """ Ensure we have correctly-typed values. @@ -442,8 +441,7 @@ def fillna( if self._can_hold_element(value): nb = self if inplace else self.copy() putmask_inplace(nb.values, mask, value) - # TODO: should be nb._maybe_downcast? - return self._maybe_downcast([nb], downcast) + return nb._maybe_downcast([nb], downcast) if noop: # we can't process the value, but nothing to do @@ -722,6 +720,9 @@ def copy(self, deep: bool = True): values = values.copy() return self.make_block_same_class(values, ndim=self.ndim) + # --------------------------------------------------------------------- + # Replace + def replace( self, to_replace, @@ -872,6 +873,57 @@ def _replace_list( rb = new_rb return rb + def _replace_coerce( + self, + to_replace, + value, + mask: np.ndarray, + inplace: bool = True, + regex: bool = False, + ) -> List[Block]: + """ + Replace value corresponding to the given boolean array with another + value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + mask : np.ndarray[bool] + True indicate corresponding element is ignored. + inplace : bool, default True + Perform inplace modification. + regex : bool, default False + If true, perform regular expression substitution. + + Returns + ------- + List[Block] + """ + if mask.any(): + if not regex: + nb = self.coerce_to_target_dtype(value) + if nb is self and not inplace: + nb = nb.copy() + putmask_inplace(nb.values, mask, value) + return [nb] + else: + regex = should_use_regex(regex, to_replace) + if regex: + return self._replace_regex( + to_replace, + value, + inplace=inplace, + convert=False, + mask=mask, + ) + return self.replace(to_replace, value, inplace=inplace, regex=False) + return [self] + + # --------------------------------------------------------------------- + def setitem(self, indexer, value): """ Attempt self.values[indexer] = value, possibly creating a new array. @@ -1402,55 +1454,6 @@ def quantile( return make_block(result, placement=self.mgr_locs, ndim=2) - def _replace_coerce( - self, - to_replace, - value, - mask: np.ndarray, - inplace: bool = True, - regex: bool = False, - ) -> List[Block]: - """ - Replace value corresponding to the given boolean array with another - value. - - Parameters - ---------- - to_replace : object or pattern - Scalar to replace or regular expression to match. - value : object - Replacement object. - mask : np.ndarray[bool] - True indicate corresponding element is ignored. - inplace : bool, default True - Perform inplace modification. - regex : bool, default False - If true, perform regular expression substitution. - - Returns - ------- - List[Block] - """ - if mask.any(): - if not regex: - nb = self.coerce_to_target_dtype(value) - if nb is self and not inplace: - nb = nb.copy() - putmask_inplace(nb.values, mask, value) - return [nb] - else: - regex = should_use_regex(regex, to_replace) - if regex: - return self._replace_regex( - to_replace, - value, - inplace=inplace, - convert=False, - mask=mask, - ) - return self.replace(to_replace, value, inplace=inplace, regex=False) - return [self] - class ExtensionBlock(Block): """ @@ -1543,7 +1546,8 @@ def putmask(self, mask, new) -> List[Block]: new_values[mask] = new return [self.make_block(values=new_values)] - def _maybe_coerce_values(self, values): + @classmethod + def _maybe_coerce_values(cls, values): """ Unbox to an extension array. @@ -1934,13 +1938,39 @@ def to_native_types( class DatetimeLikeBlockMixin(HybridMixin, Block): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" - @property - def _holder(self): - return DatetimeArray + _dtype: np.dtype + _holder: Type[Union[DatetimeArray, TimedeltaArray]] - @property - def fill_value(self): - return np.datetime64("NaT", "ns") + @classmethod + def _maybe_coerce_values(cls, values): + """ + Input validation for values passed to __init__. Ensure that + we have nanosecond datetime64/timedelta64, coercing if necessary. + + Parameters + ---------- + values : array-like + Must be convertible to datetime64/timedelta64 + + Returns + ------- + values : ndarray[datetime64ns/timedelta64ns] + + Overridden by DatetimeTZBlock. + """ + if values.dtype != cls._dtype: + # non-nano we will convert to nano + if values.dtype.kind != cls._dtype.kind: + # caller is responsible for ensuring td64/dt64 dtype + raise TypeError(values.dtype) # pragma: no cover + + values = cls._holder._from_sequence(values)._data + + if isinstance(values, cls._holder): + values = values._data + + assert isinstance(values, np.ndarray), type(values) + return values def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: """ @@ -2036,36 +2066,14 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]: class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () is_datetime = True + fill_value = np.datetime64("NaT", "ns") + _dtype = fill_value.dtype + _holder = DatetimeArray @property def _can_hold_na(self): return True - def _maybe_coerce_values(self, values): - """ - Input validation for values passed to __init__. Ensure that - we have datetime64ns, coercing if necessary. - - Parameters - ---------- - values : array-like - Must be convertible to datetime64 - - Returns - ------- - values : ndarray[datetime64ns] - - Overridden by DatetimeTZBlock. - """ - if values.dtype != DT64NS_DTYPE: - values = conversion.ensure_datetime64ns(values) - - if isinstance(values, DatetimeArray): - values = values._data - - assert isinstance(values, np.ndarray), type(values) - return values - def set_inplace(self, locs, values): """ See Block.set.__doc__ @@ -2084,21 +2092,20 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): is_datetimetz = True is_extension = True + _holder = DatetimeArray + internal_values = Block.internal_values _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types diff = DatetimeBlock.diff - fill_value = np.datetime64("NaT", "ns") + fill_value = NaT where = DatetimeBlock.where putmask = DatetimeLikeBlockMixin.putmask array_values = ExtensionBlock.array_values - @property - def _holder(self): - return DatetimeArray - - def _maybe_coerce_values(self, values): + @classmethod + def _maybe_coerce_values(cls, values): """ Input validation for values passed to __init__. Ensure that we have datetime64TZ, coercing if necessary. @@ -2112,8 +2119,8 @@ def _maybe_coerce_values(self, values): ------- values : DatetimeArray """ - if not isinstance(values, self._holder): - values = self._holder(values) + if not isinstance(values, cls._holder): + values = cls._holder(values) if values.tz is None: raise ValueError("cannot create a DatetimeTZBlock without a tz") @@ -2160,10 +2167,8 @@ def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: def external_values(self): # NB: this is different from np.asarray(self.values), since that # return an object-dtype ndarray of Timestamps. - if self.is_datetimetz: - # avoid FutureWarning in .astype in casting from dt64t to dt64 - return self.values._data - return np.asarray(self.values.astype("datetime64[ns]", copy=False)) + # avoid FutureWarning in .astype in casting from dt64t to dt64 + return self.values._data def fillna( self, value, limit=None, inplace: bool = False, downcast=None @@ -2206,38 +2211,17 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin): is_timedelta = True _can_hold_na = True is_numeric = False + _holder = TimedeltaArray fill_value = np.timedelta64("NaT", "ns") - - def _maybe_coerce_values(self, values): - if values.dtype != TD64NS_DTYPE: - # non-nano we will convert to nano - if values.dtype.kind != "m": - # caller is responsible for ensuring timedelta64 dtype - raise TypeError(values.dtype) # pragma: no cover - - values = TimedeltaArray._from_sequence(values)._data - if isinstance(values, TimedeltaArray): - values = values._data - assert isinstance(values, np.ndarray), type(values) - return values - - @property - def _holder(self): - return TimedeltaArray + _dtype = fill_value.dtype def fillna( self, value, limit=None, inplace: bool = False, downcast=None ) -> List[Block]: - # TODO(EA2D): if we operated on array_values, TDA.fillna would handle - # raising here. - if is_integer(value): - # Deprecation GH#24694, GH#19233 - raise TypeError( - "Passing integers to fillna for timedelta64[ns] dtype is no " - "longer supported. To obtain the old behavior, pass " - "`pd.Timedelta(seconds=n)` instead." - ) - return super().fillna(value, limit=limit, inplace=inplace, downcast=downcast) + values = self.array_values() + values = values if inplace else values.copy() + new_values = values.fillna(value=value, limit=limit) + return [self.make_block_same_class(values=new_values)] class ObjectBlock(Block): @@ -2245,7 +2229,8 @@ class ObjectBlock(Block): is_object = True _can_hold_na = True - def _maybe_coerce_values(self, values): + @classmethod + def _maybe_coerce_values(cls, values): if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) return values @@ -2475,6 +2460,7 @@ def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. + # error: "ExtensionArray" has no attribute "reshape" values = values.reshape(tuple((1,) + shape)) # type: ignore[attr-defined] return values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1c45b39ba990a..ccac2696b34c5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1140,8 +1140,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False if value.ndim == 2: value = value.T - - if value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): + elif value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): # TODO(EA2D): special case not needed with 2D EAs value = safe_reshape(value, (1,) + value.shape) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 68f791ac0a837..670c2f9f6da6c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -96,9 +96,8 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.as_index = True self.exclusions = set() self.binner = None - # pandas\core\resample.py:96: error: Incompatible types in assignment - # (expression has type "None", variable has type "BaseGrouper") - # [assignment] + # error: Incompatible types in assignment (expression has type "None", variable + # has type "BaseGrouper") self.grouper = None # type: ignore[assignment] if self.groupby is not None: @@ -301,7 +300,7 @@ def pipe( def aggregate(self, func, *args, **kwargs): self._set_binner() - result, how = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: how = func grouper = None @@ -419,8 +418,7 @@ def _apply_loffset(self, result): result : Series or DataFrame the result of resample """ - # pandas\core\resample.py:409: error: Cannot determine type of - # 'loffset' [has-type] + # error: Cannot determine type of 'loffset' needs_offset = ( isinstance( self.loffset, # type: ignore[has-type] @@ -431,8 +429,7 @@ def _apply_loffset(self, result): ) if needs_offset: - # pandas\core\resample.py:415: error: Cannot determine type of - # 'loffset' [has-type] + # error: Cannot determine type of 'loffset' result.index = result.index + self.loffset # type: ignore[has-type] self.loffset = None @@ -869,8 +866,7 @@ def std(self, ddof=1, *args, **kwargs): Standard deviation of values within each group. """ nv.validate_resampler_func("std", args, kwargs) - # pandas\core\resample.py:850: error: Unexpected keyword argument - # "ddof" for "_downsample" [call-arg] + # error: Unexpected keyword argument "ddof" for "_downsample" return self._downsample("std", ddof=ddof) # type: ignore[call-arg] def var(self, ddof=1, *args, **kwargs): @@ -888,8 +884,7 @@ def var(self, ddof=1, *args, **kwargs): Variance of values within each group. """ nv.validate_resampler_func("var", args, kwargs) - # pandas\core\resample.py:867: error: Unexpected keyword argument - # "ddof" for "_downsample" [call-arg] + # error: Unexpected keyword argument "ddof" for "_downsample" return self._downsample("var", ddof=ddof) # type: ignore[call-arg] @doc(GroupBy.size) @@ -948,11 +943,8 @@ def quantile(self, q=0.5, **kwargs): Return a DataFrame, where the coulmns are groupby columns, and the values are its quantiles. """ - # pandas\core\resample.py:920: error: Unexpected keyword argument "q" - # for "_downsample" [call-arg] - - # pandas\core\resample.py:920: error: Too many arguments for - # "_downsample" [call-arg] + # error: Unexpected keyword argument "q" for "_downsample" + # error: Too many arguments for "_downsample" return self._downsample("quantile", q=q, **kwargs) # type: ignore[call-arg] @@ -1005,8 +997,7 @@ def __init__(self, obj, *args, **kwargs): for attr in self._attributes: setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) - # pandas\core\resample.py:972: error: Too many arguments for "__init__" - # of "object" [call-arg] + # error: Too many arguments for "__init__" of "object" super().__init__(None) # type: ignore[call-arg] self._groupby = groupby self._groupby.mutated = True @@ -1070,8 +1061,8 @@ def _downsample(self, how, **kwargs): return obj # do we have a regular frequency - # pandas\core\resample.py:1037: error: "BaseGrouper" has no - # attribute "binlabels" [attr-defined] + + # error: "BaseGrouper" has no attribute "binlabels" if ( (ax.freq is not None or ax.inferred_freq is not None) and len(self.grouper.binlabels) > len(ax) # type: ignore[attr-defined] diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8704d757c3289..963d071dc2768 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -996,9 +996,8 @@ def _get_merge_keys(self): """ left_keys = [] right_keys = [] - # pandas\core\reshape\merge.py:966: error: Need type annotation for - # 'join_names' (hint: "join_names: List[] = ...") - # [var-annotated] + # error: Need type annotation for 'join_names' (hint: "join_names: List[] + # = ...") join_names = [] # type: ignore[var-annotated] right_drop = [] left_drop = [] diff --git a/pandas/core/series.py b/pandas/core/series.py index 7d97c9f6189f3..c98fc98db5116 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3973,7 +3973,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): func = dict(kwargs.items()) op = series_apply(self, func, args=args, kwargs=kwargs) - result, how = op.agg() + result = op.agg() if result is None: # we can be called from an inner function which @@ -4044,6 +4044,12 @@ def apply( Series.agg: Only perform aggregating type operations. Series.transform: Only perform transforming type operations. + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`udf-mutation` + for more details. + Examples -------- Create a series with typical summer temperatures for each city. diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index ad2eafe7295b0..49eb87a3bc8ba 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -41,6 +41,10 @@ ----- `agg` is an alias for `aggregate`. Use the alias. +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`udf-mutation` +for more details. + A passed user-defined-function will be passed a Series for evaluation. {examples}""" @@ -296,6 +300,12 @@ {klass}.agg : Only perform aggregating type operations. {klass}.apply : Invoke function on a {klass}. +Notes +----- +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`udf-mutation` +for more details. + Examples -------- >>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b5714dbcd9e91..1538975b260c0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -510,7 +510,7 @@ def calc(x): return self._apply_tablewise(homogeneous_func, name) def aggregate(self, func, *args, **kwargs): - result, how = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: return self.apply(func, raw=False, args=args, kwargs=kwargs) return result @@ -994,7 +994,7 @@ def calc(x): axis="", ) def aggregate(self, func, *args, **kwargs): - result, how = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: # these must apply directly diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f12a530ea6c34..8902d45144c56 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -893,8 +893,8 @@ def check_extension(cls, ext: str): """ if ext.startswith("."): ext = ext[1:] - # error: "Callable[[ExcelWriter], Any]" has no attribute "__iter__" - # (not iterable) [attr-defined] + # error: "Callable[[ExcelWriter], Any]" has no attribute "__iter__" (not + # iterable) if not any( ext in extension for extension in cls.supported_extensions # type: ignore[attr-defined] diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 3a753a707166e..15f49660dc6dc 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,13 +1,12 @@ from __future__ import annotations -from distutils.version import LooseVersion import mmap from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions -from pandas.compat._optional import get_version, import_optional_dependency +from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import BaseExcelReader, ExcelWriter from pandas.io.excel._util import validate_freeze_panes @@ -509,40 +508,20 @@ def get_sheet_by_index(self, index: int): def _convert_cell(self, cell, convert_float: bool) -> Scalar: - from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC + from openpyxl.cell.cell import TYPE_ERROR, TYPE_NUMERIC if cell.value is None: return "" # compat with xlrd - elif cell.is_date: - return cell.value elif cell.data_type == TYPE_ERROR: return np.nan - elif cell.data_type == TYPE_BOOL: - return bool(cell.value) - elif cell.data_type == TYPE_NUMERIC: - # GH5394 - if convert_float: - val = int(cell.value) - if val == cell.value: - return val - else: - return float(cell.value) + elif not convert_float and cell.data_type == TYPE_NUMERIC: + return float(cell.value) return cell.value def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: - # GH 39001 - # Reading of excel file depends on dimension data being correct but - # writers sometimes omit or get it wrong - import openpyxl - - version = LooseVersion(get_version(openpyxl)) - - # There is no good way of determining if a sheet is read-only - # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605 - is_readonly = hasattr(sheet, "reset_dimensions") - if version >= "3.0.0" and is_readonly: + if self.book.read_only: sheet.reset_dimensions() data: List[List[Scalar]] = [] @@ -556,7 +535,7 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: # Trim trailing empty rows data = data[: last_row_with_data + 1] - if version >= "3.0.0" and is_readonly and len(data) > 0: + if self.book.read_only and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index ea291bcbfa44c..bdd2b3d6e4c6a 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -69,8 +69,7 @@ def check_main(): return not hasattr(main, "__file__") or get_option("mode.sim_interactive") try: - # pandas\io\formats\console.py:72: error: Name '__IPYTHON__' is not - # defined [name-defined] + # error: Name '__IPYTHON__' is not defined return __IPYTHON__ or check_main() # type: ignore[name-defined] except NameError: return check_main() @@ -85,8 +84,7 @@ def in_ipython_frontend(): bool """ try: - # pandas\io\formats\console.py:86: error: Name 'get_ipython' is not - # defined [name-defined] + # error: Name 'get_ipython' is not defined ip = get_ipython() # type: ignore[name-defined] return "zmq" in str(type(ip)).lower() except NameError: diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index b027d8139f24b..099688b2449db 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -475,7 +475,7 @@ def __init__( if not len(Index(cols).intersection(df.columns)): raise KeyError("passes columns are not ALL present dataframe") - if len(Index(cols).intersection(df.columns)) != len(cols): + if len(Index(cols).intersection(df.columns)) != len(set(cols)): # Deprecated in GH#17295, enforced in 1.0.0 raise KeyError("Not all names specified in 'columns' are found") @@ -613,9 +613,8 @@ def _format_header(self) -> Iterable[ExcelCell]: "" ] * len(self.columns) if reduce(lambda x, y: x and y, map(lambda x: x != "", row)): - # pandas\io\formats\excel.py:618: error: Incompatible types in - # assignment (expression has type "Generator[ExcelCell, None, - # None]", variable has type "Tuple[]") [assignment] + # error: Incompatible types in assignment (expression has type + # "Generator[ExcelCell, None, None]", variable has type "Tuple[]") gen2 = ( # type: ignore[assignment] ExcelCell(self.rowcounter, colindex, val, self.header_style) for colindex, val in enumerate(row) @@ -819,9 +818,8 @@ def write( if isinstance(writer, ExcelWriter): need_save = False else: - # pandas\io\formats\excel.py:808: error: Cannot instantiate - # abstract class 'ExcelWriter' with abstract attributes 'engine', - # 'save', 'supported_extensions' and 'write_cells' [abstract] + # error: Cannot instantiate abstract class 'ExcelWriter' with abstract + # attributes 'engine', 'save', 'supported_extensions' and 'write_cells' writer = ExcelWriter( # type: ignore[abstract] writer, engine=engine, storage_options=storage_options ) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 05d94366e6623..48b2fae8c6de5 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1347,11 +1347,9 @@ def _value_formatter( def base_formatter(v): assert float_format is not None # for mypy - # pandas\io\formats\format.py:1411: error: "str" not callable - # [operator] - - # pandas\io\formats\format.py:1411: error: Unexpected keyword - # argument "value" for "__call__" of "EngFormatter" [call-arg] + # error: "str" not callable + # error: Unexpected keyword argument "value" for "__call__" of + # "EngFormatter" return ( float_format(value=v) # type: ignore[operator,call-arg] if notna(v) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 735fb345363c7..8322587a096d4 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -158,13 +158,12 @@ def __init__( uuid_len: int = 5, ): # validate ordered args - if not isinstance(data, (pd.Series, pd.DataFrame)): - raise TypeError("``data`` must be a Series or DataFrame") - if data.ndim == 1: + if isinstance(data, pd.Series): data = data.to_frame() + if not isinstance(data, DataFrame): + raise TypeError("``data`` must be a Series or DataFrame") if not data.index.is_unique or not data.columns.is_unique: raise ValueError("style is not supported for non-unique indices.") - assert isinstance(data, DataFrame) self.data: DataFrame = data self.index: pd.Index = data.index self.columns: pd.Index = data.columns @@ -1740,8 +1739,8 @@ def from_custom_template(cls, searchpath, name): loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader]) # mypy doesn't like dynamically-defined classes - # error: Variable "cls" is not valid as a type [valid-type] - # error: Invalid base class "cls" [misc] + # error: Variable "cls" is not valid as a type + # error: Invalid base class "cls" class MyStyler(cls): # type:ignore[valid-type,misc] env = jinja2.Environment(loader=loader) template = env.get_template(name) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index d1d77c5e044be..27f06dc84a275 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -25,29 +25,23 @@ def __init__(self, src: FilePathOrBuffer, **kwds): for key in ("storage_options", "encoding", "memory_map", "compression"): kwds.pop(key, None) if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): - # pandas\io\parsers.py:1861: error: Item "IO[Any]" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + # error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase, + # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - # pandas\io\parsers.py:1861: error: Item "RawIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + # error: Item "RawIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, + # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - # pandas\io\parsers.py:1861: error: Item "BufferedIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + # error: Item "BufferedIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, + # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - # pandas\io\parsers.py:1861: error: Item "TextIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + # error: Item "TextIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, + # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - # pandas\io\parsers.py:1861: error: Item "TextIOWrapper" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + # error: Item "TextIOWrapper" of "Union[IO[Any], RawIOBase, BufferedIOBase, + # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - # pandas\io\parsers.py:1861: error: Item "mmap" of "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]" has - # no attribute "mmap" [union-attr] + # error: Item "mmap" of "Union[IO[Any], RawIOBase, BufferedIOBase, + # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] try: diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 223acdea80ca6..cfd648636753d 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -217,10 +217,9 @@ def _read(): reader = _read() - # pandas\io\parsers.py:2427: error: Incompatible types in assignment - # (expression has type "_reader", variable has type "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap, None]") - # [assignment] + # error: Incompatible types in assignment (expression has type "_reader", + # variable has type "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, + # TextIOWrapper, mmap, None]") self.data = reader # type: ignore[assignment] def read(self, rows=None): @@ -278,8 +277,7 @@ def _exclude_implicit_index(self, alldata): # legacy def get_chunk(self, size=None): if size is None: - # pandas\io\parsers.py:2528: error: "PythonParser" has no attribute - # "chunksize" [attr-defined] + # error: "PythonParser" has no attribute "chunksize" size = self.chunksize # type: ignore[attr-defined] return self.read(rows=size) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8917be1f558b2..077686d9bd642 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3410,8 +3410,8 @@ def queryables(self) -> Dict[str, Any]: (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) ] - # error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]" - # and "List[Tuple[str, None]]") + # error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]" and + # "List[Tuple[str, None]]") return dict(d1 + d2 + d3) # type: ignore[operator] def index_cols(self): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 7d743075674f1..2b6dd5347c3e7 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -594,17 +594,14 @@ def _make_legend(self): if self.legend: if self.legend == "reverse": - # pandas\plotting\_matplotlib\core.py:578: error: - # Incompatible types in assignment (expression has type + # error: Incompatible types in assignment (expression has type # "Iterator[Any]", variable has type "List[Any]") - # [assignment] self.legend_handles = reversed( # type: ignore[assignment] self.legend_handles ) - # pandas\plotting\_matplotlib\core.py:579: error: - # Incompatible types in assignment (expression has type + # error: Incompatible types in assignment (expression has type # "Iterator[Optional[Hashable]]", variable has type - # "List[Optional[Hashable]]") [assignment] + # "List[Optional[Hashable]]") self.legend_labels = reversed( # type: ignore[assignment] self.legend_labels ) @@ -1149,10 +1146,9 @@ def _make_plot(self): it = self._iter_data(data=data, keep_index=True) else: x = self._get_xticks(convert_period=True) - # pandas\plotting\_matplotlib\core.py:1100: error: Incompatible - # types in assignment (expression has type "Callable[[Any, Any, - # Any, Any, Any, Any, KwArg(Any)], Any]", variable has type - # "Callable[[Any, Any, Any, Any, KwArg(Any)], Any]") [assignment] + # error: Incompatible types in assignment (expression has type + # "Callable[[Any, Any, Any, Any, Any, Any, KwArg(Any)], Any]", variable has + # type "Callable[[Any, Any, Any, Any, KwArg(Any)], Any]") plotf = self._plot # type: ignore[assignment] it = self._iter_data() @@ -1601,9 +1597,8 @@ def blank_labeler(label, value): if labels is not None: blabels = [blank_labeler(left, value) for left, value in zip(labels, y)] else: - # pandas\plotting\_matplotlib\core.py:1546: error: Incompatible - # types in assignment (expression has type "None", variable has - # type "List[Any]") [assignment] + # error: Incompatible types in assignment (expression has type "None", + # variable has type "List[Any]") blabels = None # type: ignore[assignment] results = ax.pie(y, labels=blabels, **kwds) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 58f44104b99d6..e0a860b9d8709 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -530,8 +530,7 @@ def reset(self): ------- None """ - # pandas\plotting\_misc.py:533: error: Cannot access "__init__" - # directly [misc] + # error: Cannot access "__init__" directly self.__init__() # type: ignore[misc] def _get_canonical_key(self, key): diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index a47c5555d3e9f..2b18d110346e4 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -105,13 +105,11 @@ def test_infer_from_scalar_tz(tz, pandas_dtype): if pandas_dtype: exp_dtype = f"datetime64[ns, {tz}]" - exp_val = dt.value else: exp_dtype = np.object_ - exp_val = dt assert dtype == exp_dtype - assert val == exp_val + assert val == dt @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 08303fc601b3e..786944816bcf6 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -24,6 +24,7 @@ from pandas.core.dtypes.missing import isna import pandas as pd +import pandas._testing as tm @pytest.fixture( @@ -403,7 +404,13 @@ def test_maybe_promote_any_with_datetime64( expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + warn = None + if type(fill_value) is datetime.date and dtype.kind == "M": + # Casting date to dt64 is deprecated + warn = FutureWarning + + with tm.assert_produces_warning(warn, check_stacklevel=False): + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 0f4cef772458f..f3cb3f45a167f 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -55,6 +55,51 @@ from pandas.core.arrays import IntegerArray +class MockNumpyLikeArray: + """ + A class which is numpy-like (e.g. Pint's Quantity) but not actually numpy + The key is that it is not actually a numpy array so + ``util.is_array(mock_numpy_like_array_instance)`` returns ``False``. Other + important properties are that the class defines a :meth:`__iter__` method + (so that ``isinstance(abc.Iterable)`` returns ``True``) and has a + :meth:`ndim` property which can be used as a check for whether it is a + scalar or not. + """ + + def __init__(self, values): + self._values = values + + def __iter__(self): + iter_values = iter(self._values) + + def it_outer(): + yield from iter_values + + return it_outer() + + def __len__(self): + return len(self._values) + + def __array__(self, t=None): + return self._values + + @property + def ndim(self): + return self._values.ndim + + @property + def dtype(self): + return self._values.dtype + + @property + def size(self): + return self._values.size + + @property + def shape(self): + return self._values.shape + + @pytest.fixture(params=[True, False], ids=str) def coerce(request): return request.param @@ -94,6 +139,15 @@ def coerce(request): (np.ndarray((2,) * 4), True, "ndarray-4d"), (np.array([[[[]]]]), True, "ndarray-4d-empty"), (np.array(2), False, "ndarray-0d"), + (MockNumpyLikeArray(np.ndarray((2,) * 1)), True, "duck-ndarray-1d"), + (MockNumpyLikeArray(np.array([])), True, "duck-ndarray-1d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 2)), True, "duck-ndarray-2d"), + (MockNumpyLikeArray(np.array([[]])), True, "duck-ndarray-2d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 3)), True, "duck-ndarray-3d"), + (MockNumpyLikeArray(np.array([[[]]])), True, "duck-ndarray-3d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 4)), True, "duck-ndarray-4d"), + (MockNumpyLikeArray(np.array([[[[]]]])), True, "duck-ndarray-4d-empty"), + (MockNumpyLikeArray(np.array(2)), False, "duck-ndarray-0d"), (1, False, "int"), (b"123", False, "bytes"), (b"", False, "bytes-empty"), @@ -154,6 +208,8 @@ def test_is_array_like(): assert inference.is_array_like(Series([1, 2])) assert inference.is_array_like(np.array(["a", "b"])) assert inference.is_array_like(Index(["2016-01-01"])) + assert inference.is_array_like(np.array([2, 3])) + assert inference.is_array_like(MockNumpyLikeArray(np.array([2, 3]))) class DtypeList(list): dtype = "special" diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 4282db6933371..7c48c412fd694 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -10,6 +10,7 @@ MultiIndex, Series, Timestamp, + concat, get_dummies, period_range, ) @@ -176,6 +177,87 @@ def test_getitem_bool_mask_categorical_index(self): with pytest.raises(TypeError, match=msg): df4[df4.index > 1] + @pytest.mark.parametrize( + "data1,data2,expected_data", + ( + ( + [[1, 2], [3, 4]], + [[0.5, 6], [7, 8]], + [[np.nan, 3.0], [np.nan, 4.0], [np.nan, 7.0], [6.0, 8.0]], + ), + ( + [[1, 2], [3, 4]], + [[5, 6], [7, 8]], + [[np.nan, 3.0], [np.nan, 4.0], [5, 7], [6, 8]], + ), + ), + ) + def test_getitem_bool_mask_duplicate_columns_mixed_dtypes( + self, + data1, + data2, + expected_data, + ): + # GH#31954 + + df1 = DataFrame(np.array(data1)) + df2 = DataFrame(np.array(data2)) + df = concat([df1, df2], axis=1) + + result = df[df > 2] + + exdict = {i: np.array(col) for i, col in enumerate(expected_data)} + expected = DataFrame(exdict).rename(columns={2: 0, 3: 1}) + tm.assert_frame_equal(result, expected) + + @pytest.fixture + def df_dup_cols(self): + dups = ["A", "A", "C", "D"] + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") + return df + + def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self, df_dup_cols): + # `df.A > 6` is a DataFrame with a different shape from df + + # boolean with the duplicate raises + df = df_dup_cols + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df[df.A > 6] + + def test_getitem_boolean_series_with_duplicate_columns(self, df_dup_cols): + # boolean indexing + # GH#4879 + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) + expected = df[df.C > 6] + expected.columns = df_dup_cols.columns + + df = df_dup_cols + result = df[df.C > 6] + + tm.assert_frame_equal(result, expected) + result.dtypes + str(result) + + def test_getitem_boolean_frame_with_duplicate_columns(self, df_dup_cols): + + # where + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) + # `df > 6` is a DataFrame with the same shape+alignment as df + expected = df[df > 6] + expected.columns = df_dup_cols.columns + + df = df_dup_cols + result = df[df > 6] + + tm.assert_frame_equal(result, expected) + result.dtypes + str(result) + class TestGetitemSlice: def test_getitem_slice_float64(self, frame_or_series): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 9318764a1b5ad..4dfbc0b918aaa 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -2,18 +2,26 @@ import pytest from pandas.core.dtypes.base import registry as ea_registry +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_interval_dtype, + is_object_dtype, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype from pandas import ( Categorical, DataFrame, + DatetimeIndex, Index, Interval, + IntervalIndex, NaT, Period, PeriodIndex, Series, Timestamp, + cut, date_range, notna, period_range, @@ -395,6 +403,90 @@ def test_setitem_listlike_indexer_duplicate_columns_not_equal_length(self): with pytest.raises(ValueError, match=msg): df[["a", "b"]] = rhs + def test_setitem_intervals(self): + + df = DataFrame({"A": range(10)}) + ser = cut(df["A"], 5) + assert isinstance(ser.cat.categories, IntervalIndex) + + # B & D end up as Categoricals + # the remainer are converted to in-line objects + # contining an IntervalIndex.values + df["B"] = ser + df["C"] = np.array(ser) + df["D"] = ser.values + df["E"] = np.array(ser.values) + + assert is_categorical_dtype(df["B"].dtype) + assert is_interval_dtype(df["B"].cat.categories) + assert is_categorical_dtype(df["D"].dtype) + assert is_interval_dtype(df["D"].cat.categories) + + assert is_object_dtype(df["C"]) + assert is_object_dtype(df["E"]) + + # they compare equal as Index + # when converted to numpy objects + c = lambda x: Index(np.array(x)) + tm.assert_index_equal(c(df.B), c(df.B)) + tm.assert_index_equal(c(df.B), c(df.C), check_names=False) + tm.assert_index_equal(c(df.B), c(df.D), check_names=False) + tm.assert_index_equal(c(df.C), c(df.D), check_names=False) + + # B & D are the same Series + tm.assert_series_equal(df["B"], df["B"]) + tm.assert_series_equal(df["B"], df["D"], check_names=False) + + # C & E are the same Series + tm.assert_series_equal(df["C"], df["C"]) + tm.assert_series_equal(df["C"], df["E"], check_names=False) + + +class TestSetitemTZAwareValues: + @pytest.fixture + def idx(self): + naive = DatetimeIndex(["2013-1-1 13:00", "2013-1-2 14:00"], name="B") + idx = naive.tz_localize("US/Pacific") + return idx + + @pytest.fixture + def expected(self, idx): + expected = Series(np.array(idx.tolist(), dtype="object"), name="B") + assert expected.dtype == idx.dtype + return expected + + def test_setitem_dt64series(self, idx, expected): + # convert to utc + df = DataFrame(np.random.randn(2, 1), columns=["A"]) + df["B"] = idx + + with tm.assert_produces_warning(FutureWarning) as m: + df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) + msg = "do 'idx.tz_convert(None)' before calling" + assert msg in str(m[0].message) + + result = df["B"] + comp = Series(idx.tz_convert("UTC").tz_localize(None), name="B") + tm.assert_series_equal(result, comp) + + def test_setitem_datetimeindex(self, idx, expected): + # setting a DataFrame column with a tzaware DTI retains the dtype + df = DataFrame(np.random.randn(2, 1), columns=["A"]) + + # assign to frame + df["B"] = idx + result = df["B"] + tm.assert_series_equal(result, expected) + + def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected): + # setting a DataFrame column with a tzaware DTI retains the dtype + df = DataFrame(np.random.randn(2, 1), columns=["A"]) + + # object array of datetimes with a tz + df["B"] = idx.to_pydatetime() + result = df["B"] + tm.assert_series_equal(result, expected) + class TestDataFrameSetItemWithExpansion: def test_setitem_listlike_views(self): diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index e4e2656f4337c..5b7d096d0ab99 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -21,10 +21,69 @@ import pandas.core.common as com +class TestReindexSetIndex: + # Tests that check both reindex and set_index + + def test_dti_set_index_reindex_datetimeindex(self): + # GH#6631 + df = DataFrame(np.random.random(6)) + idx1 = date_range("2011/01/01", periods=6, freq="M", tz="US/Eastern") + idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") + + df = df.set_index(idx1) + tm.assert_index_equal(df.index, idx1) + df = df.reindex(idx2) + tm.assert_index_equal(df.index, idx2) + + def test_dti_set_index_reindex_freq_with_tz(self): + # GH#11314 with tz + index = date_range( + datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="H", tz="US/Eastern" + ) + df = DataFrame(np.random.randn(24, 1), columns=["a"], index=index) + new_index = date_range( + datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="H", tz="US/Eastern" + ) + + result = df.set_index(new_index) + assert result.index.freq == index.freq + + def test_set_reset_index_intervalindex(self): + + df = DataFrame({"A": range(10)}) + ser = pd.cut(df.A, 5) + df["B"] = ser + df = df.set_index("B") + + df = df.reset_index() + + class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing + def test_reindex_date_fill_value(self): + # passing date to dt64 is deprecated + arr = date_range("2016-01-01", periods=6).values.reshape(3, 2) + df = DataFrame(arr, columns=["A", "B"], index=range(3)) + + ts = df.iloc[0, 0] + fv = ts.date() + + with tm.assert_produces_warning(FutureWarning): + res = df.reindex(index=range(4), columns=["A", "B", "C"], fill_value=fv) + + expected = DataFrame( + {"A": df["A"].tolist() + [ts], "B": df["B"].tolist() + [ts], "C": [ts] * 4} + ) + tm.assert_frame_equal(res, expected) + + # same with a datetime-castable str + res = df.reindex( + index=range(4), columns=["A", "B", "C"], fill_value="2016-01-01" + ) + tm.assert_frame_equal(res, expected) + def test_reindex_with_multi_index(self): # https://github.com/pandas-dev/pandas/issues/29896 # tests for reindexing a multi-indexed DataFrame with a new MultiIndex diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index b66a95bae51c5..70232dfd1d79a 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -1,3 +1,7 @@ +""" +See also: test_reindex.py:TestReindexSetIndex +""" + from datetime import datetime, timedelta import numpy as np diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 862f5b87785f5..c68171ab254c7 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,111 +1,13 @@ from datetime import datetime -import numpy as np -import pytest import pytz -from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_interval_dtype, - is_object_dtype, -) - -from pandas import ( - DataFrame, - DatetimeIndex, - Index, - IntervalIndex, - Series, - Timestamp, - cut, - date_range, -) +from pandas import DataFrame import pandas._testing as tm class TestDataFrameAlterAxes: - @pytest.fixture - def idx_expected(self): - idx = DatetimeIndex(["2013-1-1 13:00", "2013-1-2 14:00"], name="B").tz_localize( - "US/Pacific" - ) - - expected = Series( - np.array( - [ - Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), - Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), - ], - dtype="object", - ), - name="B", - ) - assert expected.dtype == idx.dtype - return idx, expected - - def test_to_series_keep_tz_deprecated_true(self, idx_expected): - # convert to series while keeping the timezone - idx, expected = idx_expected - - msg = "stop passing 'keep_tz'" - with tm.assert_produces_warning(FutureWarning) as m: - result = idx.to_series(keep_tz=True, index=[0, 1]) - assert msg in str(m[0].message) - - tm.assert_series_equal(result, expected) - - def test_to_series_keep_tz_deprecated_false(self, idx_expected): - idx, expected = idx_expected - - with tm.assert_produces_warning(FutureWarning) as m: - result = idx.to_series(keep_tz=False, index=[0, 1]) - tm.assert_series_equal(result, expected.dt.tz_convert(None)) - msg = "do 'idx.tz_convert(None)' before calling" - assert msg in str(m[0].message) - - def test_setitem_dt64series(self, idx_expected): - # convert to utc - idx, expected = idx_expected - df = DataFrame(np.random.randn(2, 1), columns=["A"]) - df["B"] = idx - - with tm.assert_produces_warning(FutureWarning) as m: - df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) - msg = "do 'idx.tz_convert(None)' before calling" - assert msg in str(m[0].message) - - result = df["B"] - comp = Series(idx.tz_convert("UTC").tz_localize(None), name="B") - tm.assert_series_equal(result, comp) - - def test_setitem_datetimeindex(self, idx_expected): - # setting a DataFrame column with a tzaware DTI retains the dtype - idx, expected = idx_expected - df = DataFrame(np.random.randn(2, 1), columns=["A"]) - - # assign to frame - df["B"] = idx - result = df["B"] - tm.assert_series_equal(result, expected) - - def test_setitem_object_array_of_tzaware_datetimes(self, idx_expected): - # setting a DataFrame column with a tzaware DTI retains the dtype - idx, expected = idx_expected - df = DataFrame(np.random.randn(2, 1), columns=["A"]) - - # object array of datetimes with a tz - df["B"] = idx.to_pydatetime() - result = df["B"] - tm.assert_series_equal(result, expected) - - def test_constructor_from_tzaware_datetimeindex(self, idx_expected): - # don't cast a DatetimeIndex WITH a tz, leave as object - # GH 6032 - idx, expected = idx_expected - - # convert index to series - result = Series(idx) - tm.assert_series_equal(result, expected) + # Tests for setting index/columns attributes directly (i.e. __setattr__) def test_set_axis_setattr_index(self): # GH 6785 @@ -117,31 +19,6 @@ def test_set_axis_setattr_index(self): df.pop("ts") tm.assert_frame_equal(df, expected) - def test_dti_set_index_reindex(self): - # GH 6631 - df = DataFrame(np.random.random(6)) - idx1 = date_range("2011/01/01", periods=6, freq="M", tz="US/Eastern") - idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") - - df = df.set_index(idx1) - tm.assert_index_equal(df.index, idx1) - df = df.reindex(idx2) - tm.assert_index_equal(df.index, idx2) - - def test_dti_set_index_reindex_with_tz(self): - # GH 11314 - # with tz - index = date_range( - datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="H", tz="US/Eastern" - ) - df = DataFrame(np.random.randn(24, 1), columns=["a"], index=index) - new_index = date_range( - datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="H", tz="US/Eastern" - ) - - result = df.set_index(new_index) - assert result.index.freq == index.freq - # Renaming def test_assign_columns(self, float_frame): @@ -151,52 +28,3 @@ def test_assign_columns(self, float_frame): df.columns = ["foo", "bar", "baz", "quux", "foo2"] tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False) tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False) - - -class TestIntervalIndex: - def test_setitem(self): - - df = DataFrame({"A": range(10)}) - ser = cut(df["A"], 5) - assert isinstance(ser.cat.categories, IntervalIndex) - - # B & D end up as Categoricals - # the remainer are converted to in-line objects - # contining an IntervalIndex.values - df["B"] = ser - df["C"] = np.array(ser) - df["D"] = ser.values - df["E"] = np.array(ser.values) - - assert is_categorical_dtype(df["B"].dtype) - assert is_interval_dtype(df["B"].cat.categories) - assert is_categorical_dtype(df["D"].dtype) - assert is_interval_dtype(df["D"].cat.categories) - - assert is_object_dtype(df["C"]) - assert is_object_dtype(df["E"]) - - # they compare equal as Index - # when converted to numpy objects - c = lambda x: Index(np.array(x)) - tm.assert_index_equal(c(df.B), c(df.B)) - tm.assert_index_equal(c(df.B), c(df.C), check_names=False) - tm.assert_index_equal(c(df.B), c(df.D), check_names=False) - tm.assert_index_equal(c(df.C), c(df.D), check_names=False) - - # B & D are the same Series - tm.assert_series_equal(df["B"], df["B"]) - tm.assert_series_equal(df["B"], df["D"], check_names=False) - - # C & E are the same Series - tm.assert_series_equal(df["C"], df["C"]) - tm.assert_series_equal(df["C"], df["E"], check_names=False) - - def test_set_reset_index(self): - - df = DataFrame({"A": range(10)}) - s = cut(df.A, 5) - df["B"] = s - df = df.set_index("B") - - df = df.reset_index() diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 9ec745932514f..5fcab5200e305 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -20,6 +20,7 @@ Categorical, CategoricalIndex, DataFrame, + DatetimeIndex, Index, Interval, MultiIndex, @@ -48,6 +49,19 @@ class TestDataFrameConstructors: + def test_constructor_from_tzaware_datetimeindex(self): + # don't cast a DatetimeIndex WITH a tz, leave as object + # GH#6032 + naive = DatetimeIndex(["2013-1-1 13:00", "2013-1-2 14:00"], name="B") + idx = naive.tz_localize("US/Pacific") + + expected = Series(np.array(idx.tolist(), dtype="object"), name="B") + assert expected.dtype == idx.dtype + + # convert index to series + result = Series(idx) + tm.assert_series_equal(result, expected) + def test_array_of_dt64_nat_with_td64dtype_raises(self, frame_or_series): # GH#39462 nat = np.datetime64("NaT", "ns") diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 8dcf6f2188058..1f892c3a03e85 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -33,6 +33,7 @@ def test_column_dups_operations(self): expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) + def test_insert_with_duplicate_columns(self): # insert df = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], @@ -119,6 +120,7 @@ def test_column_dups_operations(self): ) tm.assert_frame_equal(df, expected) + def test_dup_across_dtypes(self): # dup across dtypes df = DataFrame( [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]], @@ -155,12 +157,14 @@ def test_column_dups_operations(self): ) check(df, expected) + def test_values_with_duplicate_columns(self): # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"]) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) assert (result == expected).all().all() + def test_rename_with_duplicate_columns(self): # rename, GH 4403 df4 = DataFrame( {"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]}, @@ -201,6 +205,8 @@ def test_column_dups_operations(self): ).set_index(["STK_ID", "RPT_Date"], drop=False) tm.assert_frame_equal(result, expected) + def test_reindex_with_duplicate_columns(self): + # reindex is invalid! df = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] @@ -211,6 +217,8 @@ def test_column_dups_operations(self): with pytest.raises(ValueError, match=msg): df.reindex(columns=["bar", "foo"]) + def test_drop_with_duplicate_columns(self): + # drop df = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] @@ -221,6 +229,7 @@ def test_column_dups_operations(self): result = df.drop("a", axis=1) check(result, expected) + def test_describe_with_duplicate_columns(self): # describe df = DataFrame( [[1, 1, 1], [2, 2, 2], [3, 3, 3]], @@ -232,6 +241,7 @@ def test_column_dups_operations(self): expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) + def test_column_dups_indexes(self): # check column dups with index equal and not equal to df's index df = DataFrame( np.random.randn(5, 3), @@ -248,6 +258,8 @@ def test_column_dups_operations(self): this_df["A"] = index check(this_df, expected_df) + def test_arithmetic_with_dups(self): + # operations for op in ["__add__", "__mul__", "__sub__", "__truediv__"]: df = DataFrame({"A": np.arange(10), "B": np.random.rand(10)}) @@ -257,6 +269,7 @@ def test_column_dups_operations(self): result = getattr(df, op)(df) check(result, expected) + def test_changing_dtypes_with_duplicate_columns(self): # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 @@ -272,7 +285,7 @@ def test_column_dups_operations(self): df["that"] = 1 check(df, expected) - def test_column_dups2(self): + def test_column_dups_drop(self): # drop buggy GH 6240 df = DataFrame( @@ -289,6 +302,7 @@ def test_column_dups2(self): result = df2.drop("C", axis=1) tm.assert_frame_equal(result, expected) + def test_column_dups_dropna(self): # dropna df = DataFrame( { @@ -310,43 +324,6 @@ def test_column_dups2(self): result = df.dropna(subset=["A", "C"], how="all") tm.assert_frame_equal(result, expected) - def test_getitem_boolean_series_with_duplicate_columns(self): - # boolean indexing - # GH 4879 - dups = ["A", "A", "C", "D"] - df = DataFrame( - np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" - ) - expected = df[df.C > 6] - expected.columns = dups - df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") - result = df[df.C > 6] - check(result, expected) - - def test_getitem_boolean_frame_with_duplicate_columns(self): - dups = ["A", "A", "C", "D"] - - # where - df = DataFrame( - np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" - ) - # `df > 6` is a DataFrame with the same shape+alignment as df - expected = df[df > 6] - expected.columns = dups - df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") - result = df[df > 6] - check(result, expected) - - def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self): - # `df.A > 6` is a DataFrame with a different shape from df - dups = ["A", "A", "C", "D"] - - # boolean with the duplicate raises - df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") - msg = "cannot reindex from a duplicate axis" - with pytest.raises(ValueError, match=msg): - df[df.A > 6] - def test_column_dups_indexing(self): # dup aligning operations should work @@ -357,6 +334,7 @@ def test_column_dups_indexing(self): result = df1.sub(df2) tm.assert_frame_equal(result, expected) + def test_dup_columns_comparisons(self): # equality df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"]) df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"]) @@ -374,6 +352,7 @@ def test_column_dups_indexing(self): ) tm.assert_frame_equal(result, expected) + def test_mixed_column_selection(self): # mixed column selection # GH 5639 dfbool = DataFrame( @@ -387,6 +366,7 @@ def test_column_dups_indexing(self): result = dfbool[["one", "three", "one"]] check(result, expected) + def test_multi_axis_dups(self): # multi-axis dups # GH 6121 df = DataFrame( @@ -422,6 +402,7 @@ def test_columns_with_dups(self): expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"]) tm.assert_frame_equal(df, expected) + def test_columns_with_dup_index(self): # with a dup index df = DataFrame([[1, 2]], columns=["a", "a"]) df.columns = ["b", "b"] @@ -429,6 +410,7 @@ def test_columns_with_dups(self): expected = DataFrame([[1, 2]], columns=["b", "b"]) tm.assert_frame_equal(df, expected) + def test_multi_dtype(self): # multi-dtype df = DataFrame( [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], @@ -441,12 +423,14 @@ def test_columns_with_dups(self): ) tm.assert_frame_equal(df, expected) + def test_multi_dtype2(self): df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"]) df.columns = ["a", "a.1", "a.2", "a.3"] str(df) expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"]) tm.assert_frame_equal(df, expected) + def test_dups_across_blocks(self): # dups across blocks df_float = DataFrame(np.random.randn(10, 3), dtype="float64") df_int = DataFrame(np.random.randn(10, 3), dtype="int64") @@ -464,6 +448,7 @@ def test_columns_with_dups(self): for i in range(len(df.columns)): df.iloc[:, i] + def test_dup_columns_across_dtype(self): # dup columns across dtype GH 2079/2194 vals = [[1, -1, 2.0], [2, -2, 3.0]] rs = DataFrame(vals, columns=["A", "A", "B"]) @@ -486,36 +471,3 @@ def test_set_value_by_index(self): df.iloc[:, 0] = 3 tm.assert_series_equal(df.iloc[:, 1], expected) - - @pytest.mark.parametrize( - "data1,data2,expected_data", - ( - ( - [[1, 2], [3, 4]], - [[0.5, 6], [7, 8]], - [[np.nan, 3.0], [np.nan, 4.0], [np.nan, 7.0], [6.0, 8.0]], - ), - ( - [[1, 2], [3, 4]], - [[5, 6], [7, 8]], - [[np.nan, 3.0], [np.nan, 4.0], [5, 7], [6, 8]], - ), - ), - ) - def test_masking_duplicate_columns_mixed_dtypes( - self, - data1, - data2, - expected_data, - ): - # GH31954 - - df1 = DataFrame(np.array(data1)) - df2 = DataFrame(np.array(data2)) - df = pd.concat([df1, df2], axis=1) - - result = df[df > 2] - expected = DataFrame( - {i: np.array(col) for i, col in enumerate(expected_data)} - ).rename(columns={2: 0, 3: 1}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 3f04f0f1163e7..04eb2f42d745b 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -769,6 +769,18 @@ def test_transform_numeric_ret(cols, exp, comp_func, agg_func, request): comp_func(result, exp) +def test_transform_ffill(): + # GH 24211 + data = [["a", 0.0], ["a", float("nan")], ["b", 1.0], ["b", float("nan")]] + df = DataFrame(data, columns=["key", "values"]) + result = df.groupby("key").transform("ffill") + expected = DataFrame({"values": [0.0, 0.0, 1.0, 1.0]}) + tm.assert_frame_equal(result, expected) + result = df.groupby("key")["values"].transform("ffill") + expected = Series([0.0, 0.0, 1.0, 1.0], name="values") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("mix_groupings", [True, False]) @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize("val1,val2", [("foo", "bar"), (1, 2), (1.0, 2.0)]) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_series.py b/pandas/tests/indexes/datetimes/methods/test_to_series.py new file mode 100644 index 0000000000000..5998fc0dde499 --- /dev/null +++ b/pandas/tests/indexes/datetimes/methods/test_to_series.py @@ -0,0 +1,37 @@ +import numpy as np +import pytest + +from pandas import DatetimeIndex, Series +import pandas._testing as tm + + +class TestToSeries: + @pytest.fixture + def idx_expected(self): + naive = DatetimeIndex(["2013-1-1 13:00", "2013-1-2 14:00"], name="B") + idx = naive.tz_localize("US/Pacific") + + expected = Series(np.array(idx.tolist(), dtype="object"), name="B") + + assert expected.dtype == idx.dtype + return idx, expected + + def test_to_series_keep_tz_deprecated_true(self, idx_expected): + # convert to series while keeping the timezone + idx, expected = idx_expected + + msg = "stop passing 'keep_tz'" + with tm.assert_produces_warning(FutureWarning) as m: + result = idx.to_series(keep_tz=True, index=[0, 1]) + assert msg in str(m[0].message) + + tm.assert_series_equal(result, expected) + + def test_to_series_keep_tz_deprecated_false(self, idx_expected): + idx, expected = idx_expected + + with tm.assert_produces_warning(FutureWarning) as m: + result = idx.to_series(keep_tz=False, index=[0, 1]) + tm.assert_series_equal(result, expected.dt.tz_convert(None)) + msg = "do 'idx.tz_convert(None)' before calling" + assert msg in str(m[0].message) diff --git a/pandas/tests/indexes/datetimes/test_insert.py b/pandas/tests/indexes/datetimes/test_insert.py index 684c6b813b48f..6dbd1287b7306 100644 --- a/pandas/tests/indexes/datetimes/test_insert.py +++ b/pandas/tests/indexes/datetimes/test_insert.py @@ -13,8 +13,12 @@ class TestInsert: @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) def test_insert_nat(self, tz, null): # GH#16537, GH#18295 (test missing) + idx = DatetimeIndex(["2017-01-01"], tz=tz) expected = DatetimeIndex(["NaT", "2017-01-01"], tz=tz) + if tz is not None and isinstance(null, np.datetime64): + expected = Index([null, idx[0]], dtype=object) + res = idx.insert(0, null) tm.assert_index_equal(res, expected) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index f4e7296598d54..95f5115a8c28b 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -7,87 +7,83 @@ class TestIntervalIndex: - def setup_method(self, method): - self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + @pytest.fixture + def series_with_interval_index(self): + return Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) - def test_getitem_with_scalar(self): + def test_getitem_with_scalar(self, series_with_interval_index, indexer_sl): - s = self.s + ser = series_with_interval_index.copy() - expected = s.iloc[:3] - tm.assert_series_equal(expected, s[:3]) - tm.assert_series_equal(expected, s[:2.5]) - tm.assert_series_equal(expected, s[0.1:2.5]) + expected = ser.iloc[:3] + tm.assert_series_equal(expected, indexer_sl(ser)[:3]) + tm.assert_series_equal(expected, indexer_sl(ser)[:2.5]) + tm.assert_series_equal(expected, indexer_sl(ser)[0.1:2.5]) + if indexer_sl is tm.loc: + tm.assert_series_equal(expected, ser.loc[-1:3]) - expected = s.iloc[1:4] - tm.assert_series_equal(expected, s[[1.5, 2.5, 3.5]]) - tm.assert_series_equal(expected, s[[2, 3, 4]]) - tm.assert_series_equal(expected, s[[1.5, 3, 4]]) + expected = ser.iloc[1:4] + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, indexer_sl(ser)[[2, 3, 4]]) + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 3, 4]]) - expected = s.iloc[2:5] - tm.assert_series_equal(expected, s[s >= 2]) + expected = ser.iloc[2:5] + tm.assert_series_equal(expected, indexer_sl(ser)[ser >= 2]) @pytest.mark.parametrize("direction", ["increasing", "decreasing"]) - def test_nonoverlapping_monotonic(self, direction, closed): + def test_nonoverlapping_monotonic(self, direction, closed, indexer_sl): tpls = [(0, 1), (2, 3), (4, 5)] if direction == "decreasing": tpls = tpls[::-1] idx = IntervalIndex.from_tuples(tpls, closed=closed) - s = Series(list("abc"), idx) + ser = Series(list("abc"), idx) - for key, expected in zip(idx.left, s): + for key, expected in zip(idx.left, ser): if idx.closed_left: - assert s[key] == expected - assert s.loc[key] == expected + assert indexer_sl(ser)[key] == expected else: with pytest.raises(KeyError, match=str(key)): - s[key] - with pytest.raises(KeyError, match=str(key)): - s.loc[key] + indexer_sl(ser)[key] - for key, expected in zip(idx.right, s): + for key, expected in zip(idx.right, ser): if idx.closed_right: - assert s[key] == expected - assert s.loc[key] == expected + assert indexer_sl(ser)[key] == expected else: with pytest.raises(KeyError, match=str(key)): - s[key] - with pytest.raises(KeyError, match=str(key)): - s.loc[key] + indexer_sl(ser)[key] - for key, expected in zip(idx.mid, s): - assert s[key] == expected - assert s.loc[key] == expected + for key, expected in zip(idx.mid, ser): + assert indexer_sl(ser)[key] == expected - def test_non_matching(self): - s = self.s + def test_non_matching(self, series_with_interval_index, indexer_sl): + ser = series_with_interval_index.copy() # this is a departure from our current # indexing scheme, but simpler with pytest.raises(KeyError, match=r"^\[-1\]$"): - s.loc[[-1, 3, 4, 5]] + indexer_sl(ser)[[-1, 3, 4, 5]] with pytest.raises(KeyError, match=r"^\[-1\]$"): - s.loc[[-1, 3]] + indexer_sl(ser)[[-1, 3]] @pytest.mark.arm_slow def test_large_series(self): - s = Series( + ser = Series( np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001)) ) - result1 = s.loc[:80000] - result2 = s.loc[0:80000] - result3 = s.loc[0:80000:1] + result1 = ser.loc[:80000] + result2 = ser.loc[0:80000] + result3 = ser.loc[0:80000:1] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) def test_loc_getitem_frame(self): # CategoricalIndex with IntervalIndex categories df = DataFrame({"A": range(10)}) - s = pd.cut(df.A, 5) - df["B"] = s + ser = pd.cut(df.A, 5) + df["B"] = ser df = df.set_index("B") result = df.loc[4] diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index a9512bc97d9de..8935eb94c1c49 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -8,89 +8,65 @@ class TestIntervalIndex: - def setup_method(self, method): - self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + @pytest.fixture + def series_with_interval_index(self): + return Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) - def test_loc_with_interval(self): + def test_loc_with_interval(self, series_with_interval_index, indexer_sl): # loc with single label / list of labels: # - Intervals: only exact matches # - scalars: those that contain it - s = self.s + ser = series_with_interval_index.copy() expected = 0 - result = s.loc[Interval(0, 1)] - assert result == expected - result = s[Interval(0, 1)] + result = indexer_sl(ser)[Interval(0, 1)] assert result == expected - expected = s.iloc[3:5] - result = s.loc[[Interval(3, 4), Interval(4, 5)]] - tm.assert_series_equal(expected, result) - result = s[[Interval(3, 4), Interval(4, 5)]] + expected = ser.iloc[3:5] + result = indexer_sl(ser)[[Interval(3, 4), Interval(4, 5)]] tm.assert_series_equal(expected, result) # missing or not exact with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='left')")): - s.loc[Interval(3, 5, closed="left")] - - with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='left')")): - s[Interval(3, 5, closed="left")] - - with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): - s[Interval(3, 5)] + indexer_sl(ser)[Interval(3, 5, closed="left")] with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): - s.loc[Interval(3, 5)] - - with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): - s[Interval(3, 5)] - - with pytest.raises( - KeyError, match=re.escape("Interval(-2, 0, closed='right')") - ): - s.loc[Interval(-2, 0)] + indexer_sl(ser)[Interval(3, 5)] with pytest.raises( KeyError, match=re.escape("Interval(-2, 0, closed='right')") ): - s[Interval(-2, 0)] - - with pytest.raises(KeyError, match=re.escape("Interval(5, 6, closed='right')")): - s.loc[Interval(5, 6)] + indexer_sl(ser)[Interval(-2, 0)] with pytest.raises(KeyError, match=re.escape("Interval(5, 6, closed='right')")): - s[Interval(5, 6)] + indexer_sl(ser)[Interval(5, 6)] - def test_loc_with_scalar(self): + def test_loc_with_scalar(self, series_with_interval_index, indexer_sl): # loc with single label / list of labels: # - Intervals: only exact matches # - scalars: those that contain it - s = self.s + ser = series_with_interval_index.copy() - assert s.loc[1] == 0 - assert s.loc[1.5] == 1 - assert s.loc[2] == 1 + assert indexer_sl(ser)[1] == 0 + assert indexer_sl(ser)[1.5] == 1 + assert indexer_sl(ser)[2] == 1 - assert s[1] == 0 - assert s[1.5] == 1 - assert s[2] == 1 + expected = ser.iloc[1:4] + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, indexer_sl(ser)[[2, 3, 4]]) + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 3, 4]]) - expected = s.iloc[1:4] - tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]]) - tm.assert_series_equal(expected, s.loc[[2, 3, 4]]) - tm.assert_series_equal(expected, s.loc[[1.5, 3, 4]]) + expected = ser.iloc[[1, 1, 2, 1]] + tm.assert_series_equal(expected, indexer_sl(ser)[[1.5, 2, 2.5, 1.5]]) - expected = s.iloc[[1, 1, 2, 1]] - tm.assert_series_equal(expected, s.loc[[1.5, 2, 2.5, 1.5]]) + expected = ser.iloc[2:5] + tm.assert_series_equal(expected, indexer_sl(ser)[ser >= 2]) - expected = s.iloc[2:5] - tm.assert_series_equal(expected, s.loc[s >= 2]) - - def test_loc_with_slices(self): + def test_loc_with_slices(self, series_with_interval_index, indexer_sl): # loc with slices: # - Interval objects: only works with exact matches @@ -99,178 +75,130 @@ def test_loc_with_slices(self): # contains them: # (slice_loc(start, stop) == (idx.get_loc(start), idx.get_loc(stop)) - s = self.s + ser = series_with_interval_index.copy() # slice of interval - expected = s.iloc[:3] - result = s.loc[Interval(0, 1) : Interval(2, 3)] - tm.assert_series_equal(expected, result) - result = s[Interval(0, 1) : Interval(2, 3)] + expected = ser.iloc[:3] + result = indexer_sl(ser)[Interval(0, 1) : Interval(2, 3)] tm.assert_series_equal(expected, result) - expected = s.iloc[3:] - result = s.loc[Interval(3, 4) :] - tm.assert_series_equal(expected, result) - result = s[Interval(3, 4) :] + expected = ser.iloc[3:] + result = indexer_sl(ser)[Interval(3, 4) :] tm.assert_series_equal(expected, result) msg = "Interval objects are not currently supported" with pytest.raises(NotImplementedError, match=msg): - s.loc[Interval(3, 6) :] + indexer_sl(ser)[Interval(3, 6) :] with pytest.raises(NotImplementedError, match=msg): - s[Interval(3, 6) :] - - with pytest.raises(NotImplementedError, match=msg): - s.loc[Interval(3, 4, closed="left") :] - - with pytest.raises(NotImplementedError, match=msg): - s[Interval(3, 4, closed="left") :] - - # slice of scalar + indexer_sl(ser)[Interval(3, 4, closed="left") :] - expected = s.iloc[:3] - tm.assert_series_equal(expected, s.loc[:3]) - tm.assert_series_equal(expected, s.loc[:2.5]) - tm.assert_series_equal(expected, s.loc[0.1:2.5]) - tm.assert_series_equal(expected, s.loc[-1:3]) - - tm.assert_series_equal(expected, s[:3]) - tm.assert_series_equal(expected, s[:2.5]) - tm.assert_series_equal(expected, s[0.1:2.5]) - - def test_slice_step_ne1(self): + def test_slice_step_ne1(self, series_with_interval_index): # GH#31658 slice of scalar with step != 1 - s = self.s - expected = s.iloc[0:4:2] + ser = series_with_interval_index.copy() + expected = ser.iloc[0:4:2] - result = s[0:4:2] + result = ser[0:4:2] tm.assert_series_equal(result, expected) - result2 = s[0:4][::2] + result2 = ser[0:4][::2] tm.assert_series_equal(result2, expected) - def test_slice_float_start_stop(self): + def test_slice_float_start_stop(self, series_with_interval_index): # GH#31658 slicing with integers is positional, with floats is not # supported - ser = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + ser = series_with_interval_index.copy() msg = "label-based slicing with step!=1 is not supported for IntervalIndex" with pytest.raises(ValueError, match=msg): ser[1.5:9.5:2] - def test_slice_interval_step(self): + def test_slice_interval_step(self, series_with_interval_index): # GH#31658 allows for integer step!=1, not Interval step - s = self.s + ser = series_with_interval_index.copy() msg = "label-based slicing with step!=1 is not supported for IntervalIndex" with pytest.raises(ValueError, match=msg): - s[0 : 4 : Interval(0, 1)] + ser[0 : 4 : Interval(0, 1)] - def test_loc_with_overlap(self): + def test_loc_with_overlap(self, indexer_sl): idx = IntervalIndex.from_tuples([(1, 5), (3, 7)]) - s = Series(range(len(idx)), index=idx) + ser = Series(range(len(idx)), index=idx) # scalar - expected = s - result = s.loc[4] - tm.assert_series_equal(expected, result) - - result = s[4] - tm.assert_series_equal(expected, result) - - result = s.loc[[4]] + expected = ser + result = indexer_sl(ser)[4] tm.assert_series_equal(expected, result) - result = s[[4]] + result = indexer_sl(ser)[[4]] tm.assert_series_equal(expected, result) # interval expected = 0 - result = s.loc[Interval(1, 5)] + result = indexer_sl(ser)[Interval(1, 5)] result == expected - result = s[Interval(1, 5)] - result == expected - - expected = s - result = s.loc[[Interval(1, 5), Interval(3, 7)]] - tm.assert_series_equal(expected, result) - - result = s[[Interval(1, 5), Interval(3, 7)]] + expected = ser + result = indexer_sl(ser)[[Interval(1, 5), Interval(3, 7)]] tm.assert_series_equal(expected, result) with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): - s.loc[Interval(3, 5)] + indexer_sl(ser)[Interval(3, 5)] with pytest.raises(KeyError, match=r"^\[Interval\(3, 5, closed='right'\)\]$"): - s.loc[[Interval(3, 5)]] - - with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): - s[Interval(3, 5)] - - with pytest.raises(KeyError, match=r"^\[Interval\(3, 5, closed='right'\)\]$"): - s[[Interval(3, 5)]] + indexer_sl(ser)[[Interval(3, 5)]] # slices with interval (only exact matches) - expected = s - result = s.loc[Interval(1, 5) : Interval(3, 7)] - tm.assert_series_equal(expected, result) - - result = s[Interval(1, 5) : Interval(3, 7)] + expected = ser + result = indexer_sl(ser)[Interval(1, 5) : Interval(3, 7)] tm.assert_series_equal(expected, result) msg = "'can only get slices from an IntervalIndex if bounds are" " non-overlapping and all monotonic increasing or decreasing'" with pytest.raises(KeyError, match=msg): - s.loc[Interval(1, 6) : Interval(3, 8)] + indexer_sl(ser)[Interval(1, 6) : Interval(3, 8)] - with pytest.raises(KeyError, match=msg): - s[Interval(1, 6) : Interval(3, 8)] - - # slices with scalar raise for overlapping intervals - # TODO KeyError is the appropriate error? - with pytest.raises(KeyError, match=msg): - s.loc[1:4] + if indexer_sl is tm.loc: + # slices with scalar raise for overlapping intervals + # TODO KeyError is the appropriate error? + with pytest.raises(KeyError, match=msg): + ser.loc[1:4] - def test_non_unique(self): + def test_non_unique(self, indexer_sl): idx = IntervalIndex.from_tuples([(1, 3), (3, 7)]) - s = Series(range(len(idx)), index=idx) + ser = Series(range(len(idx)), index=idx) - result = s.loc[Interval(1, 3)] + result = indexer_sl(ser)[Interval(1, 3)] assert result == 0 - result = s.loc[[Interval(1, 3)]] - expected = s.iloc[0:1] + result = indexer_sl(ser)[[Interval(1, 3)]] + expected = ser.iloc[0:1] tm.assert_series_equal(expected, result) - def test_non_unique_moar(self): + def test_non_unique_moar(self, indexer_sl): idx = IntervalIndex.from_tuples([(1, 3), (1, 3), (3, 7)]) - s = Series(range(len(idx)), index=idx) - - expected = s.iloc[[0, 1]] - result = s.loc[Interval(1, 3)] - tm.assert_series_equal(expected, result) + ser = Series(range(len(idx)), index=idx) - expected = s - result = s.loc[Interval(1, 3) :] + expected = ser.iloc[[0, 1]] + result = indexer_sl(ser)[Interval(1, 3)] tm.assert_series_equal(expected, result) - expected = s - result = s[Interval(1, 3) :] + expected = ser + result = indexer_sl(ser)[Interval(1, 3) :] tm.assert_series_equal(expected, result) - expected = s.iloc[[0, 1]] - result = s[[Interval(1, 3)]] + expected = ser.iloc[[0, 1]] + result = indexer_sl(ser)[[Interval(1, 3)]] tm.assert_series_equal(expected, result) - def test_missing_key_error_message(self, frame_or_series): + def test_missing_key_error_message( + self, frame_or_series, series_with_interval_index + ): # GH#27365 - obj = frame_or_series( - np.arange(5), index=IntervalIndex.from_breaks(np.arange(6)) - ) + ser = series_with_interval_index.copy() + obj = frame_or_series(ser) with pytest.raises(KeyError, match=r"\[6\]"): obj.loc[[4, 5, 6]] diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 1b9b6452b2e33..3b6bc42544c51 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -322,6 +322,7 @@ def test_loc_listlike_dtypes(self): with pytest.raises(KeyError, match=re.escape(msg)): df.loc[["a", "x"]] + def test_loc_listlike_dtypes_duplicated_categories_and_codes(self): # duplicated categories and codes index = CategoricalIndex(["a", "b", "a"]) df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) @@ -341,9 +342,11 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) + msg = "The following labels were missing: Index(['x'], dtype='object')" with pytest.raises(KeyError, match=re.escape(msg)): df.loc[["a", "x"]] + def test_loc_listlike_dtypes_unused_category(self): # contains unused category index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) @@ -363,6 +366,7 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) + msg = "The following labels were missing: Index(['x'], dtype='object')" with pytest.raises(KeyError, match=re.escape(msg)): df.loc[["a", "x"]] @@ -405,6 +409,8 @@ def test_ix_categorical_index(self): expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns) tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) + def test_ix_categorical_index_non_unique(self): + # non-unique df = DataFrame(np.random.randn(3, 3), index=list("ABA"), columns=list("XYX")) cdf = df.copy() diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 1ac2a16660f93..25d4692e4cd1d 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -373,6 +373,7 @@ def test_setting_with_copy_bug(self): with pytest.raises(com.SettingWithCopyError, match=msg): df[["c"]][mask] = df[["b"]][mask] + def test_setting_with_copy_bug_no_warning(self): # invalid warning as we are returning a new object # GH 8730 df1 = DataFrame({"x": Series(["a", "b", "c"]), "y": Series(["d", "e", "f"])}) diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index 44a5e2ae6d9e9..9f58f4af0ba55 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -37,6 +37,7 @@ def test_indexing_with_datetime_tz(self): ) tm.assert_series_equal(result, expected) + def test_indexing_fast_xs(self): # indexing - fast_xs df = DataFrame({"a": date_range("2014-01-01", periods=10, tz="UTC")}) result = df.iloc[5] @@ -53,6 +54,7 @@ def test_indexing_with_datetime_tz(self): expected = df.iloc[4:] tm.assert_frame_equal(result, expected) + def test_setitem_with_expansion(self): # indexing - setting an element df = DataFrame( data=pd.to_datetime(["2015-03-30 20:12:32", "2015-03-12 00:11:11"]), @@ -234,21 +236,23 @@ def test_loc_setitem_with_existing_dst(self): def test_getitem_millisecond_resolution(self, frame_or_series): # GH#33589 + + keys = [ + "2017-10-25T16:25:04.151", + "2017-10-25T16:25:04.252", + "2017-10-25T16:50:05.237", + "2017-10-25T16:50:05.238", + ] obj = frame_or_series( [1, 2, 3, 4], - index=[ - Timestamp("2017-10-25T16:25:04.151"), - Timestamp("2017-10-25T16:25:04.252"), - Timestamp("2017-10-25T16:50:05.237"), - Timestamp("2017-10-25T16:50:05.238"), - ], + index=[Timestamp(x) for x in keys], ) - result = obj["2017-10-25T16:25:04.252":"2017-10-25T16:50:05.237"] + result = obj[keys[1] : keys[2]] expected = frame_or_series( [2, 3], index=[ - Timestamp("2017-10-25T16:25:04.252"), - Timestamp("2017-10-25T16:50:05.237"), + Timestamp(keys[1]), + Timestamp(keys[2]), ], ) tm.assert_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index dcd073681cecf..63313589d64f7 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -125,6 +125,7 @@ def test_inf_upcast(self): expected = pd.Float64Index([1, 2, np.inf]) tm.assert_index_equal(result, expected) + def test_inf_upcast_empty(self): # Test with np.inf in columns df = DataFrame() df.loc[0, 0] = 1 @@ -148,6 +149,9 @@ def test_setitem_dtype_upcast(self): ) tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize("val", [3.14, "wxyz"]) + def test_setitem_dtype_upcast2(self, val): + # GH10280 df = DataFrame( np.arange(6, dtype="int64").reshape(2, 3), @@ -155,19 +159,19 @@ def test_setitem_dtype_upcast(self): columns=["foo", "bar", "baz"], ) - for val in [3.14, "wxyz"]: - left = df.copy() - left.loc["a", "bar"] = val - right = DataFrame( - [[0, val, 2], [3, 4, 5]], - index=list("ab"), - columns=["foo", "bar", "baz"], - ) + left = df.copy() + left.loc["a", "bar"] = val + right = DataFrame( + [[0, val, 2], [3, 4, 5]], + index=list("ab"), + columns=["foo", "bar", "baz"], + ) - tm.assert_frame_equal(left, right) - assert is_integer_dtype(left["foo"]) - assert is_integer_dtype(left["baz"]) + tm.assert_frame_equal(left, right) + assert is_integer_dtype(left["foo"]) + assert is_integer_dtype(left["baz"]) + def test_setitem_dtype_upcast3(self): left = DataFrame( np.arange(6, dtype="int64").reshape(2, 3) / 10.0, index=list("ab"), @@ -195,6 +199,8 @@ def test_dups_fancy_indexing(self): expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) + def test_dups_fancy_indexing_across_dtypes(self): + # across dtypes df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() @@ -208,6 +214,7 @@ def test_dups_fancy_indexing(self): tm.assert_frame_equal(df, result) + def test_dups_fancy_indexing_not_in_order(self): # GH 3561, dups not in selected order df = DataFrame( {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")}, @@ -232,6 +239,8 @@ def test_dups_fancy_indexing(self): with pytest.raises(KeyError, match="with any missing labels"): df.loc[rows] + def test_dups_fancy_indexing_only_missing_label(self): + # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) with pytest.raises( @@ -244,6 +253,8 @@ def test_dups_fancy_indexing(self): # ToDo: check_index_type can be True after GH 11497 + def test_dups_fancy_indexing_missing_label(self): + # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with pytest.raises(KeyError, match="with any missing labels"): @@ -253,6 +264,8 @@ def test_dups_fancy_indexing(self): with pytest.raises(KeyError, match="with any missing labels"): df.loc[[0, 8, 0]] + def test_dups_fancy_indexing_non_unique(self): + # non unique with non unique selector df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) with pytest.raises(KeyError, match="with any missing labels"): @@ -447,6 +460,7 @@ def test_multi_assign(self): df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) + def test_multi_assign_broadcasting_rhs(self): # broadcasting on the rhs is required df = DataFrame( { @@ -781,14 +795,16 @@ def test_non_reducing_slice(self, slc): tslice_ = non_reducing_slice(slc) assert isinstance(df.loc[tslice_], DataFrame) - def test_list_slice(self): + @pytest.mark.parametrize("box", [list, Series, np.array]) + def test_list_slice(self, box): # like dataframe getitem - slices = [["A"], Series(["A"]), np.array(["A"])] + subset = box(["A"]) + df = DataFrame({"A": [1, 2], "B": [3, 4]}, index=["A", "B"]) expected = pd.IndexSlice[:, ["A"]] - for subset in slices: - result = non_reducing_slice(subset) - tm.assert_frame_equal(df.loc[result], df.loc[expected]) + + result = non_reducing_slice(subset) + tm.assert_frame_equal(df.loc[result], df.loc[expected]) def test_maybe_numeric_slice(self): df = DataFrame({"A": [1, 2], "B": ["c", "d"], "C": [True, False]}) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 0a50ef2831534..3203b7fa1893d 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1233,8 +1233,8 @@ def check_frame_setitem(self, elem, index: Index, inplace: bool): if inplace: # assertion here implies setting was done inplace - # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" - # has no attribute "blocks" [union-attr] + # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has no + # attribute "blocks" assert df._mgr.blocks[0].values is arr # type:ignore[union-attr] else: assert df.dtypes[0] == object diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 0962b719efd4d..8128e958141e2 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,11 +1,8 @@ -from distutils.version import LooseVersion from pathlib import Path import numpy as np import pytest -from pandas.compat._optional import get_version - import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -157,10 +154,6 @@ def test_read_with_bad_dimension( datapath, ext, header, expected_data, filename, read_only, request ): # GH 38956, 39001 - no/incorrect dimension information - version = LooseVersion(get_version(openpyxl)) - if (read_only or read_only is None) and version < "3.0.0": - msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" - request.node.add_marker(pytest.mark.xfail(reason=msg)) path = datapath("io", "data", "excel", f"{filename}{ext}") if read_only is None: result = pd.read_excel(path, header=header) @@ -195,10 +188,6 @@ def test_append_mode_file(ext): @pytest.mark.parametrize("read_only", [True, False, None]) def test_read_with_empty_trailing_rows(datapath, ext, read_only, request): # GH 39181 - version = LooseVersion(get_version(openpyxl)) - if (read_only or read_only is None) and version < "3.0.0": - msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" - request.node.add_marker(pytest.mark.xfail(reason=msg)) path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}") if read_only is None: result = pd.read_excel(path) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 0c61a8a18e153..0aebcda83993d 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1305,6 +1305,15 @@ def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, path): with pytest.raises(ValueError, match="Excel does not support"): df.to_excel(path) + def test_excel_duplicate_columns_with_names(self, path): + # GH#39695 + df = DataFrame({"A": [0, 1], "B": [10, 11]}) + df.to_excel(path, columns=["A", "B", "A"], index=False) + + result = pd.read_excel(path) + expected = DataFrame([[0, 10, 0], [1, 11, 1]], columns=["A", "B", "A.1"]) + tm.assert_frame_equal(result, expected) + class TestExcelWriterEngineTests: @pytest.mark.parametrize( diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index d97410562083c..e047317acd24d 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -559,7 +559,7 @@ def test_dt64_series_assign_nat(nat_val, tz, indexer_sli): base = Series(dti) expected = Series([pd.NaT] + list(dti[1:]), dtype=dti.dtype) - should_cast = nat_val is pd.NaT or base.dtype.kind == nat_val.dtype.kind + should_cast = nat_val is pd.NaT or base.dtype == nat_val.dtype if not should_cast: expected = expected.astype(object) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 3a9ec0948b29a..36948c3dc05f3 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -671,6 +671,43 @@ def key(self): return 0 +class TestSetitemNADatetime64Dtype(SetitemCastingEquivalents): + # some nat-like values should be cast to datetime64 when inserting + # into a datetime64 series. Others should coerce to object + # and retain their dtypes. + + @pytest.fixture(params=[None, "UTC", "US/Central"]) + def obj(self, request): + tz = request.param + dti = date_range("2016-01-01", periods=3, tz=tz) + return Series(dti) + + @pytest.fixture( + params=[NaT, np.timedelta64("NaT", "ns"), np.datetime64("NaT", "ns")] + ) + def val(self, request): + return request.param + + @pytest.fixture + def is_inplace(self, val, obj): + if obj._values.tz is None: + # cast to object iff val is timedelta64("NaT") + return val is NaT or val.dtype.kind == "M" + + # otherwise we have to exclude tznaive dt64("NaT") + return val is NaT + + @pytest.fixture + def expected(self, obj, val, is_inplace): + dtype = obj.dtype if is_inplace else object + expected = Series([val] + list(obj[1:]), dtype=dtype) + return expected + + @pytest.fixture + def key(self): + return 0 + + class TestSetitemMismatchedTZCastsToObject(SetitemCastingEquivalents): # GH#24024 @pytest.fixture diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 0bcb37d4880a6..7e33b766a1413 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -204,8 +204,9 @@ def test_timedelta_fillna(self, frame_or_series): expected = frame_or_series(expected) tm.assert_equal(result, expected) - # interpreted as seconds, deprecated - with pytest.raises(TypeError, match="Passing integers to fillna"): + # interpreted as seconds, no longer supported + msg = "value should be a 'Timedelta', 'NaT', or array of those. Got 'int'" + with pytest.raises(TypeError, match=msg): obj.fillna(1) result = obj.fillna(Timedelta(seconds=1)) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index a13fb1ce57f6c..268c636ab9353 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -78,8 +78,8 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: {dedent(doc)}""" ) - # error: Incompatible return value type (got "Callable[[VarArg(Any), - # KwArg(Any)], Callable[...,Any]]", expected "Callable[[F], F]") + # error: Incompatible return value type (got "Callable[[VarArg(Any), KwArg(Any)], + # Callable[...,Any]]", expected "Callable[[F], F]") return wrapper # type: ignore[return-value] @@ -362,10 +362,10 @@ def decorator(decorated: F) -> F: for docstring in docstrings: if hasattr(docstring, "_docstring_components"): - # error: Item "str" of "Union[str, Callable[..., Any]]" has no - # attribute "_docstring_components" [union-attr] - # error: Item "function" of "Union[str, Callable[..., Any]]" - # has no attribute "_docstring_components" [union-attr] + # error: Item "str" of "Union[str, Callable[..., Any]]" has no attribute + # "_docstring_components" + # error: Item "function" of "Union[str, Callable[..., Any]]" has no + # attribute "_docstring_components" docstring_components.extend( docstring._docstring_components # type: ignore[union-attr] ) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index d521f2ee421be..f991b16fea192 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -1,17 +1,17 @@ -#!/usr/bin/env python3 """ Validate that the titles in the rst files follow the proper capitalization convention. Print the titles that do not follow the convention. Usage:: -./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst -./scripts/validate_rst_title_capitalization.py doc/source/ +As pre-commit hook (recommended): + pre-commit run title-capitalization --all-files + +From the command-line: + python scripts/validate_rst_title_capitalization.py """ import argparse -import glob -import os import re import sys from typing import Iterable, List, Tuple @@ -233,36 +233,7 @@ def find_titles(rst_file: str) -> Iterable[Tuple[str, int]]: previous_line = line -def find_rst_files(source_paths: List[str]) -> Iterable[str]: - """ - Given the command line arguments of directory paths, this method - yields the strings of the .rst file directories that these paths contain. - - Parameters - ---------- - source_paths : str - List of directories to validate, provided through command line arguments. - - Yields - ------- - str - Directory address of a .rst files found in command line argument directories. - """ - - for directory_address in source_paths: - if not os.path.exists(directory_address): - raise ValueError( - "Please enter a valid path, pointing to a valid file/directory." - ) - elif directory_address.endswith(".rst"): - yield directory_address - else: - yield from glob.glob( - pathname=f"{directory_address}/**/*.rst", recursive=True - ) - - -def main(source_paths: List[str], output_format: str) -> int: +def main(source_paths: List[str]) -> int: """ The main method to print all headings with incorrect capitalization. @@ -270,8 +241,6 @@ def main(source_paths: List[str], output_format: str) -> int: ---------- source_paths : str List of directories to validate, provided through command line arguments. - output_format : str - Output format of the script. Returns ------- @@ -281,7 +250,7 @@ def main(source_paths: List[str], output_format: str) -> int: number_of_errors: int = 0 - for filename in find_rst_files(source_paths): + for filename in source_paths: for title, line_number in find_titles(filename): if title != correct_title_capitalization(title): print( @@ -297,16 +266,9 @@ def main(source_paths: List[str], output_format: str) -> int: parser = argparse.ArgumentParser(description="Validate heading capitalization") parser.add_argument( - "paths", nargs="+", default=".", help="Source paths of file/directory to check." - ) - - parser.add_argument( - "--format", - "-f", - default="{source_path}:{line_number}:{msg}:{heading}:{correct_heading}", - help="Output format of incorrectly capitalized titles", + "paths", nargs="*", help="Source paths of file/directory to check." ) args = parser.parse_args() - sys.exit(main(args.paths, args.format)) + sys.exit(main(args.paths))