diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index f3b005b704014..f4a6ed5f26c89 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -118,12 +118,29 @@ def setup(self): self.a = pd.Categorical(list("aabbcd") * N) self.b = pd.Categorical(list("bbcdjk") * N) + self.idx_a = pd.CategoricalIndex(range(N), range(N)) + self.idx_b = pd.CategoricalIndex(range(N + 1), range(N + 1)) + self.df_a = pd.DataFrame(range(N), columns=["a"], index=self.idx_a) + self.df_b = pd.DataFrame(range(N + 1), columns=["a"], index=self.idx_b) + def time_concat(self): pd.concat([self.s, self.s]) def time_union(self): union_categoricals([self.a, self.b]) + def time_append_overlapping_index(self): + self.idx_a.append(self.idx_a) + + def time_append_non_overlapping_index(self): + self.idx_a.append(self.idx_b) + + def time_concat_overlapping_index(self): + pd.concat([self.df_a, self.df_a]) + + def time_concat_non_overlapping_index(self): + pd.concat([self.df_a, self.df_b]) + class ValueCounts: diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index 5227ad0f53a04..3743882b936e2 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -25,6 +25,15 @@ def time_isin_outside(self, dtype, exponent): self.s.isin(self.values_outside) +class UniqueForLargePyObjectInts: + def setup(self): + lst = [x << 32 for x in range(5000)] + self.arr = np.array(lst, dtype=np.object_) + + def time_unique(self): + pd.unique(self.arr) + + class IsinWithRandomFloat: params = [ [np.float64, np.object], diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index d4219296f5795..9a8a95bec66ad 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -329,21 +329,11 @@ Each data structure has several *constructor properties* for returning a new data structure as the result of an operation. By overriding these properties, you can retain subclasses through ``pandas`` data manipulations. -There are 3 constructor properties to be defined: +There are 3 possible constructor properties to be defined on a subclass: -* ``_constructor``: Used when a manipulation result has the same dimensions as the original. -* ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. -* ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()``. - -Following table shows how ``pandas`` data structures define constructor properties by default. - -=========================== ======================= ============= -Property Attributes ``Series`` ``DataFrame`` -=========================== ======================= ============= -``_constructor`` ``Series`` ``DataFrame`` -``_constructor_sliced`` ``NotImplementedError`` ``Series`` -``_constructor_expanddim`` ``DataFrame`` ``NotImplementedError`` -=========================== ======================= ============= +* ``DataFrame/Series._constructor``: Used when a manipulation result has the same dimension as the original. +* ``DataFrame._constructor_sliced``: Used when a ``DataFrame`` (sub-)class manipulation result should be a ``Series`` (sub-)class. +* ``Series._constructor_expanddim``: Used when a ``Series`` (sub-)class manipulation result should be a ``DataFrame`` (sub-)class, e.g. ``Series.to_frame()``. Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 974f84d3b244a..8ea0d72356acf 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -17,11 +17,14 @@ Fixed regressions - Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`) - Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`) -- Fixed regression in :class:`DataFrame.astype` and :class:`Series.astype` not casting to bytes dtype (:issue:`39474`) +- Fixed regression in :meth:`DataFrame.astype` and :meth:`Series.astype` not casting to bytes dtype (:issue:`39474`) - Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`) - Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`) - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) +- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`) +- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`) - Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) +- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 17d8c79994dbe..75bca020fd78f 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -219,7 +219,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. -- +- Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - .. --------------------------------------------------------------------------- @@ -253,6 +253,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`) - Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`) - Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`) +- Performance improvement in :func:`unique` for object data type (:issue:`37615`) .. --------------------------------------------------------------------------- @@ -304,6 +305,7 @@ Numeric - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`) +- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) - Conversion @@ -338,7 +340,7 @@ Indexing - Bug in :meth:`Series.__setitem__` raising ``ValueError`` when setting a :class:`Series` with a scalar indexer (:issue:`38303`) - Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`) - Bug in :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` always raising ``KeyError`` when slicing with existing strings an :class:`Index` with milliseconds (:issue:`33589`) -- Bug in setting ``timedelta64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`) +- Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, issue:`39619`) - Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`) - Bug in setting ``datetime64`` values into a :class:`Series` with integer-dtype incorrect casting the datetime64 values to integers (:issue:`39266`) - Bug in :meth:`Index.get_loc` not raising ``KeyError`` when method is specified for ``NaN`` value when ``NaN`` is not in :class:`Index` (:issue:`39382`) @@ -445,10 +447,11 @@ Other - Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`) - Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`) - Bug in :class:`Styler` which caused CSS to duplicate on multiple renders. (:issue:`39395`) +- ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`) - :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) - Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`) - Bug in :class:`Styler` where ``subset`` arg in methods raised an error for some valid multiindex slices (:issue:`33562`) -- +- :class:`Styler` rendered HTML output minor alterations to support w3 good code standard (:issue:`39626`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index e1ea1fbf9bd46..e5026ce2fa292 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -34,10 +34,14 @@ cdef class {{name}}Engine(IndexEngine): cdef _make_hash_table(self, Py_ssize_t n): return _hash.{{name}}HashTable(n) - {{if name not in {'Float64', 'Float32'} }} cdef _check_type(self, object val): + {{if name not in {'Float64', 'Float32'} }} if not util.is_integer_object(val): raise KeyError(val) + {{else}} + if util.is_bool_object(val): + # avoid casting to True -> 1.0 + raise KeyError(val) {{endif}} cdef void _call_map_locations(self, values): diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 0073aaf0195c7..aee018262e3a6 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -178,11 +178,31 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { return result; } -// For PyObject_Hash holds: -// hash(0.0) == 0 == hash(-0.0) -// hash(X) == 0 if X is a NaN-value -// so it is OK to use it directly -#define kh_python_hash_func(key) (PyObject_Hash(key)) + +khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ + // For PyObject_Hash holds: + // hash(0.0) == 0 == hash(-0.0) + // hash(X) == 0 if X is a NaN-value + // so it is OK to use it directly for doubles + Py_hash_t hash = PyObject_Hash(key); + if (hash == -1) { + PyErr_Clear(); + return 0; + } + #if SIZEOF_PY_HASH_T == 4 + // it is already 32bit value + return hash; + #else + // for 64bit builds, + // we need information of the upper 32bits as well + // see GH 37615 + khuint64_t as_uint = (khuint64_t) hash; + // uints avoid undefined behavior of signed ints + return (as_uint>>32)^as_uint; + #endif +} + + #define kh_python_hash_equal(a, b) (pyobject_cmp(a, b)) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index e3f159346cd51..4dbce8f75898f 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -457,7 +457,7 @@ def transform( # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate - if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: + if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty and not obj.empty: raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2b0d3f5aa8862..cdbef673643e8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -324,7 +324,8 @@ def unique(values): Hash table-based unique. Uniques are returned in order of appearance. This does NOT sort. - Significantly faster than numpy.unique. Includes NA values. + Significantly faster than numpy.unique for long enough sequences. + Includes NA values. Parameters ---------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 533190e692891..828b460f84ec6 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -63,7 +63,7 @@ def frame_apply( raw: bool = False, result_type: Optional[str] = None, args=None, - kwds=None, + kwargs=None, ) -> FrameApply: """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) @@ -79,7 +79,7 @@ def frame_apply( raw=raw, result_type=result_type, args=args, - kwds=kwds, + kwargs=kwargs, ) @@ -88,14 +88,14 @@ def series_apply( func: AggFuncType, convert_dtype: bool = True, args=None, - kwds=None, + kwargs=None, ) -> SeriesApply: return SeriesApply( obj, func, convert_dtype, args, - kwds, + kwargs, ) @@ -109,12 +109,12 @@ def __init__( raw: bool, result_type: Optional[str], args, - kwds, + kwargs, ): self.obj = obj self.raw = raw self.args = args or () - self.kwds = kwds or {} + self.kwargs = kwargs or {} if result_type not in [None, "reduce", "broadcast", "expand"]: raise ValueError( @@ -126,13 +126,13 @@ def __init__( # curry if needed if ( - (kwds or args) + (kwargs or args) and not isinstance(func, (np.ufunc, str)) and not is_list_like(func) ): def f(x): - return func(x, *args, **kwds) + return func(x, *args, **kwargs) else: f = func @@ -163,7 +163,7 @@ def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]: obj = self.obj arg = self.f args = self.args - kwargs = self.kwds + kwargs = self.kwargs _axis = kwargs.pop("_axis", None) if _axis is None: @@ -413,10 +413,10 @@ def maybe_apply_str(self) -> Optional[FrameOrSeriesUnion]: if callable(func): sig = inspect.getfullargspec(func) if "axis" in sig.args: - self.kwds["axis"] = self.axis + self.kwargs["axis"] = self.axis elif self.axis != 0: raise ValueError(f"Operation {f} does not support axis=1") - return self.obj._try_aggregate_string_function(f, *self.args, **self.kwds) + return self.obj._try_aggregate_string_function(f, *self.args, **self.kwargs) def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: """ @@ -430,7 +430,7 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: # Note: dict-likes are list-like if not is_list_like(self.f): return None - return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwds) + return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) class FrameApply(Apply): @@ -806,7 +806,7 @@ def __init__( func: AggFuncType, convert_dtype: bool, args, - kwds, + kwargs, ): self.convert_dtype = convert_dtype @@ -816,7 +816,7 @@ def __init__( raw=False, result_type=None, args=args, - kwds=kwds, + kwargs=kwargs, ) def apply(self) -> FrameOrSeriesUnion: @@ -877,17 +877,17 @@ def __init__( obj: Union[SeriesGroupBy, DataFrameGroupBy], func: AggFuncType, args, - kwds, + kwargs, ): - kwds = kwds.copy() - self.axis = obj.obj._get_axis_number(kwds.get("axis", 0)) + kwargs = kwargs.copy() + self.axis = obj.obj._get_axis_number(kwargs.get("axis", 0)) super().__init__( obj, func, raw=False, result_type=None, args=args, - kwds=kwds, + kwargs=kwargs, ) def apply(self): @@ -903,7 +903,7 @@ def __init__( obj: Union[Resampler, BaseWindow], func: AggFuncType, args, - kwds, + kwargs, ): super().__init__( obj, @@ -911,7 +911,7 @@ def __init__( raw=False, result_type=None, args=args, - kwds=kwds, + kwargs=kwargs, ) def apply(self): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 640c8d66807ad..8aa3d7900e8e9 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -588,9 +588,13 @@ def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bo Otherwise an object array is returned. """ # perf shortcut as this is the most common case - if isinstance(arr, np.ndarray): - if maybe_castable(arr) and not copy and dtype is None: - return arr + if ( + isinstance(arr, np.ndarray) + and maybe_castable(arr.dtype) + and not copy + and dtype is None + ): + return arr if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)): # create an extension array from its dtype diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1cb592f18dd2c..ed36beb80986e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1331,20 +1331,18 @@ def convert_dtypes( return inferred_dtype -def maybe_castable(arr: np.ndarray) -> bool: +def maybe_castable(dtype: np.dtype) -> bool: # return False to force a non-fastpath - assert isinstance(arr, np.ndarray) # GH 37024 - # check datetime64[ns]/timedelta64[ns] are valid # otherwise try to coerce - kind = arr.dtype.kind + kind = dtype.kind if kind == "M": - return is_datetime64_ns_dtype(arr.dtype) + return is_datetime64_ns_dtype(dtype) elif kind == "m": - return is_timedelta64_ns_dtype(arr.dtype) + return is_timedelta64_ns_dtype(dtype) - return arr.dtype.name not in POSSIBLY_CAST_DTYPES + return dtype.name not in POSSIBLY_CAST_DTYPES def maybe_infer_to_datetimelike( diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6357b8feb348b..f3f899f9fd90a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -490,23 +490,14 @@ class DataFrame(NDFrame, OpsMixin): _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set _typ = "dataframe" _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) + _accessors: Set[str] = {"sparse"} + _hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([]) @property def _constructor(self) -> Type[DataFrame]: return DataFrame _constructor_sliced: Type[Series] = Series - _hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([]) - _accessors: Set[str] = {"sparse"} - - @property - def _constructor_expanddim(self): - # GH#31549 raising NotImplementedError on a property causes trouble - # for `inspect` - def constructor(*args, **kwargs): - raise NotImplementedError("Not supported for DataFrames!") - - return constructor # ---------------------------------------------------------------------- # Constructors @@ -3786,8 +3777,21 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: raise ValueError("at least one of include or exclude must be nonempty") # convert the myriad valid dtypes object to a single representation - include = frozenset(infer_dtype_from_object(x) for x in include) - exclude = frozenset(infer_dtype_from_object(x) for x in exclude) + def check_int_infer_dtype(dtypes): + converted_dtypes = [] + for dtype in dtypes: + # Numpy maps int to different types (int32, in64) on Windows and Linux + # see https://github.com/numpy/numpy/issues/9464 + if (isinstance(dtype, str) and dtype == "int") or (dtype is int): + converted_dtypes.append(np.int32) + converted_dtypes.append(np.int64) + else: + converted_dtypes.append(infer_dtype_from_object(dtype)) + return frozenset(converted_dtypes) + + include = check_int_infer_dtype(include) + exclude = check_int_infer_dtype(exclude) + for dtypes in (include, exclude): invalidate_string_dtypes(dtypes) @@ -7718,7 +7722,7 @@ def _aggregate(self, arg, axis: Axis = 0, *args, **kwargs): func=arg, axis=0, args=args, - kwds=kwargs, + kwargs=kwargs, ) result, how = op.agg() @@ -7750,7 +7754,7 @@ def apply( raw: bool = False, result_type=None, args=(), - **kwds, + **kwargs, ): """ Apply a function along an axis of the DataFrame. @@ -7798,7 +7802,7 @@ def apply( args : tuple Positional arguments to pass to `func` in addition to the array/series. - **kwds + **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -7892,7 +7896,7 @@ def apply( raw=raw, result_type=result_type, args=args, - kwds=kwds, + kwargs=kwargs, ) return op.apply() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 96b35f1aaab9c..e1271cfec2bde 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -375,22 +375,6 @@ def _constructor(self: FrameOrSeries) -> Type[FrameOrSeries]: """ raise AbstractMethodError(self) - @property - def _constructor_sliced(self): - """ - Used when a manipulation result has one lower dimension(s) as the - original, such as DataFrame single columns slicing. - """ - raise AbstractMethodError(self) - - @property - def _constructor_expanddim(self): - """ - Used when a manipulation result has one higher dimension as the - original, such as Series.to_frame() - """ - raise NotImplementedError - # ---------------------------------------------------------------------- # Internals @@ -8891,32 +8875,11 @@ def _where( if isinstance(other, (np.ndarray, ExtensionArray)): if other.shape != self.shape: - - if self.ndim == 1: - - icond = cond._values - - # GH 2745 / GH 4192 - # treat like a scalar - if len(other) == 1: - other = other[0] - - # GH 3235 - # match True cond to other - elif len(cond[icond]) == len(other): - - # try to not change dtype at first - new_other = self._values - new_other = new_other.copy() - new_other[icond] = other - other = new_other - - else: - raise ValueError( - "Length of replacements must equal series length" - ) - - else: + if self.ndim != 1: + # In the ndim == 1 case we may have + # other length 1, which we treat as scalar (GH#2745, GH#4192) + # or len(other) == icond.sum(), which we treat like + # __setitem__ (GH#3235) raise ValueError( "other must be the same shape as self when an ndarray" ) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 12698efa86b28..7b6eb4c8fe2f9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -983,7 +983,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # try to treat as if we are passing a list try: result, _ = GroupByApply( - self, [func], args=(), kwds={"_axis": self.axis} + self, [func], args=(), kwargs={"_axis": self.axis} ).agg() # select everything except for the last level, which is the one diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f2fd5ca9c62c7..5c1fabd67bc8d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4550,11 +4550,16 @@ def putmask(self, mask, value): return self.astype(dtype).putmask(mask, value) values = self._values.copy() - if isinstance(converted, np.timedelta64) and self.dtype == object: + dtype, _ = infer_dtype_from(converted, pandas_dtype=True) + if dtype.kind in ["m", "M"]: # https://github.com/numpy/numpy/issues/12550 # timedelta64 will incorrectly cast to int - converted = [converted] * mask.sum() - values[mask] = converted + if not is_list_like(converted): + converted = [converted] * mask.sum() + values[mask] = converted + else: + converted = list(converted) + np.putmask(values, mask, converted) else: np.putmask(values, mask, converted) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 777fc1c7c4ad2..d6427aed6edf3 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,4 +1,4 @@ -from typing import Any, Hashable, Optional +from typing import Hashable, Optional import warnings import numpy as np @@ -9,7 +9,6 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( - is_bool, is_dtype_equal, is_extension_array_dtype, is_float, @@ -336,13 +335,6 @@ def _convert_slice_indexer(self, key: slice, kind: str): # translate to locations return self.slice_indexer(key.start, key.stop, key.step, kind=kind) - @doc(Index.get_loc) - def get_loc(self, key, method=None, tolerance=None): - if is_bool(key): - # Catch this to avoid accidentally casting to 1.0 - raise KeyError(key) - return super().get_loc(key, method=method, tolerance=tolerance) - # ---------------------------------------------------------------- def _format_native_types( @@ -359,10 +351,3 @@ def _format_native_types( fixed_width=False, ) return formatter.get_result_as_array() - - def __contains__(self, other: Any) -> bool: - hash(other) - if super().__contains__(other): - return True - - return is_float(other) and np.isnan(other) and self.hasnans diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9314666acdaad..75814cb2bae3d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1031,7 +1031,8 @@ def putmask(self, mask, new) -> List[Block]: elif not mask.any(): return [self] - elif isinstance(new, np.timedelta64): + dtype, _ = infer_dtype_from(new) + if dtype.kind in ["m", "M"]: # using putmask with object dtype will incorrect cast to object # Having excluded self._can_hold_element, we know we cannot operate # in-place, so we are safe using `where` @@ -1317,10 +1318,15 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]: blocks = block.where(orig_other, cond, errors=errors, axis=axis) return self._maybe_downcast(blocks, "infer") - elif isinstance(other, np.timedelta64): - # expressions.where will cast np.timedelta64 to int - result = self.values.copy() - result[~cond] = [other] * (~cond).sum() + dtype, _ = infer_dtype_from(other, pandas_dtype=True) + if dtype.kind in ["m", "M"] and dtype.kind != values.dtype.kind: + # expressions.where would cast np.timedelta64 to int + if not is_list_like(other): + other = [other] * (~cond).sum() + else: + other = list(other) + result = values.copy() + np.putmask(result, ~cond, other) else: # convert datetime to datetime64, timedelta to timedelta64 diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 965de2e04bf40..34b7838d2280c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -301,7 +301,7 @@ def pipe( def aggregate(self, func, *args, **kwargs): self._set_binner() - result, how = ResamplerWindowApply(self, func, args=args, kwds=kwargs).agg() + result, how = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: how = func grouper = None diff --git a/pandas/core/series.py b/pandas/core/series.py index 8bd325beede65..559b27aeb7e50 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -180,8 +180,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like - and index is None, then the values in the index are used to - reindex the Series after it is created using the keys in the data. + and index is None, then the keys in the data are used as the index. If the + index is not None, the resulting Series is reindexed with the index values. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Series. If not specified, this will be inferred from `data`. @@ -190,6 +190,33 @@ class Series(base.IndexOpsMixin, generic.NDFrame): The name to give to the Series. copy : bool, default False Copy input data. + + Examples + -------- + Constructing Series from a dictionary with an Index specified + + >>> d = {'a': 1, 'b': 2, 'c': 3} + >>> ser = pd.Series(data=d, index=['a', 'b', 'c']) + >>> ser + a 1 + b 2 + c 3 + dtype: int64 + + The keys of the dictionary match with the Index values, hence the Index + values have no effect. + + >>> d = {'a': 1, 'b': 2, 'c': 3} + >>> ser = pd.Series(data=d, index=['x', 'y', 'z']) + >>> ser + x NaN + y NaN + z NaN + dtype: float64 + + Note that the Index is first build with the keys from the dictionary. + After this the Series is reindexed with the given Index values, hence we + get all NaN as a result. """ _typ = "series" @@ -403,6 +430,10 @@ def _constructor(self) -> Type[Series]: @property def _constructor_expanddim(self) -> Type[DataFrame]: + """ + Used when a manipulation result has one higher dimension as the + original, such as Series.to_frame() + """ from pandas.core.frame import DataFrame return DataFrame @@ -3940,7 +3971,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if func is None: func = dict(kwargs.items()) - op = series_apply(self, func, args=args, kwds=kwargs) + op = series_apply(self, func, args=args, kwargs=kwargs) result, how = op.agg() if result is None: @@ -3981,7 +4012,7 @@ def apply( func: AggFuncType, convert_dtype: bool = True, args: Tuple[Any, ...] = (), - **kwds, + **kwargs, ) -> FrameOrSeriesUnion: """ Invoke function on values of Series. @@ -3998,7 +4029,7 @@ def apply( False, leave as dtype=object. args : tuple Positional arguments passed to func after the series value. - **kwds + **kwargs Additional keyword arguments passed to func. Returns @@ -4079,7 +4110,7 @@ def apply( Helsinki 2.484907 dtype: float64 """ - op = series_apply(self, func, convert_dtype, args, kwds) + op = series_apply(self, func, convert_dtype, args, kwargs) return op.apply() def _reduce( diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9a68e470201c7..bec6cfb375716 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -510,7 +510,7 @@ def calc(x): return self._apply_tablewise(homogeneous_func, name) def aggregate(self, func, *args, **kwargs): - result, how = ResamplerWindowApply(self, func, args=args, kwds=kwargs).agg() + result, how = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: return self.apply(func, raw=False, args=args, kwargs=kwargs) return result @@ -994,7 +994,7 @@ def calc(x): axis="", ) def aggregate(self, func, *args, **kwargs): - result, how = ResamplerWindowApply(self, func, args=args, kwds=kwargs).agg() + result, how = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: # these must apply directly diff --git a/pandas/io/common.py b/pandas/io/common.py index e5a1f58ec6cd2..429df94271693 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -726,6 +726,12 @@ def __init__( self.archive_name = archive_name self.multiple_write_buffer: Optional[Union[StringIO, BytesIO]] = None + # add csv file name like gz or bz2 + if archive_name is None and isinstance(file, (os.PathLike, str)): + archive_name = os.path.basename(file) + if archive_name.endswith('.zip'): + self.archive_name = archive_name[:-4] + kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} kwargs_zip.update(kwargs) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 213be7c05b370..84b5cae09acce 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1069,26 +1069,37 @@ def __init__( xlrd_version = LooseVersion(get_version(xlrd)) - if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: - ext = inspect_excel_format( - content_or_path=path_or_buffer, storage_options=storage_options - ) - + ext = None if engine is None: + # Only determine ext if it is needed + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content_or_path=path_or_buffer, storage_options=storage_options + ) + # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") - if engine == "xlrd" and ext != "xls" and xlrd_version is not None: - if xlrd_version >= "2": + if engine == "xlrd" and xlrd_version is not None: + if ext is None: + # Need ext to determine ext in order to raise/warn + if isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + path_or_buffer, storage_options=storage_options + ) + + if ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - else: + elif ext != "xls": caller = inspect.stack()[1] if ( caller.filename.endswith( diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 64c64b5009b0c..81303bc1b6674 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,6 +1,7 @@ from __future__ import annotations from distutils.version import LooseVersion +import mmap from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np @@ -40,6 +41,7 @@ def __init__( from openpyxl import load_workbook self.book = load_workbook(self.handles.handle) + self.handles.handle.seek(0) else: # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -52,6 +54,9 @@ def save(self): Save workbook to disk. """ self.book.save(self.handles.handle) + if "r+" in self.mode and not isinstance(self.handles.handle, mmap.mmap): + # truncate file to the written content + self.handles.handle.truncate() @classmethod def _convert_to_style_kwargs(cls, style_dict: dict) -> Dict[str, Serialisable]: @@ -533,7 +538,11 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: version = LooseVersion(get_version(openpyxl)) - if version >= "3.0.0": + # There is no good way of determining if a sheet is read-only + # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605 + is_readonly = hasattr(sheet, "reset_dimensions") + + if version >= "3.0.0" and is_readonly: sheet.reset_dimensions() data: List[List[Scalar]] = [] @@ -541,7 +550,7 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: converted_row = [self._convert_cell(cell, convert_float) for cell in row] data.append(converted_row) - if version >= "3.0.0" and len(data) > 0: + if version >= "3.0.0" and is_readonly and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6eac9ba87c73d..3d9eb4e96f78a 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -391,9 +391,6 @@ def _translate(self): BLANK_CLASS = "blank" BLANK_VALUE = "" - def format_attr(pair): - return f"{pair['key']}={pair['value']}" - # for sparsifying a MultiIndex idx_lengths = _get_level_lengths(self.index) col_lengths = _get_level_lengths(self.columns, hidden_columns) @@ -462,9 +459,7 @@ def format_attr(pair): } colspan = col_lengths.get((r, c), 0) if colspan > 1: - es["attributes"] = [ - format_attr({"key": "colspan", "value": f'"{colspan}"'}) - ] + es["attributes"] = [f'colspan="{colspan}"'] row_es.append(es) head.append(row_es) @@ -508,9 +503,7 @@ def format_attr(pair): } rowspan = idx_lengths.get((c, r), 0) if rowspan > 1: - es["attributes"] = [ - format_attr({"key": "rowspan", "value": f'"{rowspan}"'}) - ] + es["attributes"] = [f'rowspan="{rowspan}"'] row_es.append(es) for c, col in enumerate(self.data.columns): diff --git a/pandas/io/formats/templates/html.tpl b/pandas/io/formats/templates/html.tpl index 97bfda9af089d..b315c57a65cdf 100644 --- a/pandas/io/formats/templates/html.tpl +++ b/pandas/io/formats/templates/html.tpl @@ -1,70 +1,70 @@ {# Update the template_structure.html document too #} {%- block before_style -%}{%- endblock before_style -%} {% block style %} - -{%- endblock style %} -{%- block before_table %}{% endblock before_table %} -{%- block table %} - -{%- block caption %} -{%- if caption -%} - -{%- endif -%} -{%- endblock caption %} -{%- block thead %} - - {%- block before_head_rows %}{% endblock %} - {%- for r in head %} - {%- block head_tr scoped %} +{% endblock style %} +{% block before_table %}{% endblock before_table %} +{% block table %} +
{{caption}}
+{% block caption %} +{% if caption %} + +{% endif %} +{% endblock caption %} +{% block thead %} + +{% block before_head_rows %}{% endblock %} +{% for r in head %} +{% block head_tr scoped %} - {%- for c in r %} - {%- if c.is_visible != False %} - <{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}} - {%- endif %} - {%- endfor %} +{% for c in r %} +{% if c.is_visible != False %} + <{{c.type}} class="{{c.class}}" {{c.attributes|join(" ")}}>{{c.value}} +{% endif %} +{% endfor %} - {%- endblock head_tr %} - {%- endfor %} - {%- block after_head_rows %}{% endblock %} - -{%- endblock thead %} -{%- block tbody %} - - {% block before_rows %}{% endblock before_rows %} - {% for r in body %} - {% block tr scoped %} +{% endblock head_tr %} +{% endfor %} +{% block after_head_rows %}{% endblock %} + +{% endblock thead %} +{% block tbody %} + +{% block before_rows %}{% endblock before_rows %} +{% for r in body %} +{% block tr scoped %} - {% for c in r %} - {% if c.is_visible != False %} - <{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }} - {% endif %} - {%- endfor %} +{% for c in r %} +{% if c.is_visible != False %} + <{{c.type}} {% if c.id is defined -%} id="T_{{uuid}}{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes|join(" ")}}>{{c.display_value}} +{% endif %} +{% endfor %} - {% endblock tr %} - {%- endfor %} - {%- block after_rows %}{%- endblock after_rows %} - -{%- endblock tbody %} +{% endblock tr %} +{% endfor %} +{% block after_rows %}{% endblock after_rows %} + +{% endblock tbody %}
{{caption}}
-{%- endblock table %} -{%- block after_table %}{% endblock after_table %} +{% endblock table %} +{% block after_table %}{% endblock after_table %} diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index bff0306a50ee6..c4959ee2c8962 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -274,3 +274,13 @@ def test_transform_mixed_column_name_dtypes(): msg = r"Column\(s\) \[1, 'b'\] do not exist" with pytest.raises(SpecificationError, match=msg): df.transform({"a": int, 1: str, "b": int}) + + +def test_transform_empty_dataframe(): + # https://github.com/pandas-dev/pandas/issues/39636 + df = DataFrame([], columns=["col1", "col2"]) + result = df.transform(lambda x: x + 10) + tm.assert_frame_equal(result, df) + + result = df["col1"].transform(lambda x: x + 10) + tm.assert_series_equal(result, df["col1"]) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 434df5ccccaf7..2a94b18b806f8 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -110,7 +110,7 @@ def test_select_dtypes_exclude_include_using_list_like(self): { "a": list("abc"), "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), + "c": np.arange(3, 6, dtype="u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, @@ -128,6 +128,26 @@ def test_select_dtypes_exclude_include_using_list_like(self): e = df[["b", "e"]] tm.assert_frame_equal(r, e) + @pytest.mark.parametrize( + "include", [(np.bool_, "int"), (np.bool_, "integer"), ("bool", int)] + ) + def test_select_dtypes_exclude_include_int(self, include): + # Fix select_dtypes(include='int') for Windows, FYI #36596 + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6, dtype="int32"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + exclude = (np.datetime64,) + result = df.select_dtypes(include=include, exclude=exclude) + expected = df[["b", "c", "e"]] + tm.assert_frame_equal(result, expected) + def test_select_dtypes_include_using_scalars(self): df = DataFrame( { diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 29a2d9c17202e..6b8284908213a 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -321,12 +321,14 @@ def test_set_flags(self, allows_duplicate_labels, frame_or_series): result.iloc[key] = 10 assert obj.iloc[key] == 0 - def test_constructor_expanddim_lookup(self): - # GH#33628 accessing _constructor_expanddim should not - # raise NotImplementedError + def test_constructor_expanddim(self): + # GH#33628 accessing _constructor_expanddim should not raise NotImplementedError + # GH38782 pandas has no container higher than DataFrame (two-dim), so + # DataFrame._constructor_expand_dim, doesn't make sense, so is removed. df = DataFrame() - with pytest.raises(NotImplementedError, match="Not supported for DataFrames!"): + msg = "'DataFrame' object has no attribute '_constructor_expanddim'" + with pytest.raises(AttributeError, match=msg): df._constructor_expanddim(np.arange(27).reshape(3, 3, 3)) @skip_if_no("jinja2") diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 640501baffc62..04a484c3edc0d 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,4 +1,5 @@ from distutils.version import LooseVersion +from pathlib import Path import numpy as np import pytest @@ -122,6 +123,17 @@ def test_to_excel_with_openpyxl_engine(ext): styled.to_excel(filename, engine="openpyxl") +@pytest.mark.parametrize("read_only", [True, False]) +def test_read_workbook(datapath, ext, read_only): + # GH 39528 + filename = datapath("io", "data", "excel", "test1" + ext) + wb = openpyxl.load_workbook(filename, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = pd.read_excel(filename) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "header, expected_data", [ @@ -139,13 +151,41 @@ def test_to_excel_with_openpyxl_engine(ext): @pytest.mark.parametrize( "filename", ["dimension_missing", "dimension_small", "dimension_large"] ) -@pytest.mark.xfail( - LooseVersion(get_version(openpyxl)) < "3.0.0", - reason="openpyxl read-only sheet is incorrect when dimension data is wrong", -) -def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_with_bad_dimension( + datapath, ext, header, expected_data, filename, read_only, request +): # GH 38956, 39001 - no/incorrect dimension information + version = LooseVersion(get_version(openpyxl)) + if (read_only or read_only is None) and version < "3.0.0": + msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" + request.node.add_marker(pytest.mark.xfail(reason=msg)) path = datapath("io", "data", "excel", f"{filename}{ext}") - result = pd.read_excel(path, header=header) + if read_only is None: + result = pd.read_excel(path, header=header) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl", header=header) + wb.close() expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) + + +def test_append_mode_file(ext): + # GH 39576 + df = DataFrame() + + with tm.ensure_clean(ext) as f: + df.to_excel(f, engine="openpyxl") + + with ExcelWriter(f, mode="a", engine="openpyxl") as writer: + df.to_excel(writer) + + # make sure that zip files are not concatenated by making sure that + # "docProps/app.xml" only occurs twice in the file + data = Path(f).read_bytes() + first = data.find(b"docProps/app.xml") + second = data.find(b"docProps/app.xml", first + 1) + third = data.find(b"docProps/app.xml", second + 1) + assert second != -1 and third == -1 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b2e87de5580e6..a594718bd62d9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -2,6 +2,7 @@ from functools import partial import os from urllib.error import URLError +from zipfile import BadZipFile import numpy as np import pytest @@ -685,7 +686,13 @@ def test_missing_file_raises(self, read_ext): def test_corrupt_bytes_raises(self, read_ext, engine): bad_stream = b"foo" - with pytest.raises(ValueError, match="File is not a recognized excel file"): + if engine is None or engine == "xlrd": + error = ValueError + msg = "File is not a recognized excel file" + else: + error = BadZipFile + msg = "File is not a zip file" + with pytest.raises(error, match=msg): pd.read_excel(bad_stream) @tm.network diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index d2c5b5b9d0b2c..0bd0c5bd87761 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -140,8 +140,8 @@ def test_multiple_render(self): s = Styler(self.df, uuid_len=0).applymap(lambda x: "color: red;", subset=["A"]) s.render() # do 2 renders to ensure css styles not duplicated assert ( - '" in s.render() + '" in s.render() ) def test_render_empty_dfs(self): @@ -1794,11 +1794,11 @@ def test_column_and_row_styling(self): df = DataFrame(data=[[0, 1], [1, 2]], columns=["A", "B"]) s = Styler(df, uuid_len=0) s = s.set_table_styles({"A": [{"selector": "", "props": [("color", "blue")]}]}) - assert "#T__ .col0 {\n color: blue;\n }" in s.render() + assert "#T__ .col0 {\n color: blue;\n}" in s.render() s = s.set_table_styles( {0: [{"selector": "", "props": [("color", "blue")]}]}, axis=1 ) - assert "#T__ .row0 {\n color: blue;\n }" in s.render() + assert "#T__ .row0 {\n color: blue;\n}" in s.render() def test_colspan_w3(self): # GH 36223 @@ -1855,12 +1855,12 @@ def test_tooltip_render(self, ttips): s = Styler(df, uuid_len=0).set_tooltips(ttips).render() # test tooltip table level class - assert "#T__ .pd-t {\n visibility: hidden;\n" in s + assert "#T__ .pd-t {\n visibility: hidden;\n" in s # test 'Min' tooltip added assert ( - "#T__ #T__row0_col0:hover .pd-t {\n visibility: visible;\n } " - + ' #T__ #T__row0_col0 .pd-t::after {\n content: "Min";\n }' + "#T__ #T__row0_col0:hover .pd-t {\n visibility: visible;\n}\n" + + '#T__ #T__row0_col0 .pd-t::after {\n content: "Min";\n}' in s ) assert ( @@ -1871,8 +1871,8 @@ def test_tooltip_render(self, ttips): # test 'Max' tooltip added assert ( - "#T__ #T__row0_col1:hover .pd-t {\n visibility: visible;\n } " - + ' #T__ #T__row0_col1 .pd-t::after {\n content: "Max";\n }' + "#T__ #T__row0_col1:hover .pd-t {\n visibility: visible;\n}\n" + + '#T__ #T__row0_col1 .pd-t::after {\n content: "Max";\n}' in s ) assert ( @@ -1892,16 +1892,16 @@ def test_tooltip_reindex(self): index=[0, 2], ) s = Styler(df, uuid_len=0).set_tooltips(DataFrame(ttips)).render() - assert '#T__ #T__row0_col0 .pd-t::after {\n content: "Mi";\n }' in s - assert '#T__ #T__row0_col2 .pd-t::after {\n content: "Ma";\n }' in s - assert '#T__ #T__row2_col0 .pd-t::after {\n content: "Mu";\n }' in s - assert '#T__ #T__row2_col2 .pd-t::after {\n content: "Mo";\n }' in s + assert '#T__ #T__row0_col0 .pd-t::after {\n content: "Mi";\n}' in s + assert '#T__ #T__row0_col2 .pd-t::after {\n content: "Ma";\n}' in s + assert '#T__ #T__row2_col0 .pd-t::after {\n content: "Mu";\n}' in s + assert '#T__ #T__row2_col2 .pd-t::after {\n content: "Mo";\n}' in s def test_tooltip_ignored(self): # GH 21266 df = DataFrame(data=[[0, 1], [2, 3]]) s = Styler(df).set_tooltips_class("pd-t").render() # no set_tooltips() - assert '' in s + assert '' in s assert '' not in s def test_tooltip_class(self): @@ -1913,11 +1913,8 @@ def test_tooltip_class(self): .set_tooltips_class(name="other-class", properties=[("color", "green")]) .render() ) - assert "#T__ .other-class {\n color: green;\n" in s - assert ( - '#T__ #T__row0_col0 .other-class::after {\n content: "tooltip";\n' - in s - ) + assert "#T__ .other-class {\n color: green;\n" in s + assert '#T__ #T__row0_col0 .other-class::after {\n content: "tooltip";\n' in s # GH 39563 s = ( @@ -1926,10 +1923,50 @@ def test_tooltip_class(self): .set_tooltips_class(name="other-class", properties="color:green;color:red;") .render() ) - assert ( - "#T__ .other-class {\n color: green;\n color: red;\n " - in s + assert "#T__ .other-class {\n color: green;\n color: red;\n}" in s + + def test_w3_html_format(self): + s = ( + Styler( + DataFrame([[2.61], [2.69]], index=["a", "b"], columns=["A"]), + uuid_len=0, + ) + .set_table_styles([{"selector": "th", "props": "att2:v2;"}]) + .applymap(lambda x: "att1:v1;") + .set_table_attributes('class="my-cls1" style="attr3:v3;"') + .set_td_classes(DataFrame(["my-cls2"], index=["a"], columns=["A"])) + .format("{:.1f}") + .set_caption("A comprehensive test") ) + expected = """ + + + + + + + + + + + + + + + + + + +
A comprehensive test
A
a2.6
b2.7
+""" + assert expected == s.render() @td.skip_if_no_mpl diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 767b61e31698b..3a993f544b64a 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -74,18 +74,6 @@ def test_setitem_tuple_with_datetimetz_values(self): tm.assert_series_equal(result, expected) -class TestSetitemPeriodDtype: - @pytest.mark.parametrize("na_val", [None, np.nan]) - def test_setitem_na_period_dtype_casts_to_nat(self, na_val): - ser = Series(period_range("2000-01-01", periods=10, freq="D")) - - ser[3] = na_val - assert ser[3] is NaT - - ser[3:5] = na_val - assert ser[4] is NaT - - class TestSetitemScalarIndexer: def test_setitem_negative_out_of_bounds(self): ser = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) @@ -259,29 +247,6 @@ def test_setitem_callable_other(self): class TestSetitemCasting: - @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) - def test_setitem_dt64_into_int_series(self, dtype): - # dont cast dt64 to int when doing this setitem - orig = Series([1, 2, 3]) - - val = np.datetime64("2021-01-18 13:25:00", "ns") - if dtype == "m8[ns]": - val = val - val - - ser = orig.copy() - ser[:-1] = val - expected = Series([val, val, 3], dtype=object) - tm.assert_series_equal(ser, expected) - assert isinstance(ser[0], type(val)) - - ser = orig.copy() - ser[:-1] = [val, val] - tm.assert_series_equal(ser, expected) - - ser = orig.copy() - ser[:-1] = np.array([val, val]) - tm.assert_series_equal(ser, expected) - @pytest.mark.parametrize("unique", [True, False]) @pytest.mark.parametrize("val", [3, 3.0, "3"], ids=type) def test_setitem_non_bool_into_bool(self, val, indexer_sli, unique): @@ -599,3 +564,70 @@ def is_inplace(self): Indicate we do _not_ expect the setting to be done inplace. """ return False + + +class TestSetitemDT64IntoInt(SetitemCastingEquivalents): + # GH#39619 dont cast dt64 to int when doing this setitem + + @pytest.fixture(params=["M8[ns]", "m8[ns]"]) + def dtype(self, request): + return request.param + + @pytest.fixture + def scalar(self, dtype): + val = np.datetime64("2021-01-18 13:25:00", "ns") + if dtype == "m8[ns]": + val = val - val + return val + + @pytest.fixture + def expected(self, scalar): + expected = Series([scalar, scalar, 3], dtype=object) + assert isinstance(expected[0], type(scalar)) + return expected + + @pytest.fixture + def obj(self): + return Series([1, 2, 3]) + + @pytest.fixture + def key(self): + return slice(None, -1) + + @pytest.fixture(params=[None, list, np.array]) + def val(self, scalar, request): + box = request.param + if box is None: + return scalar + return box([scalar, scalar]) + + @pytest.fixture + def is_inplace(self): + return False + + +class TestSetitemNAPeriodDtype(SetitemCastingEquivalents): + # Setting compatible NA values into Series with PeriodDtype + + @pytest.fixture + def expected(self, key): + exp = Series(period_range("2000-01-01", periods=10, freq="D")) + exp._values.view("i8")[key] = NaT.value + assert exp[key] is NaT or all(x is NaT for x in exp[key]) + return exp + + @pytest.fixture + def obj(self): + return Series(period_range("2000-01-01", periods=10, freq="D")) + + @pytest.fixture(params=[3, slice(3, 5)]) + def key(self, request): + return request.param + + @pytest.fixture(params=[None, np.nan]) + def val(self, request): + return request.param + + @pytest.fixture + def is_inplace(self): + return True diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 2f255d92d86e3..4dd91b942474a 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -1,8 +1,11 @@ +import inspect import pydoc import numpy as np import pytest +from pandas.util._test_decorators import skip_if_no + import pandas as pd from pandas import DataFrame, Index, Series, date_range import pandas._testing as tm @@ -167,3 +170,10 @@ def test_attrs(self): s.attrs["version"] = 1 result = s + 1 assert result.attrs == {"version": 1} + + @skip_if_no("jinja2") + def test_inspect_getmembers(self): + # GH38782 + ser = Series() + with tm.assert_produces_warning(None): + inspect.getmembers(ser)