diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index f3b005b704014..f4a6ed5f26c89 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -118,12 +118,29 @@ def setup(self): self.a = pd.Categorical(list("aabbcd") * N) self.b = pd.Categorical(list("bbcdjk") * N) + self.idx_a = pd.CategoricalIndex(range(N), range(N)) + self.idx_b = pd.CategoricalIndex(range(N + 1), range(N + 1)) + self.df_a = pd.DataFrame(range(N), columns=["a"], index=self.idx_a) + self.df_b = pd.DataFrame(range(N + 1), columns=["a"], index=self.idx_b) + def time_concat(self): pd.concat([self.s, self.s]) def time_union(self): union_categoricals([self.a, self.b]) + def time_append_overlapping_index(self): + self.idx_a.append(self.idx_a) + + def time_append_non_overlapping_index(self): + self.idx_a.append(self.idx_b) + + def time_concat_overlapping_index(self): + pd.concat([self.df_a, self.df_a]) + + def time_concat_non_overlapping_index(self): + pd.concat([self.df_a, self.df_b]) + class ValueCounts: diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index 5227ad0f53a04..3743882b936e2 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -25,6 +25,15 @@ def time_isin_outside(self, dtype, exponent): self.s.isin(self.values_outside) +class UniqueForLargePyObjectInts: + def setup(self): + lst = [x << 32 for x in range(5000)] + self.arr = np.array(lst, dtype=np.object_) + + def time_unique(self): + pd.unique(self.arr) + + class IsinWithRandomFloat: params = [ [np.float64, np.object], diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index d4219296f5795..9a8a95bec66ad 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -329,21 +329,11 @@ Each data structure has several *constructor properties* for returning a new data structure as the result of an operation. By overriding these properties, you can retain subclasses through ``pandas`` data manipulations. -There are 3 constructor properties to be defined: +There are 3 possible constructor properties to be defined on a subclass: -* ``_constructor``: Used when a manipulation result has the same dimensions as the original. -* ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. -* ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()``. - -Following table shows how ``pandas`` data structures define constructor properties by default. - -=========================== ======================= ============= -Property Attributes ``Series`` ``DataFrame`` -=========================== ======================= ============= -``_constructor`` ``Series`` ``DataFrame`` -``_constructor_sliced`` ``NotImplementedError`` ``Series`` -``_constructor_expanddim`` ``DataFrame`` ``NotImplementedError`` -=========================== ======================= ============= +* ``DataFrame/Series._constructor``: Used when a manipulation result has the same dimension as the original. +* ``DataFrame._constructor_sliced``: Used when a ``DataFrame`` (sub-)class manipulation result should be a ``Series`` (sub-)class. +* ``Series._constructor_expanddim``: Used when a ``Series`` (sub-)class manipulation result should be a ``DataFrame`` (sub-)class, e.g. ``Series.to_frame()``. Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 974f84d3b244a..8ea0d72356acf 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -17,11 +17,14 @@ Fixed regressions - Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`) - Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`) -- Fixed regression in :class:`DataFrame.astype` and :class:`Series.astype` not casting to bytes dtype (:issue:`39474`) +- Fixed regression in :meth:`DataFrame.astype` and :meth:`Series.astype` not casting to bytes dtype (:issue:`39474`) - Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`) - Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`) - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) +- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`) +- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`) - Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) +- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 17d8c79994dbe..75bca020fd78f 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -219,7 +219,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. -- +- Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - .. --------------------------------------------------------------------------- @@ -253,6 +253,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`) - Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`) - Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`) +- Performance improvement in :func:`unique` for object data type (:issue:`37615`) .. --------------------------------------------------------------------------- @@ -304,6 +305,7 @@ Numeric - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`) +- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) - Conversion @@ -338,7 +340,7 @@ Indexing - Bug in :meth:`Series.__setitem__` raising ``ValueError`` when setting a :class:`Series` with a scalar indexer (:issue:`38303`) - Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`) - Bug in :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` always raising ``KeyError`` when slicing with existing strings an :class:`Index` with milliseconds (:issue:`33589`) -- Bug in setting ``timedelta64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`) +- Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, issue:`39619`) - Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`) - Bug in setting ``datetime64`` values into a :class:`Series` with integer-dtype incorrect casting the datetime64 values to integers (:issue:`39266`) - Bug in :meth:`Index.get_loc` not raising ``KeyError`` when method is specified for ``NaN`` value when ``NaN`` is not in :class:`Index` (:issue:`39382`) @@ -445,10 +447,11 @@ Other - Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`) - Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`) - Bug in :class:`Styler` which caused CSS to duplicate on multiple renders. (:issue:`39395`) +- ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`) - :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) - Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`) - Bug in :class:`Styler` where ``subset`` arg in methods raised an error for some valid multiindex slices (:issue:`33562`) -- +- :class:`Styler` rendered HTML output minor alterations to support w3 good code standard (:issue:`39626`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index e1ea1fbf9bd46..e5026ce2fa292 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -34,10 +34,14 @@ cdef class {{name}}Engine(IndexEngine): cdef _make_hash_table(self, Py_ssize_t n): return _hash.{{name}}HashTable(n) - {{if name not in {'Float64', 'Float32'} }} cdef _check_type(self, object val): + {{if name not in {'Float64', 'Float32'} }} if not util.is_integer_object(val): raise KeyError(val) + {{else}} + if util.is_bool_object(val): + # avoid casting to True -> 1.0 + raise KeyError(val) {{endif}} cdef void _call_map_locations(self, values): diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 0073aaf0195c7..aee018262e3a6 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -178,11 +178,31 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { return result; } -// For PyObject_Hash holds: -// hash(0.0) == 0 == hash(-0.0) -// hash(X) == 0 if X is a NaN-value -// so it is OK to use it directly -#define kh_python_hash_func(key) (PyObject_Hash(key)) + +khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ + // For PyObject_Hash holds: + // hash(0.0) == 0 == hash(-0.0) + // hash(X) == 0 if X is a NaN-value + // so it is OK to use it directly for doubles + Py_hash_t hash = PyObject_Hash(key); + if (hash == -1) { + PyErr_Clear(); + return 0; + } + #if SIZEOF_PY_HASH_T == 4 + // it is already 32bit value + return hash; + #else + // for 64bit builds, + // we need information of the upper 32bits as well + // see GH 37615 + khuint64_t as_uint = (khuint64_t) hash; + // uints avoid undefined behavior of signed ints + return (as_uint>>32)^as_uint; + #endif +} + + #define kh_python_hash_equal(a, b) (pyobject_cmp(a, b)) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index e3f159346cd51..4dbce8f75898f 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -457,7 +457,7 @@ def transform( # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate - if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: + if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty and not obj.empty: raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2b0d3f5aa8862..cdbef673643e8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -324,7 +324,8 @@ def unique(values): Hash table-based unique. Uniques are returned in order of appearance. This does NOT sort. - Significantly faster than numpy.unique. Includes NA values. + Significantly faster than numpy.unique for long enough sequences. + Includes NA values. Parameters ---------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 533190e692891..828b460f84ec6 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -63,7 +63,7 @@ def frame_apply( raw: bool = False, result_type: Optional[str] = None, args=None, - kwds=None, + kwargs=None, ) -> FrameApply: """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) @@ -79,7 +79,7 @@ def frame_apply( raw=raw, result_type=result_type, args=args, - kwds=kwds, + kwargs=kwargs, ) @@ -88,14 +88,14 @@ def series_apply( func: AggFuncType, convert_dtype: bool = True, args=None, - kwds=None, + kwargs=None, ) -> SeriesApply: return SeriesApply( obj, func, convert_dtype, args, - kwds, + kwargs, ) @@ -109,12 +109,12 @@ def __init__( raw: bool, result_type: Optional[str], args, - kwds, + kwargs, ): self.obj = obj self.raw = raw self.args = args or () - self.kwds = kwds or {} + self.kwargs = kwargs or {} if result_type not in [None, "reduce", "broadcast", "expand"]: raise ValueError( @@ -126,13 +126,13 @@ def __init__( # curry if needed if ( - (kwds or args) + (kwargs or args) and not isinstance(func, (np.ufunc, str)) and not is_list_like(func) ): def f(x): - return func(x, *args, **kwds) + return func(x, *args, **kwargs) else: f = func @@ -163,7 +163,7 @@ def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]: obj = self.obj arg = self.f args = self.args - kwargs = self.kwds + kwargs = self.kwargs _axis = kwargs.pop("_axis", None) if _axis is None: @@ -413,10 +413,10 @@ def maybe_apply_str(self) -> Optional[FrameOrSeriesUnion]: if callable(func): sig = inspect.getfullargspec(func) if "axis" in sig.args: - self.kwds["axis"] = self.axis + self.kwargs["axis"] = self.axis elif self.axis != 0: raise ValueError(f"Operation {f} does not support axis=1") - return self.obj._try_aggregate_string_function(f, *self.args, **self.kwds) + return self.obj._try_aggregate_string_function(f, *self.args, **self.kwargs) def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: """ @@ -430,7 +430,7 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: # Note: dict-likes are list-like if not is_list_like(self.f): return None - return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwds) + return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) class FrameApply(Apply): @@ -806,7 +806,7 @@ def __init__( func: AggFuncType, convert_dtype: bool, args, - kwds, + kwargs, ): self.convert_dtype = convert_dtype @@ -816,7 +816,7 @@ def __init__( raw=False, result_type=None, args=args, - kwds=kwds, + kwargs=kwargs, ) def apply(self) -> FrameOrSeriesUnion: @@ -877,17 +877,17 @@ def __init__( obj: Union[SeriesGroupBy, DataFrameGroupBy], func: AggFuncType, args, - kwds, + kwargs, ): - kwds = kwds.copy() - self.axis = obj.obj._get_axis_number(kwds.get("axis", 0)) + kwargs = kwargs.copy() + self.axis = obj.obj._get_axis_number(kwargs.get("axis", 0)) super().__init__( obj, func, raw=False, result_type=None, args=args, - kwds=kwds, + kwargs=kwargs, ) def apply(self): @@ -903,7 +903,7 @@ def __init__( obj: Union[Resampler, BaseWindow], func: AggFuncType, args, - kwds, + kwargs, ): super().__init__( obj, @@ -911,7 +911,7 @@ def __init__( raw=False, result_type=None, args=args, - kwds=kwds, + kwargs=kwargs, ) def apply(self): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 640c8d66807ad..8aa3d7900e8e9 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -588,9 +588,13 @@ def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bo Otherwise an object array is returned. """ # perf shortcut as this is the most common case - if isinstance(arr, np.ndarray): - if maybe_castable(arr) and not copy and dtype is None: - return arr + if ( + isinstance(arr, np.ndarray) + and maybe_castable(arr.dtype) + and not copy + and dtype is None + ): + return arr if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)): # create an extension array from its dtype diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1cb592f18dd2c..ed36beb80986e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1331,20 +1331,18 @@ def convert_dtypes( return inferred_dtype -def maybe_castable(arr: np.ndarray) -> bool: +def maybe_castable(dtype: np.dtype) -> bool: # return False to force a non-fastpath - assert isinstance(arr, np.ndarray) # GH 37024 - # check datetime64[ns]/timedelta64[ns] are valid # otherwise try to coerce - kind = arr.dtype.kind + kind = dtype.kind if kind == "M": - return is_datetime64_ns_dtype(arr.dtype) + return is_datetime64_ns_dtype(dtype) elif kind == "m": - return is_timedelta64_ns_dtype(arr.dtype) + return is_timedelta64_ns_dtype(dtype) - return arr.dtype.name not in POSSIBLY_CAST_DTYPES + return dtype.name not in POSSIBLY_CAST_DTYPES def maybe_infer_to_datetimelike( diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6357b8feb348b..f3f899f9fd90a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -490,23 +490,14 @@ class DataFrame(NDFrame, OpsMixin): _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set _typ = "dataframe" _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) + _accessors: Set[str] = {"sparse"} + _hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([]) @property def _constructor(self) -> Type[DataFrame]: return DataFrame _constructor_sliced: Type[Series] = Series - _hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([]) - _accessors: Set[str] = {"sparse"} - - @property - def _constructor_expanddim(self): - # GH#31549 raising NotImplementedError on a property causes trouble - # for `inspect` - def constructor(*args, **kwargs): - raise NotImplementedError("Not supported for DataFrames!") - - return constructor # ---------------------------------------------------------------------- # Constructors @@ -3786,8 +3777,21 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: raise ValueError("at least one of include or exclude must be nonempty") # convert the myriad valid dtypes object to a single representation - include = frozenset(infer_dtype_from_object(x) for x in include) - exclude = frozenset(infer_dtype_from_object(x) for x in exclude) + def check_int_infer_dtype(dtypes): + converted_dtypes = [] + for dtype in dtypes: + # Numpy maps int to different types (int32, in64) on Windows and Linux + # see https://github.com/numpy/numpy/issues/9464 + if (isinstance(dtype, str) and dtype == "int") or (dtype is int): + converted_dtypes.append(np.int32) + converted_dtypes.append(np.int64) + else: + converted_dtypes.append(infer_dtype_from_object(dtype)) + return frozenset(converted_dtypes) + + include = check_int_infer_dtype(include) + exclude = check_int_infer_dtype(exclude) + for dtypes in (include, exclude): invalidate_string_dtypes(dtypes) @@ -7718,7 +7722,7 @@ def _aggregate(self, arg, axis: Axis = 0, *args, **kwargs): func=arg, axis=0, args=args, - kwds=kwargs, + kwargs=kwargs, ) result, how = op.agg() @@ -7750,7 +7754,7 @@ def apply( raw: bool = False, result_type=None, args=(), - **kwds, + **kwargs, ): """ Apply a function along an axis of the DataFrame. @@ -7798,7 +7802,7 @@ def apply( args : tuple Positional arguments to pass to `func` in addition to the array/series. - **kwds + **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -7892,7 +7896,7 @@ def apply( raw=raw, result_type=result_type, args=args, - kwds=kwds, + kwargs=kwargs, ) return op.apply() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 96b35f1aaab9c..e1271cfec2bde 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -375,22 +375,6 @@ def _constructor(self: FrameOrSeries) -> Type[FrameOrSeries]: """ raise AbstractMethodError(self) - @property - def _constructor_sliced(self): - """ - Used when a manipulation result has one lower dimension(s) as the - original, such as DataFrame single columns slicing. - """ - raise AbstractMethodError(self) - - @property - def _constructor_expanddim(self): - """ - Used when a manipulation result has one higher dimension as the - original, such as Series.to_frame() - """ - raise NotImplementedError - # ---------------------------------------------------------------------- # Internals @@ -8891,32 +8875,11 @@ def _where( if isinstance(other, (np.ndarray, ExtensionArray)): if other.shape != self.shape: - - if self.ndim == 1: - - icond = cond._values - - # GH 2745 / GH 4192 - # treat like a scalar - if len(other) == 1: - other = other[0] - - # GH 3235 - # match True cond to other - elif len(cond[icond]) == len(other): - - # try to not change dtype at first - new_other = self._values - new_other = new_other.copy() - new_other[icond] = other - other = new_other - - else: - raise ValueError( - "Length of replacements must equal series length" - ) - - else: + if self.ndim != 1: + # In the ndim == 1 case we may have + # other length 1, which we treat as scalar (GH#2745, GH#4192) + # or len(other) == icond.sum(), which we treat like + # __setitem__ (GH#3235) raise ValueError( "other must be the same shape as self when an ndarray" ) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 12698efa86b28..7b6eb4c8fe2f9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -983,7 +983,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # try to treat as if we are passing a list try: result, _ = GroupByApply( - self, [func], args=(), kwds={"_axis": self.axis} + self, [func], args=(), kwargs={"_axis": self.axis} ).agg() # select everything except for the last level, which is the one diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f2fd5ca9c62c7..5c1fabd67bc8d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4550,11 +4550,16 @@ def putmask(self, mask, value): return self.astype(dtype).putmask(mask, value) values = self._values.copy() - if isinstance(converted, np.timedelta64) and self.dtype == object: + dtype, _ = infer_dtype_from(converted, pandas_dtype=True) + if dtype.kind in ["m", "M"]: # https://github.com/numpy/numpy/issues/12550 # timedelta64 will incorrectly cast to int - converted = [converted] * mask.sum() - values[mask] = converted + if not is_list_like(converted): + converted = [converted] * mask.sum() + values[mask] = converted + else: + converted = list(converted) + np.putmask(values, mask, converted) else: np.putmask(values, mask, converted) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 777fc1c7c4ad2..d6427aed6edf3 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,4 +1,4 @@ -from typing import Any, Hashable, Optional +from typing import Hashable, Optional import warnings import numpy as np @@ -9,7 +9,6 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( - is_bool, is_dtype_equal, is_extension_array_dtype, is_float, @@ -336,13 +335,6 @@ def _convert_slice_indexer(self, key: slice, kind: str): # translate to locations return self.slice_indexer(key.start, key.stop, key.step, kind=kind) - @doc(Index.get_loc) - def get_loc(self, key, method=None, tolerance=None): - if is_bool(key): - # Catch this to avoid accidentally casting to 1.0 - raise KeyError(key) - return super().get_loc(key, method=method, tolerance=tolerance) - # ---------------------------------------------------------------- def _format_native_types( @@ -359,10 +351,3 @@ def _format_native_types( fixed_width=False, ) return formatter.get_result_as_array() - - def __contains__(self, other: Any) -> bool: - hash(other) - if super().__contains__(other): - return True - - return is_float(other) and np.isnan(other) and self.hasnans diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9314666acdaad..75814cb2bae3d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1031,7 +1031,8 @@ def putmask(self, mask, new) -> List[Block]: elif not mask.any(): return [self] - elif isinstance(new, np.timedelta64): + dtype, _ = infer_dtype_from(new) + if dtype.kind in ["m", "M"]: # using putmask with object dtype will incorrect cast to object # Having excluded self._can_hold_element, we know we cannot operate # in-place, so we are safe using `where` @@ -1317,10 +1318,15 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]: blocks = block.where(orig_other, cond, errors=errors, axis=axis) return self._maybe_downcast(blocks, "infer") - elif isinstance(other, np.timedelta64): - # expressions.where will cast np.timedelta64 to int - result = self.values.copy() - result[~cond] = [other] * (~cond).sum() + dtype, _ = infer_dtype_from(other, pandas_dtype=True) + if dtype.kind in ["m", "M"] and dtype.kind != values.dtype.kind: + # expressions.where would cast np.timedelta64 to int + if not is_list_like(other): + other = [other] * (~cond).sum() + else: + other = list(other) + result = values.copy() + np.putmask(result, ~cond, other) else: # convert datetime to datetime64, timedelta to timedelta64 diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 965de2e04bf40..34b7838d2280c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -301,7 +301,7 @@ def pipe( def aggregate(self, func, *args, **kwargs): self._set_binner() - result, how = ResamplerWindowApply(self, func, args=args, kwds=kwargs).agg() + result, how = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: how = func grouper = None diff --git a/pandas/core/series.py b/pandas/core/series.py index 8bd325beede65..559b27aeb7e50 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -180,8 +180,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like - and index is None, then the values in the index are used to - reindex the Series after it is created using the keys in the data. + and index is None, then the keys in the data are used as the index. If the + index is not None, the resulting Series is reindexed with the index values. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Series. If not specified, this will be inferred from `data`. @@ -190,6 +190,33 @@ class Series(base.IndexOpsMixin, generic.NDFrame): The name to give to the Series. copy : bool, default False Copy input data. + + Examples + -------- + Constructing Series from a dictionary with an Index specified + + >>> d = {'a': 1, 'b': 2, 'c': 3} + >>> ser = pd.Series(data=d, index=['a', 'b', 'c']) + >>> ser + a 1 + b 2 + c 3 + dtype: int64 + + The keys of the dictionary match with the Index values, hence the Index + values have no effect. + + >>> d = {'a': 1, 'b': 2, 'c': 3} + >>> ser = pd.Series(data=d, index=['x', 'y', 'z']) + >>> ser + x NaN + y NaN + z NaN + dtype: float64 + + Note that the Index is first build with the keys from the dictionary. + After this the Series is reindexed with the given Index values, hence we + get all NaN as a result. """ _typ = "series" @@ -403,6 +430,10 @@ def _constructor(self) -> Type[Series]: @property def _constructor_expanddim(self) -> Type[DataFrame]: + """ + Used when a manipulation result has one higher dimension as the + original, such as Series.to_frame() + """ from pandas.core.frame import DataFrame return DataFrame @@ -3940,7 +3971,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if func is None: func = dict(kwargs.items()) - op = series_apply(self, func, args=args, kwds=kwargs) + op = series_apply(self, func, args=args, kwargs=kwargs) result, how = op.agg() if result is None: @@ -3981,7 +4012,7 @@ def apply( func: AggFuncType, convert_dtype: bool = True, args: Tuple[Any, ...] = (), - **kwds, + **kwargs, ) -> FrameOrSeriesUnion: """ Invoke function on values of Series. @@ -3998,7 +4029,7 @@ def apply( False, leave as dtype=object. args : tuple Positional arguments passed to func after the series value. - **kwds + **kwargs Additional keyword arguments passed to func. Returns @@ -4079,7 +4110,7 @@ def apply( Helsinki 2.484907 dtype: float64 """ - op = series_apply(self, func, convert_dtype, args, kwds) + op = series_apply(self, func, convert_dtype, args, kwargs) return op.apply() def _reduce( diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9a68e470201c7..bec6cfb375716 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -510,7 +510,7 @@ def calc(x): return self._apply_tablewise(homogeneous_func, name) def aggregate(self, func, *args, **kwargs): - result, how = ResamplerWindowApply(self, func, args=args, kwds=kwargs).agg() + result, how = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: return self.apply(func, raw=False, args=args, kwargs=kwargs) return result @@ -994,7 +994,7 @@ def calc(x): axis="", ) def aggregate(self, func, *args, **kwargs): - result, how = ResamplerWindowApply(self, func, args=args, kwds=kwargs).agg() + result, how = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: # these must apply directly diff --git a/pandas/io/common.py b/pandas/io/common.py index e5a1f58ec6cd2..429df94271693 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -726,6 +726,12 @@ def __init__( self.archive_name = archive_name self.multiple_write_buffer: Optional[Union[StringIO, BytesIO]] = None + # add csv file name like gz or bz2 + if archive_name is None and isinstance(file, (os.PathLike, str)): + archive_name = os.path.basename(file) + if archive_name.endswith('.zip'): + self.archive_name = archive_name[:-4] + kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} kwargs_zip.update(kwargs) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 213be7c05b370..84b5cae09acce 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1069,26 +1069,37 @@ def __init__( xlrd_version = LooseVersion(get_version(xlrd)) - if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: - ext = inspect_excel_format( - content_or_path=path_or_buffer, storage_options=storage_options - ) - + ext = None if engine is None: + # Only determine ext if it is needed + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content_or_path=path_or_buffer, storage_options=storage_options + ) + # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") - if engine == "xlrd" and ext != "xls" and xlrd_version is not None: - if xlrd_version >= "2": + if engine == "xlrd" and xlrd_version is not None: + if ext is None: + # Need ext to determine ext in order to raise/warn + if isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + path_or_buffer, storage_options=storage_options + ) + + if ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - else: + elif ext != "xls": caller = inspect.stack()[1] if ( caller.filename.endswith( diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 64c64b5009b0c..81303bc1b6674 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,6 +1,7 @@ from __future__ import annotations from distutils.version import LooseVersion +import mmap from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np @@ -40,6 +41,7 @@ def __init__( from openpyxl import load_workbook self.book = load_workbook(self.handles.handle) + self.handles.handle.seek(0) else: # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -52,6 +54,9 @@ def save(self): Save workbook to disk. """ self.book.save(self.handles.handle) + if "r+" in self.mode and not isinstance(self.handles.handle, mmap.mmap): + # truncate file to the written content + self.handles.handle.truncate() @classmethod def _convert_to_style_kwargs(cls, style_dict: dict) -> Dict[str, Serialisable]: @@ -533,7 +538,11 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: version = LooseVersion(get_version(openpyxl)) - if version >= "3.0.0": + # There is no good way of determining if a sheet is read-only + # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605 + is_readonly = hasattr(sheet, "reset_dimensions") + + if version >= "3.0.0" and is_readonly: sheet.reset_dimensions() data: List[List[Scalar]] = [] @@ -541,7 +550,7 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: converted_row = [self._convert_cell(cell, convert_float) for cell in row] data.append(converted_row) - if version >= "3.0.0" and len(data) > 0: + if version >= "3.0.0" and is_readonly and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6eac9ba87c73d..3d9eb4e96f78a 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -391,9 +391,6 @@ def _translate(self): BLANK_CLASS = "blank" BLANK_VALUE = "" - def format_attr(pair): - return f"{pair['key']}={pair['value']}" - # for sparsifying a MultiIndex idx_lengths = _get_level_lengths(self.index) col_lengths = _get_level_lengths(self.columns, hidden_columns) @@ -462,9 +459,7 @@ def format_attr(pair): } colspan = col_lengths.get((r, c), 0) if colspan > 1: - es["attributes"] = [ - format_attr({"key": "colspan", "value": f'"{colspan}"'}) - ] + es["attributes"] = [f'colspan="{colspan}"'] row_es.append(es) head.append(row_es) @@ -508,9 +503,7 @@ def format_attr(pair): } rowspan = idx_lengths.get((c, r), 0) if rowspan > 1: - es["attributes"] = [ - format_attr({"key": "rowspan", "value": f'"{rowspan}"'}) - ] + es["attributes"] = [f'rowspan="{rowspan}"'] row_es.append(es) for c, col in enumerate(self.data.columns): diff --git a/pandas/io/formats/templates/html.tpl b/pandas/io/formats/templates/html.tpl index 97bfda9af089d..b315c57a65cdf 100644 --- a/pandas/io/formats/templates/html.tpl +++ b/pandas/io/formats/templates/html.tpl @@ -1,70 +1,70 @@ {# Update the template_structure.html document too #} {%- block before_style -%}{%- endblock before_style -%} {% block style %} - -{%- endblock style %} -{%- block before_table %}{% endblock before_table %} -{%- block table %} -
+ | A | +
---|---|
a | +2.6 | +
b | +2.7 | +