diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 42a039af46e94..d042bda77d4e8 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,4 +1,10 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml +trigger: +- master + +pr: +- master + jobs: # Mac and Linux use the same template - template: ci/azure/posix.yml diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index c9a2e4eefd19d..437cc9b161e8a 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -38,11 +38,11 @@ jobs: LC_ALL: "it_IT.utf8" EXTRA_APT: "language-pack-it xsel" - py36_32bit: - ENV_FILE: ci/deps/azure-36-32bit.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network and not clipboard" - BITS32: "yes" + #py36_32bit: + # ENV_FILE: ci/deps/azure-36-32bit.yaml + # CONDA_PY: "36" + # PATTERN: "not slow and not network and not clipboard" + # BITS32: "yes" py37_locale: ENV_FILE: ci/deps/azure-37-locale.yaml diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 200d567a62732..d7f7690f8c3d0 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -397,6 +397,28 @@ The result will be a DataFrame with the same index as the input Series, and with one column whose name is the original name of the Series (only if no other column name provided). +.. _basics.dataframe.from_list_dataclasses: + +From a list of dataclasses +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.1.0 + +Data Classes as introduced in `PEP557 `__, +can be passed into the DataFrame constructor. +Passing a list of dataclasses is equivilent to passing a list of dictionaries. + +Please be aware, that that all values in the list should be dataclasses, mixing +types in the list would result in a TypeError. + +.. ipython:: python + + from dataclasses import make_dataclass + + Point = make_dataclass("Point", [("x", int), ("y", int)]) + + pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + **Missing data** Much more will be said on this topic in the :ref:`Missing data ` diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5b6f70be478c2..21e59805fa143 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -168,6 +168,37 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss .. --------------------------------------------------------------------------- +.. _whatsnew_110.api_breaking.assignment_to_multiple_columns: + +Assignment to multiple columns of a DataFrame when some columns do not exist +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed with the right values. (:issue:`13658`) + +.. ipython:: python + + df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df[['a', 'c']] = 1 + In [4]: df + Out[4]: + a b + 0 1 1 + 1 1 1 + 2 1 1 + +*New behavior*: + +.. ipython:: python + + df[['a', 'c']] = 1 + df + .. _whatsnew_110.deprecations: Deprecations @@ -175,7 +206,8 @@ Deprecations - Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`) - :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`) - Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`) -- +- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) +- :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) .. --------------------------------------------------------------------------- @@ -190,7 +222,7 @@ Performance improvements - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of - existing indexes (:issue:`28584`, :issue:`32640`) + existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) .. --------------------------------------------------------------------------- @@ -216,6 +248,7 @@ Datetimelike - Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`) - Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`) - :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`) +- Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`) Timedelta ^^^^^^^^^ @@ -241,7 +274,7 @@ Conversion ^^^^^^^^^^ - Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) - Bug in :class:`Timedelta` construction with large nanoseconds keyword value (:issue:`32402`) -- +- Bug in :class:`DataFrame` construction where sets would be duplicated rather than raising (:issue:`32582`) Strings ^^^^^^^ @@ -306,6 +339,7 @@ I/O - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) +- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) Plotting @@ -335,6 +369,8 @@ Reshaping - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) - :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) +- Bug in :meth:`DataFrame.apply` where callback was called with :class:`Series` parameter even though ``raw=True`` requested. (:issue:`32423`) +- Bug in :meth:`DataFrame.pivot_table` losing timezone information when creating a :class:`MultiIndex` level from a column with timezone-aware dtype (:issue:`32558`) Sparse @@ -356,6 +392,7 @@ Other instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) - Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) +- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 437406cbbd819..5545302fcbfc4 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,4 +1,5 @@ import cython +from collections import defaultdict from cython import Py_ssize_t from cpython.slice cimport PySlice_GetIndicesEx @@ -7,7 +8,9 @@ cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX import numpy as np -from numpy cimport int64_t +cimport numpy as cnp +from numpy cimport NPY_INT64, int64_t +cnp.import_array() from pandas._libs.algos import ensure_int64 @@ -105,7 +108,9 @@ cdef class BlockPlacement: Py_ssize_t start, stop, end, _ if not self._has_array: start, stop, step, _ = slice_get_indices_ex(self._as_slice) - self._as_array = np.arange(start, stop, step, dtype=np.int64) + # NOTE: this is the C-optimized equivalent of + # np.arange(start, stop, step, dtype=np.int64) + self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64) self._has_array = True return self._as_array @@ -369,8 +374,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): Py_ssize_t i, start, stop, n, diff object blkno - list group_order - dict group_dict + object group_dict = defaultdict(list) int64_t[:] res_view n = blknos.shape[0] @@ -391,28 +395,16 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): yield cur_blkno, slice(start, n) else: - group_order = [] - group_dict = {} - for i in range(1, n): if blknos[i] != cur_blkno: - if cur_blkno not in group_dict: - group_order.append(cur_blkno) - group_dict[cur_blkno] = [(start, i)] - else: - group_dict[cur_blkno].append((start, i)) + group_dict[cur_blkno].append((start, i)) start = i cur_blkno = blknos[i] - if cur_blkno not in group_dict: - group_order.append(cur_blkno) - group_dict[cur_blkno] = [(start, n)] - else: - group_dict[cur_blkno].append((start, n)) + group_dict[cur_blkno].append((start, n)) - for blkno in group_order: - slices = group_dict[blkno] + for blkno, slices in group_dict.items(): if len(slices) == 1: yield blkno, slice(slices[0][0], slices[0][1]) else: diff --git a/pandas/conftest.py b/pandas/conftest.py index dcfc523315c8b..d8f96021cdb15 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -425,6 +425,15 @@ def nselect_method(request): return request.param +@pytest.fixture(params=["first", "last", False]) +def keep(request): + """ + Valid values for the 'keep' parameter used in + .duplicated or .drop_duplicates + """ + return request.param + + @pytest.fixture(params=["left", "right", "both", "neither"]) def closed(request): """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f9059054ba59f..5b324bc5753ec 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -11,6 +11,7 @@ from pandas._libs import Timestamp, algos, hashtable as htable, lib from pandas._libs.tslib import iNaT +from pandas._typing import AnyArrayLike from pandas.util._decorators import doc from pandas.core.dtypes.cast import ( @@ -45,10 +46,14 @@ is_unsigned_integer_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndex, + ABCIndexClass, + ABCSeries, +) from pandas.core.dtypes.missing import isna, na_value_for_dtype -import pandas.core.common as com from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices @@ -384,7 +389,7 @@ def unique(values): unique1d = unique -def isin(comps, values) -> np.ndarray: +def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: """ Compute the isin boolean array. @@ -409,15 +414,14 @@ def isin(comps, values) -> np.ndarray: f"to isin(), you passed a [{type(values).__name__}]" ) - if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): + if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) + comps = extract_array(comps, extract_numpy=True) if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - return comps._values.isin(values) - - comps = com.values_from_object(comps) + return comps.isin(values) # type: ignore comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) @@ -2021,9 +2025,7 @@ def sort_mixed(values): ) codes = ensure_platform_int(np.asarray(codes)) - from pandas import Index - - if not assume_unique and not Index(values).is_unique: + if not assume_unique and not len(unique(values)) == len(values): raise ValueError("values should be unique if codes is not None") if sorter is None: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 70e0a129c055f..ceb45bc71326e 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -179,7 +179,7 @@ def get_result(self): return self.apply_empty_result() # raw - elif self.raw and not self.obj._is_mixed_type: + elif self.raw: return self.apply_raw() return self.apply_standard() diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bcb3fa53e311b..8284a89a29b52 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1675,6 +1675,12 @@ def to_dense(self): ------- dense : array """ + warn( + "Categorical.to_dense is deprecated and will be removed in " + "a future version. Use np.asarray(cat) instead.", + FutureWarning, + stacklevel=2, + ) return np.asarray(self) def fillna(self, value=None, method=None, limit=None): @@ -1733,12 +1739,17 @@ def fillna(self, value=None, method=None, limit=None): # If value is a dict or a Series (a dict value has already # been converted to a Series) - if isinstance(value, ABCSeries): - if not value[~value.isin(self.categories)].isna().all(): + if isinstance(value, (np.ndarray, Categorical, ABCSeries)): + # We get ndarray or Categorical if called via Series.fillna, + # where it will unwrap another aligned Series before getting here + + mask = ~algorithms.isin(value, self.categories) + if not isna(value[mask]).all(): raise ValueError("fill value must be in categories") values_codes = _get_codes_for_values(value, self.categories) indexer = np.where(codes == -1) + codes = codes.copy() codes[indexer] = values_codes[indexer] # If value is not a dict or Series it should be a scalar diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7223eda22b3d9..2110f782330fb 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -23,6 +23,7 @@ from pandas.core.dtypes.common import ( _INT64_DTYPE, _NS_DTYPE, + is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, is_datetime64_dtype, @@ -1903,7 +1904,11 @@ def maybe_convert_dtype(data, copy): ------ TypeError : PeriodDType data is passed """ - if is_float_dtype(data): + if not hasattr(data, "dtype"): + # e.g. collections.deque + return data, copy + + if is_float_dtype(data.dtype): # Note: we must cast to datetime64[ns] here in order to treat these # as wall-times instead of UTC timestamps. data = data.astype(_NS_DTYPE) @@ -1911,24 +1916,24 @@ def maybe_convert_dtype(data, copy): # TODO: deprecate this behavior to instead treat symmetrically # with integer dtypes. See discussion in GH#23675 - elif is_timedelta64_dtype(data): + elif is_timedelta64_dtype(data.dtype) or is_bool_dtype(data.dtype): # GH#29794 enforcing deprecation introduced in GH#23539 raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]") - elif is_period_dtype(data): + elif is_period_dtype(data.dtype): # Note: without explicitly raising here, PeriodIndex # test_setops.test_join_does_not_recur fails raise TypeError( "Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead" ) - elif is_categorical_dtype(data): + elif is_categorical_dtype(data.dtype): # GH#18664 preserve tz in going DTI->Categorical->DTI # TODO: cases where we need to do another pass through this func, # e.g. the categories are timedelta64s data = data.categories.take(data.codes, fill_value=NaT)._values copy = False - elif is_extension_array_dtype(data) and not is_datetime64tz_dtype(data): + elif is_extension_array_dtype(data.dtype) and not is_datetime64tz_dtype(data.dtype): # Includes categorical # TODO: We have no tests for these data = np.array(data, dtype=np.object_) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 3b5e6b2d2bcd3..93091555201e8 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -27,6 +27,7 @@ is_array_like, is_bool_dtype, is_datetime64_any_dtype, + is_datetime64tz_dtype, is_dtype_equal, is_integer, is_object_dtype, @@ -42,7 +43,7 @@ from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.construction import sanitize_array +from pandas.core.construction import extract_array, sanitize_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import interpolate_2d import pandas.core.ops as ops @@ -312,7 +313,7 @@ def __init__( dtype = dtype.subtype if index is not None and not is_scalar(data): - raise Exception("must only pass scalars with an index ") + raise Exception("must only pass scalars with an index") if is_scalar(data): if index is not None: @@ -367,6 +368,19 @@ def __init__( sparse_index = data._sparse_index sparse_values = np.asarray(data.sp_values, dtype=dtype) elif sparse_index is None: + data = extract_array(data, extract_numpy=True) + if not isinstance(data, np.ndarray): + # EA + if is_datetime64tz_dtype(data.dtype): + warnings.warn( + f"Creating SparseArray from {data.dtype} data " + "loses timezone information. Cast to object before " + "sparse to retain timezone information.", + UserWarning, + stacklevel=2, + ) + data = np.asarray(data, dtype="datetime64[ns]") + data = np.asarray(data) sparse_values, sparse_index, fill_value = make_sparse( data, kind=kind, fill_value=fill_value, dtype=dtype ) @@ -1497,7 +1511,7 @@ def _formatter(self, boxed=False): SparseArray._add_unary_ops() -def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): +def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format @@ -1513,7 +1527,7 @@ def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): ------- (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) """ - arr = com.values_from_object(arr) + assert isinstance(arr, np.ndarray) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") diff --git a/pandas/core/construction.py b/pandas/core/construction.py index e2d8fba8d4148..c9754ff588896 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -5,6 +5,7 @@ These should not depend on core.internals. """ +from collections import abc from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast import numpy as np @@ -446,6 +447,8 @@ def sanitize_array( # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) + elif isinstance(data, abc.Set): + raise TypeError("Set type is unordered") else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1afe7edf2641b..f5997a13e785d 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -24,6 +24,7 @@ is_array_like, is_bool, is_complex, + is_dataclass, is_decimal, is_dict_like, is_file_like, diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 56b880dca1241..d1607b5ede6c3 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -386,3 +386,39 @@ def is_sequence(obj) -> bool: return not isinstance(obj, (str, bytes)) except (TypeError, AttributeError): return False + + +def is_dataclass(item): + """ + Checks if the object is a data-class instance + + Parameters + ---------- + item : object + + Returns + -------- + is_dataclass : bool + True if the item is an instance of a data-class, + will return false if you pass the data class itself + + Examples + -------- + >>> from dataclasses import dataclass + >>> @dataclass + ... class Point: + ... x: int + ... y: int + + >>> is_dataclass(Point) + False + >>> is_dataclass(Point(0,2)) + True + + """ + try: + from dataclasses import is_dataclass + + return is_dataclass(item) and not isinstance(item, type) + except ImportError: + return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e153fdaac16e2..fc5d071d7a9a9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,6 +77,7 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, + is_dataclass, is_datetime64_any_dtype, is_dict_like, is_dtype_equal, @@ -117,6 +118,7 @@ from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( arrays_to_mgr, + dataclasses_to_dicts, get_names_from_index, init_dict, init_ndarray, @@ -474,6 +476,8 @@ def __init__( if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: + if is_dataclass(data[0]): + data = dataclasses_to_dicts(data) if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields @@ -1401,11 +1405,45 @@ def to_dict(self, orient="dict", into=dict): ) # GH16122 into_c = com.standardize_mapping(into) - if orient.lower().startswith("d"): + + orient = orient.lower() + # GH32515 + if orient.startswith(("d", "l", "s", "r", "i")) and orient not in { + "dict", + "list", + "series", + "split", + "records", + "index", + }: + warnings.warn( + "Using short name for 'orient' is deprecated. Only the " + "options: ('dict', list, 'series', 'split', 'records', 'index') " + "will be used in a future version. Use one of the above " + "to silence this warning.", + FutureWarning, + ) + + if orient.startswith("d"): + orient = "dict" + elif orient.startswith("l"): + orient = "list" + elif orient.startswith("sp"): + orient = "split" + elif orient.startswith("s"): + orient = "series" + elif orient.startswith("r"): + orient = "records" + elif orient.startswith("i"): + orient = "index" + + if orient == "dict": return into_c((k, v.to_dict(into)) for k, v in self.items()) - elif orient.lower().startswith("l"): + + elif orient == "list": return into_c((k, v.tolist()) for k, v in self.items()) - elif orient.lower().startswith("sp"): + + elif orient == "split": return into_c( ( ("index", self.index.tolist()), @@ -1419,9 +1457,11 @@ def to_dict(self, orient="dict", into=dict): ), ) ) - elif orient.lower().startswith("s"): + + elif orient == "series": return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) - elif orient.lower().startswith("r"): + + elif orient == "records": columns = self.columns.tolist() rows = ( dict(zip(columns, row)) @@ -1431,13 +1471,15 @@ def to_dict(self, orient="dict", into=dict): into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) for row in rows ] - elif orient.lower().startswith("i"): + + elif orient == "index": if not self.index.is_unique: raise ValueError("DataFrame index must be unique for orient='index'.") return into_c( (t[0], dict(zip(self.columns, t[1:]))) for t in self.itertuples(name=None) ) + else: raise ValueError(f"orient '{orient}' not understood") @@ -1773,7 +1815,9 @@ def to_records( else: ix_vals = [self.index.values] - arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns] + arrays = ix_vals + [ + np.asarray(self.iloc[:, i]) for i in range(len(self.columns)) + ] count = 0 index_names = list(self.index.names) @@ -1788,7 +1832,7 @@ def to_records( names = [str(name) for name in itertools.chain(index_names, self.columns)] else: - arrays = [self[c]._internal_get_values() for c in self.columns] + arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))] names = [str(c) for c in self.columns] index_names = [] @@ -2687,6 +2731,7 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: + self.loc._ensure_listlike_indexer(key, axis=1) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] @@ -8020,6 +8065,35 @@ def idxmin(self, axis=0, skipna=True) -> Series: Notes ----- This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object """ axis = self._get_axis_number(axis) indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e9811e0f60c7d..8d56311331d4d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6089,6 +6089,8 @@ def fillna( value = create_series_with_explicit_dtype( value, dtype_if_empty=object ) + value = value.reindex(self.index, copy=False) + value = value._values elif not is_list_like(value): pass else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 23e68802eb126..31966489403f4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5195,9 +5195,11 @@ def insert(self, loc: int, item): ------- new_index : Index """ - _self = np.asarray(self) - item = self._coerce_scalar_to_index(item)._ndarray_values - idx = np.concatenate((_self[:loc], item, _self[loc:])) + # Note: this method is overriden by all ExtensionIndex subclasses, + # so self is never backed by an EA. + arr = np.asarray(self) + item = self._coerce_scalar_to_index(item)._values + idx = np.concatenate((arr[:loc], item, arr[loc:])) return self._shallow_copy_with_infer(idx) def drop(self, labels, errors: str_t = "raise"): diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 4984fc27516ff..6d5f0dbb830f9 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -6,6 +6,7 @@ import numpy as np from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( @@ -248,6 +249,10 @@ def repeat(self, repeats, axis=None): result = self._data.repeat(repeats, axis=axis) return self._shallow_copy(result) + def insert(self, loc: int, item): + # ExtensionIndex subclasses must override Index.insert + raise AbstractMethodError(self) + def _concat_same_dtype(self, to_concat, name): arr = type(self._data)._concat_same_type(to_concat) return type(self)._simple_new(arr, name=name) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0bb88145646ed..5bffc4ec552af 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -276,6 +276,7 @@ def __new__( raise ValueError("Must pass non-zero number of levels/codes") result = object.__new__(MultiIndex) + result._cache = {} # we've already validated levels and codes, so shortcut here result._set_levels(levels, copy=copy, validate=False) @@ -564,6 +565,7 @@ def from_product(cls, iterables, sortorder=None, names=lib.no_default): if names is lib.no_default: names = [getattr(it, "name", None) for it in iterables] + # codes are all ndarrays, so cartesian_product is lossless codes = cartesian_product(codes) return MultiIndex(levels, codes, sortorder=sortorder, names=names) @@ -991,7 +993,13 @@ def _shallow_copy(self, values=None, **kwargs): # discards freq kwargs.pop("freq", None) return MultiIndex.from_tuples(values, names=names, **kwargs) - return self.copy(**kwargs) + + result = self.copy(**kwargs) + result._cache = self._cache.copy() + # GH32669 + if "levels" in result._cache: + del result._cache["levels"] + return result def _shallow_copy_with_infer(self, values, **kwargs): # On equal MultiIndexes the difference is empty. diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 8ae792d3f63b5..f83234f1aac0b 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -251,11 +251,13 @@ def _has_complex_internals(self): def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name - + cache = self._cache.copy() if values is None else {} if values is None: values = self._data - return self._simple_new(values, name=name) + result = self._simple_new(values, name=name) + result._cache = cache + return result def _maybe_convert_timedelta(self, other): """ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index c21d8df2476b3..2c038564f4e6f 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -141,7 +141,7 @@ def _simple_new(cls, values: range, name: Label = None) -> "RangeIndex": result._range = values result.name = name - + result._cache = {} result._reset_identity() return result @@ -391,7 +391,9 @@ def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is None: - return self._simple_new(self._range, name=name) + result = self._simple_new(self._range, name=name) + result._cache = self._cache.copy() + return result else: return Int64Index._simple_new(values, name=name) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c9362a0527c06..7bd25814a12ee 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -8,6 +8,7 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( + is_hashable, is_integer, is_iterator, is_list_like, @@ -581,6 +582,9 @@ def _get_setitem_indexer(self, key): """ Convert a potentially-label-based key into a positional indexer. """ + if self.name == "loc": + self._ensure_listlike_indexer(key) + if self.axis is not None: return self._convert_tuple(key, is_setter=True) @@ -611,6 +615,42 @@ def _get_setitem_indexer(self, key): raise raise IndexingError(key) from e + def _ensure_listlike_indexer(self, key, axis=None): + """ + Ensure that a list-like of column labels are all present by adding them if + they do not already exist. + + Parameters + ---------- + key : _LocIndexer key or list-like of column labels + Target labels. + axis : key axis if known + """ + column_axis = 1 + + # column only exists in 2-dimensional DataFrame + if self.ndim != 2: + return + + if isinstance(key, tuple): + # key may be a tuple if key is a _LocIndexer key + # in that case, set key to the column part of key + key = key[column_axis] + axis = column_axis + + if ( + axis == column_axis + and not isinstance(self.obj.columns, ABCMultiIndex) + and is_list_like_indexer(key) + and not com.is_bool_indexer(key) + and all(is_hashable(k) for k in key) + ): + for k in key: + try: + self.obj[k] + except KeyError: + self.obj[k] = np.nan + def __setitem__(self, key, value): if isinstance(key, tuple): key = tuple(com.apply_if_callable(x, self.obj) for x in key) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f07fa99fe57d6..1a92a9486e9e4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -605,7 +605,9 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # astype formatting else: - values = self.get_values() + # Because we have neither is_extension nor is_datelike, + # self.values already has the correct shape + values = self.values else: values = self.get_values(dtype=dtype) @@ -663,7 +665,7 @@ def _can_hold_element(self, element: Any) -> bool: def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ - values = self.get_values() + values = self.values if slicer is not None: values = values[:, slicer] @@ -1739,6 +1741,10 @@ def __init__(self, values, placement, ndim=None): values = self._maybe_coerce_values(values) super().__init__(values, placement, ndim) + if self.ndim == 2 and len(self.mgr_locs) != 1: + # TODO(2DEA): check unnecessary with 2D EAs + raise AssertionError("block.size != values.size") + def _maybe_coerce_values(self, values): """ Unbox to an extension array. diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 7570f6eddbd9c..6839d138fbf73 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -217,7 +217,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): else: # No dtype upcasting is done here, it will be performed during # concatenation itself. - values = self.block.get_values() + values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ab363e10eb098..c4416472d451c 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -429,6 +429,33 @@ def _get_axes(N, K, index, columns): return index, columns +def dataclasses_to_dicts(data): + """ Converts a list of dataclass instances to a list of dictionaries + + Parameters + ---------- + data : List[Type[dataclass]] + + Returns + -------- + list_dict : List[dict] + + Examples + -------- + >>> @dataclass + >>> class Point: + ... x: int + ... y: int + + >>> dataclasses_to_dicts([Point(1,2), Point(2,3)]) + [{"x":1,"y":2},{"x":2,"y":3}] + + """ + from dataclasses import asdict + + return list(map(asdict, data)) + + # --------------------------------------------------------------------- # Conversion of Inputs to Arrays diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b434f8dbde4eb..93d4b02310d54 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -391,6 +391,7 @@ def apply(self: T, f, filter=None, **kwargs) -> T: BlockManager """ result_blocks = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned # filter kwarg is used in replace-* family of methods if filter is not None: @@ -415,11 +416,6 @@ def apply(self: T, f, filter=None, **kwargs) -> T: align_keys = ["new", "mask"] else: align_keys = ["mask"] - elif f == "fillna": - # fillna internally does putmask, maybe it's better to do this - # at mgr, not block level? - align_copy = False - align_keys = ["value"] else: align_keys = [] diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index d8652c9b4fac9..7abb14303f8cc 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -2,8 +2,6 @@ from pandas.core.dtypes.common import is_list_like -import pandas.core.common as com - def cartesian_product(X): """ @@ -51,9 +49,20 @@ def cartesian_product(X): # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) - return [ - np.tile( - np.repeat(np.asarray(com.values_from_object(x)), b[i]), np.product(a[i]) - ) - for i, x in enumerate(X) - ] + return [_tile_compat(np.repeat(x, b[i]), np.product(a[i])) for i, x in enumerate(X)] + + +def _tile_compat(arr, num: int): + """ + Index compat for np.tile. + + Notes + ----- + Does not support multi-dimensional `num`. + """ + if isinstance(arr, np.ndarray): + return np.tile(arr, num) + + # Otherwise we have an Index + taker = np.tile(np.arange(len(arr)), num) + return arr.take(taker) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2d8eb9b29498a..e120695cc83e8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2666,9 +2666,9 @@ def combine(self, other, func, fill_value=None) -> "Series": new_values = [func(lv, other) for lv in self._values] new_name = self.name - if is_categorical_dtype(self.values): + if is_categorical_dtype(self.dtype): pass - elif is_extension_array_dtype(self.values): + elif is_extension_array_dtype(self.dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. new_values = try_cast_to_ea(self._values, new_values) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c32b4d81c0988..8e38332e67439 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -359,7 +359,18 @@ def _convert_listlike_datetimes( # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation orig_arg = arg - arg, _ = maybe_convert_dtype(arg, copy=False) + try: + arg, _ = maybe_convert_dtype(arg, copy=False) + except TypeError: + if errors == "coerce": + result = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) + return DatetimeIndex(result, name=name) + elif errors == "ignore": + from pandas import Index + + result = Index(arg, name=name) + return result + raise arg = ensure_object(arg) require_iso8601 = False diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 50b5db0274aa5..52783b3a9e134 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2273,11 +2273,15 @@ def __init__(self, f, **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. self._col_indices = None - ( - self.columns, - self.num_original_columns, - self.unnamed_cols, - ) = self._infer_columns() + try: + ( + self.columns, + self.num_original_columns, + self.unnamed_cols, + ) = self._infer_columns() + except (TypeError, ValueError): + self.close() + raise # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index f49f70f5acf77..b99e172674f66 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -247,7 +247,7 @@ def test_set_categories(self): tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) - tm.assert_numpy_array_equal(c.to_dense(), exp) + tm.assert_numpy_array_equal(np.asarray(c), exp) # all "pointers" to '4' must be changed from 3 to 0,... c = c.set_categories([4, 3, 2, 1]) @@ -260,7 +260,7 @@ def test_set_categories(self): # output is the same exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) - tm.assert_numpy_array_equal(c.to_dense(), exp) + tm.assert_numpy_array_equal(np.asarray(c), exp) assert c.min() == 4 assert c.max() == 1 @@ -268,13 +268,19 @@ def test_set_categories(self): c2 = c.set_categories([4, 3, 2, 1], ordered=False) assert not c2.ordered - tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) + tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) # set_categories should pass thru the ordering c2 = c.set_ordered(False).set_categories([4, 3, 2, 1]) assert not c2.ordered - tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) + tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) + + def test_to_dense_deprecated(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + + with tm.assert_produces_warning(FutureWarning): + cat.to_dense() @pytest.mark.parametrize( "values, categories, new_categories", diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 8889f45a84237..9eb3c8b3a8c48 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -82,3 +82,18 @@ def test_fillna_iterable_category(self, named): expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)]) tm.assert_categorical_equal(result, expected) + + def test_fillna_array(self): + # accept Categorical or ndarray value if it holds appropriate values + cat = Categorical(["A", "B", "C", None, None]) + + other = cat.fillna("C") + result = cat.fillna(other) + tm.assert_categorical_equal(result, other) + assert isna(cat[-1]) # didnt modify original inplace + + other = np.array(["A", "B", "C", "B", "A"]) + result = cat.fillna(other) + expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype) + tm.assert_categorical_equal(result, expected) + assert isna(cat[-1]) # didnt modify original inplace diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index baca18239b929..4dab86166e13c 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -96,6 +96,22 @@ def test_constructor_na_dtype(self, dtype): with pytest.raises(ValueError, match="Cannot convert"): SparseArray([0, 1, np.nan], dtype=dtype) + def test_constructor_warns_when_losing_timezone(self): + # GH#32501 warn when losing timezone inforamtion + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + + expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(dti) + + tm.assert_sp_array_equal(result, expected) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(pd.Series(dti)) + + tm.assert_sp_array_equal(result, expected) + def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) # XXX: Behavior change: specifying SparseIndex no longer changes the diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 2c26e72a245f7..7d80ad3d8c6be 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -89,11 +89,26 @@ def test_non_array_raises(self): with pytest.raises(ValueError, match="list"): DatetimeArray([1, 2, 3]) - def test_other_type_raises(self): + def test_bool_dtype_raises(self): + arr = np.array([1, 2, 3], dtype="bool") + with pytest.raises( ValueError, match="The dtype of 'values' is incorrect.*bool" ): - DatetimeArray(np.array([1, 2, 3], dtype="bool")) + DatetimeArray(arr) + + msg = r"dtype bool cannot be converted to datetime64\[ns\]" + with pytest.raises(TypeError, match=msg): + DatetimeArray._from_sequence(arr) + + with pytest.raises(TypeError, match=msg): + sequence_to_dt64ns(arr) + + with pytest.raises(TypeError, match=msg): + pd.DatetimeIndex(arr) + + with pytest.raises(TypeError, match=msg): + pd.to_datetime(arr) def test_incorrect_dtype_raises(self): with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index 5fb5072e5c9d9..5a78e5d6352a9 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -570,108 +570,6 @@ def test_factorize(self, index_or_series_obj, sort): tm.assert_numpy_array_equal(result_codes, expected_codes) tm.assert_index_equal(result_uniques, expected_uniques) - def test_duplicated_drop_duplicates_index(self): - # GH 4060 - for original in self.objs: - if isinstance(original, Index): - - # special case - if original.is_boolean(): - result = original.drop_duplicates() - expected = Index([False, True], name="a") - tm.assert_index_equal(result, expected) - continue - - # original doesn't have duplicates - expected = np.array([False] * len(original), dtype=bool) - duplicated = original.duplicated() - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = original.drop_duplicates() - tm.assert_index_equal(result, original) - assert result is not original - - # has_duplicates - assert not original.has_duplicates - - # create repeated values, 3rd and 5th values are duplicated - idx = original[list(range(len(original))) + [5, 3]] - expected = np.array([False] * len(original) + [True, True], dtype=bool) - duplicated = idx.duplicated() - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - tm.assert_index_equal(idx.drop_duplicates(), original) - - base = [False] * len(idx) - base[3] = True - base[5] = True - expected = np.array(base) - - duplicated = idx.duplicated(keep="last") - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = idx.drop_duplicates(keep="last") - tm.assert_index_equal(result, idx[~expected]) - - base = [False] * len(original) + [True, True] - base[3] = True - base[5] = True - expected = np.array(base) - - duplicated = idx.duplicated(keep=False) - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = idx.drop_duplicates(keep=False) - tm.assert_index_equal(result, idx[~expected]) - - with pytest.raises( - TypeError, - match=r"drop_duplicates\(\) got an unexpected keyword argument", - ): - idx.drop_duplicates(inplace=True) - - else: - expected = Series( - [False] * len(original), index=original.index, name="a" - ) - tm.assert_series_equal(original.duplicated(), expected) - result = original.drop_duplicates() - tm.assert_series_equal(result, original) - assert result is not original - - idx = original.index[list(range(len(original))) + [5, 3]] - values = original._values[list(range(len(original))) + [5, 3]] - s = Series(values, index=idx, name="a") - - expected = Series( - [False] * len(original) + [True, True], index=idx, name="a" - ) - tm.assert_series_equal(s.duplicated(), expected) - tm.assert_series_equal(s.drop_duplicates(), original) - - base = [False] * len(idx) - base[3] = True - base[5] = True - expected = Series(base, index=idx, name="a") - - tm.assert_series_equal(s.duplicated(keep="last"), expected) - tm.assert_series_equal( - s.drop_duplicates(keep="last"), s[~np.array(base)] - ) - - base = [False] * len(original) + [True, True] - base[3] = True - base[5] = True - expected = Series(base, index=idx, name="a") - - tm.assert_series_equal(s.duplicated(keep=False), expected) - tm.assert_series_equal( - s.drop_duplicates(keep=False), s[~np.array(base)] - ) - - s.drop_duplicates(inplace=True) - tm.assert_series_equal(s, original) - def test_drop_duplicates_series_vs_dataframe(self): # GH 14192 df = pd.DataFrame( @@ -834,23 +732,6 @@ def test_access_by_position(self, indices): with pytest.raises(IndexError, match=msg): series.iloc[size] - @pytest.mark.parametrize("indexer_klass", [list, pd.Index]) - @pytest.mark.parametrize( - "indexer", - [ - [True] * 10, - [False] * 10, - [True, False, True, True, False, False, True, True, False, True], - ], - ) - def test_bool_indexing(self, indexer_klass, indexer): - # GH 22533 - for idx in self.indexes: - exp_idx = [i for i in range(len(indexer)) if indexer[i]] - tm.assert_index_equal(idx[indexer_klass(indexer)], idx[exp_idx]) - s = pd.Series(idx) - tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) - def test_get_indexer_non_unique_dtype_mismatch(self): # GH 25459 indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 7892030a6727e..923447889d04c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -215,6 +215,63 @@ def test_setitem_list_of_tuples(self, float_frame): expected = Series(tuples, index=float_frame.index, name="tuples") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "columns,box,expected", + [ + ( + ["A", "B", "C", "D"], + 7, + pd.DataFrame( + [[7, 7, 7, 7], [7, 7, 7, 7], [7, 7, 7, 7]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "D"], + [7, 8], + pd.DataFrame( + [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "B", "C"], + np.array([7, 8, 9], dtype=np.int64), + pd.DataFrame( + [[7, 8, 9], [7, 8, 9], [7, 8, 9]], columns=["A", "B", "C"] + ), + ), + ( + ["B", "C", "D"], + [[7, 8, 9], [10, 11, 12], [13, 14, 15]], + pd.DataFrame( + [[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "A", "D"], + np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]], dtype=np.int64), + pd.DataFrame( + [[8, 2, 7, 9], [11, 4, 10, 12], [14, 6, 13, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "C"], + pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + pd.DataFrame( + [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] + ), + ), + ], + ) + def test_setitem_list_missing_columns(self, columns, box, expected): + # GH 29334 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df[columns] = box + tm.assert_frame_equal(df, expected) + def test_setitem_multi_index(self): # GH7655, test that assigning to a sub-frame of a frame # with multi-index columns aligns both rows and columns @@ -459,13 +516,6 @@ def test_setitem(self, float_frame): float_frame["col6"] = series tm.assert_series_equal(series, float_frame["col6"], check_names=False) - msg = ( - r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the " - r"\[columns\]\"" - ) - with pytest.raises(KeyError, match=msg): - float_frame[np.random.randn(len(float_frame) + 1)] = 1 - # set ndarray arr = np.random.randn(len(float_frame)) float_frame["col9"] = arr diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index cd9bd169322fd..f1656b46cf356 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -70,8 +70,17 @@ def test_to_dict_invalid_orient(self): with pytest.raises(ValueError, match=msg): df.to_dict(orient="xinvalid") + @pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"]) + def test_to_dict_short_orient_warns(self, orient): + # GH#32515 + df = DataFrame({"A": [0, 1]}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df.to_dict(orient=orient) + @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) def test_to_dict(self, mapping): + # orient= should only take the listed options + # see GH#32515 test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} # GH#16122 @@ -81,19 +90,19 @@ def test_to_dict(self, mapping): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("l", mapping) + recons_data = DataFrame(test_data).to_dict("list", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][int(k2) - 1] - recons_data = DataFrame(test_data).to_dict("s", mapping) + recons_data = DataFrame(test_data).to_dict("series", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("sp", mapping) + recons_data = DataFrame(test_data).to_dict("split", mapping) expected_split = { "columns": ["A", "B"], "index": ["1", "2", "3"], @@ -101,7 +110,7 @@ def test_to_dict(self, mapping): } tm.assert_dict_equal(recons_data, expected_split) - recons_data = DataFrame(test_data).to_dict("r", mapping) + recons_data = DataFrame(test_data).to_dict("records", mapping) expected_records = [ {"A": 1.0, "B": "1"}, {"A": 2.0, "B": "2"}, @@ -113,7 +122,7 @@ def test_to_dict(self, mapping): tm.assert_dict_equal(l, r) # GH#10844 - recons_data = DataFrame(test_data).to_dict("i") + recons_data = DataFrame(test_data).to_dict("index") for k, v in test_data.items(): for k2, v2 in v.items(): @@ -121,7 +130,7 @@ def test_to_dict(self, mapping): df = DataFrame(test_data) df["duped"] = df[df.columns[0]] - recons_data = df.to_dict("i") + recons_data = df.to_dict("index") comp_data = test_data.copy() comp_data["duped"] = comp_data[df.columns[0]] for k, v in comp_data.items(): diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index d0181f0309af1..34b323e55d8cd 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -3,7 +3,14 @@ import numpy as np import pytest -from pandas import CategoricalDtype, DataFrame, MultiIndex, Series, date_range +from pandas import ( + CategoricalDtype, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, +) import pandas._testing as tm @@ -18,6 +25,17 @@ def test_to_records_dt64(self): result = df.to_records()["index"][0] assert expected == result + def test_to_records_dt64tz_column(self): + # GH#32535 dont less tz in to_records + df = DataFrame({"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")}) + + result = df.to_records() + + assert result.dtype["A"] == object + val = result[0][1] + assert isinstance(val, Timestamp) + assert val == df.loc[0, "A"] + def test_to_records_with_multindex(self): # GH#3189 index = [ diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 11705cd77a325..ee3cd59c27b44 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -235,7 +235,14 @@ def test_apply_broadcast_error(self, int_frame_const_col): with pytest.raises(ValueError): df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") - def test_apply_raw(self, float_frame): + def test_apply_raw(self, float_frame, mixed_type_frame): + def _assert_raw(x): + assert isinstance(x, np.ndarray) + assert x.ndim == 1 + + float_frame.apply(_assert_raw, raw=True) + float_frame.apply(_assert_raw, axis=1, raw=True) + result0 = float_frame.apply(np.mean, raw=True) result1 = float_frame.apply(np.mean, axis=1, raw=True) @@ -250,6 +257,10 @@ def test_apply_raw(self, float_frame): expected = float_frame * 2 tm.assert_frame_equal(result, expected) + # Mixed dtype (GH-32423) + mixed_type_frame.apply(_assert_raw, raw=True) + mixed_type_frame.apply(_assert_raw, axis=1, raw=True) + def test_apply_axis1(self, float_frame): d = float_frame.index[0] tapplied = float_frame.apply(np.mean, axis=1) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 172800e74d181..95f812a99c579 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -9,7 +9,7 @@ import pytest import pytz -from pandas.compat import is_platform_little_endian +from pandas.compat import PY37, is_platform_little_endian from pandas.compat.numpy import _is_numpy_dev from pandas.core.dtypes.common import is_integer_dtype @@ -47,15 +47,15 @@ class TestDataFrameConstructors: def test_series_with_name_not_matching_column(self): # GH#9232 - x = pd.Series(range(5), name=1) - y = pd.Series(range(5), name=0) + x = Series(range(5), name=1) + y = Series(range(5), name=0) - result = pd.DataFrame(x, columns=[0]) - expected = pd.DataFrame([], columns=[0]) + result = DataFrame(x, columns=[0]) + expected = DataFrame([], columns=[0]) tm.assert_frame_equal(result, expected) - result = pd.DataFrame(y, columns=[1]) - expected = pd.DataFrame([], columns=[1]) + result = DataFrame(y, columns=[1]) + expected = DataFrame([], columns=[1]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -126,7 +126,7 @@ def test_constructor_cast_failure(self): def test_constructor_dtype_copy(self): orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]}) - new_df = pd.DataFrame(orig_df, dtype=float, copy=True) + new_df = DataFrame(orig_df, dtype=float, copy=True) new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 @@ -220,10 +220,10 @@ def test_constructor_rec(self, float_frame): index = float_frame.index df = DataFrame(rec) - tm.assert_index_equal(df.columns, pd.Index(rec.dtype.names)) + tm.assert_index_equal(df.columns, Index(rec.dtype.names)) df2 = DataFrame(rec, index=index) - tm.assert_index_equal(df2.columns, pd.Index(rec.dtype.names)) + tm.assert_index_equal(df2.columns, Index(rec.dtype.names)) tm.assert_index_equal(df2.index, index) rng = np.arange(len(rec))[::-1] @@ -298,7 +298,7 @@ def test_constructor_dict(self): tm.assert_series_equal(frame["col1"], datetime_series.rename("col1")) - exp = pd.Series( + exp = Series( np.concatenate([[np.nan] * 5, datetime_series_short.values]), index=datetime_series.index, name="col2", @@ -325,7 +325,7 @@ def test_constructor_dict(self): # Length-one dict micro-optimization frame = DataFrame({"A": {"1": 1, "2": 2}}) - tm.assert_index_equal(frame.index, pd.Index(["1", "2"])) + tm.assert_index_equal(frame.index, Index(["1", "2"])) # empty dict plus index idx = Index([0, 1, 2]) @@ -418,8 +418,8 @@ def test_constructor_dict_order_insertion(self): def test_constructor_dict_nan_key_and_columns(self): # GH 16894 - result = pd.DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2]) - expected = pd.DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2]) + result = DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2]) + expected = DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2]) tm.assert_frame_equal(result, expected) def test_constructor_multi_index(self): @@ -428,29 +428,29 @@ def test_constructor_multi_index(self): tuples = [(2, 3), (3, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) - assert pd.isna(df).values.ravel().all() + assert isna(df).values.ravel().all() tuples = [(3, 3), (2, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) - assert pd.isna(df).values.ravel().all() + assert isna(df).values.ravel().all() def test_constructor_2d_index(self): # GH 25416 # handling of 2d index in construction - df = pd.DataFrame([[1]], columns=[[1]], index=[1, 2]) - expected = pd.DataFrame( + df = DataFrame([[1]], columns=[[1]], index=[1, 2]) + expected = DataFrame( [1, 1], index=pd.Int64Index([1, 2], dtype="int64"), - columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + columns=MultiIndex(levels=[[1]], codes=[[0]]), ) tm.assert_frame_equal(df, expected) - df = pd.DataFrame([[1]], columns=[[1]], index=[[1, 2]]) - expected = pd.DataFrame( + df = DataFrame([[1]], columns=[[1]], index=[[1, 2]]) + expected = DataFrame( [1, 1], - index=pd.MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), - columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + index=MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), + columns=MultiIndex(levels=[[1]], codes=[[0]]), ) tm.assert_frame_equal(df, expected) @@ -471,7 +471,7 @@ def test_constructor_error_msgs(self): DataFrame( np.arange(12).reshape((4, 3)), columns=["foo", "bar", "baz"], - index=pd.date_range("2000-01-01", periods=3), + index=date_range("2000-01-01", periods=3), ) arr = np.array([[4, 5, 6]]) @@ -713,14 +713,12 @@ def test_constructor_period(self): # PeriodIndex a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M") b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D") - df = pd.DataFrame({"a": a, "b": b}) + df = DataFrame({"a": a, "b": b}) assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype # list of periods - df = pd.DataFrame( - {"a": a.astype(object).tolist(), "b": b.astype(object).tolist()} - ) + df = DataFrame({"a": a.astype(object).tolist(), "b": b.astype(object).tolist()}) assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype @@ -882,8 +880,8 @@ def test_constructor_maskedarray_nonfloat(self): def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) - expected = pd.DataFrame( + result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = DataFrame( {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, columns=["A", "B"], index=[1, 2], @@ -892,8 +890,8 @@ def test_constructor_maskedarray_hardened(self): tm.assert_frame_equal(result, expected) # Check case where mask is hard but no data are masked mat_hard = ma.ones((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) - expected = pd.DataFrame( + result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = DataFrame( {"A": [1.0, 1.0], "B": [1.0, 1.0]}, columns=["A", "B"], index=[1, 2], @@ -907,8 +905,8 @@ def test_constructor_maskedrecarray_dtype(self): np.ma.zeros(5, dtype=[("date", " False and 1-> True + # any other value would be duplicated + tc = tc[:2] + expected = expected[:2] + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + + result_dropped = tc.drop_duplicates(keep=keep) + tm.assert_series_equal(result_dropped, tc) + + # validate shallow copy + assert result_dropped is not tc + + class TestSeriesDropDuplicates: @pytest.mark.parametrize( "dtype", diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a1de9c435c9ba..ad7028702ec8c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -760,6 +760,16 @@ def test_categorical_from_codes(self): result = algos.isin(Sd, St) tm.assert_numpy_array_equal(expected, result) + def test_categorical_isin(self): + vals = np.array([0, 1, 2, 0]) + cats = ["a", "b", "c"] + cat = Categorical(1).from_codes(vals, cats) + other = Categorical(1).from_codes(np.array([0, 1]), cats) + + expected = np.array([True, True, False, True]) + result = algos.isin(cat, other) + tm.assert_numpy_array_equal(expected, result) + def test_same_nan_is_in(self): # GH 22160 # nan is special, because from " a is b" doesn't follow "a == b" diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 19385e797467c..e0dfeac4ab475 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -627,3 +627,13 @@ def test_non_coerce_uint64_conflict(errors, exp): else: result = to_numeric(ser, errors=errors) tm.assert_series_equal(result, ser) + + +def test_failure_to_convert_uint64_string_to_NaN(): + # GH 32394 + result = to_numeric("uint64", errors="coerce") + assert np.isnan(result) + + ser = Series([32, 64, np.nan]) + result = to_numeric(pd.Series(["32", "64", "uint64"]), errors="coerce") + tm.assert_series_equal(result, ser)