From 3b265b0c4fc8b529d224c82875d03a0e0165cf73 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Feb 2021 07:51:08 -0800 Subject: [PATCH 01/10] BUG: raise on RangeIndex.array --- pandas/_testing/__init__.py | 14 ++++++++++++-- pandas/core/indexes/range.py | 22 +++++++++++++++++++--- pandas/core/reshape/merge.py | 12 +++++++++--- pandas/core/series.py | 17 ++++++++++++++--- pandas/core/sorting.py | 9 ++++++--- pandas/tests/arithmetic/test_numeric.py | 1 + 6 files changed, 61 insertions(+), 14 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 0b2be53131af6..75df3c58f9bdf 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -99,7 +99,13 @@ use_numexpr, with_csv_dialect, ) -from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray, period_array +from pandas.core.arrays import ( + DatetimeArray, + PandasArray, + PeriodArray, + TimedeltaArray, + period_array, +) if TYPE_CHECKING: from pandas import PeriodIndex, TimedeltaIndex @@ -197,7 +203,11 @@ def box_expected(expected, box_cls, transpose=True): subclass of box_cls """ if box_cls is pd.array: - expected = pd.array(expected) + if isinstance(expected, pd.RangeIndex): + # pd.array would return an IntegerArray + expected = PandasArray(expected._values) + else: + expected = pd.array(expected) elif box_cls is pd.Index: expected = pd.Index(expected) elif box_cls is pd.Series: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ee0b49aac3f79..3d283f5a4d0b0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -171,6 +171,13 @@ def _data(self): """ return np.arange(self.start, self.stop, self.step, dtype=np.int64) + @property + def array(self): + raise ValueError( + f"{type(self).__name__} has no single backing array. Use " + f"'{type(self).__name__}.to_numpy()' to get a NumPy array." + ) + @cache_readonly def _cached_int64index(self) -> Int64Index: return Int64Index._simple_new(self._data, name=self.name) @@ -485,12 +492,17 @@ def argsort(self, *args, **kwargs) -> np.ndarray: -------- numpy.ndarray.argsort """ + ascending = kwargs.pop("ascending", True) # EA compat nv.validate_argsort(args, kwargs) if self._range.step > 0: - return np.arange(len(self)) + result = np.arange(len(self)) else: - return np.arange(len(self) - 1, -1, -1) + result = np.arange(len(self) - 1, -1, -1) + + if not ascending: + result = result[::-1] + return result def factorize( self, sort: bool = False, na_sentinel: Optional[int] = -1 @@ -870,7 +882,11 @@ def _arith_method(self, other, op): if op in [operator.mul, ops.rmul, operator.truediv, ops.rtruediv]: step = op - other = extract_array(other, extract_numpy=True) + if isinstance(other, RangeIndex): + # TODO: in some cases we can likely be more efficient, especially add/sub + other = other._values + else: + other = extract_array(other, extract_numpy=True) attrs = self._get_attributes_dict() left, right = self, other diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8704d757c3289..127570bc5bf47 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -47,7 +47,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype -from pandas import Categorical, Index, MultiIndex +from pandas import Categorical, Index, MultiIndex, RangeIndex from pandas.core import groupby import pandas.core.algorithms as algos import pandas.core.common as com @@ -2032,8 +2032,14 @@ def _factorize_keys( (array([0, 1, 2]), array([0, 1]), 3) """ # Some pre-processing for non-ndarray lk / rk - lk = extract_array(lk, extract_numpy=True) - rk = extract_array(rk, extract_numpy=True) + if not isinstance(lk, RangeIndex): + lk = extract_array(lk, extract_numpy=True) + else: + lk = np.array(lk) # TODO: more efficient option? + if not isinstance(rk, RangeIndex): + rk = extract_array(rk, extract_numpy=True) + else: + rk = np.array(rk) # TODO: more efficient option? if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): # Extract the ndarray (UTC-localized) values diff --git a/pandas/core/series.py b/pandas/core/series.py index 7d97c9f6189f3..386d216ac9bdd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -94,6 +94,7 @@ Float64Index, Index, MultiIndex, + RangeIndex, ensure_index, ) import pandas.core.indexes.base as ibase @@ -4994,7 +4995,10 @@ def _cmp_method(self, other, op): raise ValueError("Can only compare identically-labeled Series objects") lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) + if isinstance(other, RangeIndex): + rvalues = other._values + else: + rvalues = extract_array(other, extract_numpy=True) res_values = ops.comparison_op(lvalues, rvalues, op) @@ -5005,7 +5009,10 @@ def _logical_method(self, other, op): self, other = ops.align_method_SERIES(self, other, align_asobject=True) lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) + if isinstance(other, RangeIndex): + rvalues = other._values + else: + rvalues = extract_array(other, extract_numpy=True) res_values = ops.logical_op(lvalues, rvalues, op) return self._construct_result(res_values, name=res_name) @@ -5015,7 +5022,11 @@ def _arith_method(self, other, op): self, other = ops.align_method_SERIES(self, other) lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) + if isinstance(other, RangeIndex): + rvalues = other._values + else: + rvalues = extract_array(other, extract_numpy=True) + result = ops.arithmetic_op(lvalues, rvalues, op) return self._construct_result(result, name=res_name) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index cfbabab491ae4..e722c2e434e0b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -25,7 +25,7 @@ ensure_platform_int, is_extension_array_dtype, ) -from pandas.core.dtypes.generic import ABCMultiIndex +from pandas.core.dtypes.generic import ABCMultiIndex, ABCRangeIndex from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algorithms @@ -361,9 +361,12 @@ def nargsort( mask=mask, ) - items = extract_array(items) + if isinstance(items, ABCRangeIndex): + return items.argsort(ascending=ascending) # TODO: test coverage with key? + elif not isinstance(items, ABCMultiIndex): + items = extract_array(items) if mask is None: - mask = np.asarray(isna(items)) + mask = np.asarray(isna(items)) # TODO: does this exclude MultiIndex too? if is_extension_array_dtype(items): return items.argsort(ascending=ascending, kind=kind, na_position=na_position) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index f4f258b559939..44f214269a3fb 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -308,6 +308,7 @@ def test_add_sub_datetimelike_invalid(self, numeric_idx, other, box_with_array): "Concatenation operation is not implemented for NumPy arrays", # pd.array vs np.datetime64 case r"operand type\(s\) all returned NotImplemented from __array_ufunc__", + "can only perform ops with numeric values", ] ) with pytest.raises(TypeError, match=msg): From bd50f1854323e6e6508e97e222b233f3c969f3ce Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Feb 2021 13:35:01 -0800 Subject: [PATCH 02/10] make RangeIndex check part of extract_array --- pandas/core/construction.py | 10 +++++++++- pandas/core/indexes/range.py | 7 ++----- pandas/core/reshape/merge.py | 12 +++--------- pandas/core/series.py | 16 +++------------- 4 files changed, 17 insertions(+), 28 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 0c0084f2492d3..9aa1c620fe1d9 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -60,6 +60,7 @@ ABCExtensionArray, ABCIndex, ABCPandasArray, + ABCRangeIndex, ABCSeries, ) from pandas.core.dtypes.missing import isna @@ -368,7 +369,9 @@ def array( return PandasArray._from_sequence(data, dtype=dtype, copy=copy) -def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayLike]: +def extract_array( + obj: object, extract_numpy: bool = False, range_compat: bool = False +) -> Union[Any, ArrayLike]: """ Extract the ndarray or ExtensionArray from a Series or Index. @@ -383,6 +386,9 @@ def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayL extract_numpy : bool, default False Whether to extract the ndarray from a PandasArray + range_compat : bool, default False + If we have a RangeIndex, return range._values if True, otherwise raise. + Returns ------- arr : object @@ -411,6 +417,8 @@ def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayL array([1, 2, 3]) """ if isinstance(obj, (ABCIndex, ABCSeries)): + if range_compat and isinstance(obj, ABCRangeIndex): + return obj._values obj = obj.array if extract_numpy and isinstance(obj, ABCPandasArray): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index f73707d36d478..3615d85273f99 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -896,11 +896,8 @@ def _arith_method(self, other, op): if op in [operator.mul, ops.rmul, operator.truediv, ops.rtruediv]: step = op - if isinstance(other, RangeIndex): - # TODO: in some cases we can likely be more efficient, especially add/sub - other = other._values - else: - other = extract_array(other, extract_numpy=True) + # TODO: if other is a RangeIndex we may have more efficient options + other = extract_array(other, extract_numpy=True, range_compat=True) attrs = self._get_attributes_dict() left, right = self, other diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9ee3b0a21747f..ce06fc55ee8e6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -70,7 +70,6 @@ Categorical, Index, MultiIndex, - RangeIndex, ) from pandas.core import groupby import pandas.core.algorithms as algos @@ -2059,14 +2058,9 @@ def _factorize_keys( (array([0, 1, 2]), array([0, 1]), 3) """ # Some pre-processing for non-ndarray lk / rk - if not isinstance(lk, RangeIndex): - lk = extract_array(lk, extract_numpy=True) - else: - lk = np.array(lk) # TODO: more efficient option? - if not isinstance(rk, RangeIndex): - rk = extract_array(rk, extract_numpy=True) - else: - rk = np.array(rk) # TODO: more efficient option? + lk = extract_array(lk, extract_numpy=True, range_compat=True) + rk = extract_array(rk, extract_numpy=True, range_compat=True) + # TODO: if either is a RangeIndex, we can likely factorize more efficiently? if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): # Extract the ndarray (UTC-localized) values diff --git a/pandas/core/series.py b/pandas/core/series.py index 6ec6b49995d10..3f43b27cd88ce 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -116,7 +116,6 @@ Float64Index, Index, MultiIndex, - RangeIndex, ensure_index, ) import pandas.core.indexes.base as ibase @@ -5011,10 +5010,7 @@ def _cmp_method(self, other, op): raise ValueError("Can only compare identically-labeled Series objects") lvalues = extract_array(self, extract_numpy=True) - if isinstance(other, RangeIndex): - rvalues = other._values - else: - rvalues = extract_array(other, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True, range_compat=True) res_values = ops.comparison_op(lvalues, rvalues, op) @@ -5025,10 +5021,7 @@ def _logical_method(self, other, op): self, other = ops.align_method_SERIES(self, other, align_asobject=True) lvalues = extract_array(self, extract_numpy=True) - if isinstance(other, RangeIndex): - rvalues = other._values - else: - rvalues = extract_array(other, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True, range_compat=True) res_values = ops.logical_op(lvalues, rvalues, op) return self._construct_result(res_values, name=res_name) @@ -5038,10 +5031,7 @@ def _arith_method(self, other, op): self, other = ops.align_method_SERIES(self, other) lvalues = extract_array(self, extract_numpy=True) - if isinstance(other, RangeIndex): - rvalues = other._values - else: - rvalues = extract_array(other, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True, range_compat=True) result = ops.arithmetic_op(lvalues, rvalues, op) From 92ccdfef14ffc4f6904fe221bf0d3fd686560634 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 Mar 2021 14:46:56 -0800 Subject: [PATCH 03/10] merge master --- .github/workflows/ci.yml | 5 +- .pre-commit-config.yaml | 14 +- asv_bench/benchmarks/algorithms.py | 30 +- asv_bench/benchmarks/algos/isin.py | 6 + asv_bench/benchmarks/attrs_caching.py | 18 - asv_bench/benchmarks/dtypes.py | 22 - asv_bench/benchmarks/gil.py | 1 + asv_bench/benchmarks/groupby.py | 33 +- asv_bench/benchmarks/indexing_engines.py | 7 + asv_bench/benchmarks/inference.py | 216 +++ asv_bench/benchmarks/libs.py | 66 +- asv_bench/benchmarks/reindex.py | 20 +- asv_bench/benchmarks/rolling.py | 2 +- asv_bench/benchmarks/timedelta.py | 36 - asv_bench/benchmarks/timeseries.py | 157 +- asv_bench/benchmarks/tslibs/normalize.py | 5 + asv_bench/benchmarks/tslibs/period.py | 5 + asv_bench/benchmarks/tslibs/resolution.py | 31 +- asv_bench/benchmarks/tslibs/timestamp.py | 25 +- asv_bench/benchmarks/tslibs/tslib.py | 14 +- asv_bench/benchmarks/tslibs/tz_convert.py | 8 +- conda.recipe/bld.bat | 2 - conda.recipe/build.sh | 2 - conda.recipe/meta.yaml | 40 - doc/source/getting_started/install.rst | 9 + doc/source/reference/io.rst | 7 + doc/source/user_guide/io.rst | 456 ++++++ doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.2.3.rst | 27 +- doc/source/whatsnew/v1.2.4.rst | 48 + doc/source/whatsnew/v1.3.0.rst | 82 +- environment.yml | 2 +- pandas/__init__.py | 1 + pandas/_libs/groupby.pyx | 14 +- pandas/_libs/lib.pyx | 4 +- pandas/_libs/tslib.pyx | 15 +- pandas/_libs/tslibs/offsets.pyx | 2 +- pandas/_libs/tslibs/timedeltas.pyx | 16 +- pandas/_libs/window/aggregations.pyx | 130 +- pandas/core/algorithms.py | 42 +- pandas/core/apply.py | 70 +- pandas/core/arrays/categorical.py | 37 +- pandas/core/arrays/datetimelike.py | 19 +- pandas/core/arrays/datetimes.py | 67 +- pandas/core/arrays/integer.py | 9 - pandas/core/arrays/interval.py | 4 +- pandas/core/arrays/numeric.py | 9 + pandas/core/arrays/string_arrow.py | 21 +- pandas/core/arrays/timedeltas.py | 13 +- pandas/core/common.py | 18 + pandas/core/construction.py | 9 +- pandas/core/dtypes/cast.py | 351 +++-- pandas/core/dtypes/concat.py | 4 +- pandas/core/frame.py | 284 +++- pandas/core/generic.py | 8 +- pandas/core/groupby/generic.py | 6 + pandas/core/groupby/groupby.py | 3 +- pandas/core/groupby/ops.py | 142 +- pandas/core/indexes/base.py | 25 +- pandas/core/indexes/datetimelike.py | 40 +- pandas/core/indexes/extension.py | 22 +- pandas/core/indexes/multi.py | 7 +- pandas/core/indexes/range.py | 4 +- pandas/core/indexing.py | 27 +- pandas/core/internals/array_manager.py | 70 +- pandas/core/internals/blocks.py | 90 +- pandas/core/internals/construction.py | 242 +-- pandas/core/internals/managers.py | 9 + pandas/core/missing.py | 66 +- pandas/core/nanops.py | 3 +- pandas/core/reshape/reshape.py | 5 +- pandas/core/series.py | 44 +- pandas/core/sorting.py | 9 +- pandas/core/tools/datetimes.py | 256 ++-- pandas/core/window/ewm.py | 73 +- pandas/io/api.py | 1 + pandas/io/formats/format.py | 135 +- pandas/io/formats/style.py | 45 +- pandas/io/formats/xml.py | 618 ++++++++ pandas/io/xml.py | 944 ++++++++++++ pandas/tests/api/test_api.py | 1 + pandas/tests/apply/conftest.py | 18 + pandas/tests/apply/test_frame_apply.py | 301 +--- .../apply/test_frame_apply_relabeling.py | 10 - pandas/tests/apply/test_frame_transform.py | 78 +- pandas/tests/apply/test_invalid_arg.py | 284 ++++ pandas/tests/apply/test_series_apply.py | 25 - pandas/tests/apply/test_series_transform.py | 37 +- .../arrays/categorical/test_constructors.py | 6 + .../tests/arrays/floating/test_arithmetic.py | 21 + .../tests/arrays/integer/test_arithmetic.py | 40 +- pandas/tests/arrays/masked/test_arithmetic.py | 12 +- pandas/tests/arrays/test_datetimelike.py | 20 +- pandas/tests/base/test_constructors.py | 4 +- .../dtypes/cast/test_construct_ndarray.py | 10 + pandas/tests/dtypes/test_dtypes.py | 2 +- pandas/tests/dtypes/test_inference.py | 34 +- pandas/tests/dtypes/test_missing.py | 28 +- pandas/tests/extension/base/groupby.py | 6 + pandas/tests/extension/test_boolean.py | 3 + pandas/tests/extension/test_string.py | 48 +- .../frame/constructors/test_from_records.py | 37 +- pandas/tests/frame/indexing/test_setitem.py | 13 +- pandas/tests/frame/methods/test_astype.py | 8 - pandas/tests/frame/methods/test_fillna.py | 11 +- pandas/tests/frame/methods/test_rename.py | 1 + pandas/tests/frame/methods/test_replace.py | 7 + pandas/tests/frame/methods/test_sort_index.py | 17 + pandas/tests/frame/test_constructors.py | 74 +- pandas/tests/groupby/test_allowlist.py | 5 +- pandas/tests/groupby/test_apply.py | 13 +- pandas/tests/groupby/test_categorical.py | 10 +- pandas/tests/groupby/test_function.py | 8 + pandas/tests/groupby/test_groupby.py | 3 + pandas/tests/groupby/test_quantile.py | 5 + pandas/tests/groupby/test_sample.py | 10 + .../tests/groupby/transform/test_transform.py | 11 +- .../tests/indexes/categorical/test_append.py | 62 + .../indexes/categorical/test_category.py | 67 +- .../tests/indexes/categorical/test_formats.py | 6 + .../datetimelike_/test_drop_duplicates.py | 80 + .../tests/indexes/datetimelike_/test_nat.py | 54 + .../indexes/datetimelike_/test_sort_values.py | 317 ++++ .../datetimelike_/test_value_counts.py | 103 ++ .../indexes/datetimes/methods/test_repeat.py | 78 + .../tests/indexes/datetimes/test_datetime.py | 22 - .../tests/indexes/datetimes/test_indexing.py | 7 + pandas/tests/indexes/datetimes/test_misc.py | 17 + pandas/tests/indexes/datetimes/test_ops.py | 254 ---- .../indexes/datetimes/test_partial_slicing.py | 6 - .../indexes/period/methods/test_is_full.py | 23 + .../indexes/period/methods/test_repeat.py | 26 + pandas/tests/indexes/period/test_join.py | 2 +- pandas/tests/indexes/period/test_ops.py | 276 ---- pandas/tests/indexes/period/test_period.py | 106 -- .../tests/indexes/period/test_period_range.py | 8 + pandas/tests/indexes/test_common.py | 4 +- pandas/tests/indexes/test_index_new.py | 7 + .../indexes/timedeltas/methods/test_repeat.py | 34 + .../tests/indexes/timedeltas/test_indexing.py | 28 +- pandas/tests/indexes/timedeltas/test_ops.py | 186 --- .../timedeltas/test_partial_slicing.py | 42 - .../indexes/timedeltas/test_timedelta.py | 25 - pandas/tests/indexing/test_categorical.py | 52 +- pandas/tests/indexing/test_iloc.py | 14 + pandas/tests/indexing/test_loc.py | 88 ++ pandas/tests/io/data/xml/baby_names.xml | 53 + pandas/tests/io/data/xml/books.xml | 21 + pandas/tests/io/data/xml/cta_rail_lines.kml | 92 ++ pandas/tests/io/data/xml/flatten_doc.xsl | 18 + pandas/tests/io/data/xml/row_field_output.xsl | 19 + pandas/tests/io/excel/test_writers.py | 4 +- pandas/tests/io/formats/style/test_style.py | 13 + pandas/tests/io/formats/test_format.py | 64 +- pandas/tests/io/formats/test_to_csv.py | 2 +- pandas/tests/io/formats/test_to_html.py | 4 +- pandas/tests/io/json/test_pandas.py | 70 +- pandas/tests/io/json/test_readlines.py | 26 +- .../tests/io/parser/common/test_chunksize.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 2 +- pandas/tests/io/parser/test_read_fwf.py | 5 +- pandas/tests/io/pytables/test_append.py | 4 +- pandas/tests/io/pytables/test_errors.py | 3 +- .../tests/io/pytables/test_file_handling.py | 9 +- pandas/tests/io/pytables/test_read.py | 8 +- pandas/tests/io/pytables/test_select.py | 2 +- pandas/tests/io/pytables/test_store.py | 24 +- pandas/tests/io/pytables/test_timezones.py | 4 +- pandas/tests/io/test_clipboard.py | 9 +- pandas/tests/io/test_common.py | 10 +- pandas/tests/io/test_feather.py | 8 +- pandas/tests/io/test_parquet.py | 8 +- pandas/tests/io/test_pickle.py | 2 +- pandas/tests/io/test_sql.py | 14 +- pandas/tests/io/test_stata.py | 27 +- pandas/tests/io/xml/test_to_xml.py | 1301 +++++++++++++++++ pandas/tests/io/xml/test_xml.py | 1097 ++++++++++++++ pandas/tests/resample/test_base.py | 3 + pandas/tests/reshape/concat/test_concat.py | 49 +- pandas/tests/reshape/concat/test_dataframe.py | 38 +- pandas/tests/reshape/concat/test_datetimes.py | 60 +- pandas/tests/reshape/concat/test_empty.py | 32 +- pandas/tests/reshape/concat/test_index.py | 14 +- pandas/tests/reshape/concat/test_series.py | 7 +- pandas/tests/reshape/merge/test_join.py | 8 +- pandas/tests/reshape/merge/test_merge.py | 110 +- pandas/tests/reshape/merge/test_merge_asof.py | 150 +- pandas/tests/reshape/merge/test_multi.py | 14 +- pandas/tests/reshape/test_crosstab.py | 19 +- pandas/tests/reshape/test_cut.py | 4 +- pandas/tests/reshape/test_melt.py | 10 +- pandas/tests/reshape/test_pivot.py | 75 +- .../tests/scalar/timedelta/test_arithmetic.py | 3 +- .../scalar/timedelta/test_constructors.py | 2 +- .../series/accessors/test_dt_accessor.py | 16 +- pandas/tests/series/indexing/test_datetime.py | 2 +- pandas/tests/series/indexing/test_getitem.py | 34 +- pandas/tests/series/indexing/test_indexing.py | 7 +- pandas/tests/series/indexing/test_where.py | 2 +- .../tests/series/methods/test_interpolate.py | 6 +- pandas/tests/series/methods/test_shift.py | 4 +- .../tests/series/methods/test_sort_index.py | 14 + pandas/tests/series/test_constructors.py | 89 +- pandas/tests/series/test_repr.py | 2 +- pandas/tests/series/test_unary.py | 59 +- pandas/tests/tools/test_to_datetime.py | 17 +- pandas/tests/window/test_ewm.py | 23 + pandas/util/_exceptions.py | 2 +- pandas/util/_validators.py | 48 +- requirements-dev.txt | 2 +- setup.cfg | 2 +- 211 files changed, 9128 insertions(+), 3369 deletions(-) delete mode 100644 conda.recipe/bld.bat delete mode 100644 conda.recipe/build.sh delete mode 100644 conda.recipe/meta.yaml create mode 100644 doc/source/whatsnew/v1.2.4.rst create mode 100644 pandas/io/formats/xml.py create mode 100644 pandas/io/xml.py create mode 100644 pandas/tests/apply/conftest.py create mode 100644 pandas/tests/indexes/categorical/test_append.py create mode 100644 pandas/tests/indexes/datetimelike_/test_drop_duplicates.py create mode 100644 pandas/tests/indexes/datetimelike_/test_nat.py create mode 100644 pandas/tests/indexes/datetimelike_/test_sort_values.py create mode 100644 pandas/tests/indexes/datetimelike_/test_value_counts.py create mode 100644 pandas/tests/indexes/datetimes/methods/test_repeat.py create mode 100644 pandas/tests/indexes/period/methods/test_is_full.py create mode 100644 pandas/tests/indexes/period/methods/test_repeat.py create mode 100644 pandas/tests/indexes/timedeltas/methods/test_repeat.py delete mode 100644 pandas/tests/indexes/timedeltas/test_partial_slicing.py create mode 100644 pandas/tests/io/data/xml/baby_names.xml create mode 100644 pandas/tests/io/data/xml/books.xml create mode 100644 pandas/tests/io/data/xml/cta_rail_lines.kml create mode 100644 pandas/tests/io/data/xml/flatten_doc.xsl create mode 100644 pandas/tests/io/data/xml/row_field_output.xsl create mode 100644 pandas/tests/io/xml/test_to_xml.py create mode 100644 pandas/tests/io/xml/test_xml.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6bb9753fcea65..c03722e32fea9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,11 +153,14 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + pytest pandas/tests/frame/test_constructors.py --array-manager + pytest pandas/tests/frame/constructors/ --array-manager pytest pandas/tests/frame/test_reductions.py --array-manager pytest pandas/tests/reductions/ --array-manager pytest pandas/tests/generic/test_generic.py --array-manager pytest pandas/tests/arithmetic/ --array-manager - pytest pandas/tests/groupby/aggregate/ --array-manager + pytest pandas/tests/groupby/ --array-manager + pytest pandas/tests/resample/ --array-manager pytest pandas/tests/reshape/merge --array-manager # indexing subset (temporary since other tests don't pass yet) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 47a9ae592f940..3966e8931162c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,13 +29,15 @@ repos: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.7.1 + rev: v1.8.0 hooks: - id: rst-backticks - id: rst-directive-colons - types: [text] + types: [text] # overwrite types: [rst] + types_or: [python, rst] - id: rst-inline-touching-normal - types: [text] + types: [text] # overwrite types: [rst] + types_or: [python, rst] - repo: local hooks: - id: pip_to_conda @@ -212,8 +214,8 @@ repos: rev: v0.1.7 hooks: - id: no-string-hints -- repo: https://github.com/MarcoGorelli/abs-imports - rev: v0.1.2 +- repo: https://github.com/MarcoGorelli/absolufy-imports + rev: v0.2.1 hooks: - - id: abs-imports + - id: absolufy-imports files: ^pandas/ diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..aecc609df574e 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -2,8 +2,6 @@ import numpy as np -from pandas._libs import lib - import pandas as pd from .pandas_vb_common import tm @@ -16,19 +14,6 @@ pass -class MaybeConvertObjects: - def setup(self): - N = 10 ** 5 - - data = list(range(N)) - data[0] = pd.NaT - data = np.array(data) - self.data = data - - def time_maybe_convert_objects(self): - lib.maybe_convert_objects(self.data) - - class Factorize: params = [ @@ -43,23 +28,36 @@ class Factorize: "datetime64[ns, tz]", "Int64", "boolean", + "string_arrow", ], ] param_names = ["unique", "sort", "dtype"] def setup(self, unique, sort, dtype): N = 10 ** 5 + string_index = tm.makeStringIndex(N) + try: + from pandas.core.arrays.string_arrow import ArrowStringDtype + + string_arrow = pd.array(string_index, dtype=ArrowStringDtype()) + except ImportError: + string_arrow = None + + if dtype == "string_arrow" and not string_arrow: + raise NotImplementedError + data = { "int": pd.Int64Index(np.arange(N)), "uint": pd.UInt64Index(np.arange(N)), "float": pd.Float64Index(np.random.randn(N)), - "string": tm.makeStringIndex(N), + "string": string_index, "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), "Int64": pd.array(np.arange(N), dtype="Int64"), "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), + "string_arrow": string_arrow, }[dtype] if not unique: data = data.repeat(5) diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 75a96e5b691ca..a8b8a193dbcfc 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -273,6 +273,7 @@ class IsInLongSeriesLookUpDominates: def setup(self, dtype, MaxNumber, series_type): N = 10 ** 7 + # https://github.com/pandas-dev/pandas/issues/39844 if not np_version_under1p20 and dtype in ("Int64", "Float64"): raise NotImplementedError @@ -303,6 +304,11 @@ class IsInLongSeriesValuesDominate: def setup(self, dtype, series_type): N = 10 ** 7 + + # https://github.com/pandas-dev/pandas/issues/39844 + if not np_version_under1p20 and dtype in ("Int64", "Float64"): + raise NotImplementedError + if series_type == "random": np.random.seed(42) vals = np.random.randint(0, 10 * N, N) diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index 9c7b107b478d4..d4366c42f96aa 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -3,11 +3,6 @@ import pandas as pd from pandas import DataFrame -try: - from pandas.util import cache_readonly -except ImportError: - from pandas.util.decorators import cache_readonly - try: from pandas.core.construction import extract_array except ImportError: @@ -53,17 +48,4 @@ def time_extract_array_numpy(self, dtype): extract_array(self.series, extract_numpy=True) -class CacheReadonly: - def setup(self): - class Foo: - @cache_readonly - def prop(self): - return 5 - - self.obj = Foo() - - def time_cache_readonly(self): - self.obj.prop - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 9209e851289bb..c561b80ed1ca6 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -13,7 +13,6 @@ from .pandas_vb_common import ( datetime_dtypes, extension_dtypes, - lib, numeric_dtypes, string_dtypes, ) @@ -49,27 +48,6 @@ def time_pandas_dtype_invalid(self, dtype): pass -class InferDtypes: - param_names = ["dtype"] - data_dict = { - "np-object": np.array([1] * 100000, dtype="O"), - "py-object": [1] * 100000, - "np-null": np.array([1] * 50000 + [np.nan] * 50000), - "py-null": [1] * 50000 + [None] * 50000, - "np-int": np.array([1] * 100000, dtype=int), - "np-floating": np.array([1.0] * 100000, dtype=float), - "empty": [], - "bytes": [b"a"] * 100000, - } - params = list(data_dict.keys()) - - def time_infer_skipna(self, dtype): - lib.infer_dtype(self.data_dict[dtype], skipna=True) - - def time_infer(self, dtype): - lib.infer_dtype(self.data_dict[dtype], skipna=False) - - class SelectDtypes: params = [ diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 410668ca3c7cf..459046d2decfb 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -125,6 +125,7 @@ def time_take1d(self, dtype): class ParallelKth: + # This depends exclusively on code in _libs/, could go in libs.py number = 1 repeat = 5 diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index fb08c6fdeaedf..9930c61e34b15 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -68,9 +68,18 @@ def time_groupby_apply_dict_return(self): class Apply: - def setup_cache(self): - N = 10 ** 4 - labels = np.random.randint(0, 2000, size=N) + + param_names = ["factor"] + params = [4, 5] + + def setup(self, factor): + N = 10 ** factor + # two cases: + # - small groups: small data (N**4) + many labels (2000) -> average group + # size of 5 (-> larger overhead of slicing method) + # - larger groups: larger data (N**5) + fewer labels (20) -> average group + # size of 5000 + labels = np.random.randint(0, 2000 if factor == 4 else 20, size=N) labels2 = np.random.randint(0, 3, size=N) df = DataFrame( { @@ -80,13 +89,13 @@ def setup_cache(self): "value2": ["foo", "bar", "baz", "qux"] * (N // 4), } ) - return df + self.df = df - def time_scalar_function_multi_col(self, df): - df.groupby(["key", "key2"]).apply(lambda x: 1) + def time_scalar_function_multi_col(self, factor): + self.df.groupby(["key", "key2"]).apply(lambda x: 1) - def time_scalar_function_single_col(self, df): - df.groupby("key").apply(lambda x: 1) + def time_scalar_function_single_col(self, factor): + self.df.groupby("key").apply(lambda x: 1) @staticmethod def df_copy_function(g): @@ -94,11 +103,11 @@ def df_copy_function(g): g.name return g.copy() - def time_copy_function_multi_col(self, df): - df.groupby(["key", "key2"]).apply(self.df_copy_function) + def time_copy_function_multi_col(self, factor): + self.df.groupby(["key", "key2"]).apply(self.df_copy_function) - def time_copy_overhead_single_col(self, df): - df.groupby("key").apply(self.df_copy_function) + def time_copy_overhead_single_col(self, factor): + self.df.groupby("key").apply(self.df_copy_function) class Groups: diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 44a22dfa77791..30ef7f63dc0dc 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,3 +1,10 @@ +""" +Benchmarks in this fiel depend exclusively on code in _libs/ + +If a PR does not edit anything in _libs, it is very unlikely that benchmarks +in this file will be affected. +""" + import numpy as np from pandas._libs import index as libindex diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index b6808ace629db..0aa924dabd469 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,8 +1,20 @@ +""" +The functions benchmarked in this file depend _almost_ exclusively on +_libs, but not in a way that is easy to formalize. + +If a PR does not change anything in pandas/_libs/ or pandas/core/tools/, then +it is likely that these benchmarks will be unaffected. +""" + import numpy as np from pandas import ( + NaT, Series, + date_range, + to_datetime, to_numeric, + to_timedelta, ) from .pandas_vb_common import ( @@ -69,6 +81,9 @@ def time_downcast(self, dtype, downcast): class MaybeConvertNumeric: + # maybe_convert_numeric depends _exclusively_ on _libs, could + # go in benchmarks/libs.py + def setup_cache(self): N = 10 ** 6 arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64") @@ -81,4 +96,205 @@ def time_convert(self, data): lib.maybe_convert_numeric(data, set(), coerce_numeric=False) +class MaybeConvertObjects: + # maybe_convert_objects depends _almost_ exclusively on _libs, but + # does have some run-time imports from outside of _libs + + def setup(self): + N = 10 ** 5 + + data = list(range(N)) + data[0] = NaT + data = np.array(data) + self.data = data + + def time_maybe_convert_objects(self): + lib.maybe_convert_objects(self.data) + + +class ToDatetimeFromIntsFloats: + def setup(self): + self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64") + self.ts_sec_float = self.ts_sec.astype("float64") + + self.ts_nanosec = 1_000_000 * self.ts_sec + self.ts_nanosec_float = self.ts_nanosec.astype("float64") + + # speed of int64 and float64 paths should be comparable + + def time_nanosec_int64(self): + to_datetime(self.ts_nanosec, unit="ns") + + def time_nanosec_float64(self): + to_datetime(self.ts_nanosec_float, unit="ns") + + def time_sec_int64(self): + to_datetime(self.ts_sec, unit="s") + + def time_sec_float64(self): + to_datetime(self.ts_sec_float, unit="s") + + +class ToDatetimeYYYYMMDD: + def setup(self): + rng = date_range(start="1/1/2000", periods=10000, freq="D") + self.stringsD = Series(rng.strftime("%Y%m%d")) + + def time_format_YYYYMMDD(self): + to_datetime(self.stringsD, format="%Y%m%d") + + +class ToDatetimeCacheSmallCount: + + params = ([True, False], [50, 500, 5000, 100000]) + param_names = ["cache", "count"] + + def setup(self, cache, count): + rng = date_range(start="1/1/1971", periods=count) + self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist() + + def time_unique_date_strings(self, cache, count): + to_datetime(self.unique_date_strings, cache=cache) + + +class ToDatetimeISO8601: + def setup(self): + rng = date_range(start="1/1/2000", periods=20000, freq="H") + self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() + self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() + self.strings_tz_space = [ + x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng + ] + + def time_iso8601(self): + to_datetime(self.strings) + + def time_iso8601_nosep(self): + to_datetime(self.strings_nosep) + + def time_iso8601_format(self): + to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S") + + def time_iso8601_format_no_sep(self): + to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S") + + def time_iso8601_tz_spaceformat(self): + to_datetime(self.strings_tz_space) + + +class ToDatetimeNONISO8601: + def setup(self): + N = 10000 + half = N // 2 + ts_string_1 = "March 1, 2018 12:00:00+0400" + ts_string_2 = "March 1, 2018 12:00:00+0500" + self.same_offset = [ts_string_1] * N + self.diff_offset = [ts_string_1] * half + [ts_string_2] * half + + def time_same_offset(self): + to_datetime(self.same_offset) + + def time_different_offset(self): + to_datetime(self.diff_offset) + + +class ToDatetimeFormatQuarters: + def setup(self): + self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000) + + def time_infer_quarter(self): + to_datetime(self.s) + + +class ToDatetimeFormat: + def setup(self): + N = 100000 + self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) + self.s2 = self.s.str.replace(":\\S+$", "") + + self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N + self.diff_offset = [ + f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10) + ] * (N // 10) + + def time_exact(self): + to_datetime(self.s2, format="%d%b%y") + + def time_no_exact(self): + to_datetime(self.s, format="%d%b%y", exact=False) + + def time_same_offset(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_different_offset(self): + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_same_offset_to_utc(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) + + def time_different_offset_to_utc(self): + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) + + +class ToDatetimeCache: + + params = [True, False] + param_names = ["cache"] + + def setup(self, cache): + N = 10000 + self.unique_numeric_seconds = list(range(N)) + self.dup_numeric_seconds = [1000] * N + self.dup_string_dates = ["2000-02-11"] * N + self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N + + def time_unique_seconds_and_unit(self, cache): + to_datetime(self.unique_numeric_seconds, unit="s", cache=cache) + + def time_dup_seconds_and_unit(self, cache): + to_datetime(self.dup_numeric_seconds, unit="s", cache=cache) + + def time_dup_string_dates(self, cache): + to_datetime(self.dup_string_dates, cache=cache) + + def time_dup_string_dates_and_format(self, cache): + to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache) + + def time_dup_string_tzoffset_dates(self, cache): + to_datetime(self.dup_string_with_tz, cache=cache) + + +class ToTimedelta: + def setup(self): + self.ints = np.random.randint(0, 60, size=10000) + self.str_days = [] + self.str_seconds = [] + for i in self.ints: + self.str_days.append(f"{i} days") + self.str_seconds.append(f"00:00:{i:02d}") + + def time_convert_int(self): + to_timedelta(self.ints, unit="s") + + def time_convert_string_days(self): + to_timedelta(self.str_days) + + def time_convert_string_seconds(self): + to_timedelta(self.str_seconds) + + +class ToTimedeltaErrors: + + params = ["coerce", "ignore"] + param_names = ["errors"] + + def setup(self, errors): + ints = np.random.randint(0, 60, size=10000) + self.arr = [f"{i} days" for i in ints] + self.arr[-1] = "apple" + + def time_convert(self, errors): + to_timedelta(self.arr, errors=errors) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py index f5c2397945cea..4e3f938a33eb1 100644 --- a/asv_bench/benchmarks/libs.py +++ b/asv_bench/benchmarks/libs.py @@ -1,10 +1,14 @@ """ Benchmarks for code in pandas/_libs, excluding pandas/_libs/tslibs, -which has its own directory +which has its own directory. + +If a PR does not edit anything in _libs/, then it is unlikely that thes +benchmarks will be affected. """ import numpy as np from pandas._libs.lib import ( + infer_dtype, is_list_like, is_scalar, ) @@ -14,6 +18,17 @@ NaT, ) +from .pandas_vb_common import ( + lib, + tm, +) + +try: + from pandas.util import cache_readonly +except ImportError: + from pandas.util.decorators import cache_readonly + + # TODO: share with something in pd._testing? scalars = [ 0, @@ -40,3 +55,52 @@ def time_is_list_like(self, param): def time_is_scalar(self, param): is_scalar(param) + + +class FastZip: + def setup(self): + N = 10000 + K = 10 + key1 = tm.makeStringIndex(N).values.repeat(K) + key2 = tm.makeStringIndex(N).values.repeat(K) + col_array = np.vstack([key1, key2, np.random.randn(N * K)]) + col_array2 = col_array.copy() + col_array2[:, :10000] = np.nan + self.col_array_list = list(col_array) + + def time_lib_fast_zip(self): + lib.fast_zip(self.col_array_list) + + +class InferDtype: + param_names = ["dtype"] + data_dict = { + "np-object": np.array([1] * 100000, dtype="O"), + "py-object": [1] * 100000, + "np-null": np.array([1] * 50000 + [np.nan] * 50000), + "py-null": [1] * 50000 + [None] * 50000, + "np-int": np.array([1] * 100000, dtype=int), + "np-floating": np.array([1.0] * 100000, dtype=float), + "empty": [], + "bytes": [b"a"] * 100000, + } + params = list(data_dict.keys()) + + def time_infer_dtype_skipna(self, dtype): + infer_dtype(self.data_dict[dtype], skipna=True) + + def time_infer_dtype(self, dtype): + infer_dtype(self.data_dict[dtype], skipna=False) + + +class CacheReadonly: + def setup(self): + class Foo: + @cache_readonly + def prop(self): + return 5 + + self.obj = Foo() + + def time_cache_readonly(self): + self.obj.prop diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 65392f2cea65b..5181b983c9f7a 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -9,10 +9,7 @@ period_range, ) -from .pandas_vb_common import ( - lib, - tm, -) +from .pandas_vb_common import tm class Reindex: @@ -155,19 +152,4 @@ def time_align_series_irregular_string(self): self.x + self.y -class LibFastZip: - def setup(self): - N = 10000 - K = 10 - key1 = tm.makeStringIndex(N).values.repeat(K) - key2 = tm.makeStringIndex(N).values.repeat(K) - col_array = np.vstack([key1, key2, np.random.randn(N * K)]) - col_array2 = col_array.copy() - col_array2[:, :10000] = np.nan - self.col_array_list = list(col_array) - - def time_lib_fast_zip(self): - lib.fast_zip(self.col_array_list) - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 0c23aa59c4608..d35770b720f7a 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -114,7 +114,7 @@ def time_ewm(self, constructor, window, dtype, method): getattr(self.ewm, method)() def time_ewm_times(self, constructor, window, dtype, method): - self.ewm.mean() + self.ewm_times.mean() class VariableWindowMethods(Methods): diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 9e221ee030e6d..cb0e4455e1a56 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -3,49 +3,13 @@ benchmarks.tslibs.timedelta for benchmarks that rely only on tslibs. """ -import numpy as np - from pandas import ( DataFrame, Series, timedelta_range, - to_timedelta, ) -class ToTimedelta: - def setup(self): - self.ints = np.random.randint(0, 60, size=10000) - self.str_days = [] - self.str_seconds = [] - for i in self.ints: - self.str_days.append(f"{i} days") - self.str_seconds.append(f"00:00:{i:02d}") - - def time_convert_int(self): - to_timedelta(self.ints, unit="s") - - def time_convert_string_days(self): - to_timedelta(self.str_days) - - def time_convert_string_seconds(self): - to_timedelta(self.str_seconds) - - -class ToTimedeltaErrors: - - params = ["coerce", "ignore"] - param_names = ["errors"] - - def setup(self, errors): - ints = np.random.randint(0, 60, size=10000) - self.arr = [f"{i} days" for i in ints] - self.arr[-1] = "apple" - - def time_convert(self, errors): - to_timedelta(self.arr, errors=errors) - - class DatetimeAccessor: def setup_cache(self): N = 100000 diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 94498e54f0f06..5b123c7127c28 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -9,7 +9,6 @@ date_range, period_range, timedelta_range, - to_datetime, ) from pandas.tseries.frequencies import infer_freq @@ -97,12 +96,12 @@ def setup(self, tz): idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz) self.df = DataFrame(np.random.randn(1000, 2), index=idx) - def time_reest_datetimeindex(self, tz): + def time_reset_datetimeindex(self, tz): self.df.reset_index() class InferFreq: - + # This depends mostly on code in _libs/, tseries/, and core.algos.unique params = [None, "D", "B"] param_names = ["freq"] @@ -273,158 +272,6 @@ def time_lookup_and_cleanup(self): self.ts.index._cleanup() -class ToDatetimeFromIntsFloats: - def setup(self): - self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64") - self.ts_sec_float = self.ts_sec.astype("float64") - - self.ts_nanosec = 1_000_000 * self.ts_sec - self.ts_nanosec_float = self.ts_nanosec.astype("float64") - - # speed of int64 and float64 paths should be comparable - - def time_nanosec_int64(self): - to_datetime(self.ts_nanosec, unit="ns") - - def time_nanosec_float64(self): - to_datetime(self.ts_nanosec_float, unit="ns") - - def time_sec_int64(self): - to_datetime(self.ts_sec, unit="s") - - def time_sec_float64(self): - to_datetime(self.ts_sec_float, unit="s") - - -class ToDatetimeYYYYMMDD: - def setup(self): - rng = date_range(start="1/1/2000", periods=10000, freq="D") - self.stringsD = Series(rng.strftime("%Y%m%d")) - - def time_format_YYYYMMDD(self): - to_datetime(self.stringsD, format="%Y%m%d") - - -class ToDatetimeCacheSmallCount: - - params = ([True, False], [50, 500, 5000, 100000]) - param_names = ["cache", "count"] - - def setup(self, cache, count): - rng = date_range(start="1/1/1971", periods=count) - self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist() - - def time_unique_date_strings(self, cache, count): - to_datetime(self.unique_date_strings, cache=cache) - - -class ToDatetimeISO8601: - def setup(self): - rng = date_range(start="1/1/2000", periods=20000, freq="H") - self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() - self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() - self.strings_tz_space = [ - x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng - ] - - def time_iso8601(self): - to_datetime(self.strings) - - def time_iso8601_nosep(self): - to_datetime(self.strings_nosep) - - def time_iso8601_format(self): - to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S") - - def time_iso8601_format_no_sep(self): - to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S") - - def time_iso8601_tz_spaceformat(self): - to_datetime(self.strings_tz_space) - - -class ToDatetimeNONISO8601: - def setup(self): - N = 10000 - half = N // 2 - ts_string_1 = "March 1, 2018 12:00:00+0400" - ts_string_2 = "March 1, 2018 12:00:00+0500" - self.same_offset = [ts_string_1] * N - self.diff_offset = [ts_string_1] * half + [ts_string_2] * half - - def time_same_offset(self): - to_datetime(self.same_offset) - - def time_different_offset(self): - to_datetime(self.diff_offset) - - -class ToDatetimeFormatQuarters: - def setup(self): - self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000) - - def time_infer_quarter(self): - to_datetime(self.s) - - -class ToDatetimeFormat: - def setup(self): - N = 100000 - self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) - self.s2 = self.s.str.replace(":\\S+$", "") - - self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N - self.diff_offset = [ - f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10) - ] * (N // 10) - - def time_exact(self): - to_datetime(self.s2, format="%d%b%y") - - def time_no_exact(self): - to_datetime(self.s, format="%d%b%y", exact=False) - - def time_same_offset(self): - to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - - def time_different_offset(self): - to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - - def time_same_offset_to_utc(self): - to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) - - def time_different_offset_to_utc(self): - to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) - - -class ToDatetimeCache: - - params = [True, False] - param_names = ["cache"] - - def setup(self, cache): - N = 10000 - self.unique_numeric_seconds = list(range(N)) - self.dup_numeric_seconds = [1000] * N - self.dup_string_dates = ["2000-02-11"] * N - self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N - - def time_unique_seconds_and_unit(self, cache): - to_datetime(self.unique_numeric_seconds, unit="s", cache=cache) - - def time_dup_seconds_and_unit(self, cache): - to_datetime(self.dup_numeric_seconds, unit="s", cache=cache) - - def time_dup_string_dates(self, cache): - to_datetime(self.dup_string_dates, cache=cache) - - def time_dup_string_dates_and_format(self, cache): - to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache) - - def time_dup_string_tzoffset_dates(self, cache): - to_datetime(self.dup_string_with_tz, cache=cache) - - class DatetimeAccessor: params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()] diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py index 292f57d7f5c77..f5f7adbf63995 100644 --- a/asv_bench/benchmarks/tslibs/normalize.py +++ b/asv_bench/benchmarks/tslibs/normalize.py @@ -14,6 +14,7 @@ from .tslib import ( _sizes, _tzs, + tzlocal_obj, ) @@ -30,6 +31,10 @@ def setup(self, size, tz): dti = pd.date_range("2016-01-01", periods=10, tz=tz).repeat(size // 10) self.i8data = dti.asi8 + if size == 10 ** 6 and tz is tzlocal_obj: + # tzlocal is cumbersomely slow, so skip to keep runtime in check + raise NotImplementedError + def time_normalize_i8_timestamps(self, size, tz): normalize_i8_timestamps(self.i8data, tz) diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index f2efee33c6da7..15a922da7ee76 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -15,6 +15,7 @@ from .tslib import ( _sizes, _tzs, + tzlocal_obj, ) try: @@ -129,6 +130,10 @@ class TimeDT64ArrToPeriodArr: param_names = ["size", "freq", "tz"] def setup(self, size, freq, tz): + if size == 10 ** 6 and tz is tzlocal_obj: + # tzlocal is cumbersomely slow, so skip to keep runtime in check + raise NotImplementedError + arr = np.arange(10, dtype="i8").repeat(size // 10) self.i8values = arr diff --git a/asv_bench/benchmarks/tslibs/resolution.py b/asv_bench/benchmarks/tslibs/resolution.py index 0d22ff77ee308..4b52efc188bf4 100644 --- a/asv_bench/benchmarks/tslibs/resolution.py +++ b/asv_bench/benchmarks/tslibs/resolution.py @@ -17,40 +17,33 @@ df.loc[key] = (val.average, val.stdev) """ -from datetime import ( - timedelta, - timezone, -) - -from dateutil.tz import ( - gettz, - tzlocal, -) import numpy as np -import pytz try: from pandas._libs.tslibs import get_resolution except ImportError: from pandas._libs.tslibs.resolution import get_resolution +from .tslib import ( + _sizes, + _tzs, + tzlocal_obj, +) + class TimeResolution: params = ( ["D", "h", "m", "s", "us", "ns"], - [1, 100, 10 ** 4, 10 ** 6], - [ - None, - timezone.utc, - timezone(timedelta(minutes=60)), - pytz.timezone("US/Pacific"), - gettz("Asia/Tokyo"), - tzlocal(), - ], + _sizes, + _tzs, ) param_names = ["unit", "size", "tz"] def setup(self, unit, size, tz): + if size == 10 ** 6 and tz is tzlocal_obj: + # tzlocal is cumbersomely slow, so skip to keep runtime in check + raise NotImplementedError + arr = np.random.randint(0, 10, size=size, dtype="i8") arr = arr.view(f"M8[{unit}]").astype("M8[ns]").view("i8") self.i8data = arr diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index 86c8d735bdb27..eda9bce89188c 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -1,30 +1,11 @@ -from datetime import ( - datetime, - timedelta, - timezone, -) - -from dateutil.tz import ( - gettz, - tzlocal, - tzutc, -) +from datetime import datetime + import numpy as np import pytz from pandas import Timestamp -# One case for each type of tzinfo object that has its own code path -# in tzconversion code. -_tzs = [ - None, - pytz.timezone("Europe/Amsterdam"), - gettz("US/Central"), - pytz.UTC, - tzutc(), - timezone(timedelta(minutes=60)), - tzlocal(), -] +from .tslib import _tzs class TimestampConstruction: diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index 17beada916e46..180f95e7fbda5 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -32,13 +32,14 @@ except ImportError: from pandas._libs.tslib import ints_to_pydatetime +tzlocal_obj = tzlocal() _tzs = [ None, timezone.utc, timezone(timedelta(minutes=60)), pytz.timezone("US/Pacific"), gettz("Asia/Tokyo"), - tzlocal(), + tzlocal_obj, ] _sizes = [0, 1, 100, 10 ** 4, 10 ** 6] @@ -53,12 +54,15 @@ class TimeIntsToPydatetime: # TODO: fold? freq? def setup(self, box, size, tz): + if box == "date" and tz is not None: + # tz is ignored, so avoid running redundant benchmarks + raise NotImplementedError # skip benchmark + if size == 10 ** 6 and tz is _tzs[-1]: + # This is cumbersomely-slow, so skip to trim runtime + raise NotImplementedError # skip benchmark + arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr def time_ints_to_pydatetime(self, box, size, tz): - if box == "date": - # ints_to_pydatetime does not allow non-None tz with date; - # this will mean doing some duplicate benchmarks - tz = None ints_to_pydatetime(self.i8data, tz, box=box) diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py index 89b39c1f8919f..793f43e9bbe35 100644 --- a/asv_bench/benchmarks/tslibs/tz_convert.py +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -6,6 +6,7 @@ from .tslib import ( _sizes, _tzs, + tzlocal_obj, ) try: @@ -24,6 +25,10 @@ class TimeTZConvert: param_names = ["size", "tz"] def setup(self, size, tz): + if size == 10 ** 6 and tz is tzlocal_obj: + # tzlocal is cumbersomely slow, so skip to keep runtime in check + raise NotImplementedError + arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr @@ -31,9 +36,6 @@ def time_tz_convert_from_utc(self, size, tz): # effectively: # dti = DatetimeIndex(self.i8data, tz=tz) # dti.tz_localize(None) - if size >= 10 ** 6 and str(tz) == "tzlocal()": - # asv fill will because each call takes 8+seconds - return if old_sig: tz_convert_from_utc(self.i8data, UTC, tz) else: diff --git a/conda.recipe/bld.bat b/conda.recipe/bld.bat deleted file mode 100644 index 284926fae8c04..0000000000000 --- a/conda.recipe/bld.bat +++ /dev/null @@ -1,2 +0,0 @@ -@echo off -%PYTHON% setup.py install diff --git a/conda.recipe/build.sh b/conda.recipe/build.sh deleted file mode 100644 index f341bce6fcf96..0000000000000 --- a/conda.recipe/build.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -$PYTHON setup.py install diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml deleted file mode 100644 index 53ee212360475..0000000000000 --- a/conda.recipe/meta.yaml +++ /dev/null @@ -1,40 +0,0 @@ -package: - name: pandas - version: {{ environ.get('GIT_DESCRIBE_TAG','').replace('v', '', 1) }} - -build: - number: {{ environ.get('GIT_DESCRIBE_NUMBER', 0) }} - {% if GIT_DESCRIBE_NUMBER|int == 0 %}string: np{{ CONDA_NPY }}py{{ CONDA_PY }}_0 - {% else %}string: np{{ CONDA_NPY }}py{{ CONDA_PY }}_{{ GIT_BUILD_STR }}{% endif %} - -source: - git_url: ../ - -requirements: - build: - - {{ compiler('c') }} - - {{ compiler('cxx') }} - host: - - python - - pip - - cython - - numpy - - setuptools >=38.6.0 - - python-dateutil >=2.7.3 - - pytz - run: - - python {{ python }} - - {{ pin_compatible('numpy') }} - - python-dateutil >=2.7.3 - - pytz - -test: - requires: - - pytest - commands: - - python -c "import pandas; pandas.test()" - - -about: - home: https://pandas.pydata.org - license: BSD diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 291799cfe521d..a9c3d637a41e3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -326,6 +326,15 @@ top-level :func:`~pandas.read_html` function: .. _lxml: https://lxml.de .. _tabulate: https://github.com/astanin/python-tabulate +XML +^^^ + +========================= ================== ============================================================= +Dependency Minimum Version Notes +========================= ================== ============================================================= +lxml 4.3.0 XML parser for read_xml and tree builder for to_xml +========================= ================== ============================================================= + SQL databases ^^^^^^^^^^^^^ diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index e755ce94812bb..442631de50c7a 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -68,6 +68,13 @@ HTML read_html +XML +~~~~ +.. autosummary:: + :toctree: api/ + + read_xml + HDFStore: PyTables (HDF5) ~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d7c1ca8bca598..7e113c93baabe 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -22,6 +22,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;Fixed-Width Text File;:ref:`read_fwf` text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` + text;`XML `__;:ref:`read_xml`;:ref:`to_xml` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` binary;`OpenDocument `__;:ref:`read_excel`; @@ -2831,6 +2832,461 @@ parse HTML tables in the top-level pandas io function ``read_html``. +XML +--- + +.. _io.read_xml: + +Reading XML +''''''''''' + +.. versionadded:: 1.3.0 + +The top-level :func:`~pandas.io.xml.read_xml` function can accept an XML +string/file/URL and will parse nodes and attributes into a pandas ``DataFrame``. + +.. note:: + + Since there is no standard XML structure where design types can vary in + many ways, ``read_xml`` works best with flatter, shallow versions. If + an XML document is deeply nested, use the ``stylesheet`` feature to + transform XML into a flatter version. + +Let's look at a few examples. + +Read an XML string: + +.. ipython:: python + + xml = """ + + + Everyday Italian + Giada De Laurentiis + 2005 + 30.00 + + + Harry Potter + J K. Rowling + 2005 + 29.99 + + + Learning XML + Erik T. Ray + 2003 + 39.95 + + """ + + df = pd.read_xml(xml) + df + +Read a URL with no options: + +.. ipython:: python + + df = pd.read_xml("https://www.w3schools.com/xml/books.xml") + df + +Read in the content of the "books.xml" file and pass it to ``read_xml`` +as a string: + +.. ipython:: python + :suppress: + + rel_path = os.path.join("..", "pandas", "tests", "io", "data", "xml", + "books.xml") + file_path = os.path.abspath(rel_path) + +.. ipython:: python + + with open(file_path, "r") as f: + df = pd.read_xml(f.read()) + df + +Read in the content of the "books.xml" as instance of ``StringIO`` or +``BytesIO`` and pass it to ``read_xml``: + +.. ipython:: python + + with open(file_path, "r") as f: + sio = StringIO(f.read()) + + df = pd.read_xml(sio) + df + +.. ipython:: python + + with open(file_path, "rb") as f: + bio = BytesIO(f.read()) + + df = pd.read_xml(bio) + df + +Even read XML from AWS S3 buckets such as Python Software Foundation's IRS 990 Form: + +.. ipython:: python + + df = pd.read_xml( + "s3://irs-form-990/201923199349319487_public.xml", + xpath=".//irs:Form990PartVIISectionAGrp", + namespaces={"irs": "http://www.irs.gov/efile"} + ) + df + +With `lxml`_ as default ``parser``, you access the full-featured XML library +that extends Python's ElementTree API. One powerful tool is ability to query +nodes selectively or conditionally with more expressive XPath: + +.. _lxml: https://lxml.de + +.. ipython:: python + + df = pd.read_xml(file_path, xpath="//book[year=2005]") + df + +Specify only elements or only attributes to parse: + +.. ipython:: python + + df = pd.read_xml(file_path, elems_only=True) + df + +.. ipython:: python + + df = pd.read_xml(file_path, attrs_only=True) + df + +XML documents can have namespaces with prefixes and default namespaces without +prefixes both of which are denoted with a special attribute ``xmlns``. In order +to parse by node under a namespace context, ``xpath`` must reference a prefix. + +For example, below XML contains a namespace with prefix, ``doc``, and URI at +``https://example.com``. In order to parse ``doc:row`` nodes, +``namespaces`` must be used. + +.. ipython:: python + + xml = """ + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + + """ + + df = pd.read_xml(xml, + xpath="//doc:row", + namespaces={"doc": "https://example.com"}) + df + +Similarly, an XML document can have a default namespace without prefix. Failing +to assign a temporary prefix will return no nodes and raise a ``ValueError``. +But assiging *any* temporary name to correct URI allows parsing by nodes. + +.. ipython:: python + + xml = """ + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + + """ + + df = pd.read_xml(xml, + xpath="//pandas:row", + namespaces={"pandas": "https://example.com"}) + df + +However, if XPath does not reference node names such as default, ``/*``, then +``namespaces`` is not required. + +With `lxml`_ as parser, you can flatten nested XML documents with an XSLT +script which also can be string/file/URL types. As background, `XSLT`_ is +a special-purpose language written in a special XML file that can transform +original XML documents into other XML, HTML, even text (CSV, JSON, etc.) +using an XSLT processor. + +.. _lxml: https://lxml.de +.. _XSLT: https://www.w3.org/TR/xslt/ + +For example, consider this somewhat nested structure of Chicago "L" Rides +where station and rides elements encapsulate data in their own sections. +With below XSLT, ``lxml`` can transform original nested document into a flatter +output (as shown below for demonstration) for easier parse into ``DataFrame``: + +.. ipython:: python + + xml = """ + + + + 2020-09-01T00:00:00 + + 864.2 + 534 + 417.2 + + + + + 2020-09-01T00:00:00 + + 2707.4 + 1909.8 + 1438.6 + + + + + 2020-09-01T00:00:00 + + 2949.6 + 1657 + 1453.8 + + + """ + + xsl = """ + + + + + + + + + + + + + + + """ + + output = """ + + + 40850 + Library + 2020-09-01T00:00:00 + 864.2 + 534 + 417.2 + + + 41700 + Washington/Wabash + 2020-09-01T00:00:00 + 2707.4 + 1909.8 + 1438.6 + + + 40380 + Clark/Lake + 2020-09-01T00:00:00 + 2949.6 + 1657 + 1453.8 + + """ + + df = pd.read_xml(xml, stylesheet=xsl) + df + + +.. _io.xml: + +Writing XML +''''''''''' + +.. versionadded:: 1.3.0 + +``DataFrame`` objects have an instance method ``to_xml`` which renders the +contents of the ``DataFrame`` as an XML document. + +.. note:: + + This method does not support special properties of XML including DTD, + CData, XSD schemas, processing instructions, comments, and others. + Only namespaces at the root level is supported. However, ``stylesheet`` + allows design changes after initial output. + +Let's look at a few examples. + +Write an XML without options: + +.. ipython:: python + + geom_df = pd.DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, np.nan, 3], + } + ) + + print(geom_df.to_xml()) + + +Write an XML with new root and row name: + +.. ipython:: python + + print(geom_df.to_xml(root_name="geometry", row_name="objects")) + +Write an attribute-centric XML: + +.. ipython:: python + + print(geom_df.to_xml(attr_cols=geom_df.columns.tolist())) + +Write a mix of elements and attributes: + +.. ipython:: python + + print( + geom_df.to_xml( + index=False, + attr_cols=['shape'], + elem_cols=['degrees', 'sides']) + ) + +Any ``DataFrames`` with hierarchical columns will be flattened for XML element names +with levels delimited by underscores: + +.. ipython:: python + + ext_geom_df = pd.DataFrame( + { + "type": ["polygon", "other", "polygon"], + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, np.nan, 3], + } + ) + + pvt_df = ext_geom_df.pivot_table(index='shape', + columns='type', + values=['degrees', 'sides'], + aggfunc='sum') + pvt_df + + print(pvt_df.to_xml()) + +Write an XML with default namespace: + +.. ipython:: python + + print(geom_df.to_xml(namespaces={"": "https://example.com"})) + +Write an XML with namespace prefix: + +.. ipython:: python + + print( + geom_df.to_xml(namespaces={"doc": "https://example.com"}, + prefix="doc") + ) + +Write an XML without declaration or pretty print: + +.. ipython:: python + + print( + geom_df.to_xml(xml_declaration=False, + pretty_print=False) + ) + +Write an XML and transform with stylesheet: + +.. ipython:: python + + xsl = """ + + + + + + + + + + + polygon + + + + + + + + """ + + print(geom_df.to_xml(stylesheet=xsl)) + + +XML Final Notes +''''''''''''''' + +* All XML documents adhere to `W3C specifications`_. Both ``etree`` and ``lxml`` + parsers will fail to parse any markup document that is not well-formed or + follows XML syntax rules. Do be aware HTML is not an XML document unless it + follows XHTML specs. However, other popular markup types including KML, XAML, + RSS, MusicML, MathML are compliant `XML schemas`_. + +* For above reason, if your application builds XML prior to pandas operations, + use appropriate DOM libraries like ``etree`` and ``lxml`` to build the necessary + document and not by string concatenation or regex adjustments. Always remember + XML is a *special* text file with markup rules. + +* With very large XML files (several hundred MBs to GBs), XPath and XSLT + can become memory-intensive operations. Be sure to have enough available + RAM for reading and writing to large XML files (roughly about 5 times the + size of text). + +* Because XSLT is a programming language, use it with caution since such scripts + can pose a security risk in your environment and can run large or infinite + recursive operations. Always test scripts on small fragments before full run. + +* The `etree`_ parser supports all functionality of both ``read_xml`` and + ``to_xml`` except for complex XPath and any XSLT. Though limited in features, + ``etree`` is still a reliable and capable parser and tree builder. Its + performance may trail ``lxml`` to a certain degree for larger files but + relatively unnoticeable on small to medium size files. + +.. _`W3C specifications`: https://www.w3.org/TR/xml/ +.. _`XML schemas`: https://en.wikipedia.org/wiki/List_of_types_of_XML_schemas +.. _`etree`: https://docs.python.org/3/library/xml.etree.elementtree.html + + .. _io.excel: diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 71c9b0613b3ce..8697182f5ca6f 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 1.2 .. toctree:: :maxdepth: 2 + v1.2.4 v1.2.3 v1.2.2 v1.2.1 diff --git a/doc/source/whatsnew/v1.2.3.rst b/doc/source/whatsnew/v1.2.3.rst index 28fc83459b69d..dec2d061504b4 100644 --- a/doc/source/whatsnew/v1.2.3.rst +++ b/doc/source/whatsnew/v1.2.3.rst @@ -1,6 +1,6 @@ .. _whatsnew_123: -What's new in 1.2.3 (March ??, 2021) +What's new in 1.2.3 (March 02, 2021) ------------------------------------ These are the changes in pandas 1.2.3. See :ref:`release` for a full changelog @@ -19,27 +19,8 @@ Fixed regressions - Fixed regression in nullable integer unary ops propagating mask on assignment (:issue:`39943`) - Fixed regression in :meth:`DataFrame.__setitem__` not aligning :class:`DataFrame` on right-hand side for boolean indexer (:issue:`39931`) - Fixed regression in :meth:`~DataFrame.to_json` failing to use ``compression`` with URL-like paths that are internally opened in binary mode or with user-provided file objects that are opened in binary mode (:issue:`39985`) -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_123.bug_fixes: - -Bug fixes -~~~~~~~~~ - -- -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_123.other: - -Other -~~~~~ - -- -- +- Fixed regression in :meth:`Series.sort_index` and :meth:`DataFrame.sort_index`, which exited with an ungraceful error when having kwarg ``ascending=None`` passed. Passing ``ascending=None`` is still considered invalid, and the improved error message suggests a proper usage (``ascending`` must be a boolean or a list-like of boolean) (:issue:`39434`) +- Fixed regression in :meth:`DataFrame.transform` and :meth:`Series.transform` giving incorrect column labels when passed a dictionary with a mix of list and non-list values (:issue:`40018`) .. --------------------------------------------------------------------------- @@ -48,4 +29,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.2.2..v1.2.3|HEAD +.. contributors:: v1.2.2..v1.2.3 diff --git a/doc/source/whatsnew/v1.2.4.rst b/doc/source/whatsnew/v1.2.4.rst new file mode 100644 index 0000000000000..790ff4c78cad6 --- /dev/null +++ b/doc/source/whatsnew/v1.2.4.rst @@ -0,0 +1,48 @@ +.. _whatsnew_124: + +What's new in 1.2.4 (April ??, 2021) +--------------------------------------- + +These are the changes in pandas 1.2.4. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_124.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_124.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_124.other: + +Other +~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_124.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.2.3..v1.2.4|HEAD diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 8deeb3cfae1d3..9bb9f0c7a467a 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -43,6 +43,73 @@ For example: storage_options=headers ) +.. _whatsnew_130.read_to_xml: + +Read and write XML documents +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We added I/O support to read and render shallow versions of `XML`_ documents with +:func:`pandas.read_xml` and :meth:`DataFrame.to_xml`. Using `lxml`_ as parser, +both XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) + +.. _XML: https://www.w3.org/standards/xml/core +.. _lxml: https://lxml.de + +.. code-block:: ipython + + In [1]: xml = """ + ...: + ...: + ...: square + ...: 360 + ...: 4.0 + ...: + ...: + ...: circle + ...: 360 + ...: + ...: + ...: + ...: triangle + ...: 180 + ...: 3.0 + ...: + ...: """ + + In [2]: df = pd.read_xml(xml) + In [3]: df + Out[3]: + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + + In [4]: df.to_xml() + Out[4]: + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + +For more, see :ref:`io.xml` in the user guide on IO tools. + .. _whatsnew_130.enhancements.other: Other enhancements @@ -61,6 +128,7 @@ Other enhancements - :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) +- Disallow :class:`DataFrame` indexer for ``iloc`` for :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__`, (:issue:`39004`) - :meth:`Series.apply` can now accept list-like or dictionary-like arguments that aren't lists or dictionaries, e.g. ``ser.apply(np.array(["sum", "mean"]))``, which was already the case for :meth:`DataFrame.apply` (:issue:`39140`) - :meth:`DataFrame.plot.scatter` can now accept a categorical column as the argument to ``c`` (:issue:`12380`, :issue:`31357`) - :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes (:issue:`35643`, :issue:`21266`, :issue:`39317`) @@ -70,6 +138,7 @@ Other enhancements - :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`) - :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files. - Add support for parsing ``ISO 8601``-like timestamps with negative signs to :meth:`pandas.Timedelta` (:issue:`37172`) +- Add support for unary operators in :class:`FloatingArray` (:issue:`38749`) .. --------------------------------------------------------------------------- @@ -250,6 +319,7 @@ Deprecations - Deprecated comparison of :class:`Timestamp` object with ``datetime.date`` objects. Instead of e.g. ``ts <= mydate`` use ``ts <= pd.Timestamp(mydate)`` or ``ts.date() <= mydate`` (:issue:`36131`) - Deprecated :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`) - Deprecated :attr:`Rolling.is_datetimelike` (:issue:`38963`) +- Deprecated :class:`DataFrame` indexer for :meth:`Series.__setitem__` and :meth:`DataFrame.__setitem__` (:issue:`39004`) - Deprecated :meth:`core.window.ewm.ExponentialMovingWindow.vol` (:issue:`39220`) - Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`) - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) @@ -270,6 +340,8 @@ Performance improvements - Performance improvement in :func:`unique` for object data type (:issue:`37615`) - Performance improvement in :class:`core.window.rolling.ExpandingGroupby` aggregation methods (:issue:`39664`) - Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`) +- Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) +- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) .. --------------------------------------------------------------------------- @@ -332,7 +404,8 @@ Conversion ^^^^^^^^^^ - Bug in :meth:`Series.to_dict` with ``orient='records'`` now returns python native types (:issue:`25969`) - Bug in :meth:`Series.view` and :meth:`Index.view` when converting between datetime-like (``datetime64[ns]``, ``datetime64[ns, tz]``, ``timedelta64``, ``period``) dtypes (:issue:`39788`) -- +- Bug in creating a :class:`DataFrame` from an empty ``np.recarray`` not retaining the original dtypes (:issue:`40121`) +- Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`) - Strings @@ -373,7 +446,7 @@ Indexing - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`) - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) -- +- Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) Missing ^^^^^^^ @@ -446,7 +519,9 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` where a :class:`MultiIndex` would be created instead of an :class:`Index` if a :class:`:meth:`core.window.rolling.RollingGroupby` object was created (:issue:`39732`) - Bug in :meth:`DataFrameGroupBy.sample` where error was raised when ``weights`` was specified and the index was an :class:`Int64Index` (:issue:`39927`) - Bug in :meth:`DataFrameGroupBy.aggregate` and :meth:`.Resampler.aggregate` would sometimes raise ``SpecificationError`` when passed a dictionary and columns were missing; will now always raise a ``KeyError`` instead (:issue:`40004`) -- +- Bug in :meth:`DataFrameGroupBy.sample` where column selection was not applied to sample result (:issue:`39928`) +- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would incorrectly raise a ``ValueError`` when providing ``times`` (:issue:`40164`) +- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`) Reshaping ^^^^^^^^^ @@ -492,6 +567,7 @@ Other - :class:`Styler` rendered HTML output minor alterations to support w3 good code standard (:issue:`39626`) - Bug in :class:`Styler` where rendered HTML was missing a column class identifier for certain header cells (:issue:`39716`) - Bug in :meth:`Styler.background_gradient` where text-color was not determined correctly (:issue:`39888`) +- Bug in :class:`Styler` where multiple elements in CSS-selectors were not correctly added to ``table_styles`` (:issue:`39942`) - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) - Bug in :func:`pandas.util.show_versions` where console JSON output was not proper JSON (:issue:`39701`) diff --git a/environment.yml b/environment.yml index 113780ed0264a..f54bf41c14c75 100644 --- a/environment.yml +++ b/environment.yml @@ -23,7 +23,7 @@ dependencies: - flake8 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - isort>=5.2.1 # check that imports are in the right order - - mypy=0.800 + - mypy=0.812 - pre-commit>=2.9.2 - pycodestyle # used by flake8 - pyupgrade diff --git a/pandas/__init__.py b/pandas/__init__.py index cc4c99efc4345..7cad3eded0585 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -167,6 +167,7 @@ read_feather, read_gbq, read_html, + read_xml, read_json, read_stata, read_sas, diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 43bf6d9dd1fee..40e82798c0753 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -497,8 +497,9 @@ def _group_add(complexfloating_t[:, :] out, raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - sumx = np.zeros_like(out) - compensation = np.zeros_like(out) + # the below is equivalent to `np.zeros_like(out)` but faster + sumx = np.zeros((out).shape, dtype=(out).base.dtype) + compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape @@ -555,7 +556,7 @@ def _group_prod(floating[:, :] out, raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - prodx = np.ones_like(out) + prodx = np.ones((out).shape, dtype=(out).base.dtype) N, K = (values).shape @@ -608,7 +609,7 @@ def _group_var(floating[:, :] out, raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - mean = np.zeros_like(out) + mean = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape @@ -665,8 +666,9 @@ def _group_mean(floating[:, :] out, raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - sumx = np.zeros_like(out) - compensation = np.zeros_like(out) + # the below is equivalent to `np.zeros_like(out)` but faster + sumx = np.zeros((out).shape, dtype=(out).base.dtype) + compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d2aa47f65d263..4e04425436af4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1562,7 +1562,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: seen_tz_aware = True if seen_tz_naive and seen_tz_aware: - return 'mixed' + return "mixed" elif util.is_datetime64_object(v): # np.datetime64 seen_datetime = True @@ -2250,7 +2250,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, break elif is_timedelta(val): if convert_timedelta: - itimedeltas[i] = convert_to_timedelta64(val, 'ns') + itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8") seen.timedelta_ = True else: seen.object_ = True diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 605e2135edc9f..337e131f0a2c9 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -376,7 +376,8 @@ cpdef array_to_datetime( bint dayfirst=False, bint yearfirst=False, bint utc=False, - bint require_iso8601=False + bint require_iso8601=False, + bint allow_mixed=False, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -405,6 +406,8 @@ cpdef array_to_datetime( indicator whether the dates should be UTC require_iso8601 : bool, default False indicator whether the datetime string should be iso8601 + allow_mixed : bool, default False + Whether to allow mixed datetimes and integers. Returns ------- @@ -597,7 +600,7 @@ cpdef array_to_datetime( return ignore_errors_out_of_bounds_fallback(values), tz_out except TypeError: - return array_to_datetime_object(values, errors, dayfirst, yearfirst) + return _array_to_datetime_object(values, errors, dayfirst, yearfirst) if seen_datetime and seen_integer: # we have mixed datetimes & integers @@ -609,10 +612,12 @@ cpdef array_to_datetime( val = values[i] if is_integer_object(val) or is_float_object(val): result[i] = NPY_NAT + elif allow_mixed: + pass elif is_raise: raise ValueError("mixed datetimes and integers in passed array") else: - return array_to_datetime_object(values, errors, dayfirst, yearfirst) + return _array_to_datetime_object(values, errors, dayfirst, yearfirst) if seen_datetime_offset and not utc_convert: # GH#17697 @@ -623,7 +628,7 @@ cpdef array_to_datetime( # (with individual dateutil.tzoffsets) are returned is_same_offsets = len(out_tzoffset_vals) == 1 if not is_same_offsets: - return array_to_datetime_object(values, errors, dayfirst, yearfirst) + return _array_to_datetime_object(values, errors, dayfirst, yearfirst) else: tz_offset = out_tzoffset_vals.pop() tz_out = pytz.FixedOffset(tz_offset / 60.) @@ -670,7 +675,7 @@ cdef ignore_errors_out_of_bounds_fallback(ndarray[object] values): @cython.wraparound(False) @cython.boundscheck(False) -cdef array_to_datetime_object( +cdef _array_to_datetime_object( ndarray[object] values, str errors, bint dayfirst=False, diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 2d4704ad3bda6..4e6e5485b2ade 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3579,7 +3579,7 @@ cpdef to_offset(freq): stride_sign = None try: - split = re.split(opattern, freq) + split = opattern.split(freq) if split[-1] != "" and not split[-1].isspace(): # the last element must be blank raise ValueError("last element must be blank") diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 76a5b6cc9de12..3cdb654642b9c 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -178,11 +178,15 @@ cpdef int64_t delta_to_nanoseconds(delta) except? -1: if is_integer_object(delta): return delta if PyDelta_Check(delta): - return ( - delta.days * 24 * 60 * 60 * 1_000_000 - + delta.seconds * 1_000_000 - + delta.microseconds - ) * 1000 + try: + return ( + delta.days * 24 * 60 * 60 * 1_000_000 + + delta.seconds * 1_000_000 + + delta.microseconds + ) * 1000 + except OverflowError as err: + from pandas._libs.tslibs.conversion import OutOfBoundsTimedelta + raise OutOfBoundsTimedelta(*err.args) from err raise TypeError(type(delta)) @@ -246,7 +250,7 @@ cdef object ensure_td64ns(object ts): td64_value = td64_value * mult except OverflowError as err: from pandas._libs.tslibs.conversion import OutOfBoundsTimedelta - raise OutOfBoundsTimedelta(ts) + raise OutOfBoundsTimedelta(ts) from err return np.timedelta64(td64_value, "ns") diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 5a95b0ec4e08a..efacfad40ef82 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -116,9 +116,10 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x, def roll_sum(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: + Py_ssize_t i, j float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0 int64_t s, e - int64_t nobs = 0, i, j, N = len(values) + int64_t nobs = 0, N = len(values) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -493,12 +494,13 @@ cdef inline void remove_skew(float64_t val, int64_t *nobs, def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: + Py_ssize_t i, j float64_t val, prev, min_val, mean_val, sum_val = 0 float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0 float64_t compensation_xx_add = 0, compensation_xx_remove = 0 float64_t compensation_x_add = 0, compensation_x_remove = 0 float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 + int64_t nobs = 0, N = len(values), nobs_mean = 0 int64_t s, e ndarray[float64_t] output, mean_array, values_copy bint is_monotonic_increasing_bounds @@ -674,13 +676,14 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: + Py_ssize_t i, j float64_t val, prev, mean_val, min_val, sum_val = 0 float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0 float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0 float64_t compensation_xx_remove = 0, compensation_xx_add = 0 float64_t compensation_x_remove = 0, compensation_x_add = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 + int64_t nobs = 0, s, e, N = len(values), nobs_mean = 0 ndarray[float64_t] output, values_copy bint is_monotonic_increasing_bounds @@ -754,15 +757,13 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, res, prev - bint err = False - int ret = 0 - skiplist_t *sl Py_ssize_t i, j + bint err = False, is_monotonic_increasing_bounds + int midpoint, ret = 0 int64_t nobs = 0, N = len(values), s, e, win - int midpoint + float64_t val, res, prev + skiplist_t *sl ndarray[float64_t] output - bint is_monotonic_increasing_bounds is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( start, end @@ -933,8 +934,8 @@ cdef _roll_min_max(ndarray[numeric] values, bint is_max): cdef: numeric ai - int64_t i, k, curr_win_size, start - Py_ssize_t nobs = 0, N = len(values) + int64_t curr_win_size, start + Py_ssize_t i, k, nobs = 0, N = len(values) deque Q[int64_t] # min/max always the front deque W[int64_t] # track the whole window for nobs compute ndarray[float64_t, ndim=1] output @@ -1017,14 +1018,14 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, O(N log(window)) implementation using skip list """ cdef: + Py_ssize_t i, j, s, e, N = len(values), idx + int ret = 0 + int64_t nobs = 0, win float64_t val, prev, midpoint, idx_with_fraction - skiplist_t *skiplist - int64_t nobs = 0, i, j, s, e, N = len(values), win - Py_ssize_t idx - ndarray[float64_t] output float64_t vlow, vhigh + skiplist_t *skiplist InterpolationType interpolation_type - int ret = 0 + ndarray[float64_t] output if quantile <= 0.0 or quantile >= 1.0: raise ValueError(f"quantile value {quantile} not in [0, 1]") @@ -1041,10 +1042,10 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, # actual skiplist ops outweigh any window computation costs output = np.empty(N, dtype=float) - if (end - start).max() == 0: + win = (end - start).max() + if win == 0: output[:] = NaN return output - win = (end - start).max() skiplist = skiplist_init(win) if skiplist == NULL: raise MemoryError("skiplist_init failed") @@ -1473,66 +1474,9 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights, # ---------------------------------------------------------------------- # Exponentially weighted moving average -def ewma_time(const float64_t[:] vals, int64_t[:] start, int64_t[:] end, - int minp, ndarray[int64_t] times, int64_t halflife): - """ - Compute exponentially-weighted moving average using halflife and time - distances. - - Parameters - ---------- - vals : ndarray[float_64] - start: ndarray[int_64] - end: ndarray[int_64] - minp : int - times : ndarray[int64] - halflife : int64 - - Returns - ------- - ndarray - """ - cdef: - Py_ssize_t i, j, num_not_nan = 0, N = len(vals) - bint is_not_nan - float64_t last_result, weights_dot, weights_sum, weight, halflife_float - float64_t[:] times_float - float64_t[:] observations = np.zeros(N, dtype=float) - float64_t[:] times_masked = np.zeros(N, dtype=float) - ndarray[float64_t] output = np.empty(N, dtype=float) - - if N == 0: - return output - - halflife_float = halflife - times_float = times.astype(float) - last_result = vals[0] - - with nogil: - for i in range(N): - is_not_nan = vals[i] == vals[i] - num_not_nan += is_not_nan - if is_not_nan: - times_masked[num_not_nan-1] = times_float[i] - observations[num_not_nan-1] = vals[i] - - weights_sum = 0 - weights_dot = 0 - for j in range(num_not_nan): - weight = 0.5 ** ( - (times_float[i] - times_masked[j]) / halflife_float) - weights_sum += weight - weights_dot += weight * observations[j] - - last_result = weights_dot / weights_sum - - output[i] = last_result if num_not_nan >= minp else NaN - - return output - - -def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, - float64_t com, bint adjust, bint ignore_na): +def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, + int minp, float64_t com, bint adjust, bint ignore_na, + const float64_t[:] deltas): """ Compute exponentially-weighted moving average using center-of-mass. @@ -1543,8 +1487,10 @@ def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, end: ndarray (int64 type) minp : int com : float64 - adjust : int + adjust : bool ignore_na : bool + times : ndarray (float64 type) + halflife : float64 Returns ------- @@ -1553,7 +1499,7 @@ def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, cdef: Py_ssize_t i, j, s, e, nobs, win_size, N = len(vals), M = len(start) - float64_t[:] sub_vals + const float64_t[:] sub_vals ndarray[float64_t] sub_output, output = np.empty(N, dtype=float) float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur bint is_observation @@ -1562,6 +1508,8 @@ def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, return output alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha for j in range(M): s = start[j] @@ -1570,9 +1518,6 @@ def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, win_size = len(sub_vals) sub_output = np.empty(win_size, dtype=float) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha - weighted_avg = sub_vals[0] is_observation = weighted_avg == weighted_avg nobs = int(is_observation) @@ -1587,8 +1532,7 @@ def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, if weighted_avg == weighted_avg: if is_observation or not ignore_na: - - old_wt *= old_wt_factor + old_wt *= old_wt_factor ** deltas[i - 1] if is_observation: # avoid numerical errors on constant series @@ -1613,8 +1557,9 @@ def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, # Exponentially weighted moving covariance -def ewmcov(float64_t[:] input_x, int64_t[:] start, int64_t[:] end, int minp, - float64_t[:] input_y, float64_t com, bint adjust, bint ignore_na, bint bias): +def ewmcov(const float64_t[:] input_x, const int64_t[:] start, const int64_t[:] end, + int minp, const float64_t[:] input_y, float64_t com, bint adjust, + bint ignore_na, bint bias): """ Compute exponentially-weighted moving variance using center-of-mass. @@ -1626,9 +1571,9 @@ def ewmcov(float64_t[:] input_x, int64_t[:] start, int64_t[:] end, int minp, minp : int input_y : ndarray (float64 type) com : float64 - adjust : int + adjust : bool ignore_na : bool - bias : int + bias : bool Returns ------- @@ -1641,7 +1586,7 @@ def ewmcov(float64_t[:] input_x, int64_t[:] start, int64_t[:] end, int minp, float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y float64_t numerator, denominator - float64_t[:] sub_x_vals, sub_y_vals + const float64_t[:] sub_x_vals, sub_y_vals ndarray[float64_t] sub_out, output = np.empty(N, dtype=float) bint is_observation @@ -1652,6 +1597,8 @@ def ewmcov(float64_t[:] input_x, int64_t[:] start, int64_t[:] end, int minp, return output alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha for j in range(L): s = start[j] @@ -1661,9 +1608,6 @@ def ewmcov(float64_t[:] input_x, int64_t[:] start, int64_t[:] end, int minp, win_size = len(sub_x_vals) sub_out = np.empty(win_size, dtype=float) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha - mean_x = sub_x_vals[0] mean_y = sub_y_vals[0] is_observation = (mean_x == mean_x) and (mean_y == mean_y) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 819e5a1c32d9b..74fb0e2bd54fb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -84,7 +84,7 @@ from pandas.core.array_algos.take import take_nd from pandas.core.construction import ( - array, + array as pd_array, ensure_wrapped_if_datetimelike, extract_array, ) @@ -108,9 +108,7 @@ # --------------- # # dtype access # # --------------- # -def _ensure_data( - values: ArrayLike, dtype: Optional[DtypeObj] = None -) -> Tuple[np.ndarray, DtypeObj]: +def _ensure_data(values: ArrayLike) -> Tuple[np.ndarray, DtypeObj]: """ routine to ensure that our data is of the correct input dtype for lower-level routines @@ -126,8 +124,6 @@ def _ensure_data( Parameters ---------- values : array-like - dtype : pandas_dtype, optional - coerce to this dtype Returns ------- @@ -135,34 +131,26 @@ def _ensure_data( pandas_dtype : np.dtype or ExtensionDtype """ - if dtype is not None: - # We only have non-None dtype when called from `isin`, and - # both Datetimelike and Categorical dispatch before getting here. - assert not needs_i8_conversion(dtype) - assert not is_categorical_dtype(dtype) - if not isinstance(values, ABCMultiIndex): # extract_array would raise values = extract_array(values, extract_numpy=True) # we check some simple dtypes first - if is_object_dtype(dtype): - return ensure_object(np.asarray(values)), np.dtype("object") - elif is_object_dtype(values) and dtype is None: + if is_object_dtype(values): return ensure_object(np.asarray(values)), np.dtype("object") try: - if is_bool_dtype(values) or is_bool_dtype(dtype): + if is_bool_dtype(values): # we are actually coercing to uint64 # until our algos support uint8 directly (see TODO) return np.asarray(values).astype("uint64"), np.dtype("bool") - elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): + elif is_signed_integer_dtype(values): return ensure_int64(values), np.dtype("int64") - elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype): + elif is_unsigned_integer_dtype(values): return ensure_uint64(values), np.dtype("uint64") - elif is_float_dtype(values) or is_float_dtype(dtype): + elif is_float_dtype(values): return ensure_float64(values), np.dtype("float64") - elif is_complex_dtype(values) or is_complex_dtype(dtype): + elif is_complex_dtype(values): # ignore the fact that we are casting to float # which discards complex parts @@ -177,12 +165,12 @@ def _ensure_data( return ensure_object(values), np.dtype("object") # datetimelike - if needs_i8_conversion(values.dtype) or needs_i8_conversion(dtype): - if is_period_dtype(values.dtype) or is_period_dtype(dtype): + if needs_i8_conversion(values.dtype): + if is_period_dtype(values.dtype): from pandas import PeriodIndex values = PeriodIndex(values)._data - elif is_timedelta64_dtype(values.dtype) or is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(values.dtype): from pandas import TimedeltaIndex values = TimedeltaIndex(values)._data @@ -202,9 +190,7 @@ def _ensure_data( dtype = values.dtype return values.asi8, dtype - elif is_categorical_dtype(values.dtype) and ( - is_categorical_dtype(dtype) or dtype is None - ): + elif is_categorical_dtype(values.dtype): values = cast("Categorical", values) values = values.codes dtype = pandas_dtype("category") @@ -488,7 +474,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: elif needs_i8_conversion(comps.dtype): # Dispatch to DatetimeLikeArrayMixin.isin - return array(comps).isin(values) + return pd_array(comps).isin(values) elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps.dtype): # e.g. comps are integers and values are datetime64s return np.zeros(comps.shape, dtype=bool) @@ -1580,7 +1566,7 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: if is_scalar(value): value = dtype.type(value) else: - value = array(value, dtype=dtype) + value = pd_array(value, dtype=dtype) elif not ( is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) ): diff --git a/pandas/core/apply.py b/pandas/core/apply.py index db4203e5158ef..203a0c675282d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -98,22 +98,6 @@ def frame_apply( ) -def series_apply( - obj: Series, - func: AggFuncType, - convert_dtype: bool = True, - args=None, - kwargs=None, -) -> SeriesApply: - return SeriesApply( - obj, - func, - convert_dtype, - args, - kwargs, - ) - - class Apply(metaclass=abc.ABCMeta): axis: int @@ -280,7 +264,7 @@ def transform_dict_like(self, func): if len(func) == 0: raise ValueError("No transform functions were provided") - self.validate_dictlike_arg("transform", obj, func) + func = self.normalize_dictlike_arg("transform", obj, func) results: Dict[Hashable, FrameOrSeriesUnion] = {} for name, how in func.items(): @@ -421,32 +405,17 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: ------- Result of aggregation. """ + from pandas.core.reshape.concat import concat + obj = self.obj arg = cast(AggFuncTypeDict, self.f) - is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) - if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") selected_obj = obj._selected_obj - self.validate_dictlike_arg("agg", selected_obj, arg) - - # if we have a dict of any non-scalars - # eg. {'A' : ['mean']}, normalize all to - # be list-likes - # Cannot use arg.values() because arg may be a Series - if any(is_aggregator(x) for _, x in arg.items()): - new_arg: AggFuncTypeDict = {} - for k, v in arg.items(): - if not isinstance(v, (tuple, list, dict)): - new_arg[k] = [v] - else: - new_arg[k] = v - arg = new_arg - - from pandas.core.reshape.concat import concat + arg = self.normalize_dictlike_arg("agg", selected_obj, arg) if selected_obj.ndim == 1: # key only used for output @@ -540,14 +509,15 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: return None return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) - def validate_dictlike_arg( + def normalize_dictlike_arg( self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict - ) -> None: + ) -> AggFuncTypeDict: """ - Raise if dict-like argument is invalid. + Handler for dict-like argument. Ensures that necessary columns exist if obj is a DataFrame, and - that a nested renamer is not passed. + that a nested renamer is not passed. Also normalizes to all lists + when values consists of a mix of list and non-lists. """ assert how in ("apply", "agg", "transform") @@ -567,6 +537,23 @@ def validate_dictlike_arg( cols_sorted = list(safe_sort(list(cols))) raise KeyError(f"Column(s) {cols_sorted} do not exist") + is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) + + # if we have a dict of any non-scalars + # eg. {'A' : ['mean']}, normalize all to + # be list-likes + # Cannot use func.values() because arg may be a Series + if any(is_aggregator(x) for _, x in func.items()): + new_func: AggFuncTypeDict = {} + for k, v in func.items(): + if not is_aggregator(v): + # mypy can't realize v is not a list here + new_func[k] = [v] # type:ignore[list-item] + else: + new_func[k] = v + func = new_func + return func + class FrameApply(Apply): obj: DataFrame @@ -896,9 +883,8 @@ def series_generator(self): # of it. Kids: don't do this at home. ser = self.obj._ixs(0, axis=0) mgr = ser._mgr - blk = mgr.blocks[0] - if is_extension_array_dtype(blk.dtype): + if is_extension_array_dtype(ser.dtype): # values will be incorrect for this block # TODO(EA2D): special case would be unnecessary with 2D EAs obj = self.obj @@ -909,7 +895,7 @@ def series_generator(self): for (arr, name) in zip(values, self.index): # GH#35462 re-pin mgr in case setitem changed it ser._mgr = mgr - blk.values = arr + mgr.set_values(arr) ser.name = name yield ser diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 29a172dcdd2c7..7777cb4bf674e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -48,8 +48,6 @@ from pandas.core.dtypes.cast import ( coerce_indexer_dtype, maybe_cast_to_extension_array, - maybe_infer_to_datetimelike, - sanitize_to_nanoseconds, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -99,7 +97,7 @@ ) import pandas.core.common as com from pandas.core.construction import ( - array, + array as pd_array, extract_array, sanitize_array, ) @@ -396,20 +394,27 @@ def __init__( if dtype.categories is None: dtype = CategoricalDtype(values.categories, dtype.ordered) elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)): - # sanitize_array coerces np.nan to a string under certain versions - # of numpy - values = maybe_infer_to_datetimelike(values) - if isinstance(values, np.ndarray): - values = sanitize_to_nanoseconds(values) - elif not isinstance(values, ExtensionArray): - values = com.convert_to_list_like(values) - + values = com.convert_to_list_like(values) + if isinstance(values, list) and len(values) == 0: # By convention, empty lists result in object dtype: - sanitize_dtype = np.dtype("O") if len(values) == 0 else None - null_mask = isna(values) + values = np.array([], dtype=object) + elif isinstance(values, np.ndarray): + if values.ndim > 1: + # preempt sanitize_array from raising ValueError + raise NotImplementedError( + "> 1 ndim Categorical are not supported at this time" + ) + values = sanitize_array(values, None) + else: + # i.e. must be a list + arr = sanitize_array(values, None) + null_mask = isna(arr) if null_mask.any(): - values = [values[idx] for idx in np.where(~null_mask)[0]] - values = sanitize_array(values, None, dtype=sanitize_dtype) + # We remove null values here, then below will re-insert + # them, grep "full_codes" + arr = [values[idx] for idx in np.where(~null_mask)[0]] + arr = sanitize_array(arr, None) + values = arr if dtype.categories is None: try: @@ -493,7 +498,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: # TODO: consolidate with ndarray case? elif is_extension_array_dtype(dtype): - result = array(self, dtype=dtype, copy=copy) + result = pd_array(self, dtype=dtype, copy=copy) elif is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e476c3566c10f..633a20d6bed37 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -106,7 +106,7 @@ ) import pandas.core.common as com from pandas.core.construction import ( - array, + array as pd_array, extract_array, ) from pandas.core.indexers import ( @@ -465,15 +465,15 @@ def view(self, dtype: Optional[Dtype] = None) -> ArrayLike: dtype = pandas_dtype(dtype) if isinstance(dtype, (PeriodDtype, DatetimeTZDtype)): cls = dtype.construct_array_type() - return cls._simple_new(self.asi8, dtype=dtype) + return cls(self.asi8, dtype=dtype) elif dtype == "M8[ns]": from pandas.core.arrays import DatetimeArray - return DatetimeArray._simple_new(self.asi8, dtype=dtype) + return DatetimeArray(self.asi8, dtype=dtype) elif dtype == "m8[ns]": from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._simple_new(self.asi8.view("m8[ns]"), dtype=dtype) + return TimedeltaArray(self.asi8, dtype=dtype) return self._ndarray.view(dtype=dtype) # ------------------------------------------------------------------ @@ -719,7 +719,7 @@ def _validate_listlike(self, value, allow_object: bool = False): # Do type inference if necessary up front # e.g. we passed PeriodIndex.values and got an ndarray of Periods - value = array(value) + value = pd_array(value) value = extract_array(value, extract_numpy=True) if is_dtype_equal(value.dtype, "string"): @@ -1102,10 +1102,10 @@ def _add_timedeltalike_scalar(self, other): return type(self)(new_values, dtype=self.dtype) inc = delta_to_nanoseconds(other) - new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view( - "i8" - ) + new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan) + new_values = new_values.view("i8") new_values = self._maybe_mask_results(new_values) + new_values = new_values.view(self._ndarray.dtype) new_freq = None if isinstance(self.freq, Tick) or is_period_dtype(self.dtype): @@ -1207,7 +1207,7 @@ def _addsub_object_array(self, other: np.ndarray, op): assert self.shape == other.shape, (self.shape, other.shape) res_values = op(self.astype("O"), np.asarray(other)) - result = array(res_values.ravel()) + result = pd_array(res_values.ravel()) result = extract_array(result, extract_numpy=True).reshape(self.shape) return result @@ -1700,6 +1700,7 @@ def _round(self, freq, mode, ambiguous, nonexistent): nanos = to_offset(freq).nanos result = round_nsint64(values, mode, nanos) result = self._maybe_mask_results(result, fill_value=iNaT) + result = result.view(self._ndarray.dtype) return self._simple_new(result, dtype=self.dtype) @Appender((_round_doc + _round_example).format(op="round")) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 28e469547fe62..ce0ea7bca55cd 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -315,9 +315,7 @@ def _simple_new( cls, values, freq: Optional[BaseOffset] = None, dtype=DT64NS_DTYPE ) -> DatetimeArray: assert isinstance(values, np.ndarray) - if values.dtype != DT64NS_DTYPE: - assert values.dtype == "i8" - values = values.view(DT64NS_DTYPE) + assert values.dtype == DT64NS_DTYPE result = object.__new__(cls) result._ndarray = values @@ -439,6 +437,7 @@ def _generate_range( values = np.array([x.value for x in xdr], dtype=np.int64) _tz = start.tz if start is not None else end.tz + values = values.view("M8[ns]") index = cls._simple_new(values, freq=freq, dtype=tz_to_dtype(_tz)) if tz is not None and index.tz is None: @@ -464,9 +463,8 @@ def _generate_range( + start.value ) dtype = tz_to_dtype(tz) - index = cls._simple_new( - arr.astype("M8[ns]", copy=False), freq=None, dtype=dtype - ) + arr = arr.astype("M8[ns]", copy=False) + index = cls._simple_new(arr, freq=None, dtype=dtype) if not left_closed and len(index) and index[0] == start: # TODO: overload DatetimeLikeArrayMixin.__getitem__ @@ -476,7 +474,7 @@ def _generate_range( index = cast(DatetimeArray, index[:-1]) dtype = tz_to_dtype(tz) - return cls._simple_new(index.asi8, freq=freq, dtype=dtype) + return cls._simple_new(index._ndarray, freq=freq, dtype=dtype) # ----------------------------------------------------------------- # DatetimeLike Interface @@ -710,7 +708,7 @@ def _add_offset(self, offset): values = self.tz_localize(None) else: values = self - result = offset._apply_array(values) + result = offset._apply_array(values).view("M8[ns]") result = DatetimeArray._simple_new(result) result = result.tz_localize(self.tz) @@ -833,7 +831,7 @@ def tz_convert(self, tz): # No conversion since timestamps are all UTC to begin with dtype = tz_to_dtype(tz) - return self._simple_new(self.asi8, dtype=dtype, freq=self.freq) + return self._simple_new(self._ndarray, dtype=dtype, freq=self.freq) @dtl.ravel_compat def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): @@ -1906,6 +1904,26 @@ def std( # Constructor Helpers +def sequence_to_datetimes( + data, allow_object: bool = False, require_iso8601: bool = False +) -> Union[np.ndarray, DatetimeArray]: + """ + Parse/convert the passed data to either DatetimeArray or np.ndarray[object]. + """ + result, tz, freq = sequence_to_dt64ns( + data, + allow_object=allow_object, + allow_mixed=True, + require_iso8601=require_iso8601, + ) + if result.dtype == object: + return result + + dtype = tz_to_dtype(tz) + dta = DatetimeArray._simple_new(result, freq=freq, dtype=dtype) + return dta + + def sequence_to_dt64ns( data, dtype=None, @@ -1914,6 +1932,10 @@ def sequence_to_dt64ns( dayfirst=False, yearfirst=False, ambiguous="raise", + *, + allow_object: bool = False, + allow_mixed: bool = False, + require_iso8601: bool = False, ): """ Parameters @@ -1926,6 +1948,13 @@ def sequence_to_dt64ns( yearfirst : bool, default False ambiguous : str, bool, or arraylike, default 'raise' See pandas._libs.tslibs.tzconversion.tz_localize_to_utc. + allow_object : bool, default False + Whether to return an object-dtype ndarray instead of raising if the + data contains more than one timezone. + allow_mixed : bool, default False + Interpret integers as timestamps when datetime objects are also present. + require_iso8601 : bool, default False + Only consider ISO-8601 formats when parsing strings. Returns ------- @@ -1989,7 +2018,12 @@ def sequence_to_dt64ns( # data comes back here as either i8 to denote UTC timestamps # or M8[ns] to denote wall times data, inferred_tz = objects_to_datetime64ns( - data, dayfirst=dayfirst, yearfirst=yearfirst + data, + dayfirst=dayfirst, + yearfirst=yearfirst, + allow_object=allow_object, + allow_mixed=allow_mixed, + require_iso8601=require_iso8601, ) if tz and inferred_tz: # two timezones: convert to intended from base UTC repr @@ -1997,6 +2031,9 @@ def sequence_to_dt64ns( data = data.view(DT64NS_DTYPE) elif inferred_tz: tz = inferred_tz + elif allow_object and data.dtype == object: + # We encountered mixed-timezones. + return data, None, None data_dtype = data.dtype @@ -2053,8 +2090,9 @@ def objects_to_datetime64ns( yearfirst, utc=False, errors="raise", - require_iso8601=False, - allow_object=False, + require_iso8601: bool = False, + allow_object: bool = False, + allow_mixed: bool = False, ): """ Convert data to array of timestamps. @@ -2071,6 +2109,8 @@ def objects_to_datetime64ns( allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. + allow_mixed : bool, default False + Interpret integers as timestamps when datetime objects are also present. Returns ------- @@ -2099,6 +2139,7 @@ def objects_to_datetime64ns( dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601, + allow_mixed=allow_mixed, ) result = result.reshape(data.shape, order=order) except ValueError as err: @@ -2135,7 +2176,7 @@ def objects_to_datetime64ns( raise TypeError(result) -def maybe_convert_dtype(data, copy): +def maybe_convert_dtype(data, copy: bool): """ Convert data based on dtype conventions, issuing deprecation warnings or errors where appropriate. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b16b4b3ae856a..61d63d2eed6e9 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -315,15 +315,6 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): ) super().__init__(values, mask, copy=copy) - def __neg__(self): - return type(self)(-self._data, self._mask.copy()) - - def __pos__(self): - return self - - def __abs__(self): - return type(self)(np.abs(self._data), self._mask.copy()) - @classmethod def _from_sequence( cls, scalars, *, dtype: Optional[Dtype] = None, copy: bool = False diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 4d165dac40397..43c3a5e8bfd4c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -76,7 +76,7 @@ from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.construction import ( - array, + array as pd_array, ensure_wrapped_if_datetimelike, extract_array, ) @@ -661,7 +661,7 @@ def _cmp_method(self, other, op): if is_list_like(other): if len(self) != len(other): raise ValueError("Lengths must match to compare") - other = array(other) + other = pd_array(other) elif not isinstance(other, Interval): # non-interval scalar -> no matches return invalid_comparison(self, other, op) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 57017e44a66e9..0dd98c5e3d3f2 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -199,3 +199,12 @@ def reconstruct(x): return tuple(reconstruct(x) for x in result) else: return reconstruct(result) + + def __neg__(self): + return type(self)(-self._data, self._mask.copy()) + + def __pos__(self): + return self + + def __abs__(self): + return type(self)(abs(self._data), self._mask.copy()) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8441b324515f3..26fe6338118b6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -6,6 +6,7 @@ Any, Optional, Sequence, + Tuple, Type, Union, ) @@ -20,6 +21,7 @@ Dtype, NpDtype, ) +from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype @@ -273,9 +275,22 @@ def __len__(self) -> int: """ return len(self._data) - @classmethod - def _from_factorized(cls, values, original): - return cls._from_sequence(values) + @doc(ExtensionArray.factorize) + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: + encoded = self._data.dictionary_encode() + indices = pa.chunked_array( + [c.indices for c in encoded.chunks], type=encoded.type.index_type + ).to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(np.int64, copy=False) + + if encoded.num_chunks: + uniques = type(self)(encoded.chunk(0).dictionary) + else: + uniques = type(self)(pa.array([], type=encoded.type.value_type)) + + return indices.values, uniques @classmethod def _concat_same_type(cls, to_concat) -> ArrowStringArray: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f7af1bb3da86b..c371e27eeceac 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -4,6 +4,7 @@ from typing import ( List, Optional, + Tuple, Union, ) @@ -229,13 +230,11 @@ def _simple_new( ) -> TimedeltaArray: assert dtype == TD64NS_DTYPE, dtype assert isinstance(values, np.ndarray), type(values) - if values.dtype != TD64NS_DTYPE: - assert values.dtype == "i8" - values = values.view(TD64NS_DTYPE) + assert values.dtype == TD64NS_DTYPE result = object.__new__(cls) result._ndarray = values - result._freq = to_offset(freq) + result._freq = freq result._dtype = TD64NS_DTYPE return result @@ -317,7 +316,7 @@ def _generate_range(cls, start, end, periods, freq, closed=None): if not right_closed: index = index[:-1] - return cls._simple_new(index, freq=freq) + return cls._simple_new(index.view("m8[ns]"), freq=freq) # ---------------------------------------------------------------- # DatetimeLike Interface @@ -907,7 +906,9 @@ def f(x): # Constructor Helpers -def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): +def sequence_to_td64ns( + data, copy=False, unit=None, errors="raise" +) -> Tuple[np.ndarray, Optional[Tick]]: """ Parameters ---------- diff --git a/pandas/core/common.py b/pandas/core/common.py index 8625c5063382f..871f5ac651cce 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -3,6 +3,7 @@ Note: pandas.core.common is *not* part of the public API. """ +from __future__ import annotations from collections import ( abc, @@ -12,6 +13,7 @@ from functools import partial import inspect from typing import ( + TYPE_CHECKING, Any, Callable, Collection, @@ -51,6 +53,9 @@ from pandas.core.dtypes.inference import iterable_not_string from pandas.core.dtypes.missing import isna +if TYPE_CHECKING: + from pandas import Index + class SettingWithCopyError(ValueError): pass @@ -512,3 +517,16 @@ def temp_setattr(obj, attr: str, value) -> Iterator[None]: setattr(obj, attr, value) yield obj setattr(obj, attr, old_value) + + +def require_length_match(data, index: Index): + """ + Check the length of data matches the length of the index. + """ + if len(data) != len(index): + raise ValueError( + "Length of values " + f"({len(data)}) " + "does not match length of index " + f"({len(index)})" + ) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 9aa1c620fe1d9..0fd685e4f53f1 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -304,6 +304,7 @@ def array( raise ValueError(msg) if dtype is None and isinstance(data, (ABCSeries, ABCIndex, ABCExtensionArray)): + # Note: we exclude np.ndarray here, will do type inference on it dtype = data.dtype data = extract_array(data, extract_numpy=True) @@ -525,9 +526,9 @@ def sanitize_array( elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: # TODO: deque, array.array - if isinstance(data, set): + if isinstance(data, (set, frozenset)): # Raise only for unordered sets, e.g., not for dict_keys - raise TypeError("Set type is unordered") + raise TypeError(f"'{type(data).__name__}' type is unordered") data = list(data) if dtype is not None: @@ -677,8 +678,10 @@ def _try_cast( subarr = arr else: subarr = maybe_cast_to_datetime(arr, dtype) + if dtype is not None and dtype.kind == "M": + return subarr - if not isinstance(subarr, (ABCExtensionArray, ABCIndex)): + if not isinstance(subarr, ABCExtensionArray): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: # in case of out of bound datetime64 -> always raise diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b30dbe32eec4b..9f111282473c2 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -10,6 +10,7 @@ datetime, timedelta, ) +import inspect from typing import ( TYPE_CHECKING, Any, @@ -28,10 +29,7 @@ import numpy as np -from pandas._libs import ( - lib, - tslib, -) +from pandas._libs import lib from pandas._libs.tslibs import ( NaT, OutOfBoundsDatetime, @@ -40,9 +38,9 @@ Timedelta, Timestamp, conversion, - iNaT, ints_to_pydatetime, ) +from pandas._libs.tslibs.timedeltas import array_to_timedelta64 from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -86,6 +84,7 @@ is_timedelta64_dtype, is_timedelta64_ns_dtype, is_unsigned_integer_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, @@ -96,7 +95,6 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, - ABCIndex, ABCSeries, ) from pandas.core.dtypes.inference import is_list_like @@ -233,18 +231,17 @@ def _disallow_mismatched_datetimelike(value, dtype: DtypeObj): raise TypeError(f"Cannot cast {repr(value)} to {dtype}") -def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]): +def maybe_downcast_to_dtype( + result: ArrayLike, dtype: Union[str, np.dtype] +) -> ArrayLike: """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 """ - do_round = False - - if is_scalar(result): - return result - elif isinstance(result, ABCDataFrame): - # occurs in pivot_table doctest + if isinstance(result, ABCDataFrame): + # see test_pivot_table_doctest_case return result + do_round = False if isinstance(dtype, str): if dtype == "infer": @@ -265,6 +262,7 @@ def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]): do_round = True else: + # TODO: complex? what if result is already non-object? dtype = "object" dtype = np.dtype(dtype) @@ -289,14 +287,17 @@ def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]): i8values = result.astype("i8", copy=False) cls = dtype.construct_array_type() # equiv: DatetimeArray(i8values).tz_localize("UTC").tz_convert(dtype.tz) - result = cls._simple_new(i8values, dtype=dtype) + dt64values = i8values.view("M8[ns]") + result = cls._simple_new(dt64values, dtype=dtype) else: result = result.astype(dtype) return result -def maybe_downcast_numeric(result, dtype: DtypeObj, do_round: bool = False): +def maybe_downcast_numeric( + result: ArrayLike, dtype: DtypeObj, do_round: bool = False +) -> ArrayLike: """ Subset of maybe_downcast_to_dtype restricted to numeric dtypes. @@ -310,7 +311,7 @@ def maybe_downcast_numeric(result, dtype: DtypeObj, do_round: bool = False): ------- ndarray or ExtensionArray """ - if not isinstance(dtype, np.dtype): + if not isinstance(dtype, np.dtype) or not isinstance(result.dtype, np.dtype): # e.g. SparseDtype has no itemsize attr return result @@ -697,7 +698,7 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan): return dtype, fill_value -def _ensure_dtype_type(value, dtype: DtypeObj): +def _ensure_dtype_type(value, dtype: np.dtype): """ Ensure that the given value is an instance of the given dtype. @@ -707,21 +708,17 @@ def _ensure_dtype_type(value, dtype: DtypeObj): Parameters ---------- value : object - dtype : np.dtype or ExtensionDtype + dtype : np.dtype Returns ------- object """ # Start with exceptions in which we do _not_ cast to numpy types - if is_extension_array_dtype(dtype): - return value - elif dtype == np.object_: - return value - elif isna(value): - # e.g. keep np.nan rather than try to cast to np.float32(np.nan) + if dtype == np.object_: return value + # Note: before we get here we have already excluded isna(value) return dtype.type(value) @@ -1138,7 +1135,7 @@ def astype_nansafe( if isinstance(dtype, ExtensionDtype): return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) - elif not isinstance(dtype, np.dtype): + elif not isinstance(dtype, np.dtype): # pragma: no cover raise ValueError("dtype must be np.dtype or ExtensionDtype") if arr.dtype.kind in ["m", "M"] and ( @@ -1228,6 +1225,107 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) +def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike: + """ + Cast array (ndarray or ExtensionArray) to the new dtype. + + Parameters + ---------- + values : ndarray or ExtensionArray + dtype : dtype object + copy : bool, default False + copy if indicated + + Returns + ------- + ndarray or ExtensionArray + """ + if ( + values.dtype.kind in ["m", "M"] + and dtype.kind in ["i", "u"] + and isinstance(dtype, np.dtype) + and dtype.itemsize != 8 + ): + # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced + msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" + raise TypeError(msg) + + if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): + return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) + + if is_dtype_equal(values.dtype, dtype): + if copy: + return values.copy() + return values + + if isinstance(values, ABCExtensionArray): + values = values.astype(dtype, copy=copy) + + else: + values = astype_nansafe(values, dtype, copy=copy) + + # in pandas we don't store numpy str dtypes, so convert to object + if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): + values = np.array(values, dtype=object) + + return values + + +def astype_array_safe( + values: ArrayLike, dtype, copy: bool = False, errors: str = "raise" +) -> ArrayLike: + """ + Cast array (ndarray or ExtensionArray) to the new dtype. + + This basically is the implementation for DataFrame/Series.astype and + includes all custom logic for pandas (NaN-safety, converting str to object, + not allowing ) + + Parameters + ---------- + values : ndarray or ExtensionArray + dtype : str, dtype convertible + copy : bool, default False + copy if indicated + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + Returns + ------- + ndarray or ExtensionArray + """ + errors_legal_values = ("raise", "ignore") + + if errors not in errors_legal_values: + invalid_arg = ( + "Expected value of kwarg 'errors' to be one of " + f"{list(errors_legal_values)}. Supplied value is '{errors}'" + ) + raise ValueError(invalid_arg) + + if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): + msg = ( + f"Expected an instance of {dtype.__name__}, " + "but got the class instead. Try instantiating 'dtype'." + ) + raise TypeError(msg) + + dtype = pandas_dtype(dtype) + + try: + new_values = astype_array(values, dtype, copy=copy) + except (ValueError, TypeError): + # e.g. astype_nansafe can fail on object-dtype of strings + # trying to convert to float + if errors == "ignore": + new_values = values + else: + raise + + return new_values + + def soft_convert_objects( values: np.ndarray, datetime: bool = True, @@ -1388,9 +1486,7 @@ def maybe_castable(dtype: np.dtype) -> bool: return dtype.name not in POSSIBLY_CAST_DTYPES -def maybe_infer_to_datetimelike( - value: Union[ArrayLike, Scalar], convert_dates: bool = False -): +def maybe_infer_to_datetimelike(value: Union[np.ndarray, List]): """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a @@ -1401,21 +1497,13 @@ def maybe_infer_to_datetimelike( Parameters ---------- - value : np.array / Series / Index / list-like - convert_dates : bool, default False - if True try really hard to convert dates (such as datetime.date), other - leave inferred dtype 'date' alone + value : np.ndarray or list """ - if isinstance(value, (ABCIndex, ABCExtensionArray)): - if not is_object_dtype(value.dtype): - raise ValueError("array-like value must be object-dtype") + if not isinstance(value, (np.ndarray, list)): + raise TypeError(type(value)) # pragma: no cover - v = value - - if not is_list_like(v): - v = [v] - v = np.array(v, copy=False) + v = np.array(value, copy=False) # we only care about object dtypes if not is_object_dtype(v.dtype): @@ -1429,50 +1517,46 @@ def maybe_infer_to_datetimelike( return value def try_datetime(v: np.ndarray) -> ArrayLike: - # safe coerce to datetime64 - try: - # GH19671 - # tznaive only - v = tslib.array_to_datetime(v, require_iso8601=True, errors="raise")[0] - except ValueError: - - # we might have a sequence of the same-datetimes with tz's - # if so coerce to a DatetimeIndex; if they are not the same, - # then these stay as object dtype, xref GH19671 - from pandas import DatetimeIndex - - try: + # Coerce to datetime64, datetime64tz, or in corner cases + # object[datetimes] + from pandas.core.arrays.datetimes import sequence_to_datetimes - values, tz = conversion.datetime_to_datetime64(v) - except (ValueError, TypeError): - pass - else: - dti = DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz) - return dti._data - except TypeError: + try: + # GH#19671 we pass require_iso8601 to be relatively strict + # when parsing strings. + dta = sequence_to_datetimes(v, require_iso8601=True, allow_object=True) + except (ValueError, TypeError): # e.g. is not convertible to datetime - pass - - return v.reshape(shape) + return v.reshape(shape) + else: + if dta.dtype == object or dta.tz is None: + # GH#19671 if we have mixed timezones we may have object-dtype + # here. + # This is reachable bc allow_object=True, means we cast things + # to mixed-tz datetime objects (mostly). Only 1 test + # relies on this behavior, see GH#40111 + # FIXME: conditional reshape is kludgy + return np.asarray(dta).reshape(shape) + # otherwise we have dt64tz + return dta def try_timedelta(v: np.ndarray) -> np.ndarray: # safe coerce to timedelta64 # will try first with a string & object conversion - from pandas import to_timedelta - try: - td_values = to_timedelta(v) + # bc we know v.dtype == object, this is equivalent to + # `np.asarray(to_timedelta(v))`, but using a lower-level API that + # does not require a circular import. + td_values = array_to_timedelta64(v).view("m8[ns]") except (ValueError, OverflowError): return v.reshape(shape) else: - return np.asarray(td_values).reshape(shape) + return td_values.reshape(shape) inferred_type = lib.infer_datetimelike_array(ensure_object(v)) - if inferred_type == "date" and convert_dates: - value = try_datetime(v) - elif inferred_type == "datetime": + if inferred_type == "datetime": value = try_datetime(v) elif inferred_type == "timedelta": value = try_timedelta(v) @@ -1502,8 +1586,8 @@ def maybe_cast_to_datetime( try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ - from pandas.core.tools.datetimes import to_datetime - from pandas.core.tools.timedeltas import to_timedelta + from pandas.core.arrays.datetimes import sequence_to_datetimes + from pandas.core.arrays.timedeltas import sequence_to_td64ns if not is_list_like(value): raise TypeError("value must be listlike") @@ -1513,78 +1597,50 @@ def maybe_cast_to_datetime( is_datetime64tz = is_datetime64tz_dtype(dtype) is_timedelta64 = is_timedelta64_dtype(dtype) - if is_datetime64 or is_datetime64tz or is_timedelta64: + vdtype = getattr(value, "dtype", None) - # Force the dtype if needed. - msg = ( - f"The '{dtype.name}' dtype has no unit. " - f"Please pass in '{dtype.name}[ns]' instead." - ) - - if is_datetime64: - # unpack e.g. SparseDtype - dtype = getattr(dtype, "subtype", dtype) - if not is_dtype_equal(dtype, DT64NS_DTYPE): - - # pandas supports dtype whose granularity is less than [ns] - # e.g., [ps], [fs], [as] - if dtype <= np.dtype("M8[ns]"): - if dtype.name == "datetime64": - raise ValueError(msg) - dtype = DT64NS_DTYPE - else: - raise TypeError( - f"cannot convert datetimelike to dtype [{dtype}]" - ) - - elif is_timedelta64 and not is_dtype_equal(dtype, TD64NS_DTYPE): - - # pandas supports dtype whose granularity is less than [ns] - # e.g., [ps], [fs], [as] - if dtype <= np.dtype("m8[ns]"): - if dtype.name == "timedelta64": - raise ValueError(msg) - dtype = TD64NS_DTYPE - else: - raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]") + if is_datetime64 or is_datetime64tz or is_timedelta64: + dtype = ensure_nanosecond_dtype(dtype) if not is_sparse(value): value = np.array(value, copy=False) - # have a scalar array-like (e.g. NaT) - if value.ndim == 0: - value = iNaT - # we have an array of datetime or timedeltas & nulls - elif value.size or not is_dtype_equal(value.dtype, dtype): + if value.size or not is_dtype_equal(value.dtype, dtype): _disallow_mismatched_datetimelike(value, dtype) try: if is_datetime64: - dti = to_datetime(value, errors="raise") + dta = sequence_to_datetimes(value, allow_object=False) # GH 25843: Remove tz information since the dtype # didn't specify one - if dti.tz is not None: - dti = dti.tz_localize(None) - value = dti._values + if dta.tz is not None: + # equiv: dta.view(dtype) + # Note: NOT equivalent to dta.astype(dtype) + dta = dta.tz_localize(None) + value = dta elif is_datetime64tz: + dtype = cast(DatetimeTZDtype, dtype) # The string check can be removed once issue #13712 # is solved. String data that is passed with a # datetime64tz is assumed to be naive which should # be localized to the timezone. is_dt_string = is_string_dtype(value.dtype) - dta = to_datetime(value, errors="raise").array + dta = sequence_to_datetimes(value, allow_object=False) if dta.tz is not None: value = dta.astype(dtype, copy=False) elif is_dt_string: # Strings here are naive, so directly localize + # equiv: dta.astype(dtype) # though deprecated value = dta.tz_localize(dtype.tz) else: # Numeric values are UTC at this point, # so localize and convert + # equiv: Series(dta).astype(dtype) # though deprecated value = dta.tz_localize("UTC").tz_convert(dtype.tz) elif is_timedelta64: - value = to_timedelta(value, errors="raise")._values + # if successful, we get a ndarray[td64ns] + value, _ = sequence_to_td64ns(value) except OutOfBoundsDatetime: raise except ValueError: @@ -1593,9 +1649,7 @@ def maybe_cast_to_datetime( pass # coerce datetimelike to object - elif is_datetime64_dtype( - getattr(value, "dtype", None) - ) and not is_datetime64_dtype(dtype): + elif is_datetime64_dtype(vdtype) and not is_datetime64_dtype(dtype): if is_object_dtype(dtype): value = cast(np.ndarray, value) @@ -1616,7 +1670,7 @@ def maybe_cast_to_datetime( elif value.dtype == object: value = maybe_infer_to_datetimelike(value) - else: + elif not isinstance(value, ABCExtensionArray): # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this # conversion @@ -1639,6 +1693,52 @@ def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray: return values +def ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj: + """ + Convert dtypes with granularity less than nanosecond to nanosecond + + >>> ensure_nanosecond_dtype(np.dtype("M8[s]")) + dtype('>> ensure_nanosecond_dtype(np.dtype("m8[ps]")) + Traceback (most recent call last): + ... + TypeError: cannot convert timedeltalike to dtype [timedelta64[ps]] + """ + msg = ( + f"The '{dtype.name}' dtype has no unit. " + f"Please pass in '{dtype.name}[ns]' instead." + ) + + # unpack e.g. SparseDtype + dtype = getattr(dtype, "subtype", dtype) + + if not isinstance(dtype, np.dtype): + # i.e. datetime64tz + pass + + elif dtype.kind == "M" and dtype != DT64NS_DTYPE: + # pandas supports dtype whose granularity is less than [ns] + # e.g., [ps], [fs], [as] + if dtype <= np.dtype("M8[ns]"): + if dtype.name == "datetime64": + raise ValueError(msg) + dtype = DT64NS_DTYPE + else: + raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]") + + elif dtype.kind == "m" and dtype != TD64NS_DTYPE: + # pandas supports dtype whose granularity is less than [ns] + # e.g., [ps], [fs], [as] + if dtype <= np.dtype("m8[ns]"): + if dtype.name == "timedelta64": + raise ValueError(msg) + dtype = TD64NS_DTYPE + else: + raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]") + return dtype + + def find_common_type(types: List[DtypeObj]) -> DtypeObj: """ Find a common data type among the given dtypes. @@ -1829,7 +1929,16 @@ def construct_1d_ndarray_preserving_na( else: if dtype is not None: _disallow_mismatched_datetimelike(values, dtype) - subarr = np.array(values, dtype=dtype, copy=copy) + + if ( + dtype == object + and isinstance(values, np.ndarray) + and values.dtype.kind in ["m", "M"] + ): + # TODO(numpy#12550): special-case can be removed + subarr = construct_1d_object_array_from_listlike(list(values)) + else: + subarr = np.array(values, dtype=dtype, copy=copy) return subarr diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 42ac786ff315e..1545b5b106803 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -25,7 +25,7 @@ from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray from pandas.core.construction import ( - array, + array as pd_array, ensure_wrapped_if_datetimelike, ) @@ -66,7 +66,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: if is_extension_array_dtype(dtype) and isinstance(arr, np.ndarray): # numpy's astype cannot handle ExtensionDtypes - return array(arr, dtype=dtype, copy=False) + return pd_array(arr, dtype=dtype, copy=False) return arr.astype(dtype, copy=False) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c95e65c70899..dcd6ef77238f9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -99,7 +99,6 @@ maybe_box_native, maybe_convert_platform, maybe_downcast_to_dtype, - maybe_infer_to_datetimelike, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -147,6 +146,7 @@ from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import ( extract_array, + sanitize_array, sanitize_masked_array, ) from pandas.core.generic import ( @@ -177,13 +177,12 @@ from pandas.core.internals.construction import ( arrays_to_mgr, dataclasses_to_dicts, - init_dict, - init_ndarray, - masked_rec_array_to_mgr, + dict_to_mgr, mgr_to_mgr, + ndarray_to_mgr, nested_data_to_arrays, + rec_array_to_mgr, reorder_arrays, - sanitize_index, to_arrays, treat_as_nested, ) @@ -564,41 +563,55 @@ def __init__( if isinstance(data, DataFrame): data = data._mgr - if isinstance(data, (BlockManager, ArrayManager)): - if index is None and columns is None and dtype is None and copy is False: - # GH#33357 fastpath - NDFrame.__init__(self, data) - return + # first check if a Manager is passed without any other arguments + # -> use fastpath (without checking Manager type) + if ( + index is None + and columns is None + and dtype is None + and copy is False + and isinstance(data, (BlockManager, ArrayManager)) + ): + # GH#33357 fastpath + NDFrame.__init__(self, data) + return + + manager = get_option("mode.data_manager") + if isinstance(data, (BlockManager, ArrayManager)): mgr = self._init_mgr( data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy ) elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = dict_to_mgr(data, index, columns, dtype=dtype, typ=manager) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords # masked recarray if isinstance(data, mrecords.MaskedRecords): - mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) + mgr = rec_array_to_mgr(data, index, columns, dtype, copy, typ=manager) # a masked array else: data = sanitize_masked_array(data) - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = ndarray_to_mgr( + data, index, columns, dtype=dtype, copy=copy, typ=manager + ) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: - data_columns = list(data.dtype.names) - data = {k: data[k] for k in data_columns} - if columns is None: - columns = data_columns - mgr = init_dict(data, index, columns, dtype=dtype) + # i.e. numpy structured array + mgr = rec_array_to_mgr(data, index, columns, dtype, copy, typ=manager) elif getattr(data, "name", None) is not None: - mgr = init_dict({data.name: data}, index, columns, dtype=dtype) + # i.e. Series/Index with non-None name + mgr = dict_to_mgr( + {data.name: data}, index, columns, dtype=dtype, typ=manager + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = ndarray_to_mgr( + data, index, columns, dtype=dtype, copy=copy, typ=manager + ) # For data is list-like, or Iterable (will consume into list) elif is_list_like(data): @@ -608,14 +621,20 @@ def __init__( if is_dataclass(data[0]): data = dataclasses_to_dicts(data) if treat_as_nested(data): + if columns is not None: + columns = ensure_index(columns) arrays, columns, index = nested_data_to_arrays( data, columns, index, dtype ) - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + mgr = arrays_to_mgr( + arrays, columns, index, columns, dtype=dtype, typ=manager + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = ndarray_to_mgr( + data, index, columns, dtype=dtype, copy=copy, typ=manager + ) else: - mgr = init_dict({}, index, columns, dtype=dtype) + mgr = dict_to_mgr({}, index, columns, dtype=dtype, typ=manager) # For data is scalar else: if index is None or columns is None: @@ -632,18 +651,19 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) + mgr = arrays_to_mgr( + values, columns, index, columns, dtype=None, typ=manager + ) else: values = construct_2d_arraylike_from_scalar( data, len(index), len(columns), dtype, copy ) - mgr = init_ndarray( - values, index, columns, dtype=values.dtype, copy=False + mgr = ndarray_to_mgr( + values, index, columns, dtype=values.dtype, copy=False, typ=manager ) # ensure correct Manager type according to settings - manager = get_option("mode.data_manager") mgr = mgr_to_mgr(mgr, typ=manager) NDFrame.__init__(self, mgr) @@ -1919,12 +1939,11 @@ def from_records( arr_columns_list.append(k) arrays.append(v) - arrays, arr_columns = reorder_arrays(arrays, arr_columns_list, columns) + arr_columns = Index(arr_columns_list) + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns) elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = to_arrays(data, columns) - if columns is not None: - columns = ensure_index(columns) arr_columns = columns else: arrays, arr_columns = to_arrays(data, columns) @@ -1934,9 +1953,7 @@ def from_records( arrays[i] = lib.maybe_convert_objects(arr, try_float=True) arr_columns = ensure_index(arr_columns) - if columns is not None: - columns = ensure_index(columns) - else: + if columns is None: columns = arr_columns if exclude is None: @@ -1971,7 +1988,8 @@ def from_records( arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) + manager = get_option("mode.data_manager") + mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns, typ=manager) return cls(mgr) @@ -2178,6 +2196,7 @@ def _from_arrays( if dtype is not None: dtype = pandas_dtype(dtype) + manager = get_option("mode.data_manager") mgr = arrays_to_mgr( arrays, columns, @@ -2185,6 +2204,7 @@ def _from_arrays( columns, dtype=dtype, verify_integrity=verify_integrity, + typ=manager, ) return cls(mgr) @@ -2641,6 +2661,189 @@ def to_html( render_links=render_links, ) + @doc(storage_options=generic._shared_docs["storage_options"]) + def to_xml( + self, + path_or_buffer: Optional[FilePathOrBuffer] = None, + index: bool = True, + root_name: Optional[str] = "data", + row_name: Optional[str] = "row", + na_rep: Optional[str] = None, + attr_cols: Optional[Union[str, List[str]]] = None, + elem_cols: Optional[Union[str, List[str]]] = None, + namespaces: Optional[Dict[Optional[str], str]] = None, + prefix: Optional[str] = None, + encoding: str = "utf-8", + xml_declaration: Optional[bool] = True, + pretty_print: Optional[bool] = True, + parser: Optional[str] = "lxml", + stylesheet: Optional[FilePathOrBuffer] = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, + ) -> Optional[str]: + """ + Render a DataFrame to an XML document. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object or file-like object, optional + File to write output to. If None, the output is returned as a + string. + index : bool, default True + Whether to include index in XML document. + root_name : str, default 'data' + The name of root element in XML document. + row_name : str, default 'row' + The name of row element in XML document. + na_rep : str, optional + Missing data representation. + attr_cols : list-like, optional + List of columns to write as attributes in row element. + Hierarchical columns will be flattened with underscore + delimiting the different levels. + elem_cols : list-like, optional + List of columns to write as children in row element. By default, + all columns output as children of row element. Hierarchical + columns will be flattened with underscore delimiting the + different levels. + namespaces : dict, optional + All namespaces to be defined in root element. Keys of dict + should be prefix names and values of dict corresponding URIs. + Default namespaces should be given empty string key. For + example, :: + + namespaces = {{"": "https://example.com"}} + + prefix : str, optional + Namespace prefix to be used for every element and/or attribute + in document. This should be one of the keys in ``namespaces`` + dict. + encoding : str, default 'utf-8' + Encoding of the resulting document. + xml_declaration : bool, default True + Whether to include the XML declaration at start of document. + pretty_print : bool, default True + Whether output should be pretty printed with indentation and + line breaks. + parser : {{'lxml','etree'}}, default 'lxml' + Parser module to use for building of tree. Only 'lxml' and + 'etree' are supported. With 'lxml', the ability to use XSLT + stylesheet is supported. + stylesheet : str, path object or file-like object, optional + A URL, file-like object, or a raw string containing an XSLT + script used to transform the raw XML output. Script should use + layout of elements and attributes from original output. This + argument requires ``lxml`` to be installed. Only XSLT 1.0 + scripts and not later versions is currently supported. + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + {storage_options} + + Returns + ------- + None or str + If ``io`` is None, returns the resulting XML format as a + string. Otherwise returns None. + + See Also + -------- + to_json : Convert the pandas object to a JSON string. + to_html : Convert DataFrame to a html. + + Examples + -------- + >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], + ... 'degrees': [360, 360, 180], + ... 'sides': [4, np.nan, 3]}}) + + >>> df.to_xml() # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + + >>> df.to_xml(attr_cols=[ + ... 'index', 'shape', 'degrees', 'sides' + ... ]) # doctest: +SKIP + + + + + + + + >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, + ... prefix="doc") # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + """ + + formatter = fmt.DataFrameFormatter( + self, + index=index, + ) + + return fmt.DataFrameRenderer(formatter).to_xml( + path_or_buffer=path_or_buffer, + index=index, + root_name=root_name, + row_name=row_name, + na_rep=na_rep, + attr_cols=attr_cols, + elem_cols=elem_cols, + namespaces=namespaces, + prefix=prefix, + encoding=encoding, + xml_declaration=xml_declaration, + pretty_print=pretty_print, + parser=parser, + stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, + ) + # ---------------------------------------------------------------------- @Substitution( klass="DataFrame", @@ -4024,15 +4227,14 @@ def _sanitize_column(self, value) -> ArrayLike: value = _reindex_for_setitem(value, self.index) elif isinstance(value, ExtensionArray): - # Explicitly copy here, instead of in sanitize_index, - # as sanitize_index won't copy an EA, even with copy=True + # Explicitly copy here value = value.copy() - value = sanitize_index(value, self.index) + com.require_length_match(value, self.index) elif is_sequence(value): + com.require_length_match(value, self.index) # turn me into an ndarray - value = sanitize_index(value, self.index) if not isinstance(value, (np.ndarray, Index)): if isinstance(value, list) and len(value) > 0: value = maybe_convert_platform(value) @@ -4045,7 +4247,7 @@ def _sanitize_column(self, value) -> ArrayLike: # possibly infer to datetimelike if is_object_dtype(value.dtype): - value = maybe_infer_to_datetimelike(value) + value = sanitize_array(value, None) else: value = construct_1d_arraylike_from_scalar(value, len(self), dtype=None) @@ -5625,7 +5827,7 @@ def sort_index( self, axis: Axis = 0, level: Optional[Level] = None, - ascending: bool = True, + ascending: Union[Union[bool, int], Sequence[Union[bool, int]]] = True, inplace: bool = False, kind: str = "quicksort", na_position: str = "last", @@ -5646,7 +5848,7 @@ def sort_index( and 1 identifies the columns. level : int or level name or list of ints or list of level names If not None, sort on values in specified index level(s). - ascending : bool or list of bools, default True + ascending : bool or list-like of bools, default True Sort ascending vs. descending. When the index is a MultiIndex the sort direction can be controlled for each level individually. inplace : bool, default False diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eb4c5c07af2c4..5bba7ab67b2bf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -71,6 +71,7 @@ rewrite_axis_style_signature, ) from pandas.util._validators import ( + validate_ascending, validate_bool_kwarg, validate_fillna_kwargs, ) @@ -138,6 +139,7 @@ ArrayManager, BlockManager, ) +from pandas.core.internals.construction import mgr_to_mgr from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME from pandas.core.reshape.concat import concat @@ -4548,7 +4550,7 @@ def sort_index( self, axis=0, level=None, - ascending: bool_t = True, + ascending: Union[Union[bool_t, int], Sequence[Union[bool_t, int]]] = True, inplace: bool_t = False, kind: str = "quicksort", na_position: str = "last", @@ -4559,6 +4561,8 @@ def sort_index( inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) + ascending = validate_ascending(ascending) + target = self._get_axis(axis) indexer = get_indexer_indexer( @@ -5752,6 +5756,8 @@ def _to_dict_of_blocks(self, copy: bool_t = True): Internal ONLY - only works for BlockManager """ mgr = self._mgr + # convert to BlockManager if needed -> this way support ArrayManager as well + mgr = mgr_to_mgr(mgr, "block") mgr = cast(BlockManager, mgr) return { k: self._constructor(v).__finalize__(self) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f4c69ea9d89db..aaf67fb1be532 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1815,6 +1815,8 @@ def count(self) -> DataFrame: ids, _, ngroups = self.grouper.group_info mask = ids != -1 + using_array_manager = isinstance(data, ArrayManager) + def hfunc(bvalues: ArrayLike) -> ArrayLike: # TODO(2DEA): reshape would not be necessary with 2D EAs if bvalues.ndim == 1: @@ -1824,6 +1826,10 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: masked = mask & ~isna(bvalues) counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) + if using_array_manager: + # count_level_2d return (1, N) array for single column + # -> extract 1D array + counted = counted[0, :] return counted new_mgr = data.grouped_reduce(hfunc) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bf9fdb5d0cff7..d8135dbf3f08d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3098,11 +3098,12 @@ def sample( if random_state is not None: random_state = com.random_state(random_state) + group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) samples = [ obj.sample( n=n, frac=frac, replace=replace, weights=w, random_state=random_state ) - for (_, obj), w in zip(self, ws) + for (_, obj), w in zip(group_iterator, ws) ] return concat(samples, axis=self.axis) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5004d1fe08a5b..7fb6e98fb176e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -8,6 +8,7 @@ from __future__ import annotations import collections +import functools from typing import ( Dict, Generic, @@ -84,6 +85,7 @@ MultiIndex, ensure_index, ) +from pandas.core.internals import ArrayManager from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -94,6 +96,64 @@ get_indexer_dict, ) +_CYTHON_FUNCTIONS = { + "aggregate": { + "add": "group_add", + "prod": "group_prod", + "min": "group_min", + "max": "group_max", + "mean": "group_mean", + "median": "group_median", + "var": "group_var", + "first": "group_nth", + "last": "group_last", + "ohlc": "group_ohlc", + }, + "transform": { + "cumprod": "group_cumprod", + "cumsum": "group_cumsum", + "cummin": "group_cummin", + "cummax": "group_cummax", + "rank": "group_rank", + }, +} + + +@functools.lru_cache(maxsize=None) +def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool): + + dtype_str = dtype.name + ftype = _CYTHON_FUNCTIONS[kind][how] + + # see if there is a fused-type version of function + # only valid for numeric + f = getattr(libgroupby, ftype, None) + if f is not None and is_numeric: + return f + + # otherwise find dtype-specific version, falling back to object + for dt in [dtype_str, "object"]: + f2 = getattr(libgroupby, f"{ftype}_{dt}", None) + if f2 is not None: + return f2 + + if hasattr(f, "__signatures__"): + # inspect what fused types are implemented + if dtype_str == "object" and "object" not in f.__signatures__: + # disallow this function so we get a NotImplementedError below + # instead of a TypeError at runtime + f = None + + func = f + + if func is None: + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" + ) + + return func + class BaseGrouper: """ @@ -207,21 +267,25 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): group_keys = self._get_group_keys() result_values = None - sdata: FrameOrSeries = splitter._get_sorted_data() - if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): + if data.ndim == 2 and np.any(data.dtypes.apply(is_extension_array_dtype)): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray # TODO: can we have a workaround for EAs backed by ndarray? pass + elif isinstance(data._mgr, ArrayManager): + # TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0 + # for now -> relies on BlockManager internals + pass elif ( com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) and axis == 0 # fast_apply/libreduction doesn't allow non-numpy backed indexes - and not sdata.index._has_complex_internals + and not data.index._has_complex_internals ): try: + sdata = splitter.sorted_data result_values, mutated = splitter.fast_apply(f, sdata, group_keys) except IndexError: @@ -380,28 +444,6 @@ def get_group_levels(self) -> List[Index]: # ------------------------------------------------------------ # Aggregation functions - _cython_functions = { - "aggregate": { - "add": "group_add", - "prod": "group_prod", - "min": "group_min", - "max": "group_max", - "mean": "group_mean", - "median": "group_median", - "var": "group_var", - "first": "group_nth", - "last": "group_last", - "ohlc": "group_ohlc", - }, - "transform": { - "cumprod": "group_cumprod", - "cumsum": "group_cumsum", - "cummin": "group_cummin", - "cummax": "group_cummax", - "rank": "group_rank", - }, - } - _cython_arity = {"ohlc": 4} # OHLC @final @@ -412,43 +454,6 @@ def _is_builtin_func(self, arg): """ return SelectionMixin._builtin_table.get(arg, arg) - @final - def _get_cython_function( - self, kind: str, how: str, values: np.ndarray, is_numeric: bool - ): - - dtype_str = values.dtype.name - ftype = self._cython_functions[kind][how] - - # see if there is a fused-type version of function - # only valid for numeric - f = getattr(libgroupby, ftype, None) - if f is not None and is_numeric: - return f - - # otherwise find dtype-specific version, falling back to object - for dt in [dtype_str, "object"]: - f2 = getattr(libgroupby, f"{ftype}_{dt}", None) - if f2 is not None: - return f2 - - if hasattr(f, "__signatures__"): - # inspect what fused types are implemented - if dtype_str == "object" and "object" not in f.__signatures__: - # disallow this function so we get a NotImplementedError below - # instead of a TypeError at runtime - f = None - - func = f - - if func is None: - raise NotImplementedError( - f"function is not implemented for this dtype: " - f"[how->{how},dtype->{dtype_str}]" - ) - - return func - @final def _get_cython_func_and_vals( self, kind: str, how: str, values: np.ndarray, is_numeric: bool @@ -469,7 +474,7 @@ def _get_cython_func_and_vals( values : np.ndarray """ try: - func = self._get_cython_function(kind, how, values, is_numeric) + func = _get_cython_function(kind, how, values.dtype, is_numeric) except NotImplementedError: if is_numeric: try: @@ -479,7 +484,7 @@ def _get_cython_func_and_vals( values = values.astype(complex) else: raise - func = self._get_cython_function(kind, how, values, is_numeric) + func = _get_cython_function(kind, how, values.dtype, is_numeric) else: raise return func, values @@ -537,7 +542,7 @@ def _ea_wrap_cython_operation( return res_values res_values = res_values.astype("i8", copy=False) - result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype) + result = type(orig_values)(res_values, dtype=orig_values.dtype) return result elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): @@ -774,7 +779,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): counts[label] = group.shape[0] result[label] = res - result = lib.maybe_convert_objects(result, try_float=0) + result = lib.maybe_convert_objects(result, try_float=False) result = maybe_cast_result(result, obj, numeric_only=True) return result, counts @@ -983,7 +988,7 @@ def sort_idx(self): return get_group_index_sorter(self.labels, self.ngroups) def __iter__(self): - sdata = self._get_sorted_data() + sdata = self.sorted_data if self.ngroups == 0: # we are inside a generator, rather than raise StopIteration @@ -995,7 +1000,8 @@ def __iter__(self): for i, (start, end) in enumerate(zip(starts, ends)): yield i, self._chop(sdata, slice(start, end)) - def _get_sorted_data(self) -> FrameOrSeries: + @cache_readonly + def sorted_data(self) -> FrameOrSeries: return self.data.take(self.sort_idx, axis=self.axis) def _chop(self, sdata, slice_obj: slice) -> NDFrame: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e633d6b28a8c5..30190ef950af5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -141,13 +141,17 @@ PandasObject, ) import pandas.core.common as com -from pandas.core.construction import extract_array +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.frozen import FrozenList from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ( ensure_key_mapped, + get_group_index_sorter, nargsort, ) from pandas.core.strings import StringMethods @@ -816,7 +820,7 @@ def view(self, cls=None): arr = self._data.view("i8") idx_cls = self._dtype_to_subclass(dtype) arr_cls = idx_cls._data_cls - arr = arr_cls._simple_new(self._data.view("i8"), dtype=dtype) + arr = arr_cls(self._data.view("i8"), dtype=dtype) return idx_cls._simple_new(arr, name=self.name) result = self._data.view(cls) @@ -1152,7 +1156,7 @@ def _format_with_header( values = self._values if is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, safe=1) + values = lib.maybe_convert_objects(values, safe=True) result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] @@ -2912,7 +2916,7 @@ def union(self, other, sort=None): return self._wrap_setop_result(other, result) - def _union(self, other, sort): + def _union(self, other: Index, sort): """ Specific union logic should go here. In subclasses, union behavior should be overwritten here rather than in `self.union`. @@ -3041,7 +3045,7 @@ def intersection(self, other, sort=False): result = self._intersection(other, sort=sort) return self._wrap_setop_result(other, result) - def _intersection(self, other, sort=False): + def _intersection(self, other: Index, sort=False): """ intersection specialized to the case with matching dtypes. """ @@ -3055,13 +3059,14 @@ def _intersection(self, other, sort=False): except TypeError: pass else: - return algos.unique1d(result) + # TODO: algos.unique1d should preserve DTA/TDA + res = algos.unique1d(result) + return ensure_wrapped_if_datetimelike(res) try: indexer = other.get_indexer(lvals) - except (InvalidIndexError, IncompatibleFrequency): + except InvalidIndexError: # InvalidIndexError raised by get_indexer if non-unique - # IncompatibleFrequency raised by PeriodIndex.get_indexer indexer, _ = other.get_indexer_non_unique(lvals) mask = indexer != -1 @@ -4098,9 +4103,7 @@ def _get_leaf_sorter(labels): return np.empty(0, dtype="int64") if len(labels) == 1: - lab = ensure_int64(labels[0]) - sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) - return sorter + return get_group_index_sorter(labels[0]) # find indexers of beginning of each set of # same-key labels w.r.t all but last level diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 6d5992540ef49..f7e37b10ef74c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -47,7 +47,6 @@ is_scalar, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCSeries from pandas.core.arrays import ( DatetimeArray, @@ -86,25 +85,22 @@ def _join_i8_wrapper(joinf, with_indexers: bool = True): @staticmethod # type: ignore[misc] def wrapper(left, right): # Note: these only get called with left.dtype == right.dtype - if isinstance( - left, (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin) - ): - left = left.view("i8") - if isinstance( - right, - (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin), - ): - right = right.view("i8") + orig_left = left + + left = left.view("i8") + right = right.view("i8") results = joinf(left, right) if with_indexers: - # dtype should be timedelta64[ns] for TimedeltaIndex - # and datetime64[ns] for DatetimeIndex - dtype = cast(np.dtype, left.dtype).base join_index, left_indexer, right_indexer = results - join_index = join_index.view(dtype) + if not isinstance(orig_left, np.ndarray): + # When called from Index._intersection/_union, we have the EA + join_index = join_index.view(orig_left._ndarray.dtype) + join_index = orig_left._from_backing_data(join_index) + return join_index, left_indexer, right_indexer + return results return wrapper @@ -618,13 +614,10 @@ def delete(self: _T, loc) -> _T: @doc(NDArrayBackedExtensionIndex.insert) def insert(self, loc: int, item): - try: - result = super().insert(loc, item) - except (ValueError, TypeError): - # i.e. self._data._validate_scalar raised - return self.astype(object).insert(loc, item) - - result._data._freq = self._get_insert_freq(loc, item) + result = super().insert(loc, item) + if isinstance(result, type(self)): + # i.e. parent class method did not cast + result._data._freq = self._get_insert_freq(loc, item) return result # -------------------------------------------------------------------- @@ -650,7 +643,8 @@ def _get_join_freq(self, other): def _wrap_joined_index(self, joined: np.ndarray, other): assert other.dtype == self.dtype, (other.dtype, self.dtype) - + assert joined.dtype == "i8" or joined.dtype == self.dtype, joined.dtype + joined = joined.view(self._data._ndarray.dtype) result = super()._wrap_joined_index(joined, other) result._data._freq = self._get_join_freq(other) return result @@ -708,6 +702,8 @@ def _intersection(self, other: Index, sort=False) -> Index: # We need to invalidate the freq because Index._intersection # uses _shallow_copy on a view of self._data, which will preserve # self.freq if we're not careful. + # At this point we should have result.dtype == self.dtype + # and type(result) is type(self._data) result = self._wrap_setop_result(other, result) return result._with_freq(None)._with_freq("infer") diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index f1418869713d6..ac70200c0c404 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -16,6 +16,10 @@ doc, ) +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from, +) from pandas.core.dtypes.common import ( is_dtype_equal, is_object_dtype, @@ -370,11 +374,19 @@ def insert(self: _T, loc: int, item) -> _T: ValueError if the item is not valid for this dtype. """ arr = self._data - code = arr._validate_scalar(item) - - new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:])) - new_arr = arr._from_backing_data(new_vals) - return type(self)._simple_new(new_arr, name=self.name) + try: + code = arr._validate_scalar(item) + except (ValueError, TypeError): + # e.g. trying to insert an integer into a DatetimeIndex + # We cannot keep the same dtype, so cast to the (often object) + # minimal shared dtype before doing the insert. + dtype, _ = infer_dtype_from(item, pandas_dtype=True) + dtype = find_common_type([self.dtype, dtype]) + return self.astype(dtype).insert(loc, item) + else: + new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:])) + new_arr = arr._from_backing_data(new_vals) + return type(self)._simple_new(new_arr, name=self.name) def putmask(self, mask, value): res_values = self._data.copy() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1889821c79756..88b92c7b304ae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3719,12 +3719,7 @@ def insert(self, loc: int, item) -> MultiIndex: # must insert at end otherwise you have to recompute all the # other codes lev_loc = len(level) - try: - level = level.insert(lev_loc, k) - except TypeError: - # TODO: Should this be done inside insert? - # TODO: smarter casting rules? - level = level.astype(object).insert(lev_loc, k) + level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 3615d85273f99..bd6ec38b292f6 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -539,7 +539,7 @@ def equals(self, other: object) -> bool: # -------------------------------------------------------------------- # Set Operations - def _intersection(self, other, sort=False): + def _intersection(self, other: Index, sort=False): if not isinstance(other, RangeIndex): # Int64Index @@ -614,7 +614,7 @@ def _extended_gcd(self, a, b): old_t, t = t, old_t - quotient * t return old_r, old_s, old_t - def _union(self, other, sort): + def _union(self, other: Index, sort): """ Form the union of two Index objects and sorts if possible diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index cfe16627d5c64..bded503a1e6db 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1421,6 +1421,15 @@ def _has_valid_setitem_indexer(self, indexer) -> bool: if isinstance(indexer, dict): raise IndexError("iloc cannot enlarge its target object") + if isinstance(indexer, ABCDataFrame): + warnings.warn( + "DataFrame indexer for .iloc is deprecated and will be removed in" + "a future version.\n" + "consider using .loc with a DataFrame indexer for automatic alignment.", + FutureWarning, + stacklevel=3, + ) + if not isinstance(indexer, tuple): indexer = _tuplify(self.ndim, indexer) @@ -1508,6 +1517,12 @@ def _get_list_axis(self, key, axis: int): raise IndexError("positional indexers are out-of-bounds") from err def _getitem_axis(self, key, axis: int): + if isinstance(key, ABCDataFrame): + raise IndexError( + "DataFrame indexer is not allowed for .iloc\n" + "Consider using .loc for automatic alignment." + ) + if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) @@ -1641,7 +1656,17 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # so the object is the same index = self.obj._get_axis(i) labels = index.insert(len(index), key) - self.obj._mgr = self.obj.reindex(labels, axis=i)._mgr + + # We are expanding the Series/DataFrame values to match + # the length of thenew index `labels`. GH#40096 ensure + # this is valid even if the index has duplicates. + taker = np.arange(len(index) + 1, dtype=np.intp) + taker[-1] = -1 + reindexers = {i: (labels, taker)} + new_obj = self.obj._reindex_with_indexers( + reindexers, allow_dups=True + ) + self.obj._mgr = new_obj._mgr self.obj._maybe_update_cacher(clear=True) self.obj._is_copy = None diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index e0447378c4542..998f1ffcf02ee 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -28,11 +28,13 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( + astype_array_safe, find_common_type, infer_dtype_from_scalar, ) from pandas.core.dtypes.common import ( is_bool_dtype, + is_datetime64_ns_dtype, is_dtype_equal, is_extension_array_dtype, is_numeric_dtype, @@ -53,7 +55,11 @@ ) import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + TimedeltaArray, +) from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -113,6 +119,7 @@ def __init__( if verify_integrity: self._axes = [ensure_index(ax) for ax in axes] + self.arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays] self._verify_integrity() def make_empty(self: T, axes=None) -> T: @@ -270,15 +277,30 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: ------- ArrayManager """ - # TODO ignore_failures - result_arrays = [func(arr) for arr in self.arrays] + result_arrays: List[np.ndarray] = [] + result_indices: List[int] = [] + + for i, arr in enumerate(self.arrays): + try: + res = func(arr) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + result_arrays.append(res) + result_indices.append(i) if len(result_arrays) == 0: index = Index([None]) # placeholder else: index = Index(range(result_arrays[0].shape[0])) - return type(self)(result_arrays, [index, self.items]) + if ignore_failures: + columns = self.items[np.array(result_indices, dtype="int64")] + else: + columns = self.items + + return type(self)(result_arrays, [index, columns]) def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager: """ @@ -452,7 +474,13 @@ def putmask(self, mask, new, align: bool = True): ) def diff(self, n: int, axis: int) -> ArrayManager: - return self.apply_with_block("diff", n=n, axis=axis) + axis = self._normalize_axis(axis) + if axis == 1: + # DataFrame only calls this for n=0, in which case performing it + # with axis=0 is equivalent + assert n == 0 + axis = 0 + return self.apply(algos.diff, n=n, axis=axis) def interpolate(self, **kwargs) -> ArrayManager: return self.apply_with_block("interpolate", **kwargs) @@ -478,7 +506,7 @@ def downcast(self) -> ArrayManager: return self.apply_with_block("downcast") def astype(self, dtype, copy: bool = False, errors: str = "raise") -> ArrayManager: - return self.apply("astype", dtype=dtype, copy=copy) # , errors=errors) + return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors) def convert( self, @@ -694,20 +722,16 @@ def fast_xs(self, loc: int) -> ArrayLike: """ dtype = _interleaved_dtype(self.arrays) - if isinstance(dtype, SparseDtype): - temp_dtype = dtype.subtype - elif isinstance(dtype, PandasDtype): - temp_dtype = dtype.numpy_dtype - elif is_extension_array_dtype(dtype): - temp_dtype = "object" - elif is_dtype_equal(dtype, str): - temp_dtype = "object" - else: - temp_dtype = dtype - - result = np.array([arr[loc] for arr in self.arrays], dtype=temp_dtype) + values = [arr[loc] for arr in self.arrays] if isinstance(dtype, ExtensionDtype): - result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + result = dtype.construct_array_type()._from_sequence(values, dtype=dtype) + # for datetime64/timedelta64, the np.ndarray constructor cannot handle pd.NaT + elif is_datetime64_ns_dtype(dtype): + result = DatetimeArray._from_sequence(values, dtype=dtype)._data + elif is_timedelta64_ns_dtype(dtype): + result = TimedeltaArray._from_sequence(values, dtype=dtype)._data + else: + result = np.array(values, dtype=dtype) return result def iget(self, i: int) -> SingleBlockManager: @@ -816,7 +840,13 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False value = extract_array(value, extract_numpy=True) if value.ndim == 2: - value = value[0, :] + if value.shape[0] == 1: + value = value[0, :] + else: + raise ValueError( + f"Expected a 1D array, got an array with shape {value.shape}" + ) + # TODO self.arrays can be empty # assert len(value) == len(self.arrays[0]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 689a067e1c211..f2b8499a316b7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,6 +1,5 @@ from __future__ import annotations -import inspect import re from typing import ( TYPE_CHECKING, @@ -36,8 +35,7 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( - astype_dt64_to_dt64tz, - astype_nansafe, + astype_array_safe, can_hold_element, find_common_type, infer_dtype_from, @@ -49,7 +47,6 @@ ) from pandas.core.dtypes.common import ( is_categorical_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, @@ -652,33 +649,11 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): ------- Block """ - errors_legal_values = ("raise", "ignore") - - if errors not in errors_legal_values: - invalid_arg = ( - "Expected value of kwarg 'errors' to be one of " - f"{list(errors_legal_values)}. Supplied value is '{errors}'" - ) - raise ValueError(invalid_arg) - - if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): - msg = ( - f"Expected an instance of {dtype.__name__}, " - "but got the class instead. Try instantiating 'dtype'." - ) - raise TypeError(msg) - - dtype = pandas_dtype(dtype) + values = self.values + if values.dtype.kind in ["m", "M"]: + values = self.array_values() - try: - new_values = self._astype(dtype, copy=copy) - except (ValueError, TypeError): - # e.g. astype_nansafe can fail on object-dtype of strings - # trying to convert to float - if errors == "ignore": - new_values = self.values - else: - raise + new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) newb = self.make_block(new_values) if newb.shape != self.shape: @@ -689,37 +664,6 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): ) return newb - def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: - values = self.values - if values.dtype.kind in ["m", "M"]: - values = self.array_values() - - if ( - values.dtype.kind in ["m", "M"] - and dtype.kind in ["i", "u"] - and isinstance(dtype, np.dtype) - and dtype.itemsize != 8 - ): - # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced - msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" - raise TypeError(msg) - - if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): - return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) - - if is_dtype_equal(values.dtype, dtype): - if copy: - return values.copy() - return values - - if isinstance(values, ExtensionArray): - values = values.astype(dtype, copy=copy) - - else: - values = astype_nansafe(values, dtype, copy=copy) - - return values - def convert( self, copy: bool = True, @@ -796,7 +740,6 @@ def replace( It is used in ObjectBlocks. It is here for API compatibility. """ inplace = validate_bool_kwarg(inplace, "inplace") - original_to_replace = to_replace if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that @@ -814,9 +757,20 @@ def replace( return [self] if inplace else [self.copy()] if not self._can_hold_element(value): - blk = self.astype(object) + if self.ndim == 2 and self.shape[0] > 1: + # split so that we only upcast where necessary + nbs = self._split() + res_blocks = extend_blocks( + [ + blk.replace(to_replace, value, inplace=inplace, regex=regex) + for blk in nbs + ] + ) + return res_blocks + + blk = self.coerce_to_target_dtype(value) return blk.replace( - to_replace=original_to_replace, + to_replace=to_replace, value=value, inplace=True, regex=regex, @@ -824,7 +778,7 @@ def replace( blk = self if inplace else self.copy() putmask_inplace(blk.values, mask, value) - blocks = blk.convert(numeric=False, copy=not inplace) + blocks = blk.convert(numeric=False, copy=False) return blocks @final @@ -867,11 +821,7 @@ def _replace_regex( replace_regex(new_values, rx, value, mask) block = self.make_block(new_values) - if convert: - nbs = block.convert(numeric=False) - else: - nbs = [block] - return nbs + return [block] @final def _replace_list( diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 9903dab9976c4..9a7ae39b9f8eb 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -22,10 +22,9 @@ from pandas._libs import lib from pandas._typing import ( - Axis, + ArrayLike, DtypeObj, Manager, - Scalar, ) from pandas.core.dtypes.cast import ( @@ -36,7 +35,6 @@ maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast, - sanitize_to_nanoseconds, ) from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -71,7 +69,9 @@ get_objs_combined_axis, union_indexes, ) +from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.managers import ( + BlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks, ) @@ -79,6 +79,7 @@ if TYPE_CHECKING: from numpy.ma.mrecords import MaskedRecords + # --------------------------------------------------------------------- # BlockManager Interface @@ -90,7 +91,8 @@ def arrays_to_mgr( columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, -): + typ: Optional[str] = None, +) -> Manager: """ Segregate Series based on type and coerce into matrices. @@ -108,19 +110,29 @@ def arrays_to_mgr( # don't force copy because getting jammed in an ndarray anyway arrays = _homogenize(arrays, index, dtype) - columns = ensure_index(columns) else: - columns = ensure_index(columns) index = ensure_index(index) + columns = ensure_index(columns) + # from BlockManager perspective axes = [columns, index] - return create_block_manager_from_arrays(arrays, arr_names, axes) + if typ == "block": + return create_block_manager_from_arrays(arrays, arr_names, axes) + elif typ == "array": + return ArrayManager(arrays, [index, columns]) + else: + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") -def masked_rec_array_to_mgr( - data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool +def rec_array_to_mgr( + data: Union[MaskedRecords, np.recarray, np.ndarray], + index, + columns, + dtype: Optional[DtypeObj], + copy: bool, + typ: str, ): """ Extract from a masked rec array and create the manager. @@ -129,48 +141,54 @@ def masked_rec_array_to_mgr( fdata = ma.getdata(data) if index is None: index = _get_names_from_index(fdata) - if index is None: - index = ibase.default_index(len(data)) - index = ensure_index(index) + else: + index = ensure_index(index) if columns is not None: columns = ensure_index(columns) arrays, arr_columns = to_arrays(fdata, columns) # fill if needed - new_arrays = [] - for col in arr_columns: - arr = data[col] - fv = arr.fill_value - - mask = ma.getmaskarray(arr) - if mask.any(): - arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) - arr[mask] = fv - new_arrays.append(arr) + if isinstance(data, np.ma.MaskedArray): + new_arrays = fill_masked_arrays(data, arr_columns) + else: + new_arrays = arrays # create the manager arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) + mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ) if copy: mgr = mgr.copy() return mgr +def fill_masked_arrays(data: MaskedRecords, arr_columns: Index) -> List[np.ndarray]: + """ + Convert numpy MaskedRecords to ensure mask is softened. + """ + new_arrays = [] + + for col in arr_columns: + arr = data[col] + fv = arr.fill_value + + mask = ma.getmaskarray(arr) + if mask.any(): + arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) + arr[mask] = fv + new_arrays.append(arr) + return new_arrays + + def mgr_to_mgr(mgr, typ: str): """ Convert to specific type of Manager. Does not copy if the type is already correct. Does not guarantee a copy otherwise. """ - from pandas.core.internals import ( - ArrayManager, - BlockManager, - ) - new_mgr: Manager if typ == "block": @@ -178,7 +196,7 @@ def mgr_to_mgr(mgr, typ: str): new_mgr = mgr else: new_mgr = arrays_to_mgr( - mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None + mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block" ) elif typ == "array": if isinstance(mgr, ArrayManager): @@ -187,7 +205,7 @@ def mgr_to_mgr(mgr, typ: str): arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))] new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) else: - raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'") + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") return new_mgr @@ -195,13 +213,16 @@ def mgr_to_mgr(mgr, typ: str): # DataFrame Constructor Interface -def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): - # input must be a ndarray, list, Series, index +def ndarray_to_mgr( + values, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str +) -> Manager: + # used in DataFrame.__init__ + # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: - columns = [values.name] + columns = Index([values.name]) if index is None: index = values.index else: @@ -224,22 +245,33 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): if columns is None: columns = Index(range(len(values))) - return arrays_to_mgr(values, columns, index, columns, dtype=dtype) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): - try: - values = construct_1d_ndarray_preserving_na( - values.ravel(), dtype=dtype, copy=False - ).reshape(values.shape) - except Exception as orig: - # e.g. ValueError when trying to cast object dtype to float64 - raise ValueError( - f"failed to cast to '{dtype}' (Exception was: {orig})" - ) from orig + shape = values.shape + flat = values.ravel() + + if not is_integer_dtype(dtype): + # TODO: skipping integer_dtype is needed to keep the tests passing, + # not clear it is correct + # Note: we really only need _try_cast, but keeping to exposed funcs + values = sanitize_array( + flat, None, dtype=dtype, copy=copy, raise_cast_failure=True + ) + else: + try: + values = construct_1d_ndarray_preserving_na( + flat, dtype=dtype, copy=False + ) + except Exception as err: + # e.g. ValueError when trying to cast object dtype to float64 + msg = f"failed to cast to '{dtype}' (Exception was: {err})" + raise ValueError(msg) from err + values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes( @@ -277,10 +309,14 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): return create_block_manager_from_blocks(block_values, [columns, index]) -def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): +def dict_to_mgr( + data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str +) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. + + Used in DataFrame.__init__ """ arrays: Union[Sequence[Any], Series] @@ -321,7 +357,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ) def nested_data_to_arrays( @@ -336,7 +372,7 @@ def nested_data_to_arrays( # By the time we get here we have already checked treat_as_nested(data) if is_named_tuple(data[0]) and columns is None: - columns = data[0]._fields + columns = ensure_index(data[0]._fields) arrays, columns = to_arrays(data, columns, dtype=dtype) columns = ensure_index(columns) @@ -415,6 +451,11 @@ def _homogenize(data, index: Index, dtype: Optional[DtypeObj]): # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) + # TODO extract_array should be preferred, but that gives failures for + # `extension/test_numpy.py` (extract_array will convert numpy arrays + # to PandasArray), see https://github.com/pandas-dev/pandas/issues/40021 + # val = extract_array(val, extract_numpy=True) + val = val._values else: if isinstance(val, dict): if oindex is None: @@ -492,21 +533,18 @@ def extract_index(data) -> Index: return ensure_index(index) -def reorder_arrays(arrays, arr_columns, columns): +def reorder_arrays( + arrays: List[ArrayLike], arr_columns: Index, columns: Optional[Index] +) -> Tuple[List[ArrayLike], Index]: # reorder according to the columns - if ( - columns is not None - and len(columns) - and arr_columns is not None - and len(arr_columns) - ): + if columns is not None and len(columns) and len(arr_columns): indexer = ensure_index(arr_columns).get_indexer(columns) arr_columns = ensure_index([arr_columns[i] for i in indexer]) arrays = [arrays[i] for i in indexer] return arrays, arr_columns -def _get_names_from_index(data): +def _get_names_from_index(data) -> Index: has_some_name = any(getattr(s, "name", None) is not None for s in data) if not has_some_name: return ibase.default_index(len(data)) @@ -521,7 +559,7 @@ def _get_names_from_index(data): index[i] = f"Unnamed {count}" count += 1 - return index + return Index(index) def _get_axes( @@ -574,7 +612,9 @@ def dataclasses_to_dicts(data): # Conversion of Inputs to Arrays -def to_arrays(data, columns, dtype: Optional[DtypeObj] = None): +def to_arrays( + data, columns: Optional[Index], dtype: Optional[DtypeObj] = None +) -> Tuple[List[ArrayLike], Index]: """ Return list of arrays, columns. """ @@ -595,8 +635,10 @@ def to_arrays(data, columns, dtype: Optional[DtypeObj] = None): if isinstance(data, np.ndarray): columns = data.dtype.names if columns is not None: - return [[]] * len(columns), columns - return [], [] # columns if columns is not None else [] + # i.e. numpy structured array + arrays = [data[name] for name in columns] + return arrays, ensure_index(columns) + return [], ensure_index([]) elif isinstance(data[0], Categorical): if columns is None: @@ -605,12 +647,12 @@ def to_arrays(data, columns, dtype: Optional[DtypeObj] = None): elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray - columns = list(data.dtype.names) + columns = Index(list(data.dtype.names)) arrays = [data[k] for k in columns] return arrays, columns if isinstance(data[0], (list, tuple)): - content, columns = _list_to_arrays(data, columns) + content = _list_to_arrays(data) elif isinstance(data[0], abc.Mapping): content, columns = _list_of_dict_to_arrays(data, columns) elif isinstance(data[0], ABCSeries): @@ -618,35 +660,35 @@ def to_arrays(data, columns, dtype: Optional[DtypeObj] = None): else: # last ditch effort data = [tuple(x) for x in data] - content, columns = _list_to_arrays(data, columns) + content = _list_to_arrays(data) content, columns = _finalize_columns_and_data(content, columns, dtype) return content, columns -def _list_to_arrays( - data: List[Scalar], - columns: Union[Index, List], -) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: +def _list_to_arrays(data: List[Union[Tuple, List]]) -> np.ndarray: + # Returned np.ndarray has ndim = 2 # Note: we already check len(data) > 0 before getting hre if isinstance(data[0], tuple): content = lib.to_object_array_tuples(data) else: # list of lists content = lib.to_object_array(data) - return content, columns + return content def _list_of_series_to_arrays( data: List, - columns: Union[Index, List], -) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: + columns: Optional[Index], +) -> Tuple[np.ndarray, Index]: + # returned np.ndarray has ndim == 2 + if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] columns = get_objs_combined_axis(pass_data, sort=False) - indexer_cache: Dict[int, Scalar] = {} + indexer_cache: Dict[int, np.ndarray] = {} aligned_values = [] for s in data: @@ -669,8 +711,8 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( data: List[Dict], - columns: Union[Index, List], -) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: + columns: Optional[Index], +) -> Tuple[np.ndarray, Index]: """ Convert list of dicts to numpy arrays @@ -687,13 +729,14 @@ def _list_of_dict_to_arrays( Returns ------- - tuple - arrays, columns + content : np.ndarray[object, ndim=2] + columns : Index """ if columns is None: gen = (list(x.keys()) for x in data) sort = not any(isinstance(d, dict) for d in data) columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) + columns = ensure_index(columns) # assure that they are of the base dict class and not of derived # classes @@ -704,10 +747,10 @@ def _list_of_dict_to_arrays( def _finalize_columns_and_data( - content: np.ndarray, - columns: Optional[Union[Index, List]], + content: np.ndarray, # ndim == 2 + columns: Optional[Index], dtype: Optional[DtypeObj], -) -> Tuple[List[np.ndarray], Union[Index, List[Axis]]]: +) -> Tuple[List[np.ndarray], Index]: """ Ensure we have valid columns, cast object dtypes if possible. """ @@ -725,21 +768,21 @@ def _finalize_columns_and_data( def _validate_or_indexify_columns( - content: List, columns: Optional[Union[Index, List]] -) -> Union[Index, List[Axis]]: + content: List[np.ndarray], columns: Optional[Index] +) -> Index: """ If columns is None, make numbers as column names; Otherwise, validate that columns have valid length. Parameters ---------- - content: list of data - columns: Iterable or None + content : list of np.ndarrays + columns : Index or None Returns ------- - columns: If columns is Iterable, return as is; If columns is None, assign - positional column index value as columns. + Index + If columns is None, assign positional column index value as columns. Raises ------ @@ -783,19 +826,19 @@ def _validate_or_indexify_columns( def _convert_object_array( - content: List[Scalar], dtype: Optional[DtypeObj] = None -) -> List[Scalar]: + content: List[np.ndarray], dtype: Optional[DtypeObj] +) -> List[ArrayLike]: """ Internal function to convert object array. Parameters ---------- - content: list of processed data records - dtype: np.dtype, default is None + content: List[np.ndarray] + dtype: np.dtype or ExtensionDtype Returns ------- - arrays: casted content if not object dtype, otherwise return as is in list. + List[ArrayLike] """ # provide soft conversion of object dtypes def convert(arr): @@ -807,28 +850,3 @@ def convert(arr): arrays = [convert(arr) for arr in content] return arrays - - -# --------------------------------------------------------------------- -# Series-Based - - -def sanitize_index(data, index: Index): - """ - Sanitize an index type to return an ndarray of the underlying, pass - through a non-Index. - """ - if len(data) != len(index): - raise ValueError( - "Length of values " - f"({len(data)}) " - "does not match length of index " - f"({len(index)})" - ) - - if isinstance(data, np.ndarray): - - # coerce datetimelike types to ns - data = sanitize_to_nanoseconds(data) - - return data diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e013a7f680d6f..2ad7471d6f086 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1646,6 +1646,15 @@ def fast_xs(self, loc): """ raise NotImplementedError("Use series._values[loc] instead") + def set_values(self, values: ArrayLike): + """ + Set the values of the single block in place. + + Use at your own risk! This does not check if the passed values are + valid for the current Block/SingleBlockManager (length, dtype, etc). + """ + self.blocks[0].values = values + # -------------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 0b77a6d821c6d..d1597b23cf577 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -3,7 +3,10 @@ """ from __future__ import annotations -from functools import partial +from functools import ( + partial, + wraps, +) from typing import ( TYPE_CHECKING, Any, @@ -11,6 +14,7 @@ Optional, Set, Union, + cast, ) import numpy as np @@ -22,15 +26,13 @@ from pandas._typing import ( ArrayLike, Axis, - DtypeObj, + F, ) from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import infer_dtype_from from pandas.core.dtypes.common import ( - ensure_float64, is_array_like, - is_integer_dtype, is_numeric_v_string_like, needs_i8_conversion, ) @@ -674,54 +676,53 @@ def interpolate_2d( return result -def _cast_values_for_fillna(values, dtype: DtypeObj, has_mask: bool): - """ - Cast values to a dtype that algos.pad and algos.backfill can handle. - """ - # TODO: for int-dtypes we make a copy, but for everything else this - # alters the values in-place. Is this intentional? +def _fillna_prep(values, mask=None): + # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d - if needs_i8_conversion(dtype): - values = values.view(np.int64) + if mask is None: + mask = isna(values) - elif is_integer_dtype(values) and not has_mask: - # NB: this check needs to come after the datetime64 check above - # has_mask check to avoid casting i8 values that have already - # been cast from PeriodDtype - values = ensure_float64(values) + mask = mask.view(np.uint8) + return mask - return values +def _datetimelike_compat(func: F) -> F: + """ + Wrapper to handle datetime64 and timedelta64 dtypes. + """ -def _fillna_prep(values, mask=None): - # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d - dtype = values.dtype + @wraps(func) + def new_func(values, limit=None, mask=None): + if needs_i8_conversion(values.dtype): + if mask is None: + # This needs to occur before casting to int64 + mask = isna(values) - has_mask = mask is not None - if not has_mask: - # This needs to occur before datetime/timedeltas are cast to int64 - mask = isna(values) + result = func(values.view("i8"), limit=limit, mask=mask) + return result.view(values.dtype) - values = _cast_values_for_fillna(values, dtype, has_mask) + return func(values, limit=limit, mask=mask) - mask = mask.view(np.uint8) - return values, mask + return cast(F, new_func) +@_datetimelike_compat def _pad_1d(values, limit=None, mask=None): - values, mask = _fillna_prep(values, mask) + mask = _fillna_prep(values, mask) algos.pad_inplace(values, mask, limit=limit) return values +@_datetimelike_compat def _backfill_1d(values, limit=None, mask=None): - values, mask = _fillna_prep(values, mask) + mask = _fillna_prep(values, mask) algos.backfill_inplace(values, mask, limit=limit) return values +@_datetimelike_compat def _pad_2d(values, limit=None, mask=None): - values, mask = _fillna_prep(values, mask) + mask = _fillna_prep(values, mask) if np.all(values.shape): algos.pad_2d_inplace(values, mask, limit=limit) @@ -731,8 +732,9 @@ def _pad_2d(values, limit=None, mask=None): return values +@_datetimelike_compat def _backfill_2d(values, limit=None, mask=None): - values, mask = _fillna_prep(values, mask) + mask = _fillna_prep(values, mask) if np.all(values.shape): algos.backfill_2d_inplace(values, mask, limit=limit) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 24e75a2bbeff2..a0dfb1c83a70b 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1743,8 +1743,9 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: result = result.view(orig_dtype) else: # DatetimeArray + # TODO: have this case go through a DTA method? result = type(values)._simple_new( # type: ignore[attr-defined] - result, dtype=orig_dtype + result.view("M8[ns]"), dtype=orig_dtype ) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 543bf44e61216..271bb2ca8dd75 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -9,7 +9,6 @@ import numpy as np -import pandas._libs.algos as libalgos import pandas._libs.reshape as libreshape from pandas._libs.sparse import IntIndex from pandas._typing import Dtype @@ -42,6 +41,7 @@ decons_obs_group_ids, get_compressed_ids, get_group_index, + get_group_index_sorter, ) @@ -139,8 +139,7 @@ def _indexer_and_to_sort(self): comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) - indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0] - indexer = ensure_platform_int(indexer) + indexer = get_group_index_sorter(comp_index, ngroups) return indexer, to_sort diff --git a/pandas/core/series.py b/pandas/core/series.py index 3f43b27cd88ce..6ee6ea801d872 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -15,9 +15,11 @@ Iterable, List, Optional, + Sequence, Tuple, Type, Union, + cast, ) import warnings @@ -94,7 +96,7 @@ ops, ) from pandas.core.accessor import CachedAccessor -from pandas.core.apply import series_apply +from pandas.core.apply import SeriesApply from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor @@ -124,7 +126,6 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager -from pandas.core.internals.construction import sanitize_index from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( ensure_key_mapped, @@ -374,10 +375,8 @@ def __init__( "`index` argument. `copy` must be False." ) - elif is_extension_array_dtype(data): + elif isinstance(data, ExtensionArray): pass - elif isinstance(data, (set, frozenset)): - raise TypeError(f"'{type(data).__name__}' type is unordered") else: data = com.maybe_iterable_to_list(data) @@ -386,7 +385,7 @@ def __init__( data = [data] index = ibase.default_index(len(data)) elif is_list_like(data): - sanitize_index(data, index) + com.require_length_match(data, index) # create/copy the manager if isinstance(data, SingleBlockManager): @@ -803,7 +802,7 @@ def __array__(self, dtype: Optional[NpDtype] = None) -> np.ndarray: array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ - return np.asarray(self.array, dtype) + return np.asarray(self._values, dtype) # ---------------------------------------------------------------------- # Unary Methods @@ -1797,7 +1796,7 @@ def count(self, level=None): 2 """ if level is None: - return notna(self.array).sum() + return notna(self._values).sum() elif not isinstance(self.index, MultiIndex): raise ValueError("Series.count level is only valid with a MultiIndex") @@ -2497,7 +2496,7 @@ def diff(self, periods: int = 1) -> Series: -------- {examples} """ - result = algorithms.diff(self.array, periods) + result = algorithms.diff(self._values, periods) return self._constructor(result, index=self.index).__finalize__( self, method="diff" ) @@ -3099,7 +3098,7 @@ def update(self, other) -> None: def sort_values( self, axis=0, - ascending=True, + ascending: Union[Union[bool, int], Sequence[Union[bool, int]]] = True, inplace: bool = False, kind: str = "quicksort", na_position: str = "last", @@ -3117,7 +3116,7 @@ def sort_values( axis : {0 or 'index'}, default 0 Axis to direct sorting. The value 'index' is accepted for compatibility with DataFrame.sort_values. - ascending : bool, default True + ascending : bool or list of bools, default True If True, sort values in ascending order, otherwise descending. inplace : bool, default False If True, perform operation in-place. @@ -3277,6 +3276,7 @@ def sort_values( ) if is_list_like(ascending): + ascending = cast(Sequence[Union[bool, int]], ascending) if len(ascending) != 1: raise ValueError( f"Length of ascending ({len(ascending)}) must be 1 for Series" @@ -3291,7 +3291,7 @@ def sort_values( # GH 35922. Make sorting stable by leveraging nargsort values_to_sort = ensure_key_mapped(self, key)._values if key else self._values - sorted_index = nargsort(values_to_sort, kind, ascending, na_position) + sorted_index = nargsort(values_to_sort, kind, bool(ascending), na_position) result = self._constructor( self._values[sorted_index], index=self.index[sorted_index] @@ -3309,7 +3309,7 @@ def sort_index( self, axis=0, level=None, - ascending: bool = True, + ascending: Union[Union[bool, int], Sequence[Union[bool, int]]] = True, inplace: bool = False, kind: str = "quicksort", na_position: str = "last", @@ -3329,7 +3329,7 @@ def sort_index( Axis to direct sorting. This can only be 0 for Series. level : int, optional If not None, sort on values in specified index level(s). - ascending : bool or list of bools, default True + ascending : bool or list-like of bools, default True Sort ascending vs. descending. When the index is a MultiIndex the sort direction can be controlled for each level individually. inplace : bool, default False @@ -3806,7 +3806,7 @@ def explode(self, ignore_index: bool = False) -> Series: if not len(self) or not is_object_dtype(self): return self.copy() - values, counts = reshape.explode(np.asarray(self.array)) + values, counts = reshape.explode(np.asarray(self._values)) if ignore_index: index = ibase.default_index(len(values)) @@ -4000,7 +4000,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if func is None: func = dict(kwargs.items()) - op = series_apply(self, func, args=args, kwargs=kwargs) + op = SeriesApply(self, func, convert_dtype=False, args=args, kwargs=kwargs) result = op.agg() return result @@ -4016,7 +4016,9 @@ def transform( ) -> FrameOrSeriesUnion: # Validate axis argument self._get_axis_number(axis) - result = series_apply(self, func=func, args=args, kwargs=kwargs).transform() + result = SeriesApply( + self, func=func, convert_dtype=True, args=args, kwargs=kwargs + ).transform() return result def apply( @@ -4128,7 +4130,7 @@ def apply( Helsinki 2.484907 dtype: float64 """ - return series_apply(self, func, convert_dtype, args, kwargs).apply() + return SeriesApply(self, func, convert_dtype, args, kwargs).apply() def _reduce( self, @@ -5009,7 +5011,7 @@ def _cmp_method(self, other, op): if isinstance(other, Series) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") - lvalues = extract_array(self, extract_numpy=True) + lvalues = self._values rvalues = extract_array(other, extract_numpy=True, range_compat=True) res_values = ops.comparison_op(lvalues, rvalues, op) @@ -5020,7 +5022,7 @@ def _logical_method(self, other, op): res_name = ops.get_op_result_name(self, other) self, other = ops.align_method_SERIES(self, other, align_asobject=True) - lvalues = extract_array(self, extract_numpy=True) + lvalues = self._values rvalues = extract_array(other, extract_numpy=True, range_compat=True) res_values = ops.logical_op(lvalues, rvalues, op) @@ -5030,7 +5032,7 @@ def _arith_method(self, other, op): res_name = ops.get_op_result_name(self, other) self, other = ops.align_method_SERIES(self, other) - lvalues = extract_array(self, extract_numpy=True) + lvalues = self._values rvalues = extract_array(other, extract_numpy=True, range_compat=True) result = ops.arithmetic_op(lvalues, rvalues, op) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index c3356386ef346..55e97f738072b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -10,6 +10,7 @@ Iterable, List, Optional, + Sequence, Tuple, Union, ) @@ -48,7 +49,7 @@ def get_indexer_indexer( target: Index, level: Union[str, int, List[str], List[int]], - ascending: bool, + ascending: Union[Sequence[Union[bool, int]], Union[bool, int]], kind: str, na_position: str, sort_remaining: bool, @@ -572,7 +573,9 @@ def get_indexer_dict( # sorting levels...cleverly? -def get_group_index_sorter(group_index, ngroups: int): +def get_group_index_sorter( + group_index: np.ndarray, ngroups: int | None = None +) -> np.ndarray: """ algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where @@ -586,6 +589,8 @@ def get_group_index_sorter(group_index, ngroups: int): groupby operations. e.g. consider: df.groupby(key)[col].transform('first') """ + if ngroups is None: + ngroups = 1 + group_index.max() count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 18f9ece3e3812..d58b5e5ffa83d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -248,13 +248,13 @@ def _convert_and_box_cache( return _box_as_indexlike(result, utc=None, name=name) -def _return_parsed_timezone_results(result, timezones, tz, name): +def _return_parsed_timezone_results(result: np.ndarray, timezones, tz, name) -> Index: """ Return results from array_strptime if a %z or %Z directive was passed. Parameters ---------- - result : ndarray + result : ndarray[int64] int64 date representations of the dates timezones : ndarray pytz timezone objects @@ -284,10 +284,10 @@ def _convert_listlike_datetimes( tz: Optional[Timezone] = None, unit: Optional[str] = None, errors: Optional[str] = None, - infer_datetime_format: Optional[bool] = None, + infer_datetime_format: bool = False, dayfirst: Optional[bool] = None, yearfirst: Optional[bool] = None, - exact: Optional[bool] = None, + exact: bool = True, ): """ Helper function for to_datetime. Performs the conversions of 1D listlike @@ -305,13 +305,13 @@ def _convert_listlike_datetimes( None or string of the frequency of the passed data errors : string error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' - infer_datetime_format : boolean + infer_datetime_format : bool, default False inferring format behavior from to_datetime dayfirst : boolean dayfirst parsing behavior from to_datetime yearfirst : boolean yearfirst parsing behavior from to_datetime - exact : boolean + exact : bool, default True exact format matching behavior from to_datetime Returns @@ -346,38 +346,7 @@ def _convert_listlike_datetimes( elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") - arg = getattr(arg, "_values", arg) - - # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime - # because it expects an ndarray argument - if isinstance(arg, IntegerArray): - result = arg.astype(f"datetime64[{unit}]") - tz_parsed = None - else: - - result, tz_parsed = tslib.array_with_unit_to_datetime( - arg, unit, errors=errors - ) - - if errors == "ignore": - - result = Index(result, name=name) - else: - result = DatetimeIndex(result, name=name) - # GH 23758: We may still need to localize the result with tz - # GH 25546: Apply tz_parsed first (from arg), then tz (from caller) - # result will be naive but in UTC - try: - result = result.tz_localize("UTC").tz_convert(tz_parsed) - except AttributeError: - # Regular Index from 'ignore' path - return result - if tz is not None: - if result.tz is None: - result = result.tz_localize(tz) - else: - result = result.tz_convert(tz) - return result + return _to_datetime_with_unit(arg, unit, name, tz, errors) elif getattr(arg, "ndim", 1) > 1: raise TypeError( "arg must be a string, datetime, list, tuple, 1-d array, or Series" @@ -413,64 +382,14 @@ def _convert_listlike_datetimes( require_iso8601 = not infer_datetime_format format = None - tz_parsed = None result = None if format is not None: - try: - # shortcut formatting here - if format == "%Y%m%d": - try: - # pass orig_arg as float-dtype may have been converted to - # datetime64[ns] - orig_arg = ensure_object(orig_arg) - result = _attempt_YYYYMMDD(orig_arg, errors=errors) - except (ValueError, TypeError, OutOfBoundsDatetime) as err: - raise ValueError( - "cannot convert the input to '%Y%m%d' date format" - ) from err - - # fallback - if result is None: - try: - result, timezones = array_strptime( - arg, format, exact=exact, errors=errors - ) - if "%Z" in format or "%z" in format: - return _return_parsed_timezone_results( - result, timezones, tz, name - ) - except OutOfBoundsDatetime: - if errors == "raise": - raise - elif errors == "coerce": - result = np.empty(arg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult.fill(iNaT) - else: - result = arg - except ValueError: - # if format was inferred, try falling back - # to array_to_datetime - terminate here - # for specified formats - if not infer_datetime_format: - if errors == "raise": - raise - elif errors == "coerce": - result = np.empty(arg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult.fill(iNaT) - else: - result = arg - except ValueError as e: - # Fallback to try to convert datetime objects if timezone-aware - # datetime objects are found without passing `utc=True` - try: - values, tz = conversion.datetime_to_datetime64(arg) - dta = DatetimeArray(values, dtype=tz_to_dtype(tz)) - return DatetimeIndex._simple_new(dta, name=name) - except (ValueError, TypeError): - raise e + result = _to_datetime_with_format( + arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format + ) + if result is not None: + return result if result is None: assert format is None or infer_datetime_format @@ -485,16 +404,151 @@ def _convert_listlike_datetimes( allow_object=True, ) - if tz_parsed is not None: - # We can take a shortcut since the datetime64 numpy array - # is in UTC - dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) - return DatetimeIndex._simple_new(dta, name=name) + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) + return DatetimeIndex._simple_new(dta, name=name) + + utc = tz == "utc" + return _box_as_indexlike(result, utc=utc, name=name) + +def _array_strptime_with_fallback( + arg, + name, + tz, + fmt: str, + exact: bool, + errors: Optional[str], + infer_datetime_format: bool, +) -> Optional[Index]: + """ + Call array_strptime, with fallback behavior depending on 'errors'. + """ utc = tz == "utc" + + try: + result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors) + if "%Z" in fmt or "%z" in fmt: + return _return_parsed_timezone_results(result, timezones, tz, name) + except OutOfBoundsDatetime: + if errors == "raise": + raise + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") + iresult.fill(iNaT) + else: + result = arg + except ValueError: + # if fmt was inferred, try falling back + # to array_to_datetime - terminate here + # for specified formats + if not infer_datetime_format: + if errors == "raise": + raise + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") + iresult.fill(iNaT) + else: + result = arg + else: + # Indicates to the caller to fallback to objects_to_datetime64ns + return None + return _box_as_indexlike(result, utc=utc, name=name) +def _to_datetime_with_format( + arg, + orig_arg, + name, + tz, + fmt: str, + exact: bool, + errors: Optional[str], + infer_datetime_format: bool, +) -> Optional[Index]: + """ + Try parsing with the given format, returning None on failure. + """ + result = None + try: + # shortcut formatting here + if fmt == "%Y%m%d": + # pass orig_arg as float-dtype may have been converted to + # datetime64[ns] + orig_arg = ensure_object(orig_arg) + try: + # may return None without raising + result = _attempt_YYYYMMDD(orig_arg, errors=errors) + except (ValueError, TypeError, OutOfBoundsDatetime) as err: + raise ValueError( + "cannot convert the input to '%Y%m%d' date format" + ) from err + if result is not None: + utc = tz == "utc" + return _box_as_indexlike(result, utc=utc, name=name) + + # fallback + if result is None: + result = _array_strptime_with_fallback( + arg, name, tz, fmt, exact, errors, infer_datetime_format + ) + if result is not None: + return result + + except ValueError as e: + # Fallback to try to convert datetime objects if timezone-aware + # datetime objects are found without passing `utc=True` + try: + values, tz = conversion.datetime_to_datetime64(arg) + dta = DatetimeArray(values, dtype=tz_to_dtype(tz)) + return DatetimeIndex._simple_new(dta, name=name) + except (ValueError, TypeError): + raise e + + return result + + +def _to_datetime_with_unit(arg, unit, name, tz, errors: Optional[str]) -> Index: + """ + to_datetime specalized to the case where a 'unit' is passed. + """ + arg = getattr(arg, "_values", arg) + + # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime + # because it expects an ndarray argument + if isinstance(arg, IntegerArray): + result = arg.astype(f"datetime64[{unit}]") + tz_parsed = None + else: + result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) + + if errors == "ignore": + # Index constructor _may_ infer to DatetimeIndex + result = Index(result, name=name) + else: + result = DatetimeIndex(result, name=name) + + if not isinstance(result, DatetimeIndex): + return result + + # GH#23758: We may still need to localize the result with tz + # GH#25546: Apply tz_parsed first (from arg), then tz (from caller) + # result will be naive but in UTC + result = result.tz_localize("UTC").tz_convert(tz_parsed) + + if tz is not None: + if result.tz is None: + result = result.tz_localize(tz) + else: + result = result.tz_convert(tz) + return result + + def _adjust_to_origin(arg, origin, unit): """ Helper function for to_datetime. @@ -965,7 +1019,7 @@ def coerce(values): return values -def _attempt_YYYYMMDD(arg, errors): +def _attempt_YYYYMMDD(arg: np.ndarray, errors: Optional[str]) -> Optional[np.ndarray]: """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, arg is a passed in as an object dtype, but could really be ints/strings @@ -973,8 +1027,8 @@ def _attempt_YYYYMMDD(arg, errors): Parameters ---------- - arg : passed value - errors : 'raise','ignore','coerce' + arg : np.ndarray[object] + errors : {'raise','ignore','coerce'} """ def calc(carg): diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 208b5ab0023eb..5a71db82f26e4 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -215,11 +215,13 @@ class ExponentialMovingWindow(BaseWindow): _attributes = [ "com", + "span", + "halflife", + "alpha", "min_periods", "adjust", "ignore_na", "axis", - "halflife", "times", ] @@ -245,38 +247,48 @@ def __init__( method="single", axis=axis, ) + self.com = com + self.span = span + self.halflife = halflife + self.alpha = alpha self.adjust = adjust self.ignore_na = ignore_na - if times is not None: + self.times = times + if self.times is not None: if isinstance(times, str): - times = self._selected_obj[times] - if not is_datetime64_ns_dtype(times): + self.times = self._selected_obj[times] + if not is_datetime64_ns_dtype(self.times): raise ValueError("times must be datetime64[ns] dtype.") - if len(times) != len(obj): + if len(self.times) != len(obj): raise ValueError("times must be the same length as the object.") if not isinstance(halflife, (str, datetime.timedelta)): raise ValueError( "halflife must be a string or datetime.timedelta object" ) - if isna(times).any(): + if isna(self.times).any(): raise ValueError("Cannot convert NaT values to integer") - self.times = np.asarray(times.view(np.int64)) - self.halflife = Timedelta(halflife).value + _times = np.asarray(self.times.view(np.int64), dtype=np.float64) + _halflife = float(Timedelta(self.halflife).value) + self._deltas = np.diff(_times) / _halflife # Halflife is no longer applicable when calculating COM # But allow COM to still be calculated if the user passes other decay args - if common.count_not_none(com, span, alpha) > 0: - self.com = get_center_of_mass(com, span, None, alpha) + if common.count_not_none(self.com, self.span, self.alpha) > 0: + self._com = get_center_of_mass(self.com, self.span, None, self.alpha) else: - self.com = 0.0 + self._com = 1.0 else: - if halflife is not None and isinstance(halflife, (str, datetime.timedelta)): + if self.halflife is not None and isinstance( + self.halflife, (str, datetime.timedelta) + ): raise ValueError( "halflife can only be a timedelta convertible argument if " "times is not None." ) - self.times = None - self.halflife = None - self.com = get_center_of_mass(com, span, halflife, alpha) + # Without times, points are equally spaced + self._deltas = np.ones(max(len(self.obj) - 1, 0), dtype=np.float64) + self._com = get_center_of_mass( + self.com, self.span, self.halflife, self.alpha + ) def _get_window_indexer(self) -> BaseIndexer: """ @@ -334,21 +346,14 @@ def aggregate(self, func, *args, **kwargs): ) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) - if self.times is not None: - window_func = window_aggregations.ewma_time - window_func = partial( - window_func, - times=self.times, - halflife=self.halflife, - ) - else: - window_func = window_aggregations.ewma - window_func = partial( - window_func, - com=self.com, - adjust=self.adjust, - ignore_na=self.ignore_na, - ) + window_func = window_aggregations.ewma + window_func = partial( + window_func, + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=self._deltas, + ) return self._apply(window_func) @doc( @@ -409,7 +414,7 @@ def var(self, bias: bool = False, *args, **kwargs): window_func = window_aggregations.ewmcov window_func = partial( window_func, - com=self.com, + com=self._com, adjust=self.adjust, ignore_na=self.ignore_na, bias=bias, @@ -478,7 +483,7 @@ def cov_func(x, y): end, self.min_periods, y_array, - self.com, + self._com, self.adjust, self.ignore_na, bias, @@ -544,7 +549,7 @@ def _cov(X, Y): end, self.min_periods, Y, - self.com, + self._com, self.adjust, self.ignore_na, 1, @@ -611,7 +616,7 @@ def mean(self, engine=None, engine_kwargs=None): if maybe_use_numba(engine): groupby_ewma_func = generate_numba_groupby_ewma_func( engine_kwargs, - self.com, + self._com, self.adjust, self.ignore_na, ) diff --git a/pandas/io/api.py b/pandas/io/api.py index 2241f491b5d48..5926f2166ee9d 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -37,3 +37,4 @@ read_sql_table, ) from pandas.io.stata import read_stata +from pandas.io.xml import read_xml diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index a1b6986079723..44428abdcd8a5 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1,5 +1,5 @@ """ -Internal module for formatting output data in csv, html, +Internal module for formatting output data in csv, html, xml, and latex files. This module also applies to display formatting. """ from __future__ import annotations @@ -61,6 +61,8 @@ IndexLabel, StorageOptions, ) +from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -96,6 +98,7 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.reshape.concat import concat +from pandas.core.shared_docs import _shared_docs from pandas.io.common import stringify_path from pandas.io.formats.printing import ( @@ -941,6 +944,7 @@ class DataFrameRenderer: Called in pandas.core.frame.DataFrame: - to_html + - to_xml - to_string Parameters @@ -1033,6 +1037,135 @@ def to_html( string = html_formatter.to_string() return save_to_buffer(string, buf=buf, encoding=encoding) + @doc(storage_options=_shared_docs["storage_options"]) + def to_xml( + self, + path_or_buffer: Optional[FilePathOrBuffer] = None, + index: Optional[bool] = True, + root_name: Optional[str] = "data", + row_name: Optional[str] = "row", + na_rep: Optional[str] = None, + attr_cols: Optional[Union[str, List[str]]] = None, + elem_cols: Optional[Union[str, List[str]]] = None, + namespaces: Optional[Dict[Optional[str], str]] = None, + prefix: Optional[str] = None, + encoding: str = "utf-8", + xml_declaration: Optional[bool] = True, + pretty_print: Optional[bool] = True, + parser: Optional[str] = "lxml", + stylesheet: Optional[FilePathOrBuffer] = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, + ) -> Optional[str]: + """ + Render a DataFrame to an XML document. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object or file-like object, optional + File to write output to. If None, the output is returned as a + string. + index : bool, default True + Whether to include index in XML document. + root_name : str, default 'data' + The name of root element in XML document. + row_name : str, default 'row' + The name of row element in XML document. + na_rep : str, optional + Missing data representation. + attr_cols : list-like, optional + List of columns to write as attributes in row element. + Hierarchical columns will be flattened with underscore + delimiting the different levels. + elem_cols : list-like, optional + List of columns to write as children in row element. By default, + all columns output as children of row element. Hierarchical + columns will be flattened with underscore delimiting the + different levels. + namespaces : dict, optional + All namespaces to be defined in root element. Keys of dict + should be prefix names and values of dict corresponding URIs. + Default namespaces should be given empty string key. For + example, :: + + namespaces = {{'': 'https://example.com'}} + + prefix : str, optional + Namespace prefix to be used for every element and/or attribute + in document. This should be one of the keys in ``namespaces`` + dict. + encoding : str, default 'utf-8' + Encoding of the resulting document. + xml_declaration : str, optional + Whether to include the XML declaration at start of document. + pretty_print : bool, default True + Whether output should be pretty printed with indentation and + line breaks. + parser : {{'lxml','etree'}}, default "lxml" + Parser module to use for building of tree. Only 'lxml' and + 'etree' are supported. With 'lxml', the ability to use XSLT + stylesheet is supported. + stylesheet : str, path object or file-like object, optional + A URL, file-like object, or a raw string containing an XSLT + script used to transform the raw XML output. Script should use + layout of elements and attributes from original output. This + argument requires ``lxml`` to be installed. Only XSLT 1.0 + scripts and not later versions is currently supported. + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + {storage_options} + """ + + from pandas.io.formats.xml import ( + EtreeXMLFormatter, + LxmlXMLFormatter, + ) + + lxml = import_optional_dependency("lxml.etree", errors="ignore") + + TreeBuilder: Union[Type[EtreeXMLFormatter], Type[LxmlXMLFormatter]] + + if parser == "lxml": + if lxml is not None: + TreeBuilder = LxmlXMLFormatter + else: + raise ImportError( + "lxml not found, please install or use the etree parser." + ) + + elif parser == "etree": + TreeBuilder = EtreeXMLFormatter + + else: + raise ValueError("Values for parser can only be lxml or etree.") + + xml_formatter = TreeBuilder( + self.fmt, + path_or_buffer=path_or_buffer, + index=index, + root_name=root_name, + row_name=row_name, + na_rep=na_rep, + attr_cols=attr_cols, + elem_cols=elem_cols, + namespaces=namespaces, + prefix=prefix, + encoding=encoding, + xml_declaration=xml_declaration, + pretty_print=pretty_print, + stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, + ) + + return xml_formatter.write_output() + def to_string( self, buf: Optional[FilePathOrBuffer[str]] = None, diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 854f41d6b4dc3..e50f5986098d3 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -19,6 +19,7 @@ Sequence, Tuple, Union, + cast, ) from uuid import uuid4 @@ -55,7 +56,10 @@ CSSPair = Tuple[str, Union[str, int, float]] CSSList = List[CSSPair] CSSProperties = Union[str, CSSList] -CSSStyles = List[Dict[str, CSSProperties]] +CSSStyles = List[Dict[str, CSSProperties]] # = List[CSSDict] +# class CSSDict(TypedDict): # available when TypedDict is valid in pandas +# selector: str +# props: CSSProperties try: from matplotlib import colors @@ -566,7 +570,7 @@ def _translate(self): "body": body, "uuid": uuid, "precision": precision, - "table_styles": table_styles, + "table_styles": _format_table_styles(table_styles), "caption": caption, "table_attributes": table_attr, } @@ -1904,25 +1908,14 @@ def _pseudo_css(self, uuid: str, name: str, row: int, col: int, text: str): ------- pseudo_css : List """ + selector_id = "#T_" + uuid + "row" + str(row) + "_col" + str(col) return [ { - "selector": "#T_" - + uuid - + "row" - + str(row) - + "_col" - + str(col) - + f":hover .{name}", + "selector": selector_id + f":hover .{name}", "props": [("visibility", "visible")], }, { - "selector": "#T_" - + uuid - + "row" - + str(row) - + "_col" - + str(col) - + f" .{name}::after", + "selector": selector_id + f" .{name}::after", "props": [("content", f'"{text}"')], }, ] @@ -2077,6 +2070,26 @@ def _maybe_convert_css_to_tuples(style: CSSProperties) -> CSSList: return style +def _format_table_styles(styles: CSSStyles) -> CSSStyles: + """ + looks for multiple CSS selectors and separates them: + [{'selector': 'td, th', 'props': 'a:v;'}] + ---> [{'selector': 'td', 'props': 'a:v;'}, + {'selector': 'th', 'props': 'a:v;'}] + """ + return [ + item + for sublist in [ + [ # this is a CSSDict when TypedDict is available to avoid cast. + {"selector": x, "props": style["props"]} + for x in cast(str, style["selector"]).split(",") + ] + for style in styles + ] + for item in sublist + ] + + def _non_reducing_slice(slice_): """ Ensure that a slice doesn't reduce to a Series or Scalar. diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py new file mode 100644 index 0000000000000..dd68f0f78261e --- /dev/null +++ b/pandas/io/formats/xml.py @@ -0,0 +1,618 @@ +""" +:mod:`pandas.io.formats.xml` is a module for formatting data in XML. +""" + +import codecs +import io +from typing import ( + Any, + Dict, + List, + Optional, + Union, +) + +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) +from pandas.errors import AbstractMethodError + +from pandas.core.dtypes.common import is_list_like + +from pandas.io.common import get_handle +from pandas.io.formats.format import DataFrameFormatter +from pandas.io.xml import ( + get_data_from_filepath, + preprocess_data, +) + + +class BaseXMLFormatter: + """ + Subclass for formatting data in XML. + + Parameters + ---------- + path_or_buffer : str or file-like + This can be either a string of raw XML, a valid URL, + file or file-like object. + + index : bool + Whether to include index in xml document. + + row_name : str + Name for root of xml document. Default is 'data'. + + root_name : str + Name for row elements of xml document. Default is 'row'. + + na_rep : str + Missing data representation. + + attrs_cols : list + List of columns to write as attributes in row element. + + elem_cols : list + List of columns to write as children in row element. + + namespacess : dict + The namespaces to define in XML document as dicts with key + being namespace and value the URI. + + prefix : str + The prefix for each element in XML document including root. + + encoding : str + Encoding of xml object or document. + + xml_declaration : bool + Whether to include xml declaration at top line item in xml. + + pretty_print : bool + Whether to write xml document with line breaks and indentation. + + stylesheet : str or file-like + A URL, file, file-like object, or a raw string containing XSLT. + + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + Compression type for on-the-fly decompression of on-disk data. + If 'infer', then use extension for gzip, bz2, zip or xz. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc., + + See also + -------- + pandas.io.formats.xml.EtreeXMLFormatter + pandas.io.formats.xml.LxmlXMLFormatter + + """ + + def __init__( + self, + formatter: DataFrameFormatter, + path_or_buffer: Optional[FilePathOrBuffer] = None, + index: Optional[bool] = True, + root_name: Optional[str] = "data", + row_name: Optional[str] = "row", + na_rep: Optional[str] = None, + attr_cols: Optional[List[str]] = None, + elem_cols: Optional[List[str]] = None, + namespaces: Optional[Dict[Optional[str], str]] = None, + prefix: Optional[str] = None, + encoding: str = "utf-8", + xml_declaration: Optional[bool] = True, + pretty_print: Optional[bool] = True, + stylesheet: Optional[FilePathOrBuffer] = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, + ) -> None: + self.fmt = formatter + self.path_or_buffer = path_or_buffer + self.index = index + self.root_name = root_name + self.row_name = row_name + self.na_rep = na_rep + self.attr_cols = attr_cols + self.elem_cols = elem_cols + self.namespaces = namespaces + self.prefix = prefix + self.encoding = encoding + self.xml_declaration = xml_declaration + self.pretty_print = pretty_print + self.stylesheet = stylesheet + self.compression = compression + self.storage_options = storage_options + + self.frame = self.fmt.frame + self.orig_cols = self.fmt.frame.columns.tolist() + self.frame_dicts = self.process_dataframe() + + def build_tree(self) -> bytes: + """ + Build tree from data. + + This method initializes the root and builds attributes and elements + with optional namespaces. + """ + raise AbstractMethodError(self) + + def validate_columns(self) -> None: + """ + Validate elems_cols and attrs_cols. + + This method will check if columns is list-like. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + if self.attr_cols and not is_list_like(self.attr_cols): + raise TypeError( + f"{type(self.attr_cols).__name__} is not a valid type for attr_cols" + ) + + if self.elem_cols and not is_list_like(self.elem_cols): + raise TypeError( + f"{type(self.elem_cols).__name__} is not a valid type for elem_cols" + ) + + def validate_encoding(self) -> None: + """ + Validate encoding. + + This method will check if encoding is among listed under codecs. + + Raises + ------ + LookupError + * If encoding is not available in codecs. + """ + + codecs.lookup(self.encoding) + + def process_dataframe(self) -> Dict[Union[int, str], Dict[str, Any]]: + """ + Adjust Data Frame to fit xml output. + + This method will adjust underlying data frame for xml output, + including optionally replacing missing values and including indexes. + """ + + df = self.fmt.frame + + if self.index: + df = df.reset_index() + + if self.na_rep: + df = df.replace({None: self.na_rep, float("nan"): self.na_rep}) + + return df.to_dict(orient="index") + + def handle_indexes(self) -> None: + """ + Handle indexes. + + This method will add indexes into attr_cols or elem_cols. + """ + + indexes: List[str] = [ + x for x in self.frame_dicts[0].keys() if x not in self.orig_cols + ] + + if self.attr_cols and self.index: + self.attr_cols = indexes + self.attr_cols + + if self.elem_cols and self.index: + self.elem_cols = indexes + self.elem_cols + + def get_prefix_uri(self) -> str: + """ + Get uri of namespace prefix. + + This method retrieves corresponding URI to prefix in namespaces. + + Raises + ------ + KeyError + *If prefix is not included in namespace dict. + """ + + raise AbstractMethodError(self) + + def other_namespaces(self) -> dict: + """ + Define other namespaces. + + This method will build dictionary of namespaces attributes + for root element, conditionally with optional namespaces and + prefix. + """ + + nmsp_dict: Dict[str, str] = {} + if self.namespaces and self.prefix is None: + nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p != ""} + + if self.namespaces and self.prefix: + nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p == ""} + + return nmsp_dict + + def build_attribs(self) -> None: + """ + Create attributes of row. + + This method adds attributes using attr_cols to row element and + works with tuples for multindex or hierarchical columns. + """ + + raise AbstractMethodError(self) + + def build_elems(self) -> None: + """ + Create child elements of row. + + This method adds child elements using elem_cols to row element and + works with tuples for multindex or hierarchical columns. + """ + + raise AbstractMethodError(self) + + def write_output(self) -> Optional[str]: + xml_doc = self.build_tree() + + out_str: Optional[str] + + if self.path_or_buffer is not None: + with get_handle( + self.path_or_buffer, + "wb", + compression=self.compression, + storage_options=self.storage_options, + is_text=False, + ) as handles: + handles.handle.write(xml_doc) # type: ignore[arg-type] + return None + + else: + return xml_doc.decode(self.encoding).rstrip() + + +class EtreeXMLFormatter(BaseXMLFormatter): + """ + Class for formatting data in xml using Python standard library + modules: `xml.etree.ElementTree` and `xml.dom.minidom`. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.validate_columns() + self.validate_encoding() + self.handle_indexes() + self.prefix_uri = self.get_prefix_uri() + + def build_tree(self) -> bytes: + from xml.etree.ElementTree import ( + Element, + SubElement, + tostring, + ) + + self.root = Element( + f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces() + ) + + for k, d in self.frame_dicts.items(): + self.d = d + self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") + + if not self.attr_cols and not self.elem_cols: + self.elem_cols = list(self.frame_dicts[0].keys()) + self.build_elems() + + else: + self.build_attribs() + self.build_elems() + + self.out_xml = tostring(self.root, method="xml", encoding=self.encoding) + + if self.pretty_print: + self.out_xml = self.prettify_tree() + + if self.xml_declaration: + self.out_xml = self.add_declaration() + else: + self.out_xml = self.remove_declaration() + + if self.stylesheet is not None: + raise ValueError( + "To use stylesheet, you need lxml installed and selected as parser." + ) + + return self.out_xml + + def get_prefix_uri(self) -> str: + from xml.etree.ElementTree import register_namespace + + uri = "" + if self.namespaces: + for p, n in self.namespaces.items(): + if isinstance(p, str) and isinstance(n, str): + register_namespace(p, n) + if self.prefix: + try: + uri = f"{{{self.namespaces[self.prefix]}}}" + except KeyError: + raise KeyError(f"{self.prefix} is not included in namespaces") + else: + uri = f'{{{self.namespaces[""]}}}' + + return uri + + def build_attribs(self) -> None: + if not self.attr_cols: + return + + for col in self.attr_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + attr_name = f"{self.prefix_uri}{flat_col}" + try: + val = ( + None + if self.d[col] is None or self.d[col] != self.d[col] + else str(self.d[col]) + ) + if val is not None: + self.elem_row.attrib[attr_name] = val + except KeyError: + raise KeyError(f"no valid column, {col}") + + def build_elems(self) -> None: + from xml.etree.ElementTree import SubElement + + if not self.elem_cols: + return + + for col in self.elem_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + elem_name = f"{self.prefix_uri}{flat_col}" + try: + val = ( + None + if self.d[col] in [None, ""] or self.d[col] != self.d[col] + else str(self.d[col]) + ) + SubElement(self.elem_row, elem_name).text = val + except KeyError: + raise KeyError(f"no valid column, {col}") + + def prettify_tree(self) -> bytes: + """ + Output tree for pretty print format. + + This method will pretty print xml with line breaks and indentation. + """ + + from xml.dom.minidom import parseString + + dom = parseString(self.out_xml) + + return dom.toprettyxml(indent=" ", encoding=self.encoding) + + def add_declaration(self) -> bytes: + """ + Add xml declaration. + + This method will add xml declaration of working tree. Currently, + xml_declaration is supported in etree starting in Python 3.8. + """ + decl = f'\n' + + doc = ( + self.out_xml + if self.out_xml.startswith(b" bytes: + """ + Remove xml declaration. + + This method will remove xml declaration of working tree. Currently, + pretty_print is not supported in etree. + """ + + return self.out_xml.split(b"?>")[-1].strip() + + +class LxmlXMLFormatter(BaseXMLFormatter): + """ + Class for formatting data in xml using Python standard library + modules: `xml.etree.ElementTree` and `xml.dom.minidom`. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.validate_columns() + self.validate_encoding() + self.prefix_uri = self.get_prefix_uri() + + self.convert_empty_str_key() + self.handle_indexes() + + def build_tree(self) -> bytes: + """ + Build tree from data. + + This method initializes the root and builds attributes and elements + with optional namespaces. + """ + from lxml.etree import ( + Element, + SubElement, + tostring, + ) + + self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces) + + for k, d in self.frame_dicts.items(): + self.d = d + self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") + + if not self.attr_cols and not self.elem_cols: + self.elem_cols = list(self.frame_dicts[0].keys()) + self.build_elems() + + else: + self.build_attribs() + self.build_elems() + + self.out_xml = tostring( + self.root, + pretty_print=self.pretty_print, + method="xml", + encoding=self.encoding, + xml_declaration=self.xml_declaration, + ) + + if self.stylesheet is not None: + self.out_xml = self.transform_doc() + + return self.out_xml + + def convert_empty_str_key(self) -> None: + """ + Replace zero-lengh string in `namespaces`. + + This method will replce '' with None to align to `lxml` + requirement that empty string prefixes are not allowed. + """ + + if self.namespaces and "" in self.namespaces.keys(): + self.namespaces[None] = self.namespaces.pop("", "default") + + def get_prefix_uri(self) -> str: + uri = "" + if self.namespaces: + if self.prefix: + try: + uri = f"{{{self.namespaces[self.prefix]}}}" + except KeyError: + raise KeyError(f"{self.prefix} is not included in namespaces") + else: + uri = f'{{{self.namespaces[""]}}}' + + return uri + + def build_attribs(self) -> None: + if not self.attr_cols: + return + + for col in self.attr_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + attr_name = f"{self.prefix_uri}{flat_col}" + try: + val = ( + None + if self.d[col] is None or self.d[col] != self.d[col] + else str(self.d[col]) + ) + if val is not None: + self.elem_row.attrib[attr_name] = val + except KeyError: + raise KeyError(f"no valid column, {col}") + + def build_elems(self) -> None: + from lxml.etree import SubElement + + if not self.elem_cols: + return + + for col in self.elem_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + elem_name = f"{self.prefix_uri}{flat_col}" + try: + val = ( + None + if self.d[col] in [None, ""] or self.d[col] != self.d[col] + else str(self.d[col]) + ) + SubElement(self.elem_row, elem_name).text = val + except KeyError: + raise KeyError(f"no valid column, {col}") + + def transform_doc(self) -> bytes: + """ + Parse stylesheet from file or buffer and run it. + + This method will parse stylesheet object into tree for parsing + conditionally by its specific object type, then transforms + original tree with XSLT script. + """ + + from lxml.etree import ( + XSLT, + XMLParser, + fromstring, + parse, + ) + + style_doc = self.stylesheet + + handle_data = get_data_from_filepath( + filepath_or_buffer=style_doc, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) + + with preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) + + if isinstance(xml_data, io.StringIO): + xsl_doc = fromstring( + xml_data.getvalue().encode(self.encoding), parser=curr_parser + ) + else: + xsl_doc = parse(xml_data, parser=curr_parser) + + transformer = XSLT(xsl_doc) + new_doc = transformer(self.root) + + return bytes(new_doc) diff --git a/pandas/io/xml.py b/pandas/io/xml.py new file mode 100644 index 0000000000000..83eba5f17c7b3 --- /dev/null +++ b/pandas/io/xml.py @@ -0,0 +1,944 @@ +""" +:mod:`pandas.io.xml` is a module for reading XML. +""" + +import io +from typing import ( + Dict, + List, + Optional, + Union, +) + +from pandas._typing import ( + Buffer, + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + AbstractMethodError, + ParserError, +) +from pandas.util._decorators import doc + +from pandas.core.dtypes.common import is_list_like + +from pandas.core.frame import DataFrame +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + file_exists, + get_handle, + is_fsspec_url, + is_url, + stringify_path, +) +from pandas.io.parsers import TextParser + + +class _XMLFrameParser: + """ + Internal subclass to parse XML into DataFrames. + + Parameters + ---------- + path_or_buffer : a valid JSON str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. + + xpath : str or regex + The XPath expression to parse required set of nodes for + migration to `Data Frame`. `etree` supports limited XPath. + + namespacess : dict + The namespaces defined in XML document (`xmlns:namespace='URI') + as dicts with key being namespace and value the URI. + + elems_only : bool + Parse only the child elements at the specified `xpath`. + + attrs_only : bool + Parse only the attributes at the specified `xpath`. + + names : list + Column names for Data Frame of parsed XML data. + + encoding : str + Encoding of xml object or document. + + stylesheet : str or file-like + URL, file, file-like object, or a raw string containing XSLT, + `etree` does not support XSLT but retained for consistency. + + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + Compression type for on-the-fly decompression of on-disk data. + If 'infer', then use extension for gzip, bz2, zip or xz. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc., + + See also + -------- + pandas.io.xml._EtreeFrameParser + pandas.io.xml._LxmlFrameParser + + Notes + ----- + To subclass this class effectively you must override the following methods:` + * :func:`parse_data` + * :func:`_parse_nodes` + * :func:`_parse_doc` + * :func:`_validate_names` + * :func:`_validate_path` + + + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + path_or_buffer, + xpath, + namespaces, + elems_only, + attrs_only, + names, + encoding, + stylesheet, + compression, + storage_options, + ): + self.path_or_buffer = path_or_buffer + self.xpath = xpath + self.namespaces = namespaces + self.elems_only = elems_only + self.attrs_only = attrs_only + self.names = names + self.encoding = encoding + self.stylesheet = stylesheet + self.is_style = None + self.compression = compression + self.storage_options = storage_options + + def parse_data(self) -> List[Dict[str, Optional[str]]]: + """ + Parse xml data. + + This method will call the other internal methods to + validate xpath, names, parse and return specific nodes. + """ + + raise AbstractMethodError(self) + + def _parse_nodes(self) -> List[Dict[str, Optional[str]]]: + """ + Parse xml nodes. + + This method will parse the children and attributes of elements + in xpath, conditionally for only elements, only attributes + or both while optionally renaming node names. + + Raises + ------ + ValueError + * If only elements and only attributes are specified. + + Notes + ----- + Namespace URIs will be removed from return node values.Also, + elements with missing children or attributes compared to siblings + will have optional keys filled withi None values. + """ + + raise AbstractMethodError(self) + + def _validate_path(self) -> None: + """ + Validate xpath. + + This method checks for syntax, evaluation, or empty nodes return. + + Raises + ------ + SyntaxError + * If xpah is not supported or issues with namespaces. + + ValueError + * If xpah does not return any nodes. + """ + + raise AbstractMethodError(self) + + def _validate_names(self) -> None: + """ + Validate names. + + This method will check if names is a list-like and aligns + with length of parse nodes. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + raise AbstractMethodError(self) + + def _parse_doc(self): + """ + Build tree from io. + + This method will parse io object into tree for parsing + conditionally by its specific object type. + """ + + raise AbstractMethodError(self) + + +class _EtreeFrameParser(_XMLFrameParser): + """ + Internal class to parse XML into DataFrames with the Python + standard library XML module: `xml.etree.ElementTree`. + """ + + from xml.etree.ElementTree import ( + Element, + ElementTree, + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def parse_data(self) -> List[Dict[str, Optional[str]]]: + + if self.stylesheet is not None: + raise ValueError( + "To use stylesheet, you need lxml installed and selected as parser." + ) + + self.xml_doc = self._parse_doc() + + self._validate_path() + self._validate_names() + + return self._parse_nodes() + + def _parse_nodes(self) -> List[Dict[str, Optional[str]]]: + elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) + dicts: List[Dict[str, Optional[str]]] + + if self.elems_only and self.attrs_only: + raise ValueError("Either element or attributes can be parsed not both.") + elif self.elems_only: + if self.names: + dicts = [ + { + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.findall("*")) + }, + } + for el in elems + ] + else: + dicts = [ + { + ch.tag: ch.text.strip() if ch.text else None + for ch in el.findall("*") + } + for el in elems + ] + + elif self.attrs_only: + dicts = [ + {k: v.strip() if v else None for k, v in el.attrib.items()} + for el in elems + ] + + else: + if self.names: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.findall("*")) + }, + } + for el in elems + ] + + else: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + ch.tag: ch.text.strip() if ch.text else None + for ch in el.findall("*") + }, + } + for el in elems + ] + + dicts = [ + {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts + ] + + keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) + dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] + + if self.names: + dicts = [ + {nm: v for nm, (k, v) in zip(self.names, d.items())} for d in dicts + ] + + return dicts + + def _validate_path(self) -> None: + """ + Notes + ----- + `etree` supports limited XPath. If user attempts a more complex + expression syntax error will raise. + """ + + msg = ( + "xpath does not return any nodes. " + "If document uses namespaces denoted with " + "xmlns, be sure to define namespaces and " + "use them in xpath." + ) + try: + elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces) + if elems is None: + raise ValueError(msg) + + if elems is not None and elems.find("*") is None and elems.attrib is None: + raise ValueError(msg) + + except (KeyError, SyntaxError): + raise SyntaxError( + "You have used an incorrect or unsupported XPath " + "expression for etree library or you used an " + "undeclared namespace prefix." + ) + + def _validate_names(self) -> None: + if self.names: + parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) + children = parent.findall("*") if parent else [] + + if is_list_like(self.names): + if len(self.names) < len(children): + raise ValueError( + "names does not match length of child elements in xpath." + ) + else: + raise TypeError( + f"{type(self.names).__name__} is not a valid type for names" + ) + + def _parse_doc(self) -> Union[Element, ElementTree]: + from xml.etree.ElementTree import ( + XMLParser, + parse, + ) + + handle_data = get_data_from_filepath( + filepath_or_buffer=self.path_or_buffer, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) + + with preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) + r = parse(xml_data, parser=curr_parser) + + return r + + +class _LxmlFrameParser(_XMLFrameParser): + """ + Internal class to parse XML into DataFrames with third-party + full-featured XML library, `lxml`, that supports + XPath 1.0 and XSLT 1.0. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def parse_data(self) -> List[Dict[str, Optional[str]]]: + """ + Parse xml data. + + This method will call the other internal methods to + validate xpath, names, optionally parse and run XSLT, + and parse original or transformed XML and return specific nodes. + """ + + self.xml_doc = self._parse_doc(self.path_or_buffer) + + if self.stylesheet is not None: + self.xsl_doc = self._parse_doc(self.stylesheet) + self.xml_doc = self._transform_doc() + + self._validate_path() + self._validate_names() + + return self._parse_nodes() + + def _parse_nodes(self) -> List[Dict[str, Optional[str]]]: + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) + dicts: List[Dict[str, Optional[str]]] + + if self.elems_only and self.attrs_only: + raise ValueError("Either element or attributes can be parsed not both.") + + elif self.elems_only: + if self.names: + dicts = [ + { + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.xpath("*")) + }, + } + for el in elems + ] + else: + dicts = [ + { + ch.tag: ch.text.strip() if ch.text else None + for ch in el.xpath("*") + } + for el in elems + ] + + elif self.attrs_only: + dicts = [el.attrib for el in elems] + + else: + if self.names: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.xpath("*")) + }, + } + for el in elems + ] + else: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + ch.tag: ch.text.strip() if ch.text else None + for ch in el.xpath("*") + }, + } + for el in elems + ] + + if self.namespaces or "}" in list(dicts[0].keys())[0]: + dicts = [ + {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} + for d in dicts + ] + + keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) + dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] + + if self.names: + dicts = [ + {nm: v for nm, (k, v) in zip(self.names, d.items())} for d in dicts + ] + + return dicts + + def _transform_doc(self): + """ + Transform original tree using stylesheet. + + This method will transform original xml using XSLT script into + am ideally flatter xml document for easier parsing and migration + to Data Frame. + """ + from lxml.etree import XSLT + + transformer = XSLT(self.xsl_doc) + new_doc = transformer(self.xml_doc) + + return new_doc + + def _validate_path(self) -> None: + + msg = ( + "xpath does not return any nodes. " + "Be sure row level nodes are in xpath. " + "If document uses namespaces denoted with " + "xmlns, be sure to define namespaces and " + "use them in xpath." + ) + + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) + children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces) + attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces) + + if elems == []: + raise ValueError(msg) + + if elems != [] and attrs == [] and children == []: + raise ValueError(msg) + + def _validate_names(self) -> None: + """ + Validate names. + + This method will check if names is a list and aligns with + length of parse nodes. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + if self.names: + children = self.xml_doc.xpath( + self.xpath + "[1]/*", namespaces=self.namespaces + ) + + if is_list_like(self.names): + if len(self.names) < len(children): + raise ValueError( + "names does not match length of child elements in xpath." + ) + else: + raise TypeError( + f"{type(self.names).__name__} is not a valid type for names" + ) + + def _parse_doc(self, raw_doc): + from lxml.etree import ( + XMLParser, + fromstring, + parse, + ) + + handle_data = get_data_from_filepath( + filepath_or_buffer=raw_doc, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) + + with preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) + + if isinstance(xml_data, io.StringIO): + doc = fromstring( + xml_data.getvalue().encode(self.encoding), parser=curr_parser + ) + else: + doc = parse(xml_data, parser=curr_parser) + + return doc + + +def get_data_from_filepath( + filepath_or_buffer, + encoding, + compression, + storage_options, +) -> Union[str, bytes, Buffer]: + """ + Extract raw XML data. + + The method accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. XML string or bytes + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + + if ( + isinstance(filepath_or_buffer, str) + and not filepath_or_buffer.startswith((" Union[io.StringIO, io.BytesIO]: + """ + Convert extracted raw data. + + This method will return underlying data of extracted XML content. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is a string or bytes that is an XML document. + """ + + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data + + +def _data_to_frame(data, **kwargs) -> DataFrame: + """ + Convert parsed data to Data Frame. + + This method will bind xml dictionary data of keys and values + into named columns of Data Frame using the built-in TextParser + class that build Data Frame and infers specific dtypes. + """ + + tags = next(iter(data)) + nodes = [list(d.values()) for d in data] + + try: + with TextParser(nodes, names=tags, **kwargs) as tp: + return tp.read() + except ParserError: + raise ParserError( + "XML document may be too complex for import. " + "Try to flatten document and use distinct " + "element and attribute names." + ) + + +def _parse( + path_or_buffer, + xpath, + namespaces, + elems_only, + attrs_only, + names, + encoding, + parser, + stylesheet, + compression, + storage_options, + **kwargs, +) -> DataFrame: + """ + Call internal parsers. + + This method will conditionally call internal parsers: + LxmlFrameParser and/or EtreeParser. + + Raises + ------ + ImportError + * If lxml is not installed if selected as parser. + + ValueError + * If parser is not lxml or etree. + """ + + lxml = import_optional_dependency("lxml.etree", errors="ignore") + p: Union[_EtreeFrameParser, _LxmlFrameParser] + + if parser == "lxml": + if lxml is not None: + p = _LxmlFrameParser( + path_or_buffer, + xpath, + namespaces, + elems_only, + attrs_only, + names, + encoding, + stylesheet, + compression, + storage_options, + ) + else: + raise ImportError("lxml not found, please install or use the etree parser.") + + elif parser == "etree": + p = _EtreeFrameParser( + path_or_buffer, + xpath, + namespaces, + elems_only, + attrs_only, + names, + encoding, + stylesheet, + compression, + storage_options, + ) + else: + raise ValueError("Values for parser can only be lxml or etree.") + + data_dicts = p.parse_data() + + return _data_to_frame(data=data_dicts, **kwargs) + + +@doc(storage_options=_shared_docs["storage_options"]) +def read_xml( + path_or_buffer: FilePathOrBuffer, + xpath: Optional[str] = "./*", + namespaces: Optional[Union[dict, List[dict]]] = None, + elems_only: Optional[bool] = False, + attrs_only: Optional[bool] = False, + names: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + parser: Optional[str] = "lxml", + stylesheet: Optional[FilePathOrBuffer] = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, +) -> DataFrame: + r""" + Read XML document into a ``DataFrame`` object. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object, or file-like object + Any valid XML string or path is acceptable. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. + + xpath : str, optional, default './\*' + The XPath to parse required set of nodes for migration to DataFrame. + XPath should return a collection of elements and not a single + element. Note: The ``etree`` parser supports limited XPath + expressions. For more complex XPath, use ``lxml`` which requires + installation. + + namespaces : dict, optional + The namespaces defined in XML document as dicts with key being + namespace prefix and value the URI. There is no need to include all + namespaces in XML, only the ones used in ``xpath`` expression. + Note: if XML document uses default namespace denoted as + `xmlns=''` without a prefix, you must assign any temporary + namespace prefix such as 'doc' to the URI in order to parse + underlying nodes and/or attributes. For example, :: + + namespaces = {{"doc": "https://example.com"}} + + elems_only : bool, optional, default False + Parse only the child elements at the specified ``xpath``. By default, + all child elements and non-empty text nodes are returned. + + attrs_only : bool, optional, default False + Parse only the attributes at the specified ``xpath``. + By default, all attributes are returned. + + names : list-like, optional + Column names for DataFrame of parsed XML data. Use this parameter to + rename original element names and distinguish same named elements. + + encoding : str, optional, default 'utf-8' + Encoding of XML document. + + parser : {{'lxml','etree'}}, default 'lxml' + Parser module to use for retrieval of data. Only 'lxml' and + 'etree' are supported. With 'lxml' more complex XPath searches + and ability to use XSLT stylesheet are supported. + + stylesheet : str, path object or file-like object + A URL, file-like object, or a raw string containing an XSLT script. + This stylesheet should flatten complex, deeply nested XML documents + for easier parsing. To use this feature you must have ``lxml`` module + installed and specify 'lxml' as ``parser``. The ``xpath`` must + reference nodes of transformed XML document generated after XSLT + transformation and not the original XML document. Only XSLT 1.0 + scripts and not later versions is currently supported. + + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + + {storage_options} + + Returns + ------- + df + A DataFrame. + + See Also + -------- + read_json : Convert a JSON string to pandas object. + read_html : Read HTML tables into a list of DataFrame objects. + + Notes + ----- + This method is best designed to import shallow XML documents in + following format which is the ideal fit for the two-dimensions of a + ``DataFrame`` (row by column). :: + + + + data + data + data + ... + + + ... + + ... + + + As a file format, XML documents can be designed any way including + layout of elements and attributes as long as it conforms to W3C + specifications. Therefore, this method is a convenience handler for + a specific flatter design and not all possible XML structures. + + However, for more complex XML documents, ``stylesheet`` allows you to + temporarily redesign original document with XSLT (a special purpose + language) for a flatter version for migration to a DataFrame. + + This function will *always* return a single :class:`DataFrame` or raise + exceptions due to issues with XML document, ``xpath``, or other + parameters. + + Examples + -------- + >>> xml = ''' + ... + ... + ... square + ... 360 + ... 4.0 + ... + ... + ... circle + ... 360 + ... + ... + ... + ... triangle + ... 180 + ... 3.0 + ... + ... ''' + + >>> df = pd.read_xml(xml) + >>> df + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + + >>> xml = ''' + ... + ... + ... + ... + ... ''' + + >>> df = pd.read_xml(xml, xpath=".//row") + >>> df + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + + >>> xml = ''' + ... + ... + ... square + ... 360 + ... 4.0 + ... + ... + ... circle + ... 360 + ... + ... + ... + ... triangle + ... 180 + ... 3.0 + ... + ... ''' + + >>> df = pd.read_xml(xml, + ... xpath="//doc:row", + ... namespaces={{"doc": "https://example.com"}}) + >>> df + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + """ + + return _parse( + path_or_buffer=path_or_buffer, + xpath=xpath, + namespaces=namespaces, + elems_only=elems_only, + attrs_only=attrs_only, + names=names, + encoding=encoding, + parser=parser, + stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, + ) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 541c2988a0636..fd1c19219c4bf 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -159,6 +159,7 @@ class TestPDApi(Base): "read_gbq", "read_hdf", "read_html", + "read_xml", "read_json", "read_pickle", "read_sas", diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py new file mode 100644 index 0000000000000..b68c6235cb0b8 --- /dev/null +++ b/pandas/tests/apply/conftest.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import DataFrame + + +@pytest.fixture +def int_frame_const_col(): + """ + Fixture for DataFrame of ints which are constant per column + + Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] + """ + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) + return df diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3532040a2fd7b..12c803cbebaf3 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -14,41 +14,28 @@ Series, Timestamp, date_range, - notna, ) import pandas._testing as tm -from pandas.core.base import SpecificationError from pandas.tests.frame.common import zip_frames -@pytest.fixture -def int_frame_const_col(): - """ - Fixture for DataFrame of ints which are constant per column - - Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] - """ - df = DataFrame( - np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, - columns=["A", "B", "C"], - ) - return df - - def test_apply(float_frame): with np.errstate(all="ignore"): # ufunc - applied = float_frame.apply(np.sqrt) - tm.assert_series_equal(np.sqrt(float_frame["A"]), applied["A"]) + result = np.sqrt(float_frame["A"]) + expected = float_frame.apply(np.sqrt)["A"] + tm.assert_series_equal(result, expected) # aggregator - applied = float_frame.apply(np.mean) - assert applied["A"] == np.mean(float_frame["A"]) + result = float_frame.apply(np.mean)["A"] + expected = np.mean(float_frame["A"]) + assert result == expected d = float_frame.index[0] - applied = float_frame.apply(np.mean, axis=1) - assert applied[d] == np.mean(float_frame.xs(d)) - assert applied.index is float_frame.index # want this + result = float_frame.apply(np.mean, axis=1) + expected = np.mean(float_frame.xs(d)) + assert result[d] == expected + assert result.index is float_frame.index # invalid axis df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) @@ -58,42 +45,42 @@ def test_apply(float_frame): # GH 9573 df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) - df = df.apply(lambda ts: ts.astype("category")) + result = df.apply(lambda ts: ts.astype("category")) - assert df.shape == (4, 2) - assert isinstance(df["c0"].dtype, CategoricalDtype) - assert isinstance(df["c1"].dtype, CategoricalDtype) + assert result.shape == (4, 2) + assert isinstance(result["c0"].dtype, CategoricalDtype) + assert isinstance(result["c1"].dtype, CategoricalDtype) def test_apply_axis1_with_ea(): # GH#36785 - df = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]}) - result = df.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result, df) + expected = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]}) + result = expected.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, expected) def test_apply_mixed_datetimelike(): # mixed datetimelike # GH 7778 - df = DataFrame( + expected = DataFrame( { "A": date_range("20130101", periods=3), "B": pd.to_timedelta(np.arange(3), unit="s"), } ) - result = df.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result, df) + result = expected.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, expected) def test_apply_empty(float_frame): # empty empty_frame = DataFrame() - applied = empty_frame.apply(np.sqrt) - assert applied.empty + result = empty_frame.apply(np.sqrt) + assert result.empty - applied = empty_frame.apply(np.mean) - assert applied.empty + result = empty_frame.apply(np.mean) + assert result.empty no_rows = float_frame[:0] result = no_rows.apply(lambda x: x.mean()) @@ -108,7 +95,7 @@ def test_apply_empty(float_frame): # GH 2476 expected = DataFrame(index=["a"]) result = expected.apply(lambda x: x["a"], axis=1) - tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(result, expected) def test_apply_with_reduce_empty(): @@ -192,17 +179,6 @@ def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "how, args", [("pct_change", ()), ("nsmallest", (1, ["a", "b"])), ("tail", 1)] -) -def test_apply_str_axis_1_raises(how, args): - # GH 39211 - some ops don't support axis=1 - df = DataFrame({"a": [1, 2], "b": [3, 4]}) - msg = f"Operation {how} does not support axis=1" - with pytest.raises(ValueError, match=msg): - df.apply(how, axis=1, args=args) - - def test_apply_broadcast(float_frame, int_frame_const_col): # scalars @@ -256,27 +232,6 @@ def test_apply_broadcast(float_frame, int_frame_const_col): tm.assert_frame_equal(result, expected) -def test_apply_broadcast_error(int_frame_const_col): - df = int_frame_const_col - - # > 1 ndim - msg = "too many dims to broadcast" - with pytest.raises(ValueError, match=msg): - df.apply( - lambda x: np.array([1, 2]).reshape(-1, 2), - axis=1, - result_type="broadcast", - ) - - # cannot broadcast - msg = "cannot broadcast result" - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") - - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") - - def test_apply_raw(float_frame, mixed_type_frame): def _assert_raw(x): assert isinstance(x, np.ndarray) @@ -285,14 +240,13 @@ def _assert_raw(x): float_frame.apply(_assert_raw, raw=True) float_frame.apply(_assert_raw, axis=1, raw=True) - result0 = float_frame.apply(np.mean, raw=True) - result1 = float_frame.apply(np.mean, axis=1, raw=True) - - expected0 = float_frame.apply(lambda x: x.values.mean()) - expected1 = float_frame.apply(lambda x: x.values.mean(), axis=1) + result = float_frame.apply(np.mean, raw=True) + expected = float_frame.apply(lambda x: x.values.mean()) + tm.assert_series_equal(result, expected) - tm.assert_series_equal(result0, expected0) - tm.assert_series_equal(result1, expected1) + result = float_frame.apply(np.mean, axis=1, raw=True) + expected = float_frame.apply(lambda x: x.values.mean(), axis=1) + tm.assert_series_equal(result, expected) # no reduction result = float_frame.apply(lambda x: x * 2, raw=True) @@ -306,8 +260,9 @@ def _assert_raw(x): def test_apply_axis1(float_frame): d = float_frame.index[0] - tapplied = float_frame.apply(np.mean, axis=1) - assert tapplied[d] == np.mean(float_frame.xs(d)) + result = float_frame.apply(np.mean, axis=1)[d] + expected = np.mean(float_frame.xs(d)) + assert result == expected def test_apply_mixed_dtype_corner(): @@ -401,92 +356,25 @@ def test_apply_reduce_to_dict(): # GH 25196 37544 data = DataFrame([[1, 2], [3, 4]], columns=["c0", "c1"], index=["i0", "i1"]) - result0 = data.apply(dict, axis=0) - expected0 = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns) - tm.assert_series_equal(result0, expected0) + result = data.apply(dict, axis=0) + expected = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns) + tm.assert_series_equal(result, expected) - result1 = data.apply(dict, axis=1) - expected1 = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index) - tm.assert_series_equal(result1, expected1) + result = data.apply(dict, axis=1) + expected = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index) + tm.assert_series_equal(result, expected) def test_apply_differently_indexed(): df = DataFrame(np.random.randn(20, 10)) - result0 = df.apply(Series.describe, axis=0) - expected0 = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns) - tm.assert_frame_equal(result0, expected0) - - result1 = df.apply(Series.describe, axis=1) - expected1 = DataFrame( - {i: v.describe() for i, v in df.T.items()}, columns=df.index - ).T - tm.assert_frame_equal(result1, expected1) - - -def test_apply_modify_traceback(): - data = DataFrame( - { - "A": [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - "B": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "C": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - "D": np.random.randn(11), - "E": np.random.randn(11), - "F": np.random.randn(11), - } - ) - - data.loc[4, "C"] = np.nan - - def transform(row): - if row["C"].startswith("shin") and row["A"] == "foo": - row["D"] = 7 - return row - - def transform2(row): - if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": - row["D"] = 7 - return row + result = df.apply(Series.describe, axis=0) + expected = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns) + tm.assert_frame_equal(result, expected) - msg = "'float' object has no attribute 'startswith'" - with pytest.raises(AttributeError, match=msg): - data.apply(transform, axis=1) + result = df.apply(Series.describe, axis=1) + expected = DataFrame({i: v.describe() for i, v in df.T.items()}, columns=df.index).T + tm.assert_frame_equal(result, expected) def test_apply_bug(): @@ -525,7 +413,7 @@ def f(r): def test_apply_convert_objects(): - data = DataFrame( + expected = DataFrame( { "A": [ "foo", @@ -572,8 +460,8 @@ def test_apply_convert_objects(): } ) - result = data.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result._convert(datetime=True), data) + result = expected.apply(lambda x: x, axis=1)._convert(datetime=True) + tm.assert_frame_equal(result, expected) def test_apply_attach_name(float_frame): @@ -635,17 +523,17 @@ def test_applymap(float_frame): float_frame.applymap(type) # GH 465: function returning tuples - result = float_frame.applymap(lambda x: (x, x)) - assert isinstance(result["A"][0], tuple) + result = float_frame.applymap(lambda x: (x, x))["A"][0] + assert isinstance(result, tuple) # GH 2909: object conversion to float in constructor? df = DataFrame(data=[1, "a"]) - result = df.applymap(lambda x: x) - assert result.dtypes[0] == object + result = df.applymap(lambda x: x).dtypes[0] + assert result == object df = DataFrame(data=[1.0, "a"]) - result = df.applymap(lambda x: x) - assert result.dtypes[0] == object + result = df.applymap(lambda x: x).dtypes[0] + assert result == object # GH 2786 df = DataFrame(np.random.random((3, 4))) @@ -672,10 +560,10 @@ def test_applymap(float_frame): DataFrame(index=list("ABC")), DataFrame({"A": [], "B": [], "C": []}), ] - for frame in empty_frames: + for expected in empty_frames: for func in [round, lambda x: x]: - result = frame.applymap(func) - tm.assert_frame_equal(result, frame) + result = expected.applymap(func) + tm.assert_frame_equal(result, expected) def test_applymap_na_ignore(float_frame): @@ -743,7 +631,8 @@ def test_frame_apply_dont_convert_datetime64(): df = df.applymap(lambda x: x + BDay()) df = df.applymap(lambda x: x + BDay()) - assert df.x1.dtype == "M8[ns]" + result = df.x1.dtype + assert result == "M8[ns]" def test_apply_non_numpy_dtype(): @@ -787,11 +676,13 @@ def apply_list(row): def test_apply_noreduction_tzaware_object(): # https://github.com/pandas-dev/pandas/issues/31505 - df = DataFrame({"foo": [Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]") - result = df.apply(lambda x: x) - tm.assert_frame_equal(result, df) - result = df.apply(lambda x: x.copy()) - tm.assert_frame_equal(result, df) + expected = DataFrame( + {"foo": [Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]" + ) + result = expected.apply(lambda x: x) + tm.assert_frame_equal(result, expected) + result = expected.apply(lambda x: x.copy()) + tm.assert_frame_equal(result, expected) def test_apply_function_runs_once(): @@ -885,11 +776,11 @@ def test_infer_row_shape(): # GH 17437 # if row shape is changing, infer it df = DataFrame(np.random.rand(10, 2)) - result = df.apply(np.fft.fft, axis=0) - assert result.shape == (10, 2) + result = df.apply(np.fft.fft, axis=0).shape + assert result == (10, 2) - result = df.apply(np.fft.rfft, axis=0) - assert result.shape == (6, 2) + result = df.apply(np.fft.rfft, axis=0).shape + assert result == (6, 2) def test_with_dictlike_columns(): @@ -1101,19 +992,6 @@ def test_result_type(int_frame_const_col): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("result_type", ["foo", 1]) -def test_result_type_error(result_type, int_frame_const_col): - # allowed result_type - df = int_frame_const_col - - msg = ( - "invalid value for result_type, must be one of " - "{None, 'reduce', 'broadcast', 'expand'}" - ) - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) - - @pytest.mark.parametrize( "box", [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")], @@ -1170,20 +1048,6 @@ def test_agg_transform(axis, float_frame): tm.assert_frame_equal(result, expected) -def test_transform_and_agg_err(axis, float_frame): - # cannot both transform and agg - msg = "cannot combine transform and aggregation operations" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - float_frame.agg(["max", "sqrt"], axis=axis) - - df = DataFrame({"A": range(5), "B": 5}) - - def f(): - with np.errstate(all="ignore"): - df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - - def test_demo(): # demonstration tests df = DataFrame({"A": range(5), "B": 5}) @@ -1254,16 +1118,6 @@ def test_agg_multiple_mixed_no_warning(): tm.assert_frame_equal(result, expected) -def test_agg_dict_nested_renaming_depr(): - - df = DataFrame({"A": range(5), "B": 5}) - - # nested renaming - msg = r"nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) - - def test_agg_reduce(axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() @@ -1516,19 +1370,6 @@ def test_agg_cython_table_transform(df, func, expected, axis): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "df, func, expected", - tm.get_cython_table_params( - DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] - ), -) -def test_agg_cython_table_raises(df, func, expected, axis): - # GH 21224 - msg = "can't multiply sequence by non-int of type 'str'" - with pytest.raises(expected, match=msg): - df.agg(func, axis=axis) - - @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "args, kwargs", diff --git a/pandas/tests/apply/test_frame_apply_relabeling.py b/pandas/tests/apply/test_frame_apply_relabeling.py index 732aff24428ac..2da4a78991f5a 100644 --- a/pandas/tests/apply/test_frame_apply_relabeling.py +++ b/pandas/tests/apply/test_frame_apply_relabeling.py @@ -1,5 +1,4 @@ import numpy as np -import pytest import pandas as pd import pandas._testing as tm @@ -96,12 +95,3 @@ def test_agg_namedtuple(): index=pd.Index(["foo", "bar", "cat"]), ) tm.assert_frame_equal(result, expected) - - -def test_agg_raises(): - # GH 26513 - df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) - msg = "Must provide" - - with pytest.raises(TypeError, match=msg): - df.agg() diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 1888ddd8ec4aa..5dc828dea9e35 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -1,5 +1,4 @@ import operator -import re import numpy as np import pytest @@ -10,7 +9,6 @@ Series, ) import pandas._testing as tm -from pandas.core.base import SpecificationError from pandas.core.groupby.base import transformation_kernels from pandas.tests.frame.common import zip_frames @@ -103,6 +101,17 @@ def test_transform_dictlike(axis, float_frame, box): tm.assert_frame_equal(result, expected) +def test_transform_dictlike_mixed(): + # GH 40018 - mix of lists and non-lists in values of a dictionary + df = DataFrame({"a": [1, 2], "b": [1, 4], "c": [1, 4]}) + result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"}) + expected = DataFrame( + [[1.0, 1, 1.0], [2.0, 4, 2.0]], + columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "ops", [ @@ -148,47 +157,6 @@ def test_transform_method_name(method): tm.assert_frame_equal(result, expected) -def test_transform_and_agg_err(axis, float_frame): - # GH 35964 - # cannot both transform and agg - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - float_frame.transform(["max", "min"], axis=axis) - - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - float_frame.transform(["max", "sqrt"], axis=axis) - - -def test_agg_dict_nested_renaming_depr(): - df = DataFrame({"A": range(5), "B": 5}) - - # nested renaming - msg = r"nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - # mypy identifies the argument as an invalid type - df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) - - -def test_transform_reducer_raises(all_reductions, frame_or_series): - # GH 35964 - op = all_reductions - - obj = DataFrame({"A": [1, 2, 3]}) - if frame_or_series is not DataFrame: - obj = obj["A"] - - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - obj.transform(op) - with pytest.raises(ValueError, match=msg): - obj.transform([op]) - with pytest.raises(ValueError, match=msg): - obj.transform({"A": op}) - with pytest.raises(ValueError, match=msg): - obj.transform({"A": [op]}) - - wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"] frame_kernels_raise = [x for x in frame_kernels if x not in wont_fail] @@ -256,30 +224,6 @@ def f(x, a, b, c): frame_or_series([1]).transform(f, 0, *expected_args, **expected_kwargs) -def test_transform_missing_columns(axis): - # GH#35964 - df = DataFrame({"A": [1, 2], "B": [3, 4]}) - match = re.escape("Column(s) ['C'] do not exist") - with pytest.raises(KeyError, match=match): - df.transform({"C": "cumsum"}) - - -def test_transform_none_to_type(): - # GH#34377 - df = DataFrame({"a": [None]}) - msg = "Transform function failed" - with pytest.raises(ValueError, match=msg): - df.transform({"a": int}) - - -def test_transform_mixed_column_name_dtypes(): - # GH39025 - df = DataFrame({"a": ["1"]}) - msg = r"Column\(s\) \[1, 'b'\] do not exist" - with pytest.raises(KeyError, match=msg): - df.transform({"a": int, 1: str, "b": int}) - - def test_transform_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/39636 df = DataFrame([], columns=["col1", "col2"]) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index c67259d3c8194..5ad5390ab3e16 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -1,15 +1,48 @@ # Tests specifically aimed at detecting bad arguments. +# This file is organized by reason for exception. +# 1. always invalid argument values +# 2. missing column(s) +# 3. incompatible ops/dtype/args/kwargs +# 4. invalid result shape/type +# If your test does not fit into one of these categories, add to this list. + import re +import numpy as np import pytest from pandas import ( DataFrame, Series, + date_range, + notna, ) +import pandas._testing as tm from pandas.core.base import SpecificationError +@pytest.mark.parametrize("result_type", ["foo", 1]) +def test_result_type_error(result_type, int_frame_const_col): + # allowed result_type + df = int_frame_const_col + + msg = ( + "invalid value for result_type, must be one of " + "{None, 'reduce', 'broadcast', 'expand'}" + ) + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) + + +def test_agg_raises(): + # GH 26513 + df = DataFrame({"A": [0, 1], "B": [1, 2]}) + msg = "Must provide" + + with pytest.raises(TypeError, match=msg): + df.agg() + + @pytest.mark.parametrize("box", [DataFrame, Series]) @pytest.mark.parametrize("method", ["apply", "agg", "transform"]) @pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}]) @@ -21,6 +54,45 @@ def test_nested_renamer(box, method, func): getattr(obj, method)(func) +def test_transform_nested_renamer(): + # GH 35964 + match = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=match): + Series([1]).transform({"A": {"B": ["sum"]}}) + + +def test_agg_dict_nested_renaming_depr_agg(): + + df = DataFrame({"A": range(5), "B": 5}) + + # nested renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) + + +def test_agg_dict_nested_renaming_depr_transform(): + df = DataFrame({"A": range(5), "B": 5}) + + # nested renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + # mypy identifies the argument as an invalid type + df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) + + +def test_apply_dict_depr(): + + tsdf = DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=date_range("1/1/2000", periods=10), + ) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + tsdf.A.agg({"foo": ["sum", "mean"]}) + + @pytest.mark.parametrize("method", ["apply", "agg", "transform"]) @pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}]) def test_missing_column(method, func): @@ -29,3 +101,215 @@ def test_missing_column(method, func): match = re.escape("Column(s) ['B'] do not exist") with pytest.raises(KeyError, match=match): getattr(obj, method)(func) + + +def test_transform_missing_columns(axis): + # GH#35964 + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + match = re.escape("Column(s) ['C'] do not exist") + with pytest.raises(KeyError, match=match): + df.transform({"C": "cumsum"}) + + +def test_transform_mixed_column_name_dtypes(): + # GH39025 + df = DataFrame({"a": ["1"]}) + msg = r"Column\(s\) \[1, 'b'\] do not exist" + with pytest.raises(KeyError, match=msg): + df.transform({"a": int, 1: str, "b": int}) + + +@pytest.mark.parametrize( + "how, args", [("pct_change", ()), ("nsmallest", (1, ["a", "b"])), ("tail", 1)] +) +def test_apply_str_axis_1_raises(how, args): + # GH 39211 - some ops don't support axis=1 + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + msg = f"Operation {how} does not support axis=1" + with pytest.raises(ValueError, match=msg): + df.apply(how, axis=1, args=args) + + +def test_transform_axis_1_raises(): + # GH 35964 + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): + Series([1]).transform("sum", axis=1) + + +def test_apply_modify_traceback(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + data.loc[4, "C"] = np.nan + + def transform(row): + if row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 + return row + + def transform2(row): + if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 + return row + + msg = "'float' object has no attribute 'startswith'" + with pytest.raises(AttributeError, match=msg): + data.apply(transform, axis=1) + + +@pytest.mark.parametrize( + "df, func, expected", + tm.get_cython_table_params( + DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] + ), +) +def test_agg_cython_table_raises(df, func, expected, axis): + # GH 21224 + msg = "can't multiply sequence by non-int of type 'str'" + with pytest.raises(expected, match=msg): + df.agg(func, axis=axis) + + +def test_transform_none_to_type(): + # GH#34377 + df = DataFrame({"a": [None]}) + msg = "Transform function failed" + with pytest.raises(ValueError, match=msg): + df.transform({"a": int}) + + +def test_apply_broadcast_error(int_frame_const_col): + df = int_frame_const_col + + # > 1 ndim + msg = "too many dims to broadcast" + with pytest.raises(ValueError, match=msg): + df.apply( + lambda x: np.array([1, 2]).reshape(-1, 2), + axis=1, + result_type="broadcast", + ) + + # cannot broadcast + msg = "cannot broadcast result" + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") + + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") + + +def test_transform_and_agg_err_agg(axis, float_frame): + # cannot both transform and agg + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + float_frame.agg(["max", "sqrt"], axis=axis) + + df = DataFrame({"A": range(5), "B": 5}) + + def f(): + with np.errstate(all="ignore"): + df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) + + +def test_transform_and_agg_error_agg(string_series): + # we are trying to transform with an aggregator + msg = "cannot combine transform and aggregation" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.agg(["sqrt", "max"]) + + msg = "cannot perform both aggregation and transformation" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.agg({"foo": np.sqrt, "bar": "sum"}) + + +def test_transform_and_agg_err_transform(axis, float_frame): + # GH 35964 + # cannot both transform and agg + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + float_frame.transform(["max", "min"], axis=axis) + + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + float_frame.transform(["max", "sqrt"], axis=axis) + + +def test_transform_reducer_raises(all_reductions, frame_or_series): + # GH 35964 + op = all_reductions + + obj = DataFrame({"A": [1, 2, 3]}) + if frame_or_series is not DataFrame: + obj = obj["A"] + + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + obj.transform(op) + with pytest.raises(ValueError, match=msg): + obj.transform([op]) + with pytest.raises(ValueError, match=msg): + obj.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + obj.transform({"A": [op]}) + + +def test_transform_wont_agg(string_series): + # GH 35964 + # we are trying to transform with an aggregator + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + string_series.transform(["min", "max"]) + + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.transform(["sqrt", "max"]) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 19e6cda4ebd22..5d4a2e489e172 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -182,18 +182,6 @@ def f(x): tm.assert_series_equal(result, exp) -def test_apply_dict_depr(): - - tsdf = DataFrame( - np.random.randn(10, 3), - columns=["A", "B", "C"], - index=pd.date_range("1/1/2000", periods=10), - ) - msg = "nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - tsdf.A.agg({"foo": ["sum", "mean"]}) - - def test_apply_categorical(): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) ser = Series(values, name="XX", index=list("abcdefg")) @@ -269,19 +257,6 @@ def test_transform(string_series): tm.assert_series_equal(result.reindex_like(expected), expected) -def test_transform_and_agg_error(string_series): - # we are trying to transform with an aggregator - msg = "cannot combine transform and aggregation" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.agg(["sqrt", "max"]) - - msg = "cannot perform both aggregation and transformation" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.agg({"foo": np.sqrt, "bar": "sum"}) - - def test_demo(): # demonstration tests s = Series(range(6), dtype="int64", name="series") diff --git a/pandas/tests/apply/test_series_transform.py b/pandas/tests/apply/test_series_transform.py index e67ea4f14e4ac..90065d20e1a59 100644 --- a/pandas/tests/apply/test_series_transform.py +++ b/pandas/tests/apply/test_series_transform.py @@ -2,11 +2,12 @@ import pytest from pandas import ( + DataFrame, + MultiIndex, Series, concat, ) import pandas._testing as tm -from pandas.core.base import SpecificationError from pandas.core.groupby.base import transformation_kernels # tshift only works on time index and is deprecated @@ -55,28 +56,12 @@ def test_transform_dictlike(string_series, box): tm.assert_frame_equal(result, expected) -def test_transform_wont_agg(string_series): - # GH 35964 - # we are trying to transform with an aggregator - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - string_series.transform(["min", "max"]) - - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.transform(["sqrt", "max"]) - - -def test_transform_axis_1_raises(): - # GH 35964 - msg = "No axis named 1 for object type Series" - with pytest.raises(ValueError, match=msg): - Series([1]).transform("sum", axis=1) - - -def test_transform_nested_renamer(): - # GH 35964 - match = "nested renamer is not supported" - with pytest.raises(SpecificationError, match=match): - Series([1]).transform({"A": {"B": ["sum"]}}) +def test_transform_dictlike_mixed(): + # GH 40018 - mix of lists and non-lists in values of a dictionary + df = Series([1, 4]) + result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"}) + expected = DataFrame( + [[1.0, 1, 1.0], [2.0, 4, 2.0]], + columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 7c144c390a128..93ba16c5fda22 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -42,6 +42,12 @@ def test_categorical_scalar_deprecated(self): with tm.assert_produces_warning(FutureWarning): Categorical("A", categories=["A", "B"]) + def test_categorical_1d_only(self): + # ndim > 1 + msg = "> 1 ndim Categorical are not supported at this time" + with pytest.raises(NotImplementedError, match=msg): + Categorical(np.array([list("abcd")])) + def test_validate_ordered(self): # see gh-14058 exp_msg = "'ordered' must either be 'True' or 'False'" diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 7ba4da8a5ede9..e674b49a99bd4 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -180,3 +180,24 @@ def test_cross_type_arithmetic(): result = df.A + df.B expected = pd.Series([2, np.nan, np.nan], dtype="Float64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "source, neg_target, abs_target", + [ + ([1.1, 2.2, 3.3], [-1.1, -2.2, -3.3], [1.1, 2.2, 3.3]), + ([1.1, 2.2, None], [-1.1, -2.2, None], [1.1, 2.2, None]), + ([-1.1, 0.0, 1.1], [1.1, 0.0, -1.1], [1.1, 0.0, 1.1]), + ], +) +def test_unary_float_operators(float_ea_dtype, source, neg_target, abs_target): + # GH38794 + dtype = float_ea_dtype + arr = pd.array(source, dtype=dtype) + neg_result, pos_result, abs_result = -arr, +arr, abs(arr) + neg_target = pd.array(neg_target, dtype=dtype) + abs_target = pd.array(abs_target, dtype=dtype) + + tm.assert_extension_array_equal(neg_result, neg_target) + tm.assert_extension_array_equal(pos_result, arr) + tm.assert_extension_array_equal(abs_result, abs_target) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 0c1b10f66a73b..2eb88b669bcb1 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -284,36 +284,22 @@ def test_reduce_to_float(op): @pytest.mark.parametrize( - "source, target", + "source, neg_target, abs_target", [ - ([1, 2, 3], [-1, -2, -3]), - ([1, 2, None], [-1, -2, None]), - ([-1, 0, 1], [1, 0, -1]), + ([1, 2, 3], [-1, -2, -3], [1, 2, 3]), + ([1, 2, None], [-1, -2, None], [1, 2, None]), + ([-1, 0, 1], [1, 0, -1], [1, 0, 1]), ], ) -def test_unary_minus_nullable_int(any_signed_nullable_int_dtype, source, target): +def test_unary_int_operators( + any_signed_nullable_int_dtype, source, neg_target, abs_target +): dtype = any_signed_nullable_int_dtype arr = pd.array(source, dtype=dtype) - result = -arr - expected = pd.array(target, dtype=dtype) - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize("source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]]) -def test_unary_plus_nullable_int(any_signed_nullable_int_dtype, source): - dtype = any_signed_nullable_int_dtype - expected = pd.array(source, dtype=dtype) - result = +expected - tm.assert_extension_array_equal(result, expected) + neg_result, pos_result, abs_result = -arr, +arr, abs(arr) + neg_target = pd.array(neg_target, dtype=dtype) + abs_target = pd.array(abs_target, dtype=dtype) - -@pytest.mark.parametrize( - "source, target", - [([1, 2, 3], [1, 2, 3]), ([1, -2, None], [1, 2, None]), ([-1, 0, 1], [1, 0, 1])], -) -def test_abs_nullable_int(any_signed_nullable_int_dtype, source, target): - dtype = any_signed_nullable_int_dtype - s = pd.array(source, dtype=dtype) - result = abs(s) - expected = pd.array(target, dtype=dtype) - tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(neg_result, neg_target) + tm.assert_extension_array_equal(pos_result, arr) + tm.assert_extension_array_equal(abs_result, abs_target) diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index 1fc7f824c6daa..adb52fce17f8b 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -165,12 +165,14 @@ def test_error_len_mismatch(data, all_arithmetic_operators): @pytest.mark.parametrize("op", ["__neg__", "__abs__", "__invert__"]) -@pytest.mark.parametrize( - "values, dtype", [([1, 2, 3], "Int64"), ([True, False, True], "boolean")] -) -def test_unary_op_does_not_propagate_mask(op, values, dtype): +def test_unary_op_does_not_propagate_mask(data, op, request): # https://github.com/pandas-dev/pandas/issues/39943 - s = pd.Series(values, dtype=dtype) + data, _ = data + if data.dtype in ["Float32", "Float64"] and op == "__invert__": + request.node.add_marker( + pytest.mark.xfail(reason="invert is not implemented for float ea dtypes") + ) + s = pd.Series(data) result = getattr(s, op)() expected = result.copy(deep=True) s[0] = None diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 070dec307f527..87a095e1003c4 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -85,12 +85,10 @@ def arr1d(self): arr = self.array_cls(data, freq="D") return arr - def test_compare_len1_raises(self): + def test_compare_len1_raises(self, arr1d): # make sure we raise when comparing with different lengths, specific # to the case where one has length-1, which numpy would broadcast - data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - - arr = self.array_cls._simple_new(data, freq="D") + arr = arr1d idx = self.index_cls(arr) with pytest.raises(ValueError, match="Lengths must match"): @@ -153,7 +151,9 @@ def test_take(self): data = np.arange(100, dtype="i8") * 24 * 3600 * 10 ** 9 np.random.shuffle(data) - arr = self.array_cls._simple_new(data, freq="D") + freq = None if self.array_cls is not PeriodArray else "D" + + arr = self.array_cls(data, freq=freq) idx = self.index_cls._simple_new(arr) takers = [1, 4, 94] @@ -172,7 +172,7 @@ def test_take(self): def test_take_fill_raises(self, fill_value): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - arr = self.array_cls._simple_new(data, freq="D") + arr = self.array_cls(data, freq="D") msg = f"value should be a '{arr._scalar_type.__name__}' or 'NaT'. Got" with pytest.raises(TypeError, match=msg): @@ -181,7 +181,7 @@ def test_take_fill_raises(self, fill_value): def test_take_fill(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - arr = self.array_cls._simple_new(data, freq="D") + arr = self.array_cls(data, freq="D") result = arr.take([-1, 1], allow_fill=True, fill_value=None) assert result[0] is pd.NaT @@ -202,10 +202,8 @@ def test_take_fill_str(self, arr1d): with pytest.raises(TypeError, match=msg): arr1d.take([-1, 1], allow_fill=True, fill_value="foo") - def test_concat_same_type(self): - data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - - arr = self.array_cls._simple_new(data, freq="D") + def test_concat_same_type(self, arr1d): + arr = arr1d idx = self.index_cls(arr) idx = idx.insert(0, pd.NaT) arr = self.array_cls(idx) diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index b042e29986c80..ceb882ff9c963 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -124,9 +124,7 @@ class TestConstruction: [ Series, lambda x, **kwargs: DataFrame({"a": x}, **kwargs)["a"], - pytest.param( - lambda x, **kwargs: DataFrame(x, **kwargs)[0], marks=pytest.mark.xfail - ), + lambda x, **kwargs: DataFrame(x, **kwargs)[0], Index, ], ) diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index fe271392122a2..72da93a5c4de3 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -19,3 +19,13 @@ def test_construct_1d_ndarray_preserving_na(values, dtype, expected): result = construct_1d_ndarray_preserving_na(values, dtype=dtype) tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]"]) +def test_construct_1d_ndarray_preserving_na_datetimelike(dtype): + arr = np.arange(5, dtype=np.int64).view(dtype) + expected = np.array(list(arr), dtype=object) + assert all(isinstance(x, type(arr[0])) for x in expected) + + result = construct_1d_ndarray_preserving_na(arr, np.dtype(object)) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index bf83085058cfc..ca311768dc2d9 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -821,7 +821,7 @@ class TestCategoricalDtypeParametrized: np.arange(1000), ["a", "b", 10, 2, 1.3, True], [True, False], - pd.date_range("2017", periods=4), + date_range("2017", periods=4), ], ) def test_basic(self, categories, ordered): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 046256535df57..78a62c832833f 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -564,19 +564,35 @@ def test_maybe_convert_objects_datetime(self): [np.datetime64("2000-01-01"), np.timedelta64(1, "s")], dtype=object ) exp = arr.copy() - out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1) + out = lib.maybe_convert_objects( + arr, convert_datetime=True, convert_timedelta=True + ) tm.assert_numpy_array_equal(out, exp) arr = np.array([pd.NaT, np.timedelta64(1, "s")], dtype=object) exp = np.array([np.timedelta64("NaT"), np.timedelta64(1, "s")], dtype="m8[ns]") - out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1) + out = lib.maybe_convert_objects( + arr, convert_datetime=True, convert_timedelta=True + ) tm.assert_numpy_array_equal(out, exp) arr = np.array([np.timedelta64(1, "s"), np.nan], dtype=object) exp = arr.copy() - out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1) + out = lib.maybe_convert_objects( + arr, convert_datetime=True, convert_timedelta=True + ) tm.assert_numpy_array_equal(out, exp) + def test_maybe_convert_objects_timedelta64_nat(self): + obj = np.timedelta64("NaT", "ns") + arr = np.array([obj], dtype=object) + assert arr[0] is obj + + result = lib.maybe_convert_objects(arr, convert_timedelta=True) + + expected = np.array([obj], dtype="m8[ns]") + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "exp", [ @@ -587,7 +603,7 @@ def test_maybe_convert_objects_datetime(self): def test_maybe_convert_objects_nullable_integer(self, exp): # GH27335 arr = np.array([2, np.NaN], dtype=object) - result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=1) + result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True) tm.assert_extension_array_equal(result, exp) @@ -601,7 +617,7 @@ def test_maybe_convert_objects_bool_nan(self): def test_mixed_dtypes_remain_object_array(self): # GH14956 array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) - result = lib.maybe_convert_objects(array, convert_datetime=1) + result = lib.maybe_convert_objects(array, convert_datetime=True) tm.assert_numpy_array_equal(result, array) @@ -792,7 +808,7 @@ def test_unicode(self): (object, None, True, "empty"), ], ) - @pytest.mark.parametrize("box", [pd.Series, np.array]) + @pytest.mark.parametrize("box", [Series, np.array]) def test_object_empty(self, box, missing, dtype, skipna, expected): # GH 23421 arr = box([missing, missing], dtype=dtype) @@ -899,7 +915,7 @@ def test_infer_dtype_period(self): arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="M")]) assert lib.infer_dtype(arr, skipna=True) == "period" - @pytest.mark.parametrize("klass", [pd.array, pd.Series, pd.Index]) + @pytest.mark.parametrize("klass", [pd.array, Series, Index]) @pytest.mark.parametrize("skipna", [True, False]) def test_infer_dtype_period_array(self, klass, skipna): # https://github.com/pandas-dev/pandas/issues/23553 @@ -1248,7 +1264,7 @@ def test_interval(self): inferred = lib.infer_dtype(Series(idx), skipna=False) assert inferred == "interval" - @pytest.mark.parametrize("klass", [pd.array, pd.Series]) + @pytest.mark.parametrize("klass", [pd.array, Series]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) def test_string_dtype(self, data, skipna, klass): @@ -1257,7 +1273,7 @@ def test_string_dtype(self, data, skipna, klass): inferred = lib.infer_dtype(val, skipna=skipna) assert inferred == "string" - @pytest.mark.parametrize("klass", [pd.array, pd.Series]) + @pytest.mark.parametrize("klass", [pd.array, Series]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [[True, False, True], [True, False, pd.NA]]) def test_boolean_dtype(self, data, skipna, klass): diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index ecd56b5b61244..02bae02436d8c 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -205,16 +205,16 @@ def test_isna_datetime(self): def test_isna_old_datetimelike(self): # isna_old should work for dt64tz, td64, and period, not just tznaive - dti = pd.date_range("2016-01-01", periods=3) + dti = date_range("2016-01-01", periods=3) dta = dti._data - dta[-1] = pd.NaT + dta[-1] = NaT expected = np.array([False, False, True], dtype=bool) objs = [dta, dta.tz_localize("US/Eastern"), dta - dta, dta.to_period("D")] for obj in objs: with cf.option_context("mode.use_inf_as_na", True): - result = pd.isna(obj) + result = isna(obj) tm.assert_numpy_array_equal(result, expected) @@ -320,38 +320,38 @@ def test_period(self): def test_decimal(self): # scalars GH#23530 a = Decimal(1.0) - assert pd.isna(a) is False - assert pd.notna(a) is True + assert isna(a) is False + assert notna(a) is True b = Decimal("NaN") - assert pd.isna(b) is True - assert pd.notna(b) is False + assert isna(b) is True + assert notna(b) is False # array arr = np.array([a, b]) expected = np.array([False, True]) - result = pd.isna(arr) + result = isna(arr) tm.assert_numpy_array_equal(result, expected) - result = pd.notna(arr) + result = notna(arr) tm.assert_numpy_array_equal(result, ~expected) # series ser = Series(arr) expected = Series(expected) - result = pd.isna(ser) + result = isna(ser) tm.assert_series_equal(result, expected) - result = pd.notna(ser) + result = notna(ser) tm.assert_series_equal(result, ~expected) # index idx = pd.Index(arr) expected = np.array([False, True]) - result = pd.isna(idx) + result = isna(idx) tm.assert_numpy_array_equal(result, expected) - result = pd.notna(idx) + result = notna(idx) tm.assert_numpy_array_equal(result, ~expected) @@ -578,7 +578,7 @@ def _check_behavior(self, arr, expected): tm.assert_numpy_array_equal(result, expected) def test_basic(self): - arr = np.array([1, None, "foo", -5.1, pd.NaT, np.nan]) + arr = np.array([1, None, "foo", -5.1, NaT, np.nan]) expected = np.array([False, True, False, False, True, True]) self._check_behavior(arr, expected) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 30b115b9dba6f..d93afef60561a 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -1,5 +1,7 @@ import pytest +from pandas.compat.numpy import is_numpy_dev + import pandas as pd import pandas._testing as tm from pandas.tests.extension.base.base import BaseExtensionTests @@ -73,6 +75,10 @@ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df.groupby("A").apply(groupby_apply_op) df.groupby("A").B.apply(groupby_apply_op) + # Non-strict bc these xpass on dt64tz, Period, Interval, JSON, PandasArray + @pytest.mark.xfail( + is_numpy_dev, reason="2021-03-02 #40144 expecting fix in numpy", strict=False + ) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("A").B.apply(lambda x: x.array) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 3ef3beaa9c1b1..89991a459795e 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -16,6 +16,8 @@ import numpy as np import pytest +from pandas.compat.numpy import is_numpy_dev + import pandas as pd import pandas._testing as tm from pandas.core.arrays.boolean import BooleanDtype @@ -320,6 +322,7 @@ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df.groupby("A").apply(groupby_apply_op) df.groupby("A").B.apply(groupby_apply_op) + @pytest.mark.xfail(is_numpy_dev, reason="2021-03-02 #40144 expecting fix in numpy") def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) result = df.groupby("A").B.apply(lambda x: x.array) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index d0a3ef17afdbc..49aee76e10f6a 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -26,6 +26,29 @@ from pandas.tests.extension import base +def split_array(arr): + if not isinstance(arr.dtype, ArrowStringDtype): + pytest.skip("chunked array n/a") + + def _split_array(arr): + import pyarrow as pa + + arrow_array = arr._data + split = len(arrow_array) // 2 + arrow_array = pa.chunked_array( + [*arrow_array[:split].chunks, *arrow_array[split:].chunks] + ) + assert arrow_array.num_chunks == 2 + return type(arr)(arrow_array) + + return _split_array(arr) + + +@pytest.fixture(params=[True, False]) +def chunked(request): + return request.param + + @pytest.fixture( params=[ StringDtype, @@ -39,28 +62,32 @@ def dtype(request): @pytest.fixture -def data(dtype): +def data(dtype, chunked): strings = np.random.choice(list(string.ascii_letters), size=100) while strings[0] == strings[1]: strings = np.random.choice(list(string.ascii_letters), size=100) - return dtype.construct_array_type()._from_sequence(strings) + arr = dtype.construct_array_type()._from_sequence(strings) + return split_array(arr) if chunked else arr @pytest.fixture -def data_missing(dtype): +def data_missing(dtype, chunked): """Length 2 array with [NA, Valid]""" - return dtype.construct_array_type()._from_sequence([pd.NA, "A"]) + arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"]) + return split_array(arr) if chunked else arr @pytest.fixture -def data_for_sorting(dtype): - return dtype.construct_array_type()._from_sequence(["B", "C", "A"]) +def data_for_sorting(dtype, chunked): + arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"]) + return split_array(arr) if chunked else arr @pytest.fixture -def data_missing_for_sorting(dtype): - return dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"]) +def data_missing_for_sorting(dtype, chunked): + arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"]) + return split_array(arr) if chunked else arr @pytest.fixture @@ -69,10 +96,11 @@ def na_value(): @pytest.fixture -def data_for_grouping(dtype): - return dtype.construct_array_type()._from_sequence( +def data_for_grouping(dtype, chunked): + arr = dtype.construct_array_type()._from_sequence( ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"] ) + return split_array(arr) if chunked else arr class TestDtype(base.BaseDtypeTests): diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 0d36f3bd80e26..bc1007162884a 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,11 +6,13 @@ import pytz from pandas.compat import is_platform_little_endian +import pandas.util._test_decorators as td from pandas import ( CategoricalIndex, DataFrame, Index, + Int64Index, Interval, RangeIndex, Series, @@ -118,6 +120,8 @@ def test_from_records_sequencelike(self): tm.assert_series_equal(result["C"], df["C"]) tm.assert_series_equal(result["E1"], df["E1"].astype("float64")) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records + def test_from_records_sequencelike_empty(self): # empty case result = DataFrame.from_records([], columns=["foo", "bar", "baz"]) assert len(result) == 0 @@ -184,7 +188,12 @@ def test_from_records_bad_index_column(self): tm.assert_index_equal(df1.index, Index(df.C)) # should fail - msg = r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)" + msg = "|".join( + [ + r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)", + "Passed arrays should have the same length as the rows Index: 10 vs 1", + ] + ) with pytest.raises(ValueError, match=msg): DataFrame.from_records(df, index=[2]) with pytest.raises(KeyError, match=r"^2$"): @@ -208,6 +217,7 @@ def __iter__(self): expected = DataFrame.from_records(tups) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records def test_from_records_len0_with_columns(self): # GH#2633 result = DataFrame.from_records([], index="foo", columns=["foo", "bar"]) @@ -259,7 +269,12 @@ def test_from_records_to_records(self): tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length - msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" + msg = "|".join( + [ + r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)", + "Passed arrays should have the same length as the rows Index: 2 vs 1", + ] + ) with pytest.raises(ValueError, match=msg): DataFrame.from_records(arr, index=index[:-1]) @@ -386,6 +401,7 @@ def create_dict(order_id): result = DataFrame.from_records(documents, index=["order_id", "quantity"]) assert result.index.names == ("order_id", "quantity") + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records def test_from_records_misc_brokenness(self): # GH#2179 @@ -424,6 +440,7 @@ def test_from_records_misc_brokenness(self): ) tm.assert_series_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records def test_from_records_empty(self): # GH#3562 result = DataFrame.from_records([], columns=["a", "b", "c"]) @@ -437,11 +454,11 @@ def test_from_records_empty(self): def test_from_records_empty_with_nonempty_fields_gh3682(self): a = np.array([(1, 2)], dtype=[("id", np.int64), ("value", np.int64)]) df = DataFrame.from_records(a, index="id") - tm.assert_index_equal(df.index, Index([1], name="id")) - assert df.index.name == "id" - tm.assert_index_equal(df.columns, Index(["value"])) - - b = np.array([], dtype=[("id", np.int64), ("value", np.int64)]) - df = DataFrame.from_records(b, index="id") - tm.assert_index_equal(df.index, Index([], name="id")) - assert df.index.name == "id" + + ex_index = Int64Index([1], name="id") + expected = DataFrame({"value": [2]}, index=ex_index, columns=["value"]) + tm.assert_frame_equal(df, expected) + + b = a[:0] + df2 = DataFrame.from_records(b, index="id") + tm.assert_frame_equal(df2, df.iloc[:0]) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index f2edfed019bdb..9d61be5887b7e 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -421,19 +421,26 @@ def test_setitem_intervals(self): # B & D end up as Categoricals # the remainer are converted to in-line objects - # contining an IntervalIndex.values + # containing an IntervalIndex.values df["B"] = ser df["C"] = np.array(ser) df["D"] = ser.values df["E"] = np.array(ser.values) + df["F"] = ser.astype(object) assert is_categorical_dtype(df["B"].dtype) assert is_interval_dtype(df["B"].cat.categories) assert is_categorical_dtype(df["D"].dtype) assert is_interval_dtype(df["D"].cat.categories) - assert is_object_dtype(df["C"]) - assert is_object_dtype(df["E"]) + # Thes goes through the Series constructor and so get inferred back + # to IntervalDtype + assert is_interval_dtype(df["C"]) + assert is_interval_dtype(df["E"]) + + # But the Series constructor doesn't do inference on Series objects, + # so setting df["F"] doesnt get cast back to IntervalDtype + assert is_object_dtype(df["F"]) # they compare equal as Index # when converted to numpy objects diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 8c11f659e8454..161fe7990a327 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Categorical, @@ -92,7 +90,6 @@ def test_astype_mixed_type(self, mixed_type_frame): casted = mn.astype("O") _check_cast(casted, "object") - @td.skip_array_manager_not_yet_implemented def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) @@ -127,7 +124,6 @@ def test_astype_with_view_mixed_float(self, mixed_float_frame): casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): @@ -386,7 +382,6 @@ def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination @@ -411,7 +406,6 @@ def test_astype_to_timedelta_unit_ns(self, unit): tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float @@ -441,7 +435,6 @@ def test_astype_to_incorrect_datetimelike(self, unit): with pytest.raises(TypeError, match=msg): df.astype(dtype) - @td.skip_array_manager_not_yet_implemented def test_astype_arg_for_errors(self): # GH#14878 @@ -570,7 +563,6 @@ def test_astype_empty_dtype_dict(self): tm.assert_frame_equal(result, df) assert result is not df - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) ignore keyword @pytest.mark.parametrize( "df", [ diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 58016be82c405..564481d01abc8 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -265,12 +265,13 @@ def test_fillna_dtype_conversion(self): expected = DataFrame("nan", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) - # equiv of replace + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) object upcasting + @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) + def test_fillna_dtype_conversion_equiv_replace(self, val): df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]}) - for v in ["", 1, np.nan, 1.0]: - expected = df.replace(np.nan, v) - result = df.fillna(v) - tm.assert_frame_equal(result, expected) + expected = df.replace(np.nan, val) + result = df.fillna(val) + tm.assert_frame_equal(result, expected) @td.skip_array_manager_invalid_test def test_fillna_datetime_columns(self): diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index 677d862dfe077..462d588aff58f 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -170,6 +170,7 @@ def test_rename_multiindex(self): renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem copy/view def test_rename_nocopy(self, float_frame): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) renamed["foo"] = 1.0 diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 9ae5bb151b685..6d1e90e2f9646 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -783,6 +783,8 @@ def test_replace_mixed(self, float_string_frame): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) + def test_replace_mixed_int_block_upcasting(self): + # int block upcasting df = DataFrame( { @@ -803,6 +805,8 @@ def test_replace_mixed(self, float_string_frame): assert return_value is None tm.assert_frame_equal(df, expected) + def test_replace_mixed_int_block_splitting(self): + # int block splitting df = DataFrame( { @@ -821,6 +825,8 @@ def test_replace_mixed(self, float_string_frame): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) + def test_replace_mixed2(self): + # to object block upcasting df = DataFrame( { @@ -846,6 +852,7 @@ def test_replace_mixed(self, float_string_frame): result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) + def test_replace_mixed3(self): # test case from df = DataFrame( {"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")} diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 221296bfd6d76..5fa60b55f4e21 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -761,6 +761,23 @@ def test_sort_index_with_categories(self, categories): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "ascending", + [ + None, + [True, None], + [False, "True"], + ], + ) + def test_sort_index_ascending_bad_value_raises(self, ascending): + # GH 39434 + df = DataFrame(np.arange(64)) + length = len(df.index) + df.index = [(i - length / 2) % length for i in range(length)] + match = 'For argument "ascending" expected type bool' + with pytest.raises(ValueError, match=match): + df.sort_index(axis=0, ascending=ascending, na_position="first") + class TestDataFrameSortIndexKey: def test_sort_multi_index_key(self): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index afc7ccb516c7f..4342f1960f178 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -18,6 +18,7 @@ import pytz from pandas.compat import np_version_under1p19 +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype from pandas.core.dtypes.dtypes import ( @@ -163,7 +164,12 @@ def test_constructor_cast_failure(self): df["foo"] = np.ones((4, 2)).tolist() # this is not ok - msg = "Wrong number of items passed 2, placement implies 1" + msg = "|".join( + [ + "Wrong number of items passed 2, placement implies 1", + "Expected a 1D array, got an array with shape \\(4, 2\\)", + ] + ) with pytest.raises(ValueError, match=msg): df["test"] = np.ones((4, 2)) @@ -178,12 +184,15 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view(self): + def test_constructor_dtype_nocast_view_dataframe(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) should_be_view[0][0] = 99 assert df.values[0, 0] == 99 + @td.skip_array_manager_invalid_test # TODO(ArrayManager) keep view on 2D array? + def test_constructor_dtype_nocast_view_2d_array(self): + df = DataFrame([[1, 2]]) should_be_view = DataFrame(df.values, dtype=df[0].dtype) should_be_view[0][0] = 97 assert df.values[0, 0] == 97 @@ -279,6 +288,7 @@ def test_constructor_rec(self, float_frame): tm.assert_index_equal(df2.columns, Index(rec.dtype.names)) tm.assert_index_equal(df2.index, index) + # case with columns != the ones we would infer from the data rng = np.arange(len(rec))[::-1] df3 = DataFrame(rec, index=rng, columns=["C", "B"]) expected = DataFrame(rec, index=rng).reindex(columns=["C", "B"]) @@ -376,15 +386,18 @@ def test_constructor_dict(self): with pytest.raises(ValueError, match=msg): DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]}) + def test_constructor_dict_length1(self): # Length-one dict micro-optimization frame = DataFrame({"A": {"1": 1, "2": 2}}) tm.assert_index_equal(frame.index, Index(["1", "2"])) + def test_constructor_dict_with_index(self): # empty dict plus index idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx) assert frame.index is idx + def test_constructor_dict_with_index_and_columns(self): # empty dict with index and columns idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx, columns=idx) @@ -392,10 +405,12 @@ def test_constructor_dict(self): assert frame.columns is idx assert len(frame._series) == 3 + def test_constructor_dict_of_empty_lists(self): # with dict of empty list and Series frame = DataFrame({"A": [], "B": []}, columns=["A", "B"]) tm.assert_index_equal(frame.index, RangeIndex(0), exact=True) + def test_constructor_dict_with_none(self): # GH 14381 # Dict with None value frame_none = DataFrame({"a": None}, index=[0]) @@ -404,6 +419,7 @@ def test_constructor_dict(self): assert frame_none_list._get_value(0, "a") is None tm.assert_frame_equal(frame_none, frame_none_list) + def test_constructor_dict_errors(self): # GH10856 # dict with scalar values should raise error, even if columns passed msg = "If using all scalar values, you must pass an index" @@ -559,7 +575,7 @@ def test_constructor_error_msgs(self): with pytest.raises(ValueError, match=msg): DataFrame({"a": False, "b": True}) - def test_constructor_subclass_dict(self, float_frame, dict_subclass): + def test_constructor_subclass_dict(self, dict_subclass): # Test for passing dict subclass to constructor data = { "col1": dict_subclass((x, 10.0 * x) for x in range(10)), @@ -573,6 +589,7 @@ def test_constructor_subclass_dict(self, float_frame, dict_subclass): df = DataFrame(data) tm.assert_frame_equal(refdf, df) + def test_constructor_defaultdict(self, float_frame): # try with defaultdict from collections import defaultdict @@ -607,6 +624,7 @@ def test_constructor_dict_cast(self): assert frame["B"].dtype == np.object_ assert frame["A"].dtype == np.float64 + def test_constructor_dict_cast2(self): # can't cast to float test_data = { "A": dict(zip(range(20), tm.makeStringIndex(20))), @@ -622,6 +640,7 @@ def test_constructor_dict_dont_upcast(self): df = DataFrame(d) assert isinstance(df["Col1"]["Row2"], float) + def test_constructor_dict_dont_upcast2(self): dm = DataFrame([[1, 2], ["a", "b"]], index=[1, 2], columns=[1, 2]) assert isinstance(dm[1][1], int) @@ -1100,7 +1119,8 @@ def test_constructor_more(self, float_frame): # can't cast mat = np.array(["foo", "bar"], dtype=object).reshape(2, 1) - with pytest.raises(ValueError, match="cast"): + msg = "could not convert string to float: 'foo'" + with pytest.raises(ValueError, match=msg): DataFrame(mat, index=[0, 1], columns=[0], dtype=float) dm = DataFrame(DataFrame(float_frame._series)) @@ -1168,7 +1188,8 @@ def test_constructor_unequal_length_nested_list_column(self): # GH 32173 arrays = [list("abcd"), list("cde")] - msg = "Length of columns passed for MultiIndex columns is different" + # exception raised inside MultiIndex constructor + msg = "all arrays must be same length" with pytest.raises(ValueError, match=msg): DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) @@ -1192,6 +1213,7 @@ def __len__(self, n): expected = DataFrame([[1, "a"], [2, "b"]], columns=columns) tm.assert_frame_equal(result, expected, check_dtype=False) + def test_constructor_stdlib_array(self): # GH 4297 # support Array import array @@ -1721,12 +1743,15 @@ def test_constructor_with_datetimes(self): ) tm.assert_series_equal(result, expected) + def test_constructor_with_datetimes1(self): + # GH 2809 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] datetime_s = Series(datetimes) assert datetime_s.dtype == "M8[ns]" + def test_constructor_with_datetimes2(self): # GH 2810 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] @@ -1740,6 +1765,7 @@ def test_constructor_with_datetimes(self): ) tm.assert_series_equal(result, expected) + def test_constructor_with_datetimes3(self): # GH 7594 # don't coerce tz-aware tz = pytz.timezone("US/Eastern") @@ -1757,6 +1783,7 @@ def test_constructor_with_datetimes(self): df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) ) + def test_constructor_with_datetimes4(self): # tz-aware (UTC and other tz's) # GH 8411 dr = date_range("20130101", periods=3) @@ -1769,6 +1796,7 @@ def test_constructor_with_datetimes(self): df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "US/Eastern" + def test_constructor_with_datetimes5(self): # GH 7822 # preserver an index with a tz on dict construction i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern") @@ -1781,7 +1809,9 @@ def test_constructor_with_datetimes(self): df = DataFrame({"a": i}) tm.assert_frame_equal(df, expected) + def test_constructor_with_datetimes6(self): # multiples + i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern") i_no_tz = date_range("1/1/2011", periods=5, freq="10s") df = DataFrame({"a": i, "b": i_no_tz}) expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz}) @@ -1935,6 +1965,8 @@ def test_constructor_frame_copy(self, float_frame): assert (cop["A"] == 5).all() assert not (float_frame["A"] == 5).all() + # TODO(ArrayManager) keep view on 2D array? + @td.skip_array_manager_not_yet_implemented def test_constructor_ndarray_copy(self, float_frame): df = DataFrame(float_frame.values) @@ -1945,6 +1977,8 @@ def test_constructor_ndarray_copy(self, float_frame): float_frame.values[6] = 6 assert not (df.values[6] == 6).all() + # TODO(ArrayManager) keep view on Series? + @td.skip_array_manager_not_yet_implemented def test_constructor_series_copy(self, float_frame): series = float_frame._series @@ -2058,17 +2092,15 @@ def test_from_nested_listlike_mixed_types(self): def test_construct_from_listlikes_mismatched_lengths(self): # invalid (shape) - msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)" + msg = "|".join( + [ + r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)", + "Passed arrays should have the same length as the rows Index", + ] + ) with pytest.raises(ValueError, match=msg): DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) - def test_categorical_1d_only(self): - # TODO: belongs in Categorical tests - # ndim > 1 - msg = "> 1 ndim Categorical are not supported at this time" - with pytest.raises(NotImplementedError, match=msg): - Categorical(np.array([list("abcd")])) - def test_constructor_categorical_series(self): items = [1, 2, 3, 1] @@ -2110,6 +2142,8 @@ def test_check_dtype_empty_numeric_column(self, dtype): assert data.b.dtype == dtype + # TODO(ArrayManager) astype to bytes dtypes does not yet give object dtype + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize( "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES ) @@ -2213,7 +2247,8 @@ class DatetimeSubclass(datetime): def test_with_mismatched_index_length_raises(self): # GH#33437 dti = date_range("2016-01-01", periods=3, tz="US/Pacific") - with pytest.raises(ValueError, match="Shape of passed values"): + msg = "Shape of passed values|Passed arrays should have the same length" + with pytest.raises(ValueError, match=msg): DataFrame(dti, index=range(4)) def test_frame_ctor_datetime64_column(self): @@ -2423,11 +2458,16 @@ def test_from_2d_ndarray_with_dtype(self): expected = DataFrame(array_dim2).astype("datetime64[ns, UTC]") tm.assert_frame_equal(df, expected) - def test_construction_from_set_raises(self): + @pytest.mark.parametrize("typ", [set, frozenset]) + def test_construction_from_set_raises(self, typ): # https://github.com/pandas-dev/pandas/issues/32582 - msg = "Set type is unordered" + values = typ({1, 2, 3}) + msg = f"'{typ.__name__}' type is unordered" + with pytest.raises(TypeError, match=msg): + DataFrame({"a": values}) + with pytest.raises(TypeError, match=msg): - DataFrame({"a": {1, 2, 3}}) + Series(values) def get1(obj): diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index de8335738791d..cc036bb484ff9 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -8,6 +8,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, @@ -355,7 +357,8 @@ def test_groupby_function_rename(mframe): "cummax", "cummin", "cumprod", - "describe", + # TODO(ArrayManager) quantile + pytest.param("describe", marks=td.skip_array_manager_not_yet_implemented), "rank", "quantile", "diff", diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 639fe308529dc..79ec0af267234 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -7,6 +7,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -84,6 +86,7 @@ def test_apply_trivial_fail(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used def test_fast_apply(): # make sure that fast apply is correctly called # rather than raising any kind of error @@ -110,7 +113,7 @@ def f(g): splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) group_keys = grouper._get_group_keys() - sdata = splitter._get_sorted_data() + sdata = splitter.sorted_data values, mutated = splitter.fast_apply(f, sdata, group_keys) @@ -213,6 +216,7 @@ def test_group_apply_once_per_group2(capsys): assert result == expected +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used @pytest.mark.xfail(reason="GH-34998") def test_apply_fast_slow_identical(): # GH 31613 @@ -233,6 +237,7 @@ def fast(group): tm.assert_frame_equal(fast_df, slow_df) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used @pytest.mark.parametrize( "func", [ @@ -313,6 +318,7 @@ def test_groupby_as_index_apply(df): tm.assert_index_equal(res, ind) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_apply_concat_preserve_names(three_group): grouped = three_group.groupby(["A", "B"]) @@ -1003,9 +1009,10 @@ def test_apply_function_with_indexing_return_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH-34998") -def test_apply_with_timezones_aware(): +def test_apply_with_timezones_aware(using_array_manager, request): # GH: 27212 + if not using_array_manager: + request.node.add_marker(pytest.mark.xfail(reason="GH-34998")) dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2 index_no_tz = pd.DatetimeIndex(dates) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f0356ad90a3ff..a7247c2c04761 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -81,6 +83,7 @@ def get_stats(group): assert result.index.names[0] == "C" +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_basic(): cats = Categorical( @@ -276,7 +279,9 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -def test_observed(observed): +# TODO(ArrayManager) incorrect dtype for mean() +@td.skip_array_manager_not_yet_implemented +def test_observed(observed, using_array_manager): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -535,6 +540,7 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): assert False, msg +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range("2014-01-01", periods=4) @@ -600,6 +606,7 @@ def test_categorical_index(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_describe_categorical_columns(): # GH 11558 cats = CategoricalIndex( @@ -614,6 +621,7 @@ def test_describe_categorical_columns(): tm.assert_categorical_equal(result.stack().columns.values, cats.values) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_unstack_categorical(): # GH11558 (example is taken from the original issue) df = DataFrame( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cab5417e81445..598465a951e0f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -367,6 +367,7 @@ def test_mad(self, gb, gni): result = gni.mad() tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_describe(self, df, gb, gni): # describe expected_index = Index([1, 3], name="A") @@ -923,11 +924,13 @@ def test_is_monotonic_decreasing(in_vals, out_vals): # -------------------------------- +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_apply_describe_bug(mframe): grouped = mframe.groupby(level="first") grouped.describe() # it works! +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_series_describe_multikey(): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) @@ -937,6 +940,7 @@ def test_series_describe_multikey(): tm.assert_series_equal(result["min"], grouped.min(), check_names=False) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_series_describe_single(): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) @@ -951,6 +955,7 @@ def test_series_index_name(df): assert result.index.name == "A" +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() @@ -973,6 +978,7 @@ def test_frame_describe_multikey(tsframe): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_frame_describe_tupleindex(): # GH 14848 - regression from 0.19.0 to 0.19.1 @@ -992,6 +998,7 @@ def test_frame_describe_tupleindex(): df2.groupby("key").describe() +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_frame_describe_unstacked_format(): # GH 4792 prices = { @@ -1018,6 +1025,7 @@ def test_frame_describe_unstacked_format(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile @pytest.mark.filterwarnings( "ignore:" "indexing past lexsort depth may impact performance:" diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index afde1daca74c1..8cbb9d2443cb2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -7,6 +7,7 @@ from pandas.compat import IS64 from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -210,6 +211,7 @@ def f(grp): tm.assert_series_equal(result, e) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_pass_args_kwargs(ts, tsframe): def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) @@ -364,6 +366,7 @@ def f3(x): df2.groupby("a").apply(f3) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_attr_wrapper(ts): grouped = ts.groupby(lambda x: x.weekday()) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 9c9d1aa881890..2924348e98b56 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -8,6 +10,9 @@ ) import pandas._testing as tm +# TODO(ArrayManager) quantile +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] diff --git a/pandas/tests/groupby/test_sample.py b/pandas/tests/groupby/test_sample.py index 4b8b0173789ae..652a5fc1a3c34 100644 --- a/pandas/tests/groupby/test_sample.py +++ b/pandas/tests/groupby/test_sample.py @@ -132,3 +132,13 @@ def test_groupby_sample_with_weights(index, expected_index): result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0]) expected = Series(values, name="b", index=Index(expected_index)) tm.assert_series_equal(result, expected) + + +def test_groupby_sample_with_selections(): + # GH 39928 + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values, "c": values}) + + result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None) + expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 4956454ef2d4f..c4621d5fc0f8c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import ( ensure_platform_int, is_timedelta64_dtype, @@ -161,8 +163,13 @@ def test_transform_broadcast(tsframe, ts): assert_fp_equal(res.xs(idx), agged[idx]) -def test_transform_axis_1(request, transformation_func): +def test_transform_axis_1(request, transformation_func, using_array_manager): # GH 36308 + if using_array_manager and transformation_func == "pct_change": + # TODO(ArrayManager) column-wise shift + request.node.add_marker( + pytest.mark.xfail(reason="ArrayManager: shift axis=1 not yet implemented") + ) warn = None if transformation_func == "tshift": warn = FutureWarning @@ -183,6 +190,8 @@ def test_transform_axis_1(request, transformation_func): tm.assert_equal(result, expected) +# TODO(ArrayManager) groupby().transform returns DataFrame backed by BlockManager +@td.skip_array_manager_not_yet_implemented def test_transform_axis_ts(tsframe): # make sure that we are setting the axes diff --git a/pandas/tests/indexes/categorical/test_append.py b/pandas/tests/indexes/categorical/test_append.py new file mode 100644 index 0000000000000..b48c3219f5111 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_append.py @@ -0,0 +1,62 @@ +import pytest + +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +class TestAppend: + @pytest.fixture + def ci(self): + categories = list("cab") + return CategoricalIndex(list("aabbca"), categories=categories, ordered=False) + + def test_append(self, ci): + # append cats with the same categories + result = ci[:3].append(ci[3:]) + tm.assert_index_equal(result, ci, exact=True) + + foos = [ci[:1], ci[1:3], ci[3:]] + result = foos[0].append(foos[1:]) + tm.assert_index_equal(result, ci, exact=True) + + def test_append_empty(self, ci): + # empty + result = ci.append([]) + tm.assert_index_equal(result, ci, exact=True) + + def test_append_mismatched_categories(self, ci): + # appending with different categories or reordered is not ok + msg = "all inputs must be Index" + with pytest.raises(TypeError, match=msg): + ci.append(ci.values.set_categories(list("abcd"))) + with pytest.raises(TypeError, match=msg): + ci.append(ci.values.reorder_categories(list("abc"))) + + def test_append_category_objects(self, ci): + # with objects + result = ci.append(Index(["c", "a"])) + expected = CategoricalIndex(list("aabbcaca"), categories=ci.categories) + tm.assert_index_equal(result, expected, exact=True) + + def test_append_non_categories(self, ci): + # invalid objects -> cast to object via concat_compat + result = ci.append(Index(["a", "d"])) + expected = Index(["a", "a", "b", "b", "c", "a", "a", "d"]) + tm.assert_index_equal(result, expected, exact=True) + + def test_append_object(self, ci): + # GH#14298 - if base object is not categorical -> coerce to object + result = Index(["c", "a"]).append(ci) + expected = Index(list("caaabbca")) + tm.assert_index_equal(result, expected, exact=True) + + def test_append_to_another(self): + # hits Index._concat + fst = Index(["a", "b"]) + snd = CategoricalIndex(["d", "e"]) + result = fst.append(snd) + expected = Index(["a", "b", "d", "e"]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 8c9caf2e59011..d3c9b02b3ba23 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -30,53 +30,6 @@ def test_can_hold_identifiers(self): key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is True - def test_append(self): - - ci = self.create_index() - categories = ci.categories - - # append cats with the same categories - result = ci[:3].append(ci[3:]) - tm.assert_index_equal(result, ci, exact=True) - - foos = [ci[:1], ci[1:3], ci[3:]] - result = foos[0].append(foos[1:]) - tm.assert_index_equal(result, ci, exact=True) - - # empty - result = ci.append([]) - tm.assert_index_equal(result, ci, exact=True) - - # appending with different categories or reordered is not ok - msg = "all inputs must be Index" - with pytest.raises(TypeError, match=msg): - ci.append(ci.values.set_categories(list("abcd"))) - with pytest.raises(TypeError, match=msg): - ci.append(ci.values.reorder_categories(list("abc"))) - - # with objects - result = ci.append(Index(["c", "a"])) - expected = CategoricalIndex(list("aabbcaca"), categories=categories) - tm.assert_index_equal(result, expected, exact=True) - - # invalid objects -> cast to object via concat_compat - result = ci.append(Index(["a", "d"])) - expected = Index(["a", "a", "b", "b", "c", "a", "a", "d"]) - tm.assert_index_equal(result, expected, exact=True) - - # GH14298 - if base object is not categorical -> coerce to object - result = Index(["c", "a"]).append(ci) - expected = Index(list("caaabbca")) - tm.assert_index_equal(result, expected, exact=True) - - def test_append_to_another(self): - # hits Index._concat - fst = Index(["a", "b"]) - snd = CategoricalIndex(["d", "e"]) - result = fst.append(snd) - expected = Index(["a", "b", "d", "e"]) - tm.assert_index_equal(result, expected) - def test_insert(self): ci = self.create_index() @@ -97,10 +50,10 @@ def test_insert(self): expected = CategoricalIndex(["a"], categories=categories) tm.assert_index_equal(result, expected, exact=True) - # invalid - msg = "'fill_value=d' is not present in this Categorical's categories" - with pytest.raises(TypeError, match=msg): - ci.insert(0, "d") + # invalid -> cast to object + expected = ci.astype(object).insert(0, "d") + result = ci.insert(0, "d") + tm.assert_index_equal(result, expected, exact=True) # GH 18295 (test missing) expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"]) @@ -110,9 +63,9 @@ def test_insert(self): def test_insert_na_mismatched_dtype(self): ci = CategoricalIndex([0, 1, 1]) - msg = "'fill_value=NaT' is not present in this Categorical's categories" - with pytest.raises(TypeError, match=msg): - ci.insert(0, pd.NaT) + result = ci.insert(0, pd.NaT) + expected = Index([pd.NaT, 0, 1, 1], dtype=object) + tm.assert_index_equal(result, expected) def test_delete(self): @@ -326,12 +279,6 @@ def test_map_str(self): class TestCategoricalIndex2: # Tests that are not overriding a test in Base - def test_format_different_scalar_lengths(self): - # GH35439 - idx = CategoricalIndex(["aaaaaaaaa", "b"]) - expected = ["aaaaaaaaa", "b"] - assert idx.format() == expected - @pytest.mark.parametrize( "dtype, engine_type", [ diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 0f1cb55b9811c..2009d78e47c1c 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -7,6 +7,12 @@ class TestCategoricalIndexRepr: + def test_format_different_scalar_lengths(self): + # GH#35439 + idx = CategoricalIndex(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected + def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py new file mode 100644 index 0000000000000..c56fc84b540c0 --- /dev/null +++ b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py @@ -0,0 +1,80 @@ +import numpy as np +import pytest + +from pandas import ( + PeriodIndex, + Series, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +class DropDuplicates: + def test_drop_duplicates_metadata(self, idx): + # GH#10115 + result = idx.drop_duplicates() + tm.assert_index_equal(idx, result) + assert idx.freq == result.freq + + idx_dup = idx.append(idx) + result = idx_dup.drop_duplicates() + + expected = idx + if not isinstance(idx, PeriodIndex): + # freq is reset except for PeriodIndex + assert idx_dup.freq is None + assert result.freq is None + expected = idx._with_freq(None) + else: + assert result.freq == expected.freq + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, keep, expected, index, idx): + # to check Index/Series compat + idx = idx.append(idx[:5]) + + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] + + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) + + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) + + +class TestDropDuplicatesPeriodIndex(DropDuplicates): + @pytest.fixture(params=["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) + def freq(self, request): + return request.param + + @pytest.fixture + def idx(self, freq): + return period_range("2011-01-01", periods=10, freq=freq, name="idx") + + +class TestDropDuplicatesDatetimeIndex(DropDuplicates): + @pytest.fixture + def idx(self, freq_sample): + return date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") + + +class TestDropDuplicatesTimedeltaIndex(DropDuplicates): + @pytest.fixture + def idx(self, freq_sample): + return timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") diff --git a/pandas/tests/indexes/datetimelike_/test_nat.py b/pandas/tests/indexes/datetimelike_/test_nat.py new file mode 100644 index 0000000000000..b4a72ec65bd91 --- /dev/null +++ b/pandas/tests/indexes/datetimelike_/test_nat.py @@ -0,0 +1,54 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + NaT, + PeriodIndex, + TimedeltaIndex, +) +import pandas._testing as tm + + +class NATests: + def test_nat(self, index_without_na): + empty_index = index_without_na[:0] + + index_with_na = index_without_na.copy(deep=True) + index_with_na._data[1] = NaT + + assert type(index_without_na)._na_value is NaT + assert empty_index._na_value is NaT + assert index_with_na._na_value is NaT + assert index_without_na._na_value is NaT + + idx = index_without_na + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + assert idx.hasnans is False + + idx = index_with_na + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + assert idx.hasnans is True + + +class TestDatetimeIndexNA(NATests): + @pytest.fixture + def index_without_na(self, tz_naive_fixture): + tz = tz_naive_fixture + return DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + + +class TestTimedeltaIndexNA(NATests): + @pytest.fixture + def index_without_na(self): + return TimedeltaIndex(["1 days", "2 days"]) + + +class TestPeriodIndexNA(NATests): + @pytest.fixture + def index_without_na(self): + return PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") diff --git a/pandas/tests/indexes/datetimelike_/test_sort_values.py b/pandas/tests/indexes/datetimelike_/test_sort_values.py new file mode 100644 index 0000000000000..ad9c5ca848615 --- /dev/null +++ b/pandas/tests/indexes/datetimelike_/test_sort_values.py @@ -0,0 +1,317 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + Index, + NaT, + PeriodIndex, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +def check_freq_ascending(ordered, orig, ascending): + """ + Check the expected freq on a PeriodIndex/DatetimeIndex/TimedeltaIndex + when the original index is generated (or generate-able) with + period_range/date_range/timedelta_range. + """ + if isinstance(ordered, PeriodIndex): + assert ordered.freq == orig.freq + elif isinstance(ordered, (DatetimeIndex, TimedeltaIndex)): + if ascending: + assert ordered.freq.n == orig.freq.n + else: + assert ordered.freq.n == -1 * orig.freq.n + + +def check_freq_nonmonotonic(ordered, orig): + """ + Check the expected freq on a PeriodIndex/DatetimeIndex/TimedeltaIndex + when the original index is _not_ generated (or generate-able) with + period_range/date_range//timedelta_range. + """ + if isinstance(ordered, PeriodIndex): + assert ordered.freq == orig.freq + elif isinstance(ordered, (DatetimeIndex, TimedeltaIndex)): + assert ordered.freq is None + + +class TestSortValues: + @pytest.fixture(params=[DatetimeIndex, TimedeltaIndex, PeriodIndex]) + def non_monotonic_idx(self, request): + if request.param is DatetimeIndex: + return DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"]) + elif request.param is PeriodIndex: + dti = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"]) + return dti.to_period("D") + else: + return TimedeltaIndex( + ["1 day 00:00:05", "1 day 00:00:01", "1 day 00:00:02"] + ) + + def test_argmin_argmax(self, non_monotonic_idx): + assert non_monotonic_idx.argmin() == 1 + assert non_monotonic_idx.argmax() == 0 + + def test_sort_values(self, non_monotonic_idx): + idx = non_monotonic_idx + ordered = idx.sort_values() + assert ordered.is_monotonic + + ordered = idx.sort_values(ascending=False) + assert ordered[::-1].is_monotonic + + ordered, dexer = idx.sort_values(return_indexer=True) + assert ordered.is_monotonic + tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0], dtype=np.intp)) + + ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) + assert ordered[::-1].is_monotonic + tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1], dtype=np.intp)) + + def check_sort_values_with_freq(self, idx): + ordered = idx.sort_values() + tm.assert_index_equal(ordered, idx) + check_freq_ascending(ordered, idx, True) + + ordered = idx.sort_values(ascending=False) + expected = idx[::-1] + tm.assert_index_equal(ordered, expected) + check_freq_ascending(ordered, idx, False) + + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, idx) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2], dtype=np.intp)) + check_freq_ascending(ordered, idx, True) + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + expected = idx[::-1] + tm.assert_index_equal(ordered, expected) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0], dtype=np.intp)) + check_freq_ascending(ordered, idx, False) + + @pytest.mark.parametrize("freq", ["D", "H"]) + def test_sort_values_with_freq_timedeltaindex(self, freq): + # GH#10295 + idx = timedelta_range(start=f"1{freq}", periods=3, freq=freq).rename("idx") + + self.check_sort_values_with_freq(idx) + + @pytest.mark.parametrize( + "idx", + [ + DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq="D", name="idx" + ), + DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], + freq="H", + name="tzidx", + tz="Asia/Tokyo", + ), + ], + ) + def test_sort_values_with_freq_datetimeindex(self, idx): + self.check_sort_values_with_freq(idx) + + @pytest.mark.parametrize("freq", ["D", "2D", "4D"]) + def test_sort_values_with_freq_periodindex(self, freq): + # here with_freq refers to being period_range-like + idx = PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq=freq, name="idx" + ) + self.check_sort_values_with_freq(idx) + + @pytest.mark.parametrize( + "idx", + [ + PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A"), + Index([2011, 2012, 2013], name="idx"), # for compatibility check + ], + ) + def test_sort_values_with_freq_periodindex2(self, idx): + # here with_freq indicates this is period_range-like + self.check_sort_values_with_freq(idx) + + def check_sort_values_without_freq(self, idx, expected): + + ordered = idx.sort_values(na_position="first") + tm.assert_index_equal(ordered, expected) + check_freq_nonmonotonic(ordered, idx) + + if not idx.isna().any(): + ordered = idx.sort_values() + tm.assert_index_equal(ordered, expected) + check_freq_nonmonotonic(ordered, idx) + + ordered = idx.sort_values(ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + check_freq_nonmonotonic(ordered, idx) + + ordered, indexer = idx.sort_values(return_indexer=True, na_position="first") + tm.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, exp) + check_freq_nonmonotonic(ordered, idx) + + if not idx.isna().any(): + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, exp) + check_freq_nonmonotonic(ordered, idx) + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 0, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, exp) + check_freq_nonmonotonic(ordered, idx) + + def test_sort_values_without_freq_timedeltaindex(self): + # GH#10295 + + idx = TimedeltaIndex( + ["1 hour", "3 hour", "5 hour", "2 hour ", "1 hour"], name="idx1" + ) + expected = TimedeltaIndex( + ["1 hour", "1 hour", "2 hour", "3 hour", "5 hour"], name="idx1" + ) + self.check_sort_values_without_freq(idx, expected) + + @pytest.mark.parametrize( + "index_dates,expected_dates", + [ + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + [NaT, "2011-01-03", "2011-01-05", "2011-01-02", NaT], + [NaT, NaT, "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ], + ) + def test_sort_values_without_freq_datetimeindex( + self, index_dates, expected_dates, tz_naive_fixture + ): + tz = tz_naive_fixture + + # without freq + idx = DatetimeIndex(index_dates, tz=tz, name="idx") + expected = DatetimeIndex(expected_dates, tz=tz, name="idx") + + self.check_sort_values_without_freq(idx, expected) + + @pytest.mark.parametrize( + "idx,expected", + [ + ( + PeriodIndex( + [ + "2011-01-01", + "2011-01-03", + "2011-01-05", + "2011-01-02", + "2011-01-01", + ], + freq="D", + name="idx1", + ), + PeriodIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-03", + "2011-01-05", + ], + freq="D", + name="idx1", + ), + ), + ( + PeriodIndex( + [ + "2011-01-01", + "2011-01-03", + "2011-01-05", + "2011-01-02", + "2011-01-01", + ], + freq="D", + name="idx2", + ), + PeriodIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-03", + "2011-01-05", + ], + freq="D", + name="idx2", + ), + ), + ( + PeriodIndex( + [NaT, "2011-01-03", "2011-01-05", "2011-01-02", NaT], + freq="D", + name="idx3", + ), + PeriodIndex( + [NaT, NaT, "2011-01-02", "2011-01-03", "2011-01-05"], + freq="D", + name="idx3", + ), + ), + ( + PeriodIndex( + ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A" + ), + PeriodIndex( + ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A" + ), + ), + ( + # For compatibility check + Index([2011, 2013, 2015, 2012, 2011], name="idx"), + Index([2011, 2011, 2012, 2013, 2015], name="idx"), + ), + ], + ) + def test_sort_values_without_freq_periodindex(self, idx, expected): + # here without_freq means not generateable by period_range + self.check_sort_values_without_freq(idx, expected) + + def test_sort_values_without_freq_periodindex_nat(self): + # doesnt quite fit into check_sort_values_without_freq + idx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") + expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D") + + ordered = idx.sort_values(na_position="first") + tm.assert_index_equal(ordered, expected) + check_freq_nonmonotonic(ordered, idx) + + ordered = idx.sort_values(ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + check_freq_nonmonotonic(ordered, idx) + + +def test_order_stability_compat(): + # GH#35922. sort_values is stable both for normal and datetime-like Index + pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") + iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") + ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) + ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) + tm.assert_numpy_array_equal(indexer1, indexer2) diff --git a/pandas/tests/indexes/datetimelike_/test_value_counts.py b/pandas/tests/indexes/datetimelike_/test_value_counts.py new file mode 100644 index 0000000000000..f0df6dd678ef5 --- /dev/null +++ b/pandas/tests/indexes/datetimelike_/test_value_counts.py @@ -0,0 +1,103 @@ +import numpy as np + +from pandas import ( + DatetimeIndex, + NaT, + PeriodIndex, + Series, + TimedeltaIndex, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestValueCounts: + # GH#7735 + + def test_value_counts_unique_datetimeindex(self, tz_naive_fixture): + tz = tz_naive_fixture + orig = date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz) + self._check_value_counts_with_repeats(orig) + + def test_value_counts_unique_timedeltaindex(self): + orig = timedelta_range("1 days 09:00:00", freq="H", periods=10) + self._check_value_counts_with_repeats(orig) + + def test_value_counts_unique_periodindex(self): + orig = period_range("2011-01-01 09:00", freq="H", periods=10) + self._check_value_counts_with_repeats(orig) + + def _check_value_counts_with_repeats(self, orig): + # create repeated values, 'n'th element is repeated by n+1 times + idx = type(orig)( + np.repeat(orig._values, range(1, len(orig) + 1)), dtype=orig.dtype + ) + + exp_idx = orig[::-1] + if not isinstance(exp_idx, PeriodIndex): + exp_idx = exp_idx._with_freq(None) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + tm.assert_index_equal(idx.unique(), orig) + + def test_value_counts_unique_datetimeindex2(self, tz_naive_fixture): + tz = tz_naive_fixture + idx = DatetimeIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + NaT, + ], + tz=tz, + ) + self._check_value_counts_dropna(idx) + + def test_value_counts_unique_timedeltaindex2(self): + idx = TimedeltaIndex( + [ + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 08:00:00", + "1 days 08:00:00", + NaT, + ] + ) + self._check_value_counts_dropna(idx) + + def test_value_counts_unique_periodindex2(self): + idx = PeriodIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + NaT, + ], + freq="H", + ) + self._check_value_counts_dropna(idx) + + def _check_value_counts_dropna(self, idx): + exp_idx = idx[[2, 3]] + expected = Series([3, 2], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = idx[[2, 3, -1]] + expected = Series([3, 2, 1], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) + + tm.assert_index_equal(idx.unique(), exp_idx) diff --git a/pandas/tests/indexes/datetimes/methods/test_repeat.py b/pandas/tests/indexes/datetimes/methods/test_repeat.py new file mode 100644 index 0000000000000..81768622fd3d5 --- /dev/null +++ b/pandas/tests/indexes/datetimes/methods/test_repeat.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestRepeat: + def test_repeat_range(self, tz_naive_fixture): + tz = tz_naive_fixture + rng = date_range("1/1/2000", "1/1/2001") + + result = rng.repeat(5) + assert result.freq is None + assert len(result) == 5 * len(rng) + + index = date_range("2001-01-01", periods=2, freq="D", tz=tz) + exp = DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-02", "2001-01-02"], tz=tz + ) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = date_range("2001-01-01", periods=2, freq="2D", tz=tz) + exp = DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-03", "2001-01-03"], tz=tz + ) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz) + exp = DatetimeIndex( + [ + "2001-01-01", + "2001-01-01", + "2001-01-01", + "NaT", + "NaT", + "NaT", + "2003-01-01", + "2003-01-01", + "2003-01-01", + ], + tz=tz, + ) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + def test_repeat(self, tz_naive_fixture): + tz = tz_naive_fixture + reps = 2 + msg = "the 'axis' parameter is not supported" + + rng = date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz) + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), + ] + ) + + res = rng.repeat(reps) + tm.assert_index_equal(res, expected_rng) + assert res.freq is None + + tm.assert_index_equal(np.repeat(rng, reps), expected_rng) + with pytest.raises(ValueError, match=msg): + np.repeat(rng, reps, axis=1) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index e03de3c75704a..17b80fbc0afc2 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -147,28 +147,6 @@ def test_string_index_series_name_converted(self): result = df.T["1/3/2000"] assert result.name == df.index[2] - def test_argmin_argmax(self): - idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"]) - assert idx.argmin() == 1 - assert idx.argmax() == 0 - - def test_sort_values(self): - idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"]) - - ordered = idx.sort_values() - assert ordered.is_monotonic - - ordered = idx.sort_values(ascending=False) - assert ordered[::-1].is_monotonic - - ordered, dexer = idx.sort_values(return_indexer=True) - assert ordered.is_monotonic - tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0], dtype=np.intp)) - - ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) - assert ordered[::-1].is_monotonic - tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1], dtype=np.intp)) - def test_groupby_function_tuple_1677(self): df = DataFrame(np.random.rand(100), index=date_range("1/1/2000", periods=100)) monthly_group = df.groupby(lambda x: (x.year, x.month)) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index c65d9098a86a4..d29d4647f4753 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -551,6 +551,13 @@ def test_get_loc_reasonable_key_error(self): with pytest.raises(KeyError, match="2000"): index.get_loc("1/1/2000") + def test_get_loc_year_str(self): + rng = date_range("1/1/2000", "1/1/2010") + + result = rng.get_loc("2009") + expected = slice(3288, 3653) + assert result == expected + class TestContains: def test_dti_contains_with_duplicates(self): diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index d230aa43e43d1..eff87a2b3f275 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -37,6 +37,8 @@ def test_range_edges(self): ) tm.assert_index_equal(idx, exp) + def test_range_edges2(self): + idx = date_range( start=Timestamp("1970-01-01 00:00:00.000000004"), end=Timestamp("1970-01-01 00:00:00.000000001"), @@ -45,6 +47,8 @@ def test_range_edges(self): exp = DatetimeIndex([], freq="N") tm.assert_index_equal(idx, exp) + def test_range_edges3(self): + idx = date_range( start=Timestamp("1970-01-01 00:00:00.000000001"), end=Timestamp("1970-01-01 00:00:00.000000001"), @@ -53,6 +57,8 @@ def test_range_edges(self): exp = DatetimeIndex(["1970-01-01 00:00:00.000000001"], freq="N") tm.assert_index_equal(idx, exp) + def test_range_edges4(self): + idx = date_range( start=Timestamp("1970-01-01 00:00:00.000001"), end=Timestamp("1970-01-01 00:00:00.000004"), @@ -69,6 +75,8 @@ def test_range_edges(self): ) tm.assert_index_equal(idx, exp) + def test_range_edges5(self): + idx = date_range( start=Timestamp("1970-01-01 00:00:00.001"), end=Timestamp("1970-01-01 00:00:00.004"), @@ -85,6 +93,7 @@ def test_range_edges(self): ) tm.assert_index_equal(idx, exp) + def test_range_edges6(self): idx = date_range( start=Timestamp("1970-01-01 00:00:01"), end=Timestamp("1970-01-01 00:00:04"), @@ -101,6 +110,7 @@ def test_range_edges(self): ) tm.assert_index_equal(idx, exp) + def test_range_edges7(self): idx = date_range( start=Timestamp("1970-01-01 00:01"), end=Timestamp("1970-01-01 00:04"), @@ -117,6 +127,7 @@ def test_range_edges(self): ) tm.assert_index_equal(idx, exp) + def test_range_edges8(self): idx = date_range( start=Timestamp("1970-01-01 01:00"), end=Timestamp("1970-01-01 04:00"), @@ -133,6 +144,7 @@ def test_range_edges(self): ) tm.assert_index_equal(idx, exp) + def test_range_edges9(self): idx = date_range( start=Timestamp("1970-01-01"), end=Timestamp("1970-01-04"), freq="D" ) @@ -234,6 +246,7 @@ def test_datetimeindex_accessors(self): exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name") tm.assert_index_equal(res, exp) + def test_datetimeindex_accessors2(self): dti = date_range(freq="BQ-FEB", start=datetime(1998, 1, 1), periods=4) assert sum(dti.is_quarter_start) == 0 @@ -241,6 +254,7 @@ def test_datetimeindex_accessors(self): assert sum(dti.is_year_start) == 0 assert sum(dti.is_year_end) == 1 + def test_datetimeindex_accessors3(self): # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, bday_egypt = offsets.CustomBusinessDay(weekmask="Sun Mon Tue Wed Thu") dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) @@ -248,10 +262,12 @@ def test_datetimeindex_accessors(self): with pytest.raises(ValueError, match=msg): dti.is_month_start + def test_datetimeindex_accessors4(self): dti = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) assert dti.is_month_start[0] == 1 + def test_datetimeindex_accessors5(self): tests = [ (Timestamp("2013-06-01", freq="M").is_month_start, 1), (Timestamp("2013-06-01", freq="BM").is_month_start, 0), @@ -290,6 +306,7 @@ def test_datetimeindex_accessors(self): for ts, value in tests: assert ts == value + def test_datetimeindex_accessors6(self): # GH 6538: Check that DatetimeIndex and its TimeStamp elements # return the same weekofyear accessor close to new year w/ tz dates = ["2013/12/29", "2013/12/30", "2013/12/31"] diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 676c0ee99ef7c..7df94b5820e5d 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -1,18 +1,15 @@ from datetime import datetime from dateutil.tz import tzlocal -import numpy as np import pytest from pandas.compat import IS64 -import pandas as pd from pandas import ( DateOffset, DatetimeIndex, Index, Series, - Timestamp, bdate_range, date_range, ) @@ -46,73 +43,6 @@ def test_ops_properties_basic(self, datetime_series): with pytest.raises(AttributeError, match=msg): s.weekday - def test_repeat_range(self, tz_naive_fixture): - tz = tz_naive_fixture - rng = date_range("1/1/2000", "1/1/2001") - - result = rng.repeat(5) - assert result.freq is None - assert len(result) == 5 * len(rng) - - index = date_range("2001-01-01", periods=2, freq="D", tz=tz) - exp = DatetimeIndex( - ["2001-01-01", "2001-01-01", "2001-01-02", "2001-01-02"], tz=tz - ) - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - assert res.freq is None - - index = date_range("2001-01-01", periods=2, freq="2D", tz=tz) - exp = DatetimeIndex( - ["2001-01-01", "2001-01-01", "2001-01-03", "2001-01-03"], tz=tz - ) - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - assert res.freq is None - - index = DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz) - exp = DatetimeIndex( - [ - "2001-01-01", - "2001-01-01", - "2001-01-01", - "NaT", - "NaT", - "NaT", - "2003-01-01", - "2003-01-01", - "2003-01-01", - ], - tz=tz, - ) - for res in [index.repeat(3), np.repeat(index, 3)]: - tm.assert_index_equal(res, exp) - assert res.freq is None - - def test_repeat(self, tz_naive_fixture): - tz = tz_naive_fixture - reps = 2 - msg = "the 'axis' parameter is not supported" - - rng = date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz) - - expected_rng = DatetimeIndex( - [ - Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), - Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), - Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), - Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), - ] - ) - - res = rng.repeat(reps) - tm.assert_index_equal(res, expected_rng) - assert res.freq is None - - tm.assert_index_equal(np.repeat(rng, reps), expected_rng) - with pytest.raises(ValueError, match=msg): - np.repeat(rng, reps, axis=1) - @pytest.mark.parametrize( "freq,expected", [ @@ -137,174 +67,6 @@ def test_resolution(self, request, tz_naive_fixture, freq, expected): idx = date_range(start="2013-04-01", periods=30, freq=freq, tz=tz) assert idx.resolution == expected - def test_value_counts_unique(self, tz_naive_fixture): - tz = tz_naive_fixture - # GH 7735 - idx = date_range("2011-01-01 09:00", freq="H", periods=10) - # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) - - exp_idx = date_range("2011-01-01 18:00", freq="-1H", periods=10, tz=tz) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") - expected.index = expected.index._with_freq(None) - - for obj in [idx, Series(idx)]: - - tm.assert_series_equal(obj.value_counts(), expected) - - expected = date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz) - expected = expected._with_freq(None) - tm.assert_index_equal(idx.unique(), expected) - - idx = DatetimeIndex( - [ - "2013-01-01 09:00", - "2013-01-01 09:00", - "2013-01-01 09:00", - "2013-01-01 08:00", - "2013-01-01 08:00", - pd.NaT, - ], - tz=tz, - ) - - exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00"], tz=tz) - expected = Series([3, 2], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) - - exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00", pd.NaT], tz=tz) - expected = Series([3, 2, 1], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(dropna=False), expected) - - tm.assert_index_equal(idx.unique(), exp_idx) - - @pytest.mark.parametrize( - "idx", - [ - DatetimeIndex( - ["2011-01-01", "2011-01-02", "2011-01-03"], freq="D", name="idx" - ), - DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", - name="tzidx", - tz="Asia/Tokyo", - ), - ], - ) - def test_order_with_freq(self, idx): - ordered = idx.sort_values() - tm.assert_index_equal(ordered, idx) - assert ordered.freq == idx.freq - - ordered = idx.sort_values(ascending=False) - expected = idx[::-1] - tm.assert_index_equal(ordered, expected) - assert ordered.freq == expected.freq - assert ordered.freq.n == -1 - - ordered, indexer = idx.sort_values(return_indexer=True) - tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) - assert ordered.freq == idx.freq - - ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) - expected = idx[::-1] - tm.assert_index_equal(ordered, expected) - tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) - assert ordered.freq == expected.freq - assert ordered.freq.n == -1 - - @pytest.mark.parametrize( - "index_dates,expected_dates", - [ - ( - ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], - ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], - ), - ( - ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], - ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], - ), - ( - [pd.NaT, "2011-01-03", "2011-01-05", "2011-01-02", pd.NaT], - [pd.NaT, pd.NaT, "2011-01-02", "2011-01-03", "2011-01-05"], - ), - ], - ) - def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture): - tz = tz_naive_fixture - - # without freq - index = DatetimeIndex(index_dates, tz=tz, name="idx") - expected = DatetimeIndex(expected_dates, tz=tz, name="idx") - - ordered = index.sort_values(na_position="first") - tm.assert_index_equal(ordered, expected) - assert ordered.freq is None - - ordered = index.sort_values(ascending=False) - tm.assert_index_equal(ordered, expected[::-1]) - assert ordered.freq is None - - ordered, indexer = index.sort_values(return_indexer=True, na_position="first") - tm.assert_index_equal(ordered, expected) - - exp = np.array([0, 4, 3, 1, 2]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq is None - - ordered, indexer = index.sort_values(return_indexer=True, ascending=False) - tm.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 0, 4]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq is None - - def test_drop_duplicates_metadata(self, freq_sample): - # GH 10115 - idx = date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") - result = idx.drop_duplicates() - tm.assert_index_equal(idx, result) - assert idx.freq == result.freq - - idx_dup = idx.append(idx) - assert idx_dup.freq is None # freq is reset - result = idx_dup.drop_duplicates() - expected = idx._with_freq(None) - tm.assert_index_equal(result, expected) - assert result.freq is None - - @pytest.mark.parametrize( - "keep, expected, index", - [ - ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), - ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), - ( - False, - np.concatenate(([True] * 5, [False] * 5, [True] * 5)), - np.arange(5, 10), - ), - ], - ) - def test_drop_duplicates(self, freq_sample, keep, expected, index): - # to check Index/Series compat - idx = date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") - idx = idx.append(idx[:5]) - - tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) - expected = idx[~expected] - - result = idx.drop_duplicates(keep=keep) - tm.assert_index_equal(result, expected) - - result = Series(idx).drop_duplicates(keep=keep) - tm.assert_series_equal(result, Series(expected, index=index)) - def test_infer_freq(self, freq_sample): # GH 11018 idx = date_range("2011-01-01 09:00:00", freq=freq_sample, periods=10) @@ -312,22 +74,6 @@ def test_infer_freq(self, freq_sample): tm.assert_index_equal(idx, result) assert result.freq == freq_sample - def test_nat(self, tz_naive_fixture): - tz = tz_naive_fixture - assert DatetimeIndex._na_value is pd.NaT - assert DatetimeIndex([])._na_value is pd.NaT - - idx = DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) - assert idx._can_hold_na - - assert idx.hasnans is False - - idx = DatetimeIndex(["2011-01-01", "NaT"], tz=tz) - assert idx._can_hold_na - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) - assert idx.hasnans is True - @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) @pytest.mark.parametrize("tz", [None, "US/Eastern"]) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 05ee67eee0da5..882515799f943 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -55,12 +55,6 @@ def test_slice_year(self): expected = df[df.index.year == 2005] tm.assert_frame_equal(result, expected) - rng = date_range("1/1/2000", "1/1/2010") - - result = rng.get_loc("2009") - expected = slice(3288, 3653) - assert result == expected - @pytest.mark.parametrize( "partial_dtime", [ diff --git a/pandas/tests/indexes/period/methods/test_is_full.py b/pandas/tests/indexes/period/methods/test_is_full.py new file mode 100644 index 0000000000000..490f199a59ed7 --- /dev/null +++ b/pandas/tests/indexes/period/methods/test_is_full.py @@ -0,0 +1,23 @@ +import pytest + +from pandas import PeriodIndex + + +def test_is_full(): + index = PeriodIndex([2005, 2007, 2009], freq="A") + assert not index.is_full + + index = PeriodIndex([2005, 2006, 2007], freq="A") + assert index.is_full + + index = PeriodIndex([2005, 2005, 2007], freq="A") + assert not index.is_full + + index = PeriodIndex([2005, 2005, 2006], freq="A") + assert index.is_full + + index = PeriodIndex([2006, 2005, 2005], freq="A") + with pytest.raises(ValueError, match="Index is not monotonic"): + index.is_full + + assert index[:0].is_full diff --git a/pandas/tests/indexes/period/methods/test_repeat.py b/pandas/tests/indexes/period/methods/test_repeat.py new file mode 100644 index 0000000000000..fc344b06420d1 --- /dev/null +++ b/pandas/tests/indexes/period/methods/test_repeat.py @@ -0,0 +1,26 @@ +import numpy as np +import pytest + +from pandas import ( + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestRepeat: + @pytest.mark.parametrize("use_numpy", [True, False]) + @pytest.mark.parametrize( + "index", + [ + period_range("2000-01-01", periods=3, freq="D"), + period_range("2001-01-01", periods=3, freq="2D"), + PeriodIndex(["2001-01", "NaT", "2003-01"], freq="M"), + ], + ) + def test_repeat_freqstr(self, index, use_numpy): + # GH#10183 + expected = PeriodIndex([per for per in index for _ in range(3)]) + result = np.repeat(index, 3) if use_numpy else index.repeat(3) + tm.assert_index_equal(result, expected) + assert result.freqstr == index.freqstr diff --git a/pandas/tests/indexes/period/test_join.py b/pandas/tests/indexes/period/test_join.py index 2f16daa36d1fd..aa2393aceee52 100644 --- a/pandas/tests/indexes/period/test_join.py +++ b/pandas/tests/indexes/period/test_join.py @@ -16,7 +16,7 @@ def test_join_outer_indexer(self): pi = period_range("1/1/2000", "1/20/2000", freq="D") result = pi._outer_indexer(pi._values, pi._values) - tm.assert_numpy_array_equal(result[0], pi.asi8) + tm.assert_extension_array_equal(result[0], pi._values) tm.assert_numpy_array_equal(result[1], np.arange(len(pi), dtype=np.int64)) tm.assert_numpy_array_equal(result[2], np.arange(len(pi), dtype=np.int64)) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 52f8de27cb6c6..9ebe44fb16c8d 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -1,13 +1,6 @@ -import numpy as np import pytest import pandas as pd -from pandas import ( - Index, - NaT, - PeriodIndex, - Series, -) import pandas._testing as tm @@ -30,266 +23,6 @@ def test_resolution(self, freq, expected): idx = pd.period_range(start="2013-04-01", periods=30, freq=freq) assert idx.resolution == expected - def test_value_counts_unique(self): - # GH 7735 - idx = pd.period_range("2011-01-01 09:00", freq="H", periods=10) - # create repeated values, 'n'th element is repeated by n+1 times - idx = PeriodIndex(np.repeat(idx._values, range(1, len(idx) + 1)), freq="H") - - exp_idx = PeriodIndex( - [ - "2011-01-01 18:00", - "2011-01-01 17:00", - "2011-01-01 16:00", - "2011-01-01 15:00", - "2011-01-01 14:00", - "2011-01-01 13:00", - "2011-01-01 12:00", - "2011-01-01 11:00", - "2011-01-01 10:00", - "2011-01-01 09:00", - ], - freq="H", - ) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) - - expected = pd.period_range("2011-01-01 09:00", freq="H", periods=10) - tm.assert_index_equal(idx.unique(), expected) - - idx = PeriodIndex( - [ - "2013-01-01 09:00", - "2013-01-01 09:00", - "2013-01-01 09:00", - "2013-01-01 08:00", - "2013-01-01 08:00", - NaT, - ], - freq="H", - ) - - exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00"], freq="H") - expected = Series([3, 2], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) - - exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00", NaT], freq="H") - expected = Series([3, 2, 1], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(dropna=False), expected) - - tm.assert_index_equal(idx.unique(), exp_idx) - - @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) - def test_drop_duplicates_metadata(self, freq): - # GH 10115 - idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx") - result = idx.drop_duplicates() - tm.assert_index_equal(idx, result) - assert idx.freq == result.freq - - idx_dup = idx.append(idx) # freq will not be reset - result = idx_dup.drop_duplicates() - tm.assert_index_equal(idx, result) - assert idx.freq == result.freq - - @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) - @pytest.mark.parametrize( - "keep, expected, index", - [ - ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), - ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), - ( - False, - np.concatenate(([True] * 5, [False] * 5, [True] * 5)), - np.arange(5, 10), - ), - ], - ) - def test_drop_duplicates(self, freq, keep, expected, index): - # to check Index/Series compat - idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx") - idx = idx.append(idx[:5]) - - tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) - expected = idx[~expected] - - result = idx.drop_duplicates(keep=keep) - tm.assert_index_equal(result, expected) - - result = Series(idx).drop_duplicates(keep=keep) - tm.assert_series_equal(result, Series(expected, index=index)) - - def test_order_compat(self): - def _check_freq(index, expected_index): - if isinstance(index, PeriodIndex): - assert index.freq == expected_index.freq - - pidx = PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A") - # for compatibility check - iidx = Index([2011, 2012, 2013], name="idx") - for idx in [pidx, iidx]: - ordered = idx.sort_values() - tm.assert_index_equal(ordered, idx) - _check_freq(ordered, idx) - - ordered = idx.sort_values(ascending=False) - tm.assert_index_equal(ordered, idx[::-1]) - _check_freq(ordered, idx[::-1]) - - ordered, indexer = idx.sort_values(return_indexer=True) - tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) - _check_freq(ordered, idx) - - ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) - tm.assert_index_equal(ordered, idx[::-1]) - tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) - _check_freq(ordered, idx[::-1]) - - pidx = PeriodIndex( - ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A" - ) - pexpected = PeriodIndex( - ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A" - ) - # for compatibility check - iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") - iexpected = Index([2011, 2011, 2012, 2013, 2015], name="idx") - for idx, expected in [(pidx, pexpected), (iidx, iexpected)]: - ordered = idx.sort_values() - tm.assert_index_equal(ordered, expected) - _check_freq(ordered, idx) - - ordered = idx.sort_values(ascending=False) - tm.assert_index_equal(ordered, expected[::-1]) - _check_freq(ordered, idx) - - ordered, indexer = idx.sort_values(return_indexer=True) - tm.assert_index_equal(ordered, expected) - - exp = np.array([0, 4, 3, 1, 2]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - _check_freq(ordered, idx) - - ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) - tm.assert_index_equal(ordered, expected[::-1]) - _check_freq(ordered, idx) - - pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") - - result = pidx.sort_values(na_position="first") - expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D") - tm.assert_index_equal(result, expected) - assert result.freq == "D" - - result = pidx.sort_values(ascending=False) - expected = PeriodIndex(["2013", "2011", "2011", "NaT"], name="pidx", freq="D") - tm.assert_index_equal(result, expected) - assert result.freq == "D" - - def test_order(self): - for freq in ["D", "2D", "4D"]: - idx = PeriodIndex( - ["2011-01-01", "2011-01-02", "2011-01-03"], freq=freq, name="idx" - ) - - ordered = idx.sort_values() - tm.assert_index_equal(ordered, idx) - assert ordered.freq == idx.freq - - ordered = idx.sort_values(ascending=False) - expected = idx[::-1] - tm.assert_index_equal(ordered, expected) - assert ordered.freq == expected.freq - assert ordered.freq == freq - - ordered, indexer = idx.sort_values(return_indexer=True) - tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) - assert ordered.freq == idx.freq - assert ordered.freq == freq - - ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) - expected = idx[::-1] - tm.assert_index_equal(ordered, expected) - tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) - assert ordered.freq == expected.freq - assert ordered.freq == freq - - idx1 = PeriodIndex( - ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], - freq="D", - name="idx1", - ) - exp1 = PeriodIndex( - ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], - freq="D", - name="idx1", - ) - - idx2 = PeriodIndex( - ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], - freq="D", - name="idx2", - ) - exp2 = PeriodIndex( - ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], - freq="D", - name="idx2", - ) - - idx3 = PeriodIndex( - [NaT, "2011-01-03", "2011-01-05", "2011-01-02", NaT], freq="D", name="idx3" - ) - exp3 = PeriodIndex( - [NaT, NaT, "2011-01-02", "2011-01-03", "2011-01-05"], freq="D", name="idx3" - ) - - for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: - ordered = idx.sort_values(na_position="first") - tm.assert_index_equal(ordered, expected) - assert ordered.freq == "D" - - ordered = idx.sort_values(ascending=False) - tm.assert_index_equal(ordered, expected[::-1]) - assert ordered.freq == "D" - - ordered, indexer = idx.sort_values(return_indexer=True, na_position="first") - tm.assert_index_equal(ordered, expected) - - exp = np.array([0, 4, 3, 1, 2]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq == "D" - - ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) - tm.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 0, 4]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq == "D" - - def test_nat(self): - assert PeriodIndex._na_value is NaT - assert PeriodIndex([], freq="M")._na_value is NaT - - idx = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") - assert idx._can_hold_na - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) - assert idx.hasnans is False - - idx = PeriodIndex(["2011-01-01", "NaT"], freq="D") - assert idx._can_hold_na - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) - assert idx.hasnans is True - def test_freq_setter_deprecated(self): # GH 20678 idx = pd.period_range("2018Q1", periods=4, freq="Q") @@ -301,12 +34,3 @@ def test_freq_setter_deprecated(self): # warning for setter with pytest.raises(AttributeError, match="can't set attribute"): idx.freq = pd.offsets.Day() - - -def test_order_stability_compat(): - # GH 35922. sort_values is stable both for normal and datetime-like Index - pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") - iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") - ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) - ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) - tm.assert_numpy_array_equal(indexer1, indexer2) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index aabc837e25b4b..032b376f6d6a9 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -3,9 +3,7 @@ from pandas._libs.tslibs.period import IncompatibleFrequency -import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, NaT, @@ -49,22 +47,6 @@ def test_where(self): # This is handled in test_indexing pass - @pytest.mark.parametrize("use_numpy", [True, False]) - @pytest.mark.parametrize( - "index", - [ - period_range("2000-01-01", periods=3, freq="D"), - period_range("2001-01-01", periods=3, freq="2D"), - PeriodIndex(["2001-01", "NaT", "2003-01"], freq="M"), - ], - ) - def test_repeat_freqstr(self, index, use_numpy): - # GH10183 - expected = PeriodIndex([p for p in index for _ in range(3)]) - result = np.repeat(index, 3) if use_numpy else index.repeat(3) - tm.assert_index_equal(result, expected) - assert result.freqstr == index.freqstr - def test_no_millisecond_field(self): msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" with pytest.raises(AttributeError, match=msg): @@ -271,14 +253,6 @@ def test_is_(self): assert not index.is_(index - 2) assert not index.is_(index - 0) - def test_periods_number_check(self): - msg = ( - "Of the three parameters: start, end, and periods, exactly two " - "must be specified" - ) - with pytest.raises(ValueError, match=msg): - period_range("2011-1-1", "2012-1-1", "B") - def test_index_duplicate_periods(self): # monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") @@ -355,25 +329,6 @@ def test_iteration(self): assert isinstance(result[0], Period) assert result[0].freq == index.freq - def test_is_full(self): - index = PeriodIndex([2005, 2007, 2009], freq="A") - assert not index.is_full - - index = PeriodIndex([2005, 2006, 2007], freq="A") - assert index.is_full - - index = PeriodIndex([2005, 2005, 2007], freq="A") - assert not index.is_full - - index = PeriodIndex([2005, 2005, 2006], freq="A") - assert index.is_full - - index = PeriodIndex([2006, 2005, 2005], freq="A") - with pytest.raises(ValueError, match="Index is not monotonic"): - index.is_full - - assert index[:0].is_full - def test_with_multi_index(self): # #1705 index = date_range("1/1/2012", periods=4, freq="12H") @@ -385,29 +340,6 @@ def test_with_multi_index(self): assert isinstance(s.index.values[0][0], Period) - def test_convert_array_of_periods(self): - rng = period_range("1/1/2000", periods=20, freq="D") - periods = list(rng) - - result = Index(periods) - assert isinstance(result, PeriodIndex) - - def test_append_concat(self): # TODO: pd.concat test - # #1815 - d1 = date_range("12/31/1990", "12/31/1999", freq="A-DEC") - d2 = date_range("12/31/2000", "12/31/2009", freq="A-DEC") - - s1 = Series(np.random.randn(10), d1) - s2 = Series(np.random.randn(10), d2) - - s1 = s1.to_period() - s2 = s2.to_period() - - # drops index - result = pd.concat([s1, s2]) - assert isinstance(result.index, PeriodIndex) - assert result.index[0] == s1.index[0] - def test_pickle_freq(self): # GH2891 prng = period_range("1/1/2011", "1/1/2012", freq="M") @@ -423,44 +355,6 @@ def test_map(self): exp = Index([x.ordinal for x in index]) tm.assert_index_equal(result, exp) - @pytest.mark.parametrize( - "msg, key", - [ - (r"Period\('2019', 'A-DEC'\), 'foo', 'bar'", (Period(2019), "foo", "bar")), - (r"Period\('2019', 'A-DEC'\), 'y1', 'bar'", (Period(2019), "y1", "bar")), - (r"Period\('2019', 'A-DEC'\), 'foo', 'z1'", (Period(2019), "foo", "z1")), - ( - r"Period\('2018', 'A-DEC'\), Period\('2016', 'A-DEC'\), 'bar'", - (Period(2018), Period(2016), "bar"), - ), - (r"Period\('2018', 'A-DEC'\), 'foo', 'y1'", (Period(2018), "foo", "y1")), - ( - r"Period\('2017', 'A-DEC'\), 'foo', Period\('2015', 'A-DEC'\)", - (Period(2017), "foo", Period(2015)), - ), - (r"Period\('2017', 'A-DEC'\), 'z1', 'bar'", (Period(2017), "z1", "bar")), - ], - ) - def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key): - # issue 20684 - """ - parse_time_string return parameter if type not matched. - PeriodIndex.get_loc takes returned value from parse_time_string as a tuple. - If first argument is Period and a tuple has 3 items, - process go on not raise exception - """ - df = DataFrame( - { - "A": [Period(2019), "x1", "x2"], - "B": [Period(2018), Period(2016), "y1"], - "C": [Period(2017), "z1", Period(2015)], - "V1": [1, 2, 3], - "V2": [10, 20, 30], - } - ).set_index(["A", "B", "C"]) - with pytest.raises(KeyError, match=msg): - df.loc[key] - def test_format_empty(self): # GH35712 empty_idx = self._holder([], freq="A") diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index a5be19731b54a..c94ddf57c0ee1 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -12,6 +12,14 @@ class TestPeriodRange: + def test_required_arguments(self): + msg = ( + "Of the three parameters: start, end, and periods, exactly two " + "must be specified" + ) + with pytest.raises(ValueError, match=msg): + period_range("2011-1-1", "2012-1-1", "B") + @pytest.mark.parametrize("freq", ["D", "W", "M", "Q", "A"]) def test_construction_from_string(self, freq): # non-empty diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 97fe35bb7f2c9..5cf0134795b74 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -175,7 +175,7 @@ def test_get_unique_index(self, index_flat): vals = index[[0] * 5]._data vals[0] = pd.NaT elif needs_i8_conversion(index.dtype): - vals = index.asi8[[0] * 5] + vals = index._data._ndarray[[0] * 5] vals[0] = iNaT else: vals = index.values[[0] * 5] @@ -184,7 +184,7 @@ def test_get_unique_index(self, index_flat): vals_unique = vals[:2] if index.dtype.kind in ["m", "M"]: # i.e. needs_i8_conversion but not period_dtype, as above - vals = type(index._data)._simple_new(vals, dtype=index.dtype) + vals = type(index._data)(vals, dtype=index.dtype) vals_unique = type(index._data)._simple_new(vals_unique, dtype=index.dtype) idx_nan = index._shallow_copy(vals) idx_unique_nan = index._shallow_copy(vals_unique) diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 4fba4b13835b3..5937f43102190 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -80,6 +80,13 @@ def test_constructor_infer_periodindex(self): tm.assert_index_equal(rs, xp) assert isinstance(rs, PeriodIndex) + def test_from_list_of_periods(self): + rng = period_range("1/1/2000", periods=20, freq="D") + periods = list(rng) + + result = Index(periods) + assert isinstance(result, PeriodIndex) + @pytest.mark.parametrize("pos", [0, 1]) @pytest.mark.parametrize( "klass,dtype,ctor", diff --git a/pandas/tests/indexes/timedeltas/methods/test_repeat.py b/pandas/tests/indexes/timedeltas/methods/test_repeat.py new file mode 100644 index 0000000000000..2a9b58d1bf322 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/methods/test_repeat.py @@ -0,0 +1,34 @@ +import numpy as np + +from pandas import ( + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +class TestRepeat: + def test_repeat(self): + index = timedelta_range("1 days", periods=2, freq="D") + exp = TimedeltaIndex(["1 days", "1 days", "2 days", "2 days"]) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = TimedeltaIndex(["1 days", "NaT", "3 days"]) + exp = TimedeltaIndex( + [ + "1 days", + "1 days", + "1 days", + "NaT", + "NaT", + "NaT", + "3 days", + "3 days", + "3 days", + ] + ) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + assert res.freq is None diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 7acfb50fe944b..5f0101eb4478c 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -7,13 +7,15 @@ import numpy as np import pytest -import pandas as pd from pandas import ( Index, + NaT, Timedelta, TimedeltaIndex, + Timestamp, notna, timedelta_range, + to_timedelta, ) import pandas._testing as tm @@ -64,10 +66,10 @@ def test_getitem(self): @pytest.mark.parametrize( "key", [ - pd.Timestamp("1970-01-01"), - pd.Timestamp("1970-01-02"), + Timestamp("1970-01-01"), + Timestamp("1970-01-02"), datetime(1970, 1, 1), - pd.Timestamp("1970-01-03").to_datetime64(), + Timestamp("1970-01-03").to_datetime64(), # non-matching NA values np.datetime64("NaT"), ], @@ -81,7 +83,7 @@ def test_timestamp_invalid_key(self, key): class TestGetLoc: def test_get_loc(self): - idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + idx = to_timedelta(["0 days", "1 days", "2 days"]) for method in [None, "pad", "backfill", "nearest"]: assert idx.get_loc(idx[1], method) == 1 @@ -117,7 +119,7 @@ def test_get_loc(self): def test_get_loc_nat(self): tidx = TimedeltaIndex(["1 days 01:00:00", "NaT", "2 days 01:00:00"]) - assert tidx.get_loc(pd.NaT) == 1 + assert tidx.get_loc(NaT) == 1 assert tidx.get_loc(None) == 1 assert tidx.get_loc(float("nan")) == 1 assert tidx.get_loc(np.nan) == 1 @@ -125,12 +127,12 @@ def test_get_loc_nat(self): class TestGetIndexer: def test_get_indexer(self): - idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + idx = to_timedelta(["0 days", "1 days", "2 days"]) tm.assert_numpy_array_equal( idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) ) - target = pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) + target = to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) tm.assert_numpy_array_equal( idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) ) @@ -158,25 +160,25 @@ def test_where_invalid_dtypes(self): tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") tail = tdi[2:].tolist() - i2 = Index([pd.NaT, pd.NaT] + tail) + i2 = Index([NaT, NaT] + tail) mask = notna(i2) - expected = Index([pd.NaT.value, pd.NaT.value] + tail, dtype=object, name="idx") + expected = Index([NaT.value, NaT.value] + tail, dtype=object, name="idx") assert isinstance(expected[0], int) result = tdi.where(mask, i2.asi8) tm.assert_index_equal(result, expected) - ts = i2 + pd.Timestamp.now() + ts = i2 + Timestamp.now() expected = Index([ts[0], ts[1]] + tail, dtype=object, name="idx") result = tdi.where(mask, ts) tm.assert_index_equal(result, expected) - per = (i2 + pd.Timestamp.now()).to_period("D") + per = (i2 + Timestamp.now()).to_period("D") expected = Index([per[0], per[1]] + tail, dtype=object, name="idx") result = tdi.where(mask, per) tm.assert_index_equal(result, expected) - ts = pd.Timestamp.now() + ts = Timestamp.now() expected = Index([ts, ts] + tail, dtype=object, name="idx") result = tdi.where(mask, ts) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 4e6d69913900d..2a5051b2982bb 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( Series, TimedeltaIndex, @@ -17,50 +16,6 @@ class TestTimedeltaIndexOps: - def test_value_counts_unique(self): - # GH 7735 - idx = timedelta_range("1 days 09:00:00", freq="H", periods=10) - # create repeated values, 'n'th element is repeated by n+1 times - idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1))) - - exp_idx = timedelta_range("1 days 18:00:00", freq="-1H", periods=10) - exp_idx = exp_idx._with_freq(None) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") - - obj = idx - tm.assert_series_equal(obj.value_counts(), expected) - - obj = Series(idx) - tm.assert_series_equal(obj.value_counts(), expected) - - expected = timedelta_range("1 days 09:00:00", freq="H", periods=10) - tm.assert_index_equal(idx.unique(), expected) - - idx = TimedeltaIndex( - [ - "1 days 09:00:00", - "1 days 09:00:00", - "1 days 09:00:00", - "1 days 08:00:00", - "1 days 08:00:00", - pd.NaT, - ] - ) - - exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00"]) - expected = Series([3, 2], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) - - exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00", pd.NaT]) - expected = Series([3, 2, 1], index=exp_idx) - - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(dropna=False), expected) - - tm.assert_index_equal(idx.unique(), exp_idx) - def test_nonunique_contains(self): # GH 9512 for idx in map( @@ -84,106 +39,6 @@ def test_unknown_attribute(self): with pytest.raises(AttributeError, match=msg): ts.foo - def test_order(self): - # GH 10295 - idx1 = TimedeltaIndex(["1 day", "2 day", "3 day"], freq="D", name="idx") - idx2 = TimedeltaIndex(["1 hour", "2 hour", "3 hour"], freq="H", name="idx") - - for idx in [idx1, idx2]: - ordered = idx.sort_values() - tm.assert_index_equal(ordered, idx) - assert ordered.freq == idx.freq - - ordered = idx.sort_values(ascending=False) - expected = idx[::-1] - tm.assert_index_equal(ordered, expected) - assert ordered.freq == expected.freq - assert ordered.freq.n == -1 - - ordered, indexer = idx.sort_values(return_indexer=True) - tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) - assert ordered.freq == idx.freq - - ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) - tm.assert_index_equal(ordered, idx[::-1]) - assert ordered.freq == expected.freq - assert ordered.freq.n == -1 - - idx1 = TimedeltaIndex( - ["1 hour", "3 hour", "5 hour", "2 hour ", "1 hour"], name="idx1" - ) - exp1 = TimedeltaIndex( - ["1 hour", "1 hour", "2 hour", "3 hour", "5 hour"], name="idx1" - ) - - idx2 = TimedeltaIndex( - ["1 day", "3 day", "5 day", "2 day", "1 day"], name="idx2" - ) - - for idx, expected in [(idx1, exp1), (idx1, exp1), (idx1, exp1)]: - ordered = idx.sort_values() - tm.assert_index_equal(ordered, expected) - assert ordered.freq is None - - ordered = idx.sort_values(ascending=False) - tm.assert_index_equal(ordered, expected[::-1]) - assert ordered.freq is None - - ordered, indexer = idx.sort_values(return_indexer=True) - tm.assert_index_equal(ordered, expected) - - exp = np.array([0, 4, 3, 1, 2]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq is None - - ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) - tm.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 0, 4]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq is None - - def test_drop_duplicates_metadata(self, freq_sample): - # GH 10115 - idx = timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") - result = idx.drop_duplicates() - tm.assert_index_equal(idx, result) - assert idx.freq == result.freq - - idx_dup = idx.append(idx) - assert idx_dup.freq is None # freq is reset - result = idx_dup.drop_duplicates() - expected = idx._with_freq(None) - tm.assert_index_equal(expected, result) - assert result.freq is None - - @pytest.mark.parametrize( - "keep, expected, index", - [ - ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), - ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), - ( - False, - np.concatenate(([True] * 5, [False] * 5, [True] * 5)), - np.arange(5, 10), - ), - ], - ) - def test_drop_duplicates(self, freq_sample, keep, expected, index): - # to check Index/Series compat - idx = timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") - idx = idx.append(idx[:5]) - - tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) - expected = idx[~expected] - - result = idx.drop_duplicates(keep=keep) - tm.assert_index_equal(result, expected) - - result = Series(idx).drop_duplicates(keep=keep) - tm.assert_series_equal(result, Series(expected, index=index)) - def test_infer_freq(self, freq_sample): # GH#11018 idx = timedelta_range("1", freq=freq_sample, periods=10) @@ -191,47 +46,6 @@ def test_infer_freq(self, freq_sample): tm.assert_index_equal(idx, result) assert result.freq == freq_sample - def test_repeat(self): - index = timedelta_range("1 days", periods=2, freq="D") - exp = TimedeltaIndex(["1 days", "1 days", "2 days", "2 days"]) - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - assert res.freq is None - - index = TimedeltaIndex(["1 days", "NaT", "3 days"]) - exp = TimedeltaIndex( - [ - "1 days", - "1 days", - "1 days", - "NaT", - "NaT", - "NaT", - "3 days", - "3 days", - "3 days", - ] - ) - for res in [index.repeat(3), np.repeat(index, 3)]: - tm.assert_index_equal(res, exp) - assert res.freq is None - - def test_nat(self): - assert TimedeltaIndex._na_value is pd.NaT - assert TimedeltaIndex([])._na_value is pd.NaT - - idx = TimedeltaIndex(["1 days", "2 days"]) - assert idx._can_hold_na - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) - assert idx.hasnans is False - - idx = TimedeltaIndex(["1 days", "NaT"]) - assert idx._can_hold_na - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) - assert idx.hasnans is True - @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) def test_freq_setter(self, values, freq): diff --git a/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/pandas/tests/indexes/timedeltas/test_partial_slicing.py deleted file mode 100644 index cca211c1eb155..0000000000000 --- a/pandas/tests/indexes/timedeltas/test_partial_slicing.py +++ /dev/null @@ -1,42 +0,0 @@ -import numpy as np - -from pandas import ( - Series, - timedelta_range, -) -import pandas._testing as tm - - -class TestSlicing: - def test_partial_slice(self): - rng = timedelta_range("1 day 10:11:12", freq="h", periods=500) - s = Series(np.arange(len(rng)), index=rng) - - result = s["5 day":"6 day"] - expected = s.iloc[86:134] - tm.assert_series_equal(result, expected) - - result = s["5 day":] - expected = s.iloc[86:] - tm.assert_series_equal(result, expected) - - result = s[:"6 day"] - expected = s.iloc[:134] - tm.assert_series_equal(result, expected) - - def test_partial_slice_high_reso(self): - - # higher reso - rng = timedelta_range("1 day 10:11:12", freq="us", periods=2000) - s = Series(np.arange(len(rng)), index=rng) - - result = s["1 day 10:11:12":] - expected = s.iloc[0:] - tm.assert_series_equal(result, expected) - - result = s["1 day 10:11:12.001":] - expected = s.iloc[1000:] - tm.assert_series_equal(result, expected) - - result = s["1 days, 10:11:12.001001"] - assert result == s.iloc[1001] diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index d16a32247b917..d0f4828e8c7bd 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -64,31 +64,6 @@ def test_isin(self): index.isin([index[2], 5]), np.array([False, False, True, False]) ) - def test_sort_values(self): - - idx = TimedeltaIndex(["4d", "1d", "2d"]) - - ordered = idx.sort_values() - assert ordered.is_monotonic - - ordered = idx.sort_values(ascending=False) - assert ordered[::-1].is_monotonic - - ordered, dexer = idx.sort_values(return_indexer=True) - assert ordered.is_monotonic - - tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0]), check_dtype=False) - - ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) - assert ordered[::-1].is_monotonic - - tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]), check_dtype=False) - - def test_argmin_argmax(self): - idx = TimedeltaIndex(["1 day 00:00:05", "1 day 00:00:01", "1 day 00:00:02"]) - assert idx.argmin() == 1 - assert idx.argmax() == 0 - def test_misc_coverage(self): rng = timedelta_range("1 day", periods=5) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 68ae1a0dd6f3d..f104587ebbded 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -37,20 +37,24 @@ def setup_method(self, method): ) def test_loc_scalar(self): + dtype = CDT(list("cab")) result = self.df.loc["a"] - expected = DataFrame( - {"A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab"))))} - ).set_index("B") + bidx = Series(list("aaa"), name="B").astype(dtype) + assert bidx.dtype == dtype + + expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx)) tm.assert_frame_equal(result, expected) df = self.df.copy() df.loc["a"] = 20 + bidx2 = Series(list("aabbca"), name="B").astype(dtype) + assert bidx2.dtype == dtype expected = DataFrame( { "A": [20, 20, 2, 3, 4, 20], - "B": (Series(list("aabbca")).astype(CDT(list("cab")))), - } - ).set_index("B") + }, + index=Index(bidx2), + ) tm.assert_frame_equal(df, expected) # value not in the categories @@ -64,14 +68,38 @@ def test_loc_scalar(self): df2.loc["d"] = 10 tm.assert_frame_equal(df2, expected) - msg = "'fill_value=d' is not present in this Categorical's categories" - with pytest.raises(TypeError, match=msg): - df.loc["d", "A"] = 10 - with pytest.raises(TypeError, match=msg): - df.loc["d", "C"] = 10 + def test_loc_setitem_with_expansion_non_category(self): + # Setting-with-expansion with a new key "d" that is not among caegories + df = self.df + df.loc["a"] = 20 + + # Setting a new row on an existing column + df3 = df.copy() + df3.loc["d", "A"] = 10 + bidx3 = Index(list("aabbcad"), name="B") + expected3 = DataFrame( + { + "A": [20, 20, 2, 3, 4, 20, 10.0], + }, + index=Index(bidx3), + ) + tm.assert_frame_equal(df3, expected3) + + # Settig a new row _and_ new column + df4 = df.copy() + df4.loc["d", "C"] = 10 + expected3 = DataFrame( + { + "A": [20, 20, 2, 3, 4, 20, np.nan], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10], + }, + index=Index(bidx3), + ) + tm.assert_frame_equal(df4, expected3) + def test_loc_getitem_scalar_non_category(self): with pytest.raises(KeyError, match="^1$"): - df.loc[1] + self.df.loc[1] def test_slicing(self): cat = Series(Categorical([1, 2, 3, 4])) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 43ffc9e8eaedd..d0fdf81121c71 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1090,6 +1090,20 @@ def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame): # GH#32257 we let numpy do validation, get their exception float_frame.iloc[:, :, :] = 1 + def test_iloc_frame_indexer(self): + # GH#39004 + df = DataFrame({"a": [1, 2, 3]}) + indexer = DataFrame({"a": [True, False, True]}) + with tm.assert_produces_warning(FutureWarning): + df.iloc[indexer] = 1 + + msg = ( + "DataFrame indexer is not allowed for .iloc\n" + "Consider using .loc for automatic alignment." + ) + with pytest.raises(IndexError, match=msg): + df.iloc[indexer] + class TestILocSetItemDuplicateColumns: def test_iloc_setitem_scalar_duplicate_columns(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 466e60e84b318..5b6c042a11332 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -23,7 +23,9 @@ DatetimeIndex, Index, IndexSlice, + IntervalIndex, MultiIndex, + Period, Series, SparseDtype, Timedelta, @@ -145,6 +147,43 @@ def test_setitem_from_duplicate_axis(self): class TestLoc2: # TODO: better name, just separating out things that rely on base class + @pytest.mark.parametrize( + "msg, key", + [ + (r"Period\('2019', 'A-DEC'\), 'foo', 'bar'", (Period(2019), "foo", "bar")), + (r"Period\('2019', 'A-DEC'\), 'y1', 'bar'", (Period(2019), "y1", "bar")), + (r"Period\('2019', 'A-DEC'\), 'foo', 'z1'", (Period(2019), "foo", "z1")), + ( + r"Period\('2018', 'A-DEC'\), Period\('2016', 'A-DEC'\), 'bar'", + (Period(2018), Period(2016), "bar"), + ), + (r"Period\('2018', 'A-DEC'\), 'foo', 'y1'", (Period(2018), "foo", "y1")), + ( + r"Period\('2017', 'A-DEC'\), 'foo', Period\('2015', 'A-DEC'\)", + (Period(2017), "foo", Period(2015)), + ), + (r"Period\('2017', 'A-DEC'\), 'z1', 'bar'", (Period(2017), "z1", "bar")), + ], + ) + def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key): + # GH#20684 + """ + parse_time_string return parameter if type not matched. + PeriodIndex.get_loc takes returned value from parse_time_string as a tuple. + If first argument is Period and a tuple has 3 items, + process go on not raise exception + """ + df = DataFrame( + { + "A": [Period(2019), "x1", "x2"], + "B": [Period(2018), Period(2016), "y1"], + "C": [Period(2017), "z1", Period(2015)], + "V1": [1, 2, 3], + "V2": [10, 20, 30], + } + ).set_index(["A", "B", "C"]) + with pytest.raises(KeyError, match=msg): + df.loc[key] def test_loc_getitem_missing_unicode_key(self): df = DataFrame({"a": [1]}) @@ -1619,6 +1658,55 @@ def test_loc_setitem_with_expansion_inf_upcast_empty(self): expected = pd.Float64Index([0, 1, np.inf]) tm.assert_index_equal(result, expected) + @pytest.mark.filterwarnings("ignore:indexing past lexsort depth") + def test_loc_setitem_with_expansion_nonunique_index(self, index, request): + # GH#40096 + if not len(index): + return + if isinstance(index, IntervalIndex): + mark = pytest.mark.xfail(reason="IntervalIndex raises") + request.node.add_marker(mark) + + index = index.repeat(2) # ensure non-unique + N = len(index) + arr = np.arange(N).astype(np.int64) + + orig = DataFrame(arr, index=index, columns=[0]) + + # key that will requiring object-dtype casting in the index + key = "kapow" + assert key not in index # otherwise test is invalid + # TODO: using a tuple key breaks here in many cases + + exp_index = index.insert(len(index), key) + if isinstance(index, MultiIndex): + assert exp_index[-1][0] == key + else: + assert exp_index[-1] == key + exp_data = np.arange(N + 1).astype(np.float64) + expected = DataFrame(exp_data, index=exp_index, columns=[0]) + + # Add new row, but no new columns + df = orig.copy() + df.loc[key, 0] = N + tm.assert_frame_equal(df, expected) + + # add new row on a Series + ser = orig.copy()[0] + ser.loc[key] = N + # the series machinery lets us preserve int dtype instead of float + expected = expected[0].astype(np.int64) + tm.assert_series_equal(ser, expected) + + # add new row and new column + df = orig.copy() + df.loc[key, 1] = N + expected = DataFrame( + {0: list(arr) + [np.nan], 1: [np.nan] * N + [float(N)]}, + index=exp_index, + ) + tm.assert_frame_equal(df, expected) + class TestLocCallable: def test_frame_loc_getitem_callable(self): diff --git a/pandas/tests/io/data/xml/baby_names.xml b/pandas/tests/io/data/xml/baby_names.xml new file mode 100644 index 0000000000000..b4797b79d7112 --- /dev/null +++ b/pandas/tests/io/data/xml/baby_names.xml @@ -0,0 +1,53 @@ + + + + 1 + Jos� + Sof�a + + + 2 + Luis + Valentina + + + 3 + Carlos + Isabella + + + 4 + Juan + Camila + + + 5 + Jorge + Valeria + + + 6 + Pedro + Mariana + + + 7 + Jes�s + Gabriela + + + 8 + Manuel + Sara + + + 9 + Santiago + Daniella + + + 10 + Sebasti�n + Mar�a Jos� + + diff --git a/pandas/tests/io/data/xml/books.xml b/pandas/tests/io/data/xml/books.xml new file mode 100644 index 0000000000000..666ce60e9a2be --- /dev/null +++ b/pandas/tests/io/data/xml/books.xml @@ -0,0 +1,21 @@ + + + + Everyday Italian + Giada De Laurentiis + 2005 + 30.00 + + + Harry Potter + J K. Rowling + 2005 + 29.99 + + + Learning XML + Erik T. Ray + 2003 + 39.95 + + diff --git a/pandas/tests/io/data/xml/cta_rail_lines.kml b/pandas/tests/io/data/xml/cta_rail_lines.kml new file mode 100644 index 0000000000000..c031137ee7b20 --- /dev/null +++ b/pandas/tests/io/data/xml/cta_rail_lines.kml @@ -0,0 +1,92 @@ + + + CTA_RailLines + + + CTA_RailLines + + + Blue Line (Forest Park) + +
Blue Line (Forest Park)
OBJECTID_1 1
ASSET_ID 21100001
LINES Blue Line (Forest Park)
DESCRIPTIO Oak Park to Austin
TYPE Elevated or at Grade
LEGEND BL
ALT_LEGEND BL
BRANCH Blue Line Forest Park
SHAPE.LEN 4060.368778
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.77678526964958,41.8708863930319,0 -87.77826234150609,41.87097820122218,0 -87.78251583439344,41.87130129991005,0 -87.78418294588424,41.87145055520308,0 -87.7872369165933,41.8717239119163,0 -87.79160214925886,41.87210797280065,0 + + +
+ + Red, Purple Line + +
Red, Purple Line
OBJECTID_1 2
ASSET_ID 21100002
LINES Red, Purple Line
DESCRIPTIO Lawrence to Wilson
TYPE Elevated or at Grade
LEGEND RD
ALT_LEGEND RDPR
BRANCH Red Line North Side
SHAPE.LEN 1800.132896
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.65758750947528,41.96427269188822,0 -87.65802133507393,41.96581929055245,0 -87.65819033925305,41.96621846093642,0 -87.6583189819129,41.96650362897086,0 -87.65835858701473,41.96669002089185,0 -87.65838428411853,41.96688150295095,0 -87.65842208882658,41.96745896091846,0 -87.65846556843937,41.9683761425439,0 -87.65849296214573,41.96913893870342,0 + + +
+ + Red, Purple Line + +
Red, Purple Line
OBJECTID_1 3
ASSET_ID 21100003
LINES Red, Purple Line
DESCRIPTIO Wilson to Sheridan
TYPE Elevated or at Grade
LEGEND RD
ALT_LEGEND RDPR
BRANCH Red Line North Side
SHAPE.LEN 4256.243677
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.65492939166126,41.95377494531437,0 -87.65557043199591,41.95376544118533,0 -87.65606302030132,41.95376391658746,0 -87.65623502146268,41.95377379126367,0 -87.65634748981634,41.95380103566435,0 -87.65646537904269,41.95387703994676,0 -87.65656532461145,41.95396622645799,0 -87.65664760856414,41.95404201996044,0 -87.65671750555913,41.95416647054043,0 -87.65673983607117,41.95429949810849,0 -87.65673866475777,41.95441024240925,0 -87.6567690255541,41.95490657227902,0 -87.65683672482363,41.95692259283837,0 -87.6568900886376,41.95861070983142,0 -87.65699865558875,41.96181418669004,0 -87.65756347177603,41.96397045777844,0 -87.65758750947528,41.96427269188822,0 + + +
+ + Red, Purple Line + +
Red, Purple Line
OBJECTID_1 4
ASSET_ID 21100004
LINES Red, Purple Line
DESCRIPTIO Sheridan to Addison
TYPE Elevated or at Grade
LEGEND RD
ALT_LEGEND RDPR
BRANCH Red Line North Side
SHAPE.LEN 2581.713736
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.65362593118043,41.94742799535678,0 -87.65363554415794,41.94819886386848,0 -87.6536456393239,41.95059994675451,0 -87.65365831235026,41.95108288489359,0 -87.6536604873874,41.9519954657554,0 -87.65362592053201,41.95245597302328,0 -87.65367158496069,41.95311153649393,0 -87.65368468595476,41.9533202828916,0 -87.65369271253692,41.95343095587119,0 -87.65373335834569,41.95351536301472,0 -87.65378605844126,41.95358212680591,0 -87.65385067928185,41.95364452823767,0 -87.6539390793817,41.95370263886964,0 -87.6540786298351,41.95373403675265,0 -87.65430648647626,41.9537535411832,0 -87.65492939166126,41.95377494531437,0 + + +
+ + Red, Purple Line + +
Red, Purple Line
OBJECTID_1 5
ASSET_ID 21100005
LINES Red, Purple Line
DESCRIPTIO Addison to Clark Junction
TYPE Elevated or at Grade
LEGEND RD
ALT_LEGEND RDPR
BRANCH Red Line North Side
SHAPE.LEN 1918.716686
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.65345391792157,41.94217681262115,0 -87.65342448305786,41.94237224420864,0 -87.65339745703922,41.94268217746244,0 -87.65337753982941,41.94288140770284,0 -87.65336256753105,41.94317369618263,0 -87.65338799707138,41.94357253961736,0 -87.65340240886648,41.94389158188269,0 -87.65341837392448,41.94406444407721,0 -87.65342275247338,41.94421065714904,0 -87.65347469646018,41.94434829382345,0 -87.65351486483024,41.94447699917548,0 -87.65353483605053,41.9453896864472,0 -87.65361975532807,41.94689193720703,0 -87.65362593118043,41.94742799535678,0 + + +
+
+ +
+
diff --git a/pandas/tests/io/data/xml/flatten_doc.xsl b/pandas/tests/io/data/xml/flatten_doc.xsl new file mode 100644 index 0000000000000..a9d62d180beaf --- /dev/null +++ b/pandas/tests/io/data/xml/flatten_doc.xsl @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + diff --git a/pandas/tests/io/data/xml/row_field_output.xsl b/pandas/tests/io/data/xml/row_field_output.xsl new file mode 100644 index 0000000000000..5a0f0e655a78e --- /dev/null +++ b/pandas/tests/io/data/xml/row_field_output.xsl @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index c650f59a7da95..d8448736c7cc8 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -275,7 +275,7 @@ def test_read_excel_parse_dates(self, ext): def test_multiindex_interval_datetimes(self, ext): # GH 30986 - midx = pd.MultiIndex.from_arrays( + midx = MultiIndex.from_arrays( [ range(4), pd.interval_range( @@ -289,7 +289,7 @@ def test_multiindex_interval_datetimes(self, ext): result = pd.read_excel(pth, index_col=[0, 1]) expected = DataFrame( range(4), - pd.MultiIndex.from_arrays( + MultiIndex.from_arrays( [ range(4), [ diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 01ed234f6e248..f0d1090899043 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -626,6 +626,19 @@ def test_table_styles(self): result = " ".join(styler.render().split()) assert "th { foo: bar; }" in result + def test_table_styles_multiple(self): + ctx = self.df.style.set_table_styles( + [ + {"selector": "th,td", "props": "color:red;"}, + {"selector": "tr", "props": "color:green;"}, + ] + )._translate()["table_styles"] + assert ctx == [ + {"selector": "th", "props": [("color", "red")]}, + {"selector": "td", "props": [("color", "red")]}, + {"selector": "tr", "props": [("color", "green")]}, + ] + def test_maybe_convert_css_to_tuples(self): expected = [("a", "b"), ("c", "d e")] assert _maybe_convert_css_to_tuples("a:b;c:d e;") == expected diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 41efb594fd8e4..06e0eadb84c59 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -249,7 +249,7 @@ def test_repr_deprecation_negative_int(self): def test_repr_chop_threshold(self): df = DataFrame([[0.1, 0.5], [0.5, -0.1]]) - pd.reset_option("display.chop_threshold") # default None + reset_option("display.chop_threshold") # default None assert repr(df) == " 0 1\n0 0.1 0.5\n1 0.5 -0.1" with option_context("display.chop_threshold", 0.2): @@ -382,7 +382,7 @@ def test_repr_truncates_terminal_size(self, monkeypatch): ) index = range(5) - columns = pd.MultiIndex.from_tuples( + columns = MultiIndex.from_tuples( [ ("This is a long title with > 37 chars.", "cat"), ("This is a loooooonger title with > 43 chars.", "dog"), @@ -689,7 +689,7 @@ def test_east_asian_unicode_false(self): assert repr(df) == expected # MultiIndex - idx = pd.MultiIndex.from_tuples( + idx = MultiIndex.from_tuples( [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] ) df = DataFrame( @@ -833,7 +833,7 @@ def test_east_asian_unicode_true(self): assert repr(df) == expected # MultiIndex - idx = pd.MultiIndex.from_tuples( + idx = MultiIndex.from_tuples( [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] ) df = DataFrame( @@ -1002,14 +1002,14 @@ def test_truncate_with_different_dtypes(self): + [datetime.datetime(2012, 1, 3)] * 10 ) - with pd.option_context("display.max_rows", 8): + with option_context("display.max_rows", 8): result = str(s) assert "object" in result # 12045 df = DataFrame({"text": ["some words"] + [None] * 9}) - with pd.option_context("display.max_rows", 8, "display.max_columns", 3): + with option_context("display.max_rows", 8, "display.max_columns", 3): result = str(df) assert "None" in result assert "NaN" not in result @@ -1026,9 +1026,7 @@ def test_truncate_with_different_dtypes_multiindex(self): def test_datetimelike_frame(self): # GH 12211 - df = DataFrame( - {"date": [Timestamp("20130101").tz_localize("UTC")] + [pd.NaT] * 5} - ) + df = DataFrame({"date": [Timestamp("20130101").tz_localize("UTC")] + [NaT] * 5}) with option_context("display.max_rows", 5): result = str(df) @@ -1037,7 +1035,7 @@ def test_datetimelike_frame(self): assert "..." in result assert "[6 rows x 1 columns]" in result - dts = [Timestamp("2011-01-01", tz="US/Eastern")] * 5 + [pd.NaT] * 5 + dts = [Timestamp("2011-01-01", tz="US/Eastern")] * 5 + [NaT] * 5 df = DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) with option_context("display.max_rows", 5): expected = ( @@ -1051,7 +1049,7 @@ def test_datetimelike_frame(self): ) assert repr(df) == expected - dts = [pd.NaT] * 5 + [Timestamp("2011-01-01", tz="US/Eastern")] * 5 + dts = [NaT] * 5 + [Timestamp("2011-01-01", tz="US/Eastern")] * 5 df = DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) with option_context("display.max_rows", 5): expected = ( @@ -1117,7 +1115,7 @@ def test_unicode_problem_decoding_as_ascii(self): def test_string_repr_encoding(self, datapath): filepath = datapath("io", "parser", "data", "unicode_series.csv") - df = pd.read_csv(filepath, header=None, encoding="latin1") + df = read_csv(filepath, header=None, encoding="latin1") repr(df) repr(df[1]) @@ -1548,7 +1546,7 @@ def test_to_string_float_index(self): def test_to_string_complex_float_formatting(self): # GH #25514, 25745 - with pd.option_context("display.precision", 5): + with option_context("display.precision", 5): df = DataFrame( { "x": [ @@ -1785,7 +1783,7 @@ def test_repr_html_mathjax(self): df = DataFrame([[1, 2], [3, 4]]) assert "tex2jax_ignore" not in df._repr_html_() - with pd.option_context("display.html.use_mathjax", False): + with option_context("display.html.use_mathjax", False): assert "tex2jax_ignore" in df._repr_html_() def test_repr_html_wide(self): @@ -2229,7 +2227,7 @@ def test_east_asian_unicode_series(self): assert repr(s) == expected # MultiIndex - idx = pd.MultiIndex.from_tuples( + idx = MultiIndex.from_tuples( [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] ) s = Series([1, 22, 3333, 44444], index=idx) @@ -2324,7 +2322,7 @@ def test_east_asian_unicode_series(self): assert repr(s) == expected # MultiIndex - idx = pd.MultiIndex.from_tuples( + idx = MultiIndex.from_tuples( [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] ) s = Series([1, 22, 3333, 44444], index=idx) @@ -2853,7 +2851,7 @@ def test_output_display_precision_trailing_zeroes(self): # Issue #20359: trimming zeros while there is no decimal point # Happens when display precision is set to zero - with pd.option_context("display.precision", 0): + with option_context("display.precision", 0): s = Series([840.0, 4200.0]) expected_output = "0 840\n1 4200\ndtype: float64" assert str(s) == expected_output @@ -2862,7 +2860,7 @@ def test_output_significant_digits(self): # Issue #9764 # In case default display precision changes: - with pd.option_context("display.precision", 6): + with option_context("display.precision", 6): # DataFrame example from issue #9764 d = DataFrame( { @@ -2933,7 +2931,7 @@ def test_output_significant_digits(self): def test_too_long(self): # GH 10451 - with pd.option_context("display.precision", 4): + with option_context("display.precision", 4): # need both a number > 1e6 and something that normally formats to # having length > display.precision + 6 df = DataFrame({"x": [12345.6789]}) @@ -3011,7 +3009,7 @@ def test_all(self): class TestTimedelta64Formatter: def test_days(self): - x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="D") + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") result = fmt.Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'1 days'" @@ -3027,25 +3025,25 @@ def test_days(self): assert result[0].strip() == "1 days" def test_days_neg(self): - x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="D") + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") result = fmt.Timedelta64Formatter(-x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'-1 days'" def test_subdays(self): - y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") result = fmt.Timedelta64Formatter(y, box=True).get_result() assert result[0].strip() == "'0 days 00:00:00'" assert result[1].strip() == "'0 days 00:00:01'" def test_subdays_neg(self): - y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") result = fmt.Timedelta64Formatter(-y, box=True).get_result() assert result[0].strip() == "'0 days 00:00:00'" assert result[1].strip() == "'-1 days +23:59:59'" def test_zero(self): - x = pd.to_timedelta(list(range(1)) + [pd.NaT], unit="D") + x = pd.to_timedelta(list(range(1)) + [NaT], unit="D") result = fmt.Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" @@ -3056,13 +3054,13 @@ def test_zero(self): class TestDatetime64Formatter: def test_mixed(self): - x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), pd.NaT]) + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT]) result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 00:00:00" assert result[1].strip() == "2013-01-01 12:00:00" def test_dates(self): - x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT]) + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), NaT]) result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01" assert result[1].strip() == "2013-01-02" @@ -3137,20 +3135,20 @@ def format_func(x): class TestNaTFormatting: def test_repr(self): - assert repr(pd.NaT) == "NaT" + assert repr(NaT) == "NaT" def test_str(self): - assert str(pd.NaT) == "NaT" + assert str(NaT) == "NaT" class TestDatetimeIndexFormat: def test_datetime(self): - formatted = pd.to_datetime([datetime(2003, 1, 1, 12), pd.NaT]).format() + formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() assert formatted[0] == "2003-01-01 12:00:00" assert formatted[1] == "NaT" def test_date(self): - formatted = pd.to_datetime([datetime(2003, 1, 1), pd.NaT]).format() + formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() assert formatted[0] == "2003-01-01" assert formatted[1] == "NaT" @@ -3158,11 +3156,11 @@ def test_date_tz(self): formatted = pd.to_datetime([datetime(2013, 1, 1)], utc=True).format() assert formatted[0] == "2013-01-01 00:00:00+00:00" - formatted = pd.to_datetime([datetime(2013, 1, 1), pd.NaT], utc=True).format() + formatted = pd.to_datetime([datetime(2013, 1, 1), NaT], utc=True).format() assert formatted[0] == "2013-01-01 00:00:00+00:00" def test_date_explicit_date_format(self): - formatted = pd.to_datetime([datetime(2003, 2, 1), pd.NaT]).format( + formatted = pd.to_datetime([datetime(2003, 2, 1), NaT]).format( date_format="%m-%d-%Y", na_rep="UT" ) assert formatted[0] == "02-01-2003" @@ -3226,7 +3224,7 @@ def test_tz_dateutil(self): def test_nat_representations(self): for f in (str, repr, methodcaller("isoformat")): - assert f(pd.NaT) == "NaT" + assert f(NaT) == "NaT" def test_format_percentiles(): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 8c634509bdc84..5e599818308b8 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -326,7 +326,7 @@ def test_to_csv_multi_index(self): ), ], ) - @pytest.mark.parametrize("klass", [pd.DataFrame, pd.Series]) + @pytest.mark.parametrize("klass", [DataFrame, pd.Series]) def test_to_csv_single_level_multi_index(self, ind, expected, klass): # see gh-19589 result = klass(pd.Series([1], ind, name="data")).to_csv( diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 347e1fda3c79d..1c89c4e392a7f 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -763,7 +763,7 @@ def test_to_html_render_links(render_links, expected, datapath): def test_ignore_display_max_colwidth(method, expected, max_colwidth): # see gh-17004 df = DataFrame([lorem_ipsum]) - with pd.option_context("display.max_colwidth", max_colwidth): + with option_context("display.max_colwidth", max_colwidth): result = getattr(df, method)() expected = expected(max_colwidth) assert expected in result @@ -782,7 +782,7 @@ def test_to_html_invalid_classes_type(classes): def test_to_html_round_column_headers(): # GH 17280 df = DataFrame([1], columns=[0.55555]) - with pd.option_context("display.precision", 3): + with option_context("display.precision", 3): html = df.to_html(notebook=False) notebook = df.to_html(notebook=True) assert "0.55555" in html diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 89248447c98d3..9a793e274ce48 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -149,7 +149,7 @@ def test_frame_default_orient(self, float_frame): @pytest.mark.parametrize("numpy", [True, False]) def test_roundtrip_simple(self, orient, convert_axes, numpy, dtype, float_frame): data = float_frame.to_json(orient=orient) - result = pd.read_json( + result = read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy, dtype=dtype ) @@ -162,7 +162,7 @@ def test_roundtrip_simple(self, orient, convert_axes, numpy, dtype, float_frame) @pytest.mark.parametrize("numpy", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, numpy, dtype, int_frame): data = int_frame.to_json(orient=orient) - result = pd.read_json( + result = read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy, dtype=dtype ) expected = int_frame @@ -195,7 +195,7 @@ def test_roundtrip_str_axes(self, request, orient, convert_axes, numpy, dtype): ) data = df.to_json(orient=orient) - result = pd.read_json( + result = read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy, dtype=dtype ) @@ -235,9 +235,7 @@ def test_roundtrip_categorical(self, request, orient, convert_axes, numpy): pytest.mark.xfail(reason=f"Orient {orient} is broken with numpy=True") ) - result = pd.read_json( - data, orient=orient, convert_axes=convert_axes, numpy=numpy - ) + result = read_json(data, orient=orient, convert_axes=convert_axes, numpy=numpy) expected = self.categorical.copy() expected.index = expected.index.astype(str) # Categorical not preserved @@ -252,9 +250,7 @@ def test_roundtrip_categorical(self, request, orient, convert_axes, numpy): @pytest.mark.parametrize("numpy", [True, False]) def test_roundtrip_empty(self, orient, convert_axes, numpy, empty_frame): data = empty_frame.to_json(orient=orient) - result = pd.read_json( - data, orient=orient, convert_axes=convert_axes, numpy=numpy - ) + result = read_json(data, orient=orient, convert_axes=convert_axes, numpy=numpy) expected = empty_frame.copy() # TODO: both conditions below are probably bugs @@ -271,9 +267,7 @@ def test_roundtrip_empty(self, orient, convert_axes, numpy, empty_frame): def test_roundtrip_timestamp(self, orient, convert_axes, numpy, datetime_frame): # TODO: improve coverage with date_format parameter data = datetime_frame.to_json(orient=orient) - result = pd.read_json( - data, orient=orient, convert_axes=convert_axes, numpy=numpy - ) + result = read_json(data, orient=orient, convert_axes=convert_axes, numpy=numpy) expected = datetime_frame.copy() if not convert_axes: # one off for ts handling @@ -305,9 +299,7 @@ def test_roundtrip_mixed(self, request, orient, convert_axes, numpy): df = DataFrame(data=values, index=index) data = df.to_json(orient=orient) - result = pd.read_json( - data, orient=orient, convert_axes=convert_axes, numpy=numpy - ) + result = read_json(data, orient=orient, convert_axes=convert_axes, numpy=numpy) expected = df.copy() expected = expected.assign(**expected.select_dtypes("number").astype(np.int64)) @@ -487,12 +479,12 @@ def test_v12_compat(self, datapath): dirpath = datapath("io", "json", "data") v12_json = os.path.join(dirpath, "tsframe_v012.json") - df_unser = pd.read_json(v12_json) + df_unser = read_json(v12_json) tm.assert_frame_equal(df, df_unser) df_iso = df.drop(["modified"], axis=1) v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") - df_unser_iso = pd.read_json(v12_iso_json) + df_unser_iso = read_json(v12_iso_json) tm.assert_frame_equal(df_iso, df_unser_iso) def test_blocks_compat_GH9037(self): @@ -581,7 +573,7 @@ def test_blocks_compat_GH9037(self): # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype("unicode") - df_roundtrip = pd.read_json(df_mixed.to_json(orient="split"), orient="split") + df_roundtrip = read_json(df_mixed.to_json(orient="split"), orient="split") tm.assert_frame_equal( df_mixed, df_roundtrip, @@ -654,7 +646,7 @@ def test_series_default_orient(self, string_series): @pytest.mark.parametrize("numpy", [True, False]) def test_series_roundtrip_simple(self, orient, numpy, string_series): data = string_series.to_json(orient=orient) - result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + result = read_json(data, typ="series", orient=orient, numpy=numpy) expected = string_series if orient in ("values", "records"): @@ -668,9 +660,7 @@ def test_series_roundtrip_simple(self, orient, numpy, string_series): @pytest.mark.parametrize("numpy", [True, False]) def test_series_roundtrip_object(self, orient, numpy, dtype, object_series): data = object_series.to_json(orient=orient) - result = pd.read_json( - data, typ="series", orient=orient, numpy=numpy, dtype=dtype - ) + result = read_json(data, typ="series", orient=orient, numpy=numpy, dtype=dtype) expected = object_series if orient in ("values", "records"): @@ -683,7 +673,7 @@ def test_series_roundtrip_object(self, orient, numpy, dtype, object_series): @pytest.mark.parametrize("numpy", [True, False]) def test_series_roundtrip_empty(self, orient, numpy, empty_series): data = empty_series.to_json(orient=orient) - result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + result = read_json(data, typ="series", orient=orient, numpy=numpy) expected = empty_series if orient in ("values", "records"): @@ -696,7 +686,7 @@ def test_series_roundtrip_empty(self, orient, numpy, empty_series): @pytest.mark.parametrize("numpy", [True, False]) def test_series_roundtrip_timeseries(self, orient, numpy, datetime_series): data = datetime_series.to_json(orient=orient) - result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + result = read_json(data, typ="series", orient=orient, numpy=numpy) expected = datetime_series if orient in ("values", "records"): @@ -711,7 +701,7 @@ def test_series_roundtrip_timeseries(self, orient, numpy, datetime_series): def test_series_roundtrip_numeric(self, orient, numpy, dtype): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"]) data = s.to_json(orient=orient) - result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + result = read_json(data, typ="series", orient=orient, numpy=numpy) expected = s.copy() if orient in ("values", "records"): @@ -747,7 +737,7 @@ def test_series_with_dtype(self): def test_series_with_dtype_datetime(self, dtype, expected): s = Series(["2000-01-01"], dtype="datetime64[ns]") data = s.to_json() - result = pd.read_json(data, typ="series", dtype=dtype) + result = read_json(data, typ="series", dtype=dtype) tm.assert_series_equal(result, expected) def test_frame_from_json_precise_float(self): @@ -1001,7 +991,7 @@ def test_round_trip_exception_(self): csv = "https://raw.github.com/hayd/lahman2012/master/csvs/Teams.csv" df = pd.read_csv(csv) s = df.to_json() - result = pd.read_json(s) + result = read_json(s) tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df) @tm.network @@ -1025,17 +1015,17 @@ def test_timedelta(self): s = Series([timedelta(23), timedelta(seconds=5)]) assert s.dtype == "timedelta64[ns]" - result = pd.read_json(s.to_json(), typ="series").apply(converter) + result = read_json(s.to_json(), typ="series").apply(converter) tm.assert_series_equal(result, s) s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1])) assert s.dtype == "timedelta64[ns]" - result = pd.read_json(s.to_json(), typ="series").apply(converter) + result = read_json(s.to_json(), typ="series").apply(converter) tm.assert_series_equal(result, s) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) assert frame[0].dtype == "timedelta64[ns]" - tm.assert_frame_equal(frame, pd.read_json(frame.to_json()).apply(converter)) + tm.assert_frame_equal(frame, read_json(frame.to_json()).apply(converter)) frame = DataFrame( { @@ -1045,7 +1035,7 @@ def test_timedelta(self): } ) - result = pd.read_json(frame.to_json(date_unit="ns")) + result = read_json(frame.to_json(date_unit="ns")) result["a"] = pd.to_timedelta(result.a, unit="ns") result["c"] = pd.to_datetime(result.c) tm.assert_frame_equal(frame, result) @@ -1056,7 +1046,7 @@ def test_mixed_timedelta_datetime(self): expected = DataFrame( {"a": [pd.Timedelta(frame.a[0]).value, Timestamp(frame.a[1]).value]} ) - result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) + result = read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) @pytest.mark.parametrize("as_object", [True, False]) @@ -1086,7 +1076,7 @@ def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) expected = DataFrame({"a": [7, str(value)]}) - result = pd.read_json(frame.to_json(default_handler=str)) + result = read_json(frame.to_json(default_handler=str)) tm.assert_frame_equal(expected, result, check_index_type=False) def test_default_handler_indirect(self): @@ -1319,14 +1309,14 @@ def test_to_jsonl(self): result = df.to_json(orient="records", lines=True) expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n' assert result == expected - tm.assert_frame_equal(pd.read_json(result, lines=True), df) + tm.assert_frame_equal(read_json(result, lines=True), df) # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n' assert result == expected - tm.assert_frame_equal(pd.read_json(result, lines=True), df) + tm.assert_frame_equal(read_json(result, lines=True), df) # TODO: there is a near-identical test for pytables; can we share? def test_latin_encoding(self): @@ -1382,14 +1372,14 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): # GH25433 GH25435 expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns) dfjson = expected.to_json(orient="table") - result = pd.read_json(dfjson, orient="table") + result = read_json(dfjson, orient="table") tm.assert_frame_equal(result, expected) def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) dfjson = expected.to_json(orient="table") - result = pd.read_json(dfjson, orient="table") + result = read_json(dfjson, orient="table") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) @@ -1399,7 +1389,7 @@ def test_read_json_table_dtype_raises(self, dtype): dfjson = df.to_json(orient="table") msg = "cannot pass both dtype and orient='table'" with pytest.raises(ValueError, match=msg): - pd.read_json(dfjson, orient="table", dtype=dtype) + read_json(dfjson, orient="table", dtype=dtype) def test_read_json_table_convert_axes_raises(self): # GH25433 GH25435 @@ -1407,7 +1397,7 @@ def test_read_json_table_convert_axes_raises(self): dfjson = df.to_json(orient="table") msg = "cannot pass both convert_axes and orient='table'" with pytest.raises(ValueError, match=msg): - pd.read_json(dfjson, orient="table", convert_axes=True) + read_json(dfjson, orient="table", convert_axes=True) @pytest.mark.parametrize( "data, expected", @@ -1681,7 +1671,7 @@ def test_json_negative_indent_raises(self): def test_emca_262_nan_inf_support(self): # GH 12213 data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' - result = pd.read_json(data) + result = read_json(data) expected = DataFrame( ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] ) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index a8cf94421dbde..711addb1ac237 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -93,7 +93,7 @@ def test_readjson_chunks(lines_json_df, chunksize): def test_readjson_chunksize_requires_lines(lines_json_df): msg = "chunksize can only be passed if lines=True" with pytest.raises(ValueError, match=msg): - with pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _: + with read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _: pass @@ -102,10 +102,10 @@ def test_readjson_chunks_series(): s = pd.Series({"A": 1, "B": 2}) strio = StringIO(s.to_json(lines=True, orient="records")) - unchunked = pd.read_json(strio, lines=True, typ="Series") + unchunked = read_json(strio, lines=True, typ="Series") strio = StringIO(s.to_json(lines=True, orient="records")) - with pd.read_json(strio, lines=True, typ="Series", chunksize=1) as reader: + with read_json(strio, lines=True, typ="Series", chunksize=1) as reader: chunked = pd.concat(reader) tm.assert_series_equal(chunked, unchunked) @@ -114,7 +114,7 @@ def test_readjson_chunks_series(): def test_readjson_each_chunk(lines_json_df): # Other tests check that the final result of read_json(chunksize=True) # is correct. This checks the intermediate chunks. - with pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader: + with read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader: chunks = list(reader) assert chunks[0].shape == (2, 2) assert chunks[1].shape == (1, 2) @@ -124,9 +124,9 @@ def test_readjson_chunks_from_file(): with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.to_json(path, lines=True, orient="records") - with pd.read_json(path, lines=True, chunksize=1) as reader: + with read_json(path, lines=True, chunksize=1) as reader: chunked = pd.concat(reader) - unchunked = pd.read_json(path, lines=True) + unchunked = read_json(path, lines=True) tm.assert_frame_equal(unchunked, chunked) @@ -164,9 +164,7 @@ def test_readjson_invalid_chunksize(lines_json_df, chunksize): msg = r"'chunksize' must be an integer >=1" with pytest.raises(ValueError, match=msg): - with pd.read_json( - StringIO(lines_json_df), lines=True, chunksize=chunksize - ) as _: + with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as _: pass @@ -189,7 +187,7 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): {"A":3,"B":6} """ orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - test = pd.read_json(j, lines=True, chunksize=chunksize) + test = read_json(j, lines=True, chunksize=chunksize) if chunksize is not None: with test: test = pd.concat(test) @@ -215,7 +213,7 @@ def test_readjson_nrows(nrows): {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - result = pd.read_json(jsonl, lines=True, nrows=nrows) + result = read_json(jsonl, lines=True, nrows=nrows) expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(result, expected) @@ -243,7 +241,7 @@ def test_readjson_nrows_requires_lines(): {"a": 7, "b": 8}""" msg = "nrows can only be passed if lines=True" with pytest.raises(ValueError, match=msg): - pd.read_json(jsonl, lines=False, nrows=2) + read_json(jsonl, lines=False, nrows=2) def test_readjson_lines_chunks_fileurl(datapath): @@ -256,7 +254,7 @@ def test_readjson_lines_chunks_fileurl(datapath): ] os_path = datapath("io", "json", "data", "line_delimited.json") file_url = Path(os_path).as_uri() - with pd.read_json(file_url, lines=True, chunksize=1) as url_reader: + with read_json(file_url, lines=True, chunksize=1) as url_reader: for index, chuck in enumerate(url_reader): tm.assert_frame_equal(chuck, df_list_expected[index]) @@ -285,5 +283,5 @@ def __iter__(self): return iter(self.stringio) reader = MyReader(jsonl) - assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1 + assert len(list(read_json(reader, lines=True, chunksize=100))) > 1 assert reader.read_count > 10 diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 8e1e9fb6e458f..4bc3f3c38f506 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -193,7 +193,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers, request): # 2021-02-21 this occasionally fails on the CI with an unexpected # ResourceWarning that we have been unable to track down, # see GH#38630 - if "ResourceError" not in str(err) or parser.engine != "python": + if "ResourceWarning" not in str(err) or parser.engine != "python": raise # Check the main assertion of the test before re-raising diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9f94f3f8f8a8b..72644693f652b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1526,7 +1526,7 @@ def test_parse_timezone(all_parsers): dti = DatetimeIndex( list( - pd.date_range( + date_range( start="2018-01-04 09:01:00", end="2018-01-04 09:05:00", freq="1min", diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 5586b4915b6ea..9739a2a75886a 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -16,7 +16,6 @@ from pandas.errors import EmptyDataError -import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -687,7 +686,7 @@ def test_binary_mode(): with tm.ensure_clean() as path: Path(path).write_text(data) with open(path, "rb") as file: - df = pd.read_fwf(file) + df = read_fwf(file) file.seek(0) tm.assert_frame_equal(df, df_reference) @@ -701,7 +700,7 @@ def test_encoding_mmap(memory_map): """ encoding = "iso8859_1" data = BytesIO(" 1 A Ä 2\n".encode(encoding)) - df = pd.read_fwf( + df = read_fwf( data, header=None, widths=[2, 2, 2, 2], diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 3eebeee9788c6..8c324d73a7e54 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -415,12 +415,12 @@ def check_col(key, name, size): # just make sure there is a longer string: df2 = df.copy().reset_index().assign(C="longer").set_index("C") store.append("ss3", df2) - tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2])) + tm.assert_frame_equal(store.select("ss3"), concat([df, df2])) # same as above, with a Series store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) store.append("ss4", df2["B"]) - tm.assert_series_equal(store.select("ss4"), pd.concat([df["B"], df2["B"]])) + tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]])) # with nans _maybe_remove(store, "df") diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 11ee5e3564634..2ae330e5139be 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -6,7 +6,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( CategoricalIndex, DataFrame, @@ -207,7 +206,7 @@ def test_unsuppored_hdf_file_error(datapath): ) with pytest.raises(ValueError, match=message): - pd.read_hdf(data_path) + read_hdf(data_path) def test_read_hdf_errors(setup_path): diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 6340311b234f1..88e2b5f080282 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -5,7 +5,6 @@ from pandas.compat import is_platform_little_endian -import pandas as pd from pandas import ( DataFrame, HDFStore, @@ -188,7 +187,7 @@ def test_complibs_default_settings(setup_path): # default value with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complevel=9) - result = pd.read_hdf(tmpfile, "df") + result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: @@ -199,7 +198,7 @@ def test_complibs_default_settings(setup_path): # Set complib and check to see if compression is disabled with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complib="zlib") - result = pd.read_hdf(tmpfile, "df") + result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: @@ -210,7 +209,7 @@ def test_complibs_default_settings(setup_path): # Check if not setting complib or complevel results in no compression with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df") - result = pd.read_hdf(tmpfile, "df") + result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: @@ -256,7 +255,7 @@ def test_complibs(setup_path): # Write and read file to see if data is consistent df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) - result = pd.read_hdf(tmpfile, gname) + result = read_hdf(tmpfile, gname) tm.assert_frame_equal(result, df) # Open file and check metadata diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index f8d302a0190f8..1c9e63c66aadb 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -35,7 +35,7 @@ def test_read_missing_key_close_store(setup_path): df.to_hdf(path, "k1") with pytest.raises(KeyError, match="'No object named k2 in the file'"): - pd.read_hdf(path, "k2") + read_hdf(path, "k2") # smoke test to test that file is properly closed after # read with KeyError before another write @@ -51,11 +51,11 @@ def test_read_missing_key_opened_store(setup_path): with HDFStore(path, "r") as store: with pytest.raises(KeyError, match="'No object named k2 in the file'"): - pd.read_hdf(store, "k2") + read_hdf(store, "k2") # Test that the file is still open after a KeyError and that we can # still read from it. - pd.read_hdf(store, "k1") + read_hdf(store, "k1") def test_read_column(setup_path): @@ -315,7 +315,7 @@ def test_read_hdf_series_mode_r(format, setup_path): series = tm.makeFloatSeries() with ensure_clean_path(setup_path) as path: series.to_hdf(path, key="data", format=format) - result = pd.read_hdf(path, key="data", mode="r") + result = read_hdf(path, key="data", mode="r") tm.assert_series_equal(result, series) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index a8f63bdc5fb2f..8ad5dbc049380 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -978,5 +978,5 @@ def test_select_empty_where(where): with ensure_clean_path("empty_where.h5") as path: with HDFStore(path) as store: store.put("df", df, "t") - result = pd.read_hdf(store, "df", where=where) + result = read_hdf(store, "df", where=where) tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index ef75c86190a25..b0a11b5e7690e 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -335,12 +335,12 @@ def test_to_hdf_with_min_itemsize(setup_path): # just make sure there is a longer string: df2 = df.copy().reset_index().assign(C="longer").set_index("C") df2.to_hdf(path, "ss3", append=True, format="table") - tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2])) + tm.assert_frame_equal(read_hdf(path, "ss3"), concat([df, df2])) # same as above, with a Series df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) df2["B"].to_hdf(path, "ss4", append=True, format="table") - tm.assert_series_equal(pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]])) + tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) @pytest.mark.parametrize("format", ["fixed", "table"]) @@ -352,7 +352,7 @@ def test_to_hdf_errors(format, setup_path): # GH 20835 ser.to_hdf(path, "table", format=format, errors="surrogatepass") - result = pd.read_hdf(path, "table", errors="surrogatepass") + result = read_hdf(path, "table", errors="surrogatepass") tm.assert_series_equal(result, ser) @@ -532,11 +532,7 @@ def test_same_name_scoping(setup_path): with ensure_clean_store(setup_path) as store: - import pandas as pd - - df = DataFrame( - np.random.randn(20, 2), index=pd.date_range("20130101", periods=20) - ) + df = DataFrame(np.random.randn(20, 2), index=date_range("20130101", periods=20)) store.put("df", df, format="table") expected = df[df.index > Timestamp("20130105")] @@ -762,7 +758,7 @@ def test_start_stop_fixed(setup_path): # fixed, GH 8287 df = DataFrame( {"A": np.random.rand(20), "B": np.random.rand(20)}, - index=pd.date_range("20130101", periods=20), + index=date_range("20130101", periods=20), ) store.put("df", df) @@ -818,7 +814,7 @@ def test_path_pathlib(setup_path): df = tm.makeDataFrame() result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df") ) tm.assert_frame_equal(df, result) @@ -849,7 +845,7 @@ def writer(path): def reader(path): with HDFStore(path) as store: - return pd.read_hdf(store, "df") + return read_hdf(store, "df") result = tm.round_trip_pathlib(writer, reader) tm.assert_frame_equal(df, result) @@ -858,7 +854,7 @@ def reader(path): def test_pickle_path_localpath(setup_path): df = tm.makeDataFrame() result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df") ) tm.assert_frame_equal(df, result) @@ -872,7 +868,7 @@ def writer(path): def reader(path): with HDFStore(path) as store: - return pd.read_hdf(store, "df") + return read_hdf(store, "df") result = tm.round_trip_localpath(writer, reader) tm.assert_frame_equal(df, result) @@ -1013,5 +1009,5 @@ def test_to_hdf_with_object_column_names(setup_path): with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): df.to_hdf(path, "df", format="table", data_columns=True) - result = pd.read_hdf(path, "df", where=f"index = [{df.index[0]}]") + result = read_hdf(path, "df", where=f"index = [{df.index[0]}]") assert len(result) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index f67efb4cc60be..0532ddd17cd19 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -327,7 +327,7 @@ def test_legacy_datetimetz_object(datapath, setup_path): def test_dst_transitions(setup_path): # make sure we are not failing on transitions with ensure_clean_store(setup_path) as store: - times = pd.date_range( + times = date_range( "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", @@ -347,7 +347,7 @@ def test_dst_transitions(setup_path): def test_read_with_where_tz_aware_index(setup_path): # GH 11926 periods = 10 - dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") + dts = date_range("20151201", periods=periods, freq="D", tz="UTC") mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) expected = DataFrame({"MYCOL": 0}, index=mi) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index e60807db55f97..45d9ad430aa43 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,7 +3,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( DataFrame, get_option, @@ -216,7 +215,7 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): """.strip() ) mock_clipboard[request.node.name] = text - df = pd.read_clipboard(**clip_kwargs) + df = read_clipboard(**clip_kwargs) # excel data is parsed correctly assert df.iloc[1][1] == "Harry Carney" @@ -230,7 +229,7 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): """.strip() ) mock_clipboard[request.node.name] = text - res = pd.read_clipboard(**clip_kwargs) + res = read_clipboard(**clip_kwargs) text = dedent( """ @@ -240,7 +239,7 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): """.strip() ) mock_clipboard[request.node.name] = text - exp = pd.read_clipboard(**clip_kwargs) + exp = read_clipboard(**clip_kwargs) tm.assert_frame_equal(res, exp) @@ -250,7 +249,7 @@ def test_invalid_encoding(self, df): with pytest.raises(ValueError, match=msg): df.to_clipboard(encoding="ascii") with pytest.raises(NotImplementedError, match=msg): - pd.read_clipboard(encoding="ascii") + read_clipboard(encoding="ascii") @pytest.mark.parametrize("enc", ["UTF-8", "utf-8", "utf8"]) def test_round_trip_valid_encodings(self, enc, df): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index db742fb69dd10..e1dcec56913f9 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -9,6 +9,7 @@ import mmap import os from pathlib import Path +import tempfile import pytest @@ -119,10 +120,11 @@ def test_infer_compression_from_path(self, extension, expected, path_type): @pytest.mark.parametrize("path_type", [str, CustomFSPath, Path]) def test_get_handle_with_path(self, path_type): # ignore LocalPath: it creates strange paths: /absolute/~/sometest - filename = path_type("~/sometest") - with icom.get_handle(filename, "w") as handles: - assert os.path.isabs(handles.handle.name) - assert os.path.expanduser(filename) == handles.handle.name + with tempfile.TemporaryDirectory(dir=Path.home()) as tmp: + filename = path_type("~/" + Path(tmp).name + "/sometest") + with icom.get_handle(filename, "w") as handles: + assert Path(handles.handle.name).is_absolute() + assert os.path.expanduser(filename) == handles.handle.name def test_get_handle_with_buffer(self): input_buffer = StringIO() diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index fc83026f67930..ab0b3b08a11e8 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -177,12 +177,12 @@ def test_write_with_index(self): def test_path_pathlib(self): df = tm.makeDataFrame().reset_index() - result = tm.round_trip_pathlib(df.to_feather, pd.read_feather) + result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) def test_path_localpath(self): df = tm.makeDataFrame().reset_index() - result = tm.round_trip_localpath(df.to_feather, pd.read_feather) + result = tm.round_trip_localpath(df.to_feather, read_feather) tm.assert_frame_equal(df, result) @td.skip_if_no("pyarrow", min_version="0.16.1.dev") @@ -198,6 +198,6 @@ def test_http_path(self, feather_file): "https://raw.githubusercontent.com/pandas-dev/pandas/master/" "pandas/tests/io/data/feather/feather-0_3_1.feather" ) - expected = pd.read_feather(feather_file) - res = pd.read_feather(url) + expected = read_feather(feather_file) + res = read_feather(url) tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d5567f1208c8c..edb20c7aa9254 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -359,7 +359,7 @@ def test_parquet_read_from_url(self, df_compat, engine): "https://raw.githubusercontent.com/pandas-dev/pandas/" "master/pandas/tests/io/data/parquet/simple.parquet" ) - df = pd.read_parquet(url) + df = read_parquet(url) tm.assert_frame_equal(df, df_compat) @@ -605,7 +605,7 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): assert isinstance(buf_bytes, bytes) buf_stream = BytesIO(buf_bytes) - res = pd.read_parquet(buf_stream) + res = read_parquet(buf_stream) tm.assert_frame_equal(df_full, res) @@ -740,7 +740,7 @@ def test_s3_roundtrip_for_dir( def test_read_file_like_obj_support(self, df_compat): buffer = BytesIO() df_compat.to_parquet(buffer) - df_from_buf = pd.read_parquet(buffer) + df_from_buf = read_parquet(buffer) tm.assert_frame_equal(df_compat, df_from_buf) @td.skip_if_no("pyarrow") @@ -748,7 +748,7 @@ def test_expand_user(self, df_compat, monkeypatch): monkeypatch.setenv("HOME", "TestingUser") monkeypatch.setenv("USERPROFILE", "TestingUser") with pytest.raises(OSError, match=r".*TestingUser.*"): - pd.read_parquet("~/file.parquet") + read_parquet("~/file.parquet") with pytest.raises(OSError, match=r".*TestingUser.*"): df_compat.to_parquet("~/file.parquet") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 63dfbd59acd94..8f5a7673fa45f 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -428,7 +428,7 @@ def test_read(self, protocol, get_random_path): @pytest.mark.parametrize( ["pickle_file", "excols"], [ - ("test_py27.pkl", pd.Index(["a", "b", "c"])), + ("test_py27.pkl", Index(["a", "b", "c"])), ( "test_mi_py27.pkl", pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]), diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 0be26ab285079..e57030a4bf125 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -675,7 +675,7 @@ def test_read_sql_with_chunksize_no_result(self): query = "SELECT * FROM iris_view WHERE SepalLength < 0.0" with_batch = sql.read_sql_query(query, self.conn, chunksize=5) without_batch = sql.read_sql_query(query, self.conn) - tm.assert_frame_equal(pd.concat(with_batch), without_batch) + tm.assert_frame_equal(concat(with_batch), without_batch) def test_to_sql(self): sql.to_sql(self.test_frame1, "test_frame1", self.conn) @@ -1592,7 +1592,7 @@ def check(col): ) # GH11216 - df = pd.read_sql_query("select * from types_test_data", self.conn) + df = read_sql_query("select * from types_test_data", self.conn) if not hasattr(df, "DateColWithTz"): pytest.skip("no column with datetime with time zone") @@ -1602,7 +1602,7 @@ def check(col): col = df.DateColWithTz assert is_datetime64tz_dtype(col.dtype) - df = pd.read_sql_query( + df = read_sql_query( "select * from types_test_data", self.conn, parse_dates=["DateColWithTz"] ) if not hasattr(df, "DateColWithTz"): @@ -1612,11 +1612,9 @@ def check(col): assert str(col.dt.tz) == "UTC" check(df.DateColWithTz) - df = pd.concat( + df = concat( list( - pd.read_sql_query( - "select * from types_test_data", self.conn, chunksize=1 - ) + read_sql_query("select * from types_test_data", self.conn, chunksize=1) ), ignore_index=True, ) @@ -2851,7 +2849,7 @@ def test_chunksize_read_type(self): sql.to_sql(frame, name="test", con=self.conn) query = "select * from test" chunksize = 5 - chunk_gen = pd.read_sql_query( + chunk_gen = read_sql_query( sql=query, con=self.conn, chunksize=chunksize, index_col="index" ) chunk_df = next(chunk_gen) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index de1f3cf1e6338..05a6b3c360c61 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -31,6 +31,7 @@ StataMissingValue, StataReader, StataWriterUTF8, + ValueLabelTypeMismatch, read_stata, ) @@ -435,7 +436,7 @@ def test_read_write_dta11(self): formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: - with tm.assert_produces_warning(pd.io.stata.InvalidColumnName): + with tm.assert_produces_warning(InvalidColumnName): original.to_stata(path, None) written_and_read_again = self.read_dta(path) @@ -643,7 +644,7 @@ def test_105(self): # Data obtained from: # http://go.worldbank.org/ZXY29PVJ21 dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") - df = pd.read_stata(dpath) + df = read_stata(dpath) df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]] df0 = DataFrame(df0) df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"] @@ -1022,7 +1023,7 @@ def test_categorical_warnings_and_errors(self): [original[col].astype("category") for col in original], axis=1 ) - with tm.assert_produces_warning(pd.io.stata.ValueLabelTypeMismatch): + with tm.assert_produces_warning(ValueLabelTypeMismatch): original.to_stata(path) # should get a warning for mixed content @@ -1541,7 +1542,7 @@ def test_value_labels_iterator(self, write_index): with tm.ensure_clean() as path: df.to_stata(path, write_index=write_index) - with pd.read_stata(path, iterator=True) as dta_iter: + with read_stata(path, iterator=True) as dta_iter: value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} @@ -1551,7 +1552,7 @@ def test_set_index(self): df.index.name = "index" with tm.ensure_clean() as path: df.to_stata(path) - reread = pd.read_stata(path, index_col="index") + reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) @pytest.mark.parametrize( @@ -1652,7 +1653,7 @@ def test_convert_strl_name_swap(self): ) original.index.name = "index" - with tm.assert_produces_warning(pd.io.stata.InvalidColumnName): + with tm.assert_produces_warning(InvalidColumnName): with tm.ensure_clean() as path: original.to_stata(path, convert_strl=["long", 1], version=117) reread = self.read_dta(path) @@ -1691,7 +1692,7 @@ def test_nonfile_writing(self, version): bio.seek(0) with open(path, "wb") as dta: dta.write(bio.read()) - reread = pd.read_stata(path, index_col="index") + reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) def test_gzip_writing(self): @@ -1702,7 +1703,7 @@ def test_gzip_writing(self): with gzip.GzipFile(path, "wb") as gz: df.to_stata(gz, version=114) with gzip.GzipFile(path, "rb") as gz: - reread = pd.read_stata(gz, index_col="index") + reread = read_stata(gz, index_col="index") tm.assert_frame_equal(df, reread) def test_unicode_dta_118(self): @@ -1873,8 +1874,8 @@ def test_backward_compat(version, datapath): data_base = datapath("io", "data", "stata") ref = os.path.join(data_base, "stata-compat-118.dta") old = os.path.join(data_base, f"stata-compat-{version}.dta") - expected = pd.read_stata(ref) - old_dta = pd.read_stata(old) + expected = read_stata(ref) + old_dta = read_stata(old) tm.assert_frame_equal(old_dta, expected, check_dtype=False) @@ -1984,7 +1985,7 @@ def test_iterator_value_labels(): with tm.ensure_clean() as path: df.to_stata(path, write_index=False) expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") - with pd.read_stata(path, chunksize=100) as reader: + with read_stata(path, chunksize=100) as reader: for j, chunk in enumerate(reader): for i in range(2): tm.assert_index_equal(chunk.dtypes[i].categories, expected) @@ -2025,7 +2026,7 @@ def test_compression_roundtrip(compression): # explicitly ensure file was compressed. with tm.decompress_file(path, compression) as fh: contents = io.BytesIO(fh.read()) - reread = pd.read_stata(contents, index_col="index") + reread = read_stata(contents, index_col="index") tm.assert_frame_equal(df, reread) @@ -2049,5 +2050,5 @@ def test_stata_compression(compression_only, read_infer, to_infer): with tm.ensure_clean(filename) as path: df.to_stata(path, compression=to_compression) - result = pd.read_stata(path, compression=read_compression, index_col="index") + result = read_stata(path, compression=read_compression, index_col="index") tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py new file mode 100644 index 0000000000000..97793ce8f65b8 --- /dev/null +++ b/pandas/tests/io/xml/test_to_xml.py @@ -0,0 +1,1301 @@ +from io import ( + BytesIO, + StringIO, +) +import os +import sys +from typing import Union + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.common import get_handle +from pandas.io.xml import read_xml + +""" +CHECKLIST + +[x] - ValueError: "Values for parser can only be lxml or etree." + +etree +[x] - ImportError: "lxml not found, please install or use the etree parser." +[X] - TypeError: "...is not a valid type for attr_cols" +[X] - TypeError: "...is not a valid type for elem_cols" +[X] - LookupError: "unknown encoding" +[X] - KeyError: "...is not included in namespaces" +[X] - KeyError: "no valid column" +[X] - ValueError: "To use stylesheet, you need lxml installed..." +[] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) +[X] - FileNotFoundError: "No such file or directory" +[X] - PermissionError: "Forbidden" + +lxml +[X] - TypeError: "...is not a valid type for attr_cols" +[X] - TypeError: "...is not a valid type for elem_cols" +[X] - LookupError: "unknown encoding" +[] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) +[X] - FileNotFoundError: "No such file or directory" +[X] - KeyError: "...is not included in namespaces" +[X] - KeyError: "no valid column" +[X] - ValueError: "stylesheet is not a url, file, or xml string." +[] - LookupError: (NEED WRONG ENCODING FOR FILE OUTPUT) +[] - URLError: (USUALLY DUE TO NETWORKING) +[] - HTTPError: (NEED AN ONLINE STYLESHEET) +[X] - OSError: "failed to load external entity" +[X] - XMLSyntaxError: "Opening and ending tag mismatch" +[X] - XSLTApplyError: "Cannot resolve URI" +[X] - XSLTParseError: "failed to compile" +[X] - PermissionError: "Forbidden" +""" + +geom_df = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, np.nan, 3], + } +) + +planet_df = DataFrame( + { + "planet": [ + "Mercury", + "Venus", + "Earth", + "Mars", + "Jupiter", + "Saturn", + "Uranus", + "Neptune", + ], + "type": [ + "terrestrial", + "terrestrial", + "terrestrial", + "terrestrial", + "gas giant", + "gas giant", + "ice giant", + "ice giant", + ], + "location": [ + "inner", + "inner", + "inner", + "inner", + "outer", + "outer", + "outer", + "outer", + ], + "mass": [ + 0.330114, + 4.86747, + 5.97237, + 0.641712, + 1898.187, + 568.3174, + 86.8127, + 102.4126, + ], + } +) + +from_file_expected = """\ + + + + 0 + cooking + Everyday Italian + Giada De Laurentiis + 2005 + 30.0 + + + 1 + children + Harry Potter + J K. Rowling + 2005 + 29.99 + + + 2 + web + Learning XML + Erik T. Ray + 2003 + 39.95 + +""" + + +def equalize_decl(doc): + # etree and lxml differ on quotes and case in xml declaration + if doc is not None: + doc = doc.replace( + ' + + + cooking + Everyday Italian + Giada De Laurentiis + 2005 + 30.0 + + + children + Harry Potter + J K. Rowling + 2005 + 29.99 + + + web + Learning XML + Erik T. Ray + 2003 + 39.95 + +""" + + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, parser=parser) + + with tm.ensure_clean("test.xml") as path: + df_file.to_xml(path, index=False, parser=parser) + with open(path, "rb") as f: + output = f.read().decode("utf-8").strip() + + output = equalize_decl(output) + + assert output == expected + + +def test_index_false_rename_row_root(datapath, parser): + expected = """\ + + + + cooking + Everyday Italian + Giada De Laurentiis + 2005 + 30.0 + + + children + Harry Potter + J K. Rowling + 2005 + 29.99 + + + web + Learning XML + Erik T. Ray + 2003 + 39.95 + +""" + + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, parser=parser) + + with tm.ensure_clean("test.xml") as path: + df_file.to_xml( + path, index=False, root_name="books", row_name="book", parser=parser + ) + with open(path, "rb") as f: + output = f.read().decode("utf-8").strip() + + output = equalize_decl(output) + + assert output == expected + + +# NA_REP + +na_expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + +def test_na_elem_output(datapath, parser): + output = geom_df.to_xml(parser=parser) + output = equalize_decl(output) + + assert output == na_expected + + +def test_na_empty_str_elem_option(datapath, parser): + output = geom_df.to_xml(na_rep="", parser=parser) + output = equalize_decl(output) + + assert output == na_expected + + +def test_na_empty_elem_option(datapath, parser): + expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + 0.0 + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml(na_rep="0.0", parser=parser) + output = equalize_decl(output) + + assert output == expected + + +# ATTR_COLS + + +@pytest.mark.skipif( + sys.version_info < (3, 8), + reason=("etree alpha ordered attributes <= py3.7"), +) +def test_attrs_cols_nan_output(datapath, parser): + expected = """\ + + + + + +""" + + output = geom_df.to_xml(attr_cols=["shape", "degrees", "sides"], parser=parser) + output = equalize_decl(output) + + assert output == expected + + +@pytest.mark.skipif( + sys.version_info < (3, 8), + reason=("etree alpha ordered attributes <= py3.7"), +) +def test_attrs_cols_prefix(datapath, parser): + expected = """\ + + + + + +""" + + output = geom_df.to_xml( + attr_cols=["index", "shape", "degrees", "sides"], + namespaces={"doc": "http://example.xom"}, + prefix="doc", + parser=parser, + ) + output = equalize_decl(output) + + assert output == expected + + +def test_attrs_unknown_column(parser): + with pytest.raises(KeyError, match=("no valid column")): + geom_df.to_xml(attr_cols=["shape", "degreees", "sides"], parser=parser) + + +def test_attrs_wrong_type(parser): + with pytest.raises(TypeError, match=("is not a valid type for attr_cols")): + geom_df.to_xml(attr_cols='"shape", "degreees", "sides"', parser=parser) + + +# ELEM_COLS + + +def test_elems_cols_nan_output(datapath, parser): + elems_cols_expected = """\ + + + + 360 + 4.0 + square + + + 360 + + circle + + + 180 + 3.0 + triangle + +""" + + output = geom_df.to_xml( + index=False, elem_cols=["degrees", "sides", "shape"], parser=parser + ) + output = equalize_decl(output) + + assert output == elems_cols_expected + + +def test_elems_unknown_column(parser): + with pytest.raises(KeyError, match=("no valid column")): + geom_df.to_xml(elem_cols=["shape", "degreees", "sides"], parser=parser) + + +def test_elems_wrong_type(parser): + with pytest.raises(TypeError, match=("is not a valid type for elem_cols")): + geom_df.to_xml(elem_cols='"shape", "degreees", "sides"', parser=parser) + + +def test_elems_and_attrs_cols(datapath, parser): + elems_cols_expected = """\ + + + + 360 + 4.0 + + + 360 + + + + 180 + 3.0 + +""" + + output = geom_df.to_xml( + index=False, + elem_cols=["degrees", "sides"], + attr_cols=["shape"], + parser=parser, + ) + output = equalize_decl(output) + + assert output == elems_cols_expected + + +# HIERARCHICAL COLUMNS + + +def test_hierarchical_columns(datapath, parser): + expected = """\ + + + + inner + terrestrial + 4 + 11.81 + 2.95 + + + outer + gas giant + 2 + 2466.5 + 1233.25 + + + outer + ice giant + 2 + 189.23 + 94.61 + + + All + + 8 + 2667.54 + 333.44 + +""" + + pvt = planet_df.pivot_table( + index=["location", "type"], + values="mass", + aggfunc=["count", "sum", "mean"], + margins=True, + ).round(2) + + output = pvt.to_xml(parser=parser) + output = equalize_decl(output) + + assert output == expected + + +@pytest.mark.skipif( + sys.version_info < (3, 8), + reason=("etree alpha ordered attributes <= py3.7"), +) +def test_hierarchical_attrs_columns(datapath, parser): + expected = """\ + + + + + + +""" + + pvt = planet_df.pivot_table( + index=["location", "type"], + values="mass", + aggfunc=["count", "sum", "mean"], + margins=True, + ).round(2) + + output = pvt.to_xml(attr_cols=list(pvt.reset_index().columns.values), parser=parser) + output = equalize_decl(output) + + assert output == expected + + +# MULTIINDEX + + +def test_multi_index(datapath, parser): + expected = """\ + + + + inner + terrestrial + 4 + 11.81 + 2.95 + + + outer + gas giant + 2 + 2466.5 + 1233.25 + + + outer + ice giant + 2 + 189.23 + 94.61 + +""" + + agg = ( + planet_df.groupby(["location", "type"])["mass"] + .agg(["count", "sum", "mean"]) + .round(2) + ) + + output = agg.to_xml(parser=parser) + output = equalize_decl(output) + + assert output == expected + + +@pytest.mark.skipif( + sys.version_info < (3, 8), + reason=("etree alpha ordered attributes <= py3.7"), +) +def test_multi_index_attrs_cols(datapath, parser): + expected = """\ + + + + + +""" + + agg = ( + planet_df.groupby(["location", "type"])["mass"] + .agg(["count", "sum", "mean"]) + .round(2) + ) + output = agg.to_xml(attr_cols=list(agg.reset_index().columns.values), parser=parser) + output = equalize_decl(output) + + assert output == expected + + +# NAMESPACE + + +def test_default_namespace(parser): + expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml(namespaces={"": "http://example.com"}, parser=parser) + output = equalize_decl(output) + + assert output == expected + + +# PREFIX + + +def test_namespace_prefix(parser): + expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml( + namespaces={"doc": "http://example.com"}, prefix="doc", parser=parser + ) + output = equalize_decl(output) + + assert output == expected + + +def test_missing_prefix_in_nmsp(parser): + with pytest.raises(KeyError, match=("doc is not included in namespaces")): + + geom_df.to_xml( + namespaces={"": "http://example.com"}, prefix="doc", parser=parser + ) + + +def test_namespace_prefix_and_default(parser): + expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml( + namespaces={"": "http://example.com", "doc": "http://other.org"}, + prefix="doc", + parser=parser, + ) + output = equalize_decl(output) + + if output is not None: + # etree and lxml differs on order of namespace prefixes + output = output.replace( + 'xmlns:doc="http://other.org" xmlns="http://example.com"', + 'xmlns="http://example.com" xmlns:doc="http://other.org"', + ) + + assert output == expected + + +# ENCODING + +encoding_expected = """\ + + + + 0 + 1 + José + Sofía + + + 1 + 2 + Luis + Valentina + + + 2 + 3 + Carlos + Isabella + + + 3 + 4 + Juan + Camila + + + 4 + 5 + Jorge + Valeria + +""" + + +def test_encoding_option_str(datapath, parser): + filename = datapath("io", "data", "xml", "baby_names.xml") + df_file = read_xml(filename, parser=parser, encoding="ISO-8859-1").head(5) + + output = df_file.to_xml(encoding="ISO-8859-1", parser=parser) + + if output is not None: + # etree and lxml differ on quotes and case in xml declaration + output = output.replace( + ' + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml(xml_declaration=False) + + assert output == expected + + +def test_no_pretty_print_with_decl(parser): + expected = ( + "\n" + "0square" + "3604.0" + "1circle360" + "2" + "triangle1803.0" + "" + ) + + output = geom_df.to_xml(pretty_print=False, parser=parser) + output = equalize_decl(output) + + # etree adds space for closed tags + if output is not None: + output = output.replace(" />", "/>") + + assert output == expected + + +def test_no_pretty_print_no_decl(parser): + expected = ( + "0square" + "3604.0" + "1circle360" + "2" + "triangle1803.0" + "" + ) + + output = geom_df.to_xml(xml_declaration=False, pretty_print=False, parser=parser) + + # etree adds space for closed tags + if output is not None: + output = output.replace(" />", "/>") + + assert output == expected + + +# PARSER + + +@td.skip_if_installed("lxml") +def test_default_parser_no_lxml(): + with pytest.raises( + ImportError, match=("lxml not found, please install or use the etree parser.") + ): + geom_df.to_xml() + + +def test_unknown_parser(): + with pytest.raises( + ValueError, match=("Values for parser can only be lxml or etree.") + ): + geom_df.to_xml(parser="bs4") + + +# STYLESHEET + +xsl_expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + +@td.skip_if_no("lxml") +def test_stylesheet_file_like(datapath, mode): + xsl = datapath("io", "data", "xml", "row_field_output.xsl") + + with open(xsl, mode) as f: + assert geom_df.to_xml(stylesheet=f) == xsl_expected + + +@td.skip_if_no("lxml") +def test_stylesheet_io(datapath, mode): + xsl_path = datapath("io", "data", "xml", "row_field_output.xsl") + + xsl_obj: Union[BytesIO, StringIO] + + with open(xsl_path, mode) as f: + if mode == "rb": + xsl_obj = BytesIO(f.read()) + else: + xsl_obj = StringIO(f.read()) + + output = geom_df.to_xml(stylesheet=xsl_obj) + + assert output == xsl_expected + + +@td.skip_if_no("lxml") +def test_stylesheet_buffered_reader(datapath, mode): + xsl = datapath("io", "data", "xml", "row_field_output.xsl") + + with open(xsl, mode) as f: + xsl_obj = f.read() + + output = geom_df.to_xml(stylesheet=xsl_obj) + + assert output == xsl_expected + + +@td.skip_if_no("lxml") +def test_stylesheet_wrong_path(datapath): + from lxml.etree import XMLSyntaxError + + xsl = os.path.join("data", "xml", "row_field_output.xslt") + + with pytest.raises( + XMLSyntaxError, + match=("Start tag expected, '<' not found"), + ): + geom_df.to_xml(stylesheet=xsl) + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("val", ["", b""]) +def test_empty_string_stylesheet(val): + from lxml.etree import XMLSyntaxError + + with pytest.raises( + XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found") + ): + geom_df.to_xml(stylesheet=val) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_syntax(): + from lxml.etree import XMLSyntaxError + + xsl = """\ + + + + + + + + + + + + + + + + + + +""" + + with pytest.raises(XMLSyntaxError, match=("Opening and ending tag mismatch")): + geom_df.to_xml(stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_eval(): + from lxml.etree import XSLTParseError + + xsl = """\ + + + + + + + + + + + + + + + + + + +""" + + with pytest.raises(XSLTParseError, match=("failed to compile")): + geom_df.to_xml(stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_apply(parser): + from lxml.etree import XSLTApplyError + + xsl = """\ + + + + + + + + + +""" + + with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")): + with tm.ensure_clean("test.xml") as path: + geom_df.to_xml(path, stylesheet=xsl) + + +def test_stylesheet_with_etree(datapath): + xsl = """\ + + + + + + + + + """ + + with pytest.raises( + ValueError, match=("To use stylesheet, you need lxml installed") + ): + geom_df.to_xml(parser="etree", stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_style_to_csv(): + xsl = """\ + + + + + , + + ,shape,degrees,sides + + + + + + + +""" + + out_csv = geom_df.to_csv(line_terminator="\n") + + if out_csv is not None: + out_csv = out_csv.strip() + out_xml = geom_df.to_xml(stylesheet=xsl) + + assert out_csv == out_xml + + +@td.skip_if_no("lxml") +def test_style_to_string(): + xsl = """\ + + + + + + + shape degrees sides + + + + + + + +""" + + out_str = geom_df.to_string() + out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl) + + assert out_xml == out_str + + +@td.skip_if_no("lxml") +def test_style_to_json(): + xsl = """\ + + + + + " + + + {"shape":{ + + },"degrees":{ + + },"sides":{ + + }} + + + + + + + + + + + + + + + + + , + + +""" + + out_json = geom_df.to_json() + out_xml = geom_df.to_xml(stylesheet=xsl) + + assert out_json == out_xml + + +# COMPRESSION + + +geom_xml = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + +@pytest.mark.parametrize("comp", ["bz2", "gzip", "xz", "zip"]) +def test_compression_output(parser, comp): + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression=comp) + + with get_handle( + path, + "r", + compression=comp, + ) as handle_obj: + output = handle_obj.handle.read() + + output = equalize_decl(output) + + assert geom_xml == output.strip() + + +@pytest.mark.parametrize("comp", ["bz2", "gzip", "xz", "zip"]) +@pytest.mark.parametrize("compfile", ["xml.bz2", "xml.gz", "xml.xz", "xml.zip"]) +def test_filename_and_suffix_comp(parser, comp, compfile): + with tm.ensure_clean(filename=compfile) as path: + geom_df.to_xml(path, parser=parser, compression=comp) + + with get_handle( + path, + "r", + compression=comp, + ) as handle_obj: + output = handle_obj.handle.read() + + output = equalize_decl(output) + + assert geom_xml == output.strip() + + +def test_unsuported_compression(datapath, parser): + with pytest.raises(ValueError, match="Unrecognized compression type"): + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression="7z") + + +# STORAGE OPTIONS + + +@tm.network +@td.skip_if_no("s3fs") +@td.skip_if_no("lxml") +def test_s3_permission_output(parser): + import s3fs + + with pytest.raises(PermissionError, match="Access Denied"): + fs = s3fs.S3FileSystem(anon=True) + fs.ls("pandas-test") + + geom_df.to_xml("s3://pandas-test/geom.xml", compression="zip", parser=parser) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py new file mode 100644 index 0000000000000..6902b4e93443f --- /dev/null +++ b/pandas/tests/io/xml/test_xml.py @@ -0,0 +1,1097 @@ +from io import ( + BytesIO, + StringIO, +) +import os +from typing import Union +from urllib.error import HTTPError + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.xml import read_xml + +""" +CHECK LIST + +[x] - ValueError: "Values for parser can only be lxml or etree." + +etree +[X] - ImportError: "lxml not found, please install or use the etree parser." +[X] - TypeError: "expected str, bytes or os.PathLike object, not NoneType" +[X] - ValueError: "Either element or attributes can be parsed not both." +[X] - ValueError: "xpath does not return any nodes..." +[X] - SyntaxError: "You have used an incorrect or unsupported XPath" +[X] - ValueError: "names does not match length of child elements in xpath." +[X] - TypeError: "...is not a valid type for names" +[X] - ValueError: "To use stylesheet, you need lxml installed..." +[] - URLError: (GENERAL ERROR WITH HTTPError AS SUBCLASS) +[X] - HTTPError: "HTTP Error 404: Not Found" +[] - OSError: (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) +[X] - FileNotFoundError: "No such file or directory" +[] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) +[X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..." +[X] - UnicodeError: "UTF-16 stream does not start with BOM" +[X] - BadZipFile: "File is not a zip file" +[X] - OSError: "Invalid data stream" +[X] - LZMAError: "Input format not supported by decoder" +[X] - ValueError: "Unrecognized compression type" +[X] - PermissionError: "Forbidden" + +lxml +[X] - ValueError: "Either element or attributes can be parsed not both." +[X] - AttributeError: "__enter__" +[X] - XSLTApplyError: "Cannot resolve URI" +[X] - XSLTParseError: "document is not a stylesheet" +[X] - ValueError: "xpath does not return any nodes." +[X] - XPathEvalError: "Invalid expression" +[] - XPathSyntaxError: (OLD VERSION IN lxml FOR XPATH ERRORS) +[X] - TypeError: "empty namespace prefix is not supported in XPath" +[X] - ValueError: "names does not match length of child elements in xpath." +[X] - TypeError: "...is not a valid type for names" +[X] - LookupError: "unknown encoding" +[] - URLError: (USUALLY DUE TO NETWORKING) +[X - HTTPError: "HTTP Error 404: Not Found" +[X] - OSError: "failed to load external entity" +[X] - XMLSyntaxError: "Start tag expected, '<' not found" +[] - ParserError: (FAILSAFE CATCH ALL FOR VERY COMPLEX XML +[X] - ValueError: "Values for parser can only be lxml or etree." +[X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..." +[X] - UnicodeError: "UTF-16 stream does not start with BOM" +[X] - BadZipFile: "File is not a zip file" +[X] - OSError: "Invalid data stream" +[X] - LZMAError: "Input format not supported by decoder" +[X] - ValueError: "Unrecognized compression type" +[X] - PermissionError: "Forbidden" +""" + +geom_df = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, np.nan, 3], + } +) + +xml_default_nmsp = """\ + + + + square + 360 + 4 + + + circle + 360 + + + + triangle + 180 + 3 + +""" + +xml_prefix_nmsp = """\ + + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + +""" + + +df_kml = DataFrame( + { + "id": { + 0: "ID_00001", + 1: "ID_00002", + 2: "ID_00003", + 3: "ID_00004", + 4: "ID_00005", + }, + "name": { + 0: "Blue Line (Forest Park)", + 1: "Red, Purple Line", + 2: "Red, Purple Line", + 3: "Red, Purple Line", + 4: "Red, Purple Line", + }, + "styleUrl": { + 0: "#LineStyle01", + 1: "#LineStyle01", + 2: "#LineStyle01", + 3: "#LineStyle01", + 4: "#LineStyle01", + }, + "extrude": {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}, + "altitudeMode": { + 0: "clampedToGround", + 1: "clampedToGround", + 2: "clampedToGround", + 3: "clampedToGround", + 4: "clampedToGround", + }, + "coordinates": { + 0: ( + "-87.77678526964958,41.8708863930319,0 " + "-87.77826234150609,41.87097820122218,0 " + "-87.78251583439344,41.87130129991005,0 " + "-87.78418294588424,41.87145055520308,0 " + "-87.7872369165933,41.8717239119163,0 " + "-87.79160214925886,41.87210797280065,0" + ), + 1: ( + "-87.65758750947528,41.96427269188822,0 " + "-87.65802133507393,41.96581929055245,0 " + "-87.65819033925305,41.96621846093642,0 " + "-87.6583189819129,41.96650362897086,0 " + "-87.65835858701473,41.96669002089185,0 " + "-87.65838428411853,41.96688150295095,0 " + "-87.65842208882658,41.96745896091846,0 " + "-87.65846556843937,41.9683761425439,0 " + "-87.65849296214573,41.96913893870342,0" + ), + 2: ( + "-87.65492939166126,41.95377494531437,0 " + "-87.65557043199591,41.95376544118533,0 " + "-87.65606302030132,41.95376391658746,0 " + "-87.65623502146268,41.95377379126367,0 " + "-87.65634748981634,41.95380103566435,0 " + "-87.65646537904269,41.95387703994676,0 " + "-87.65656532461145,41.95396622645799,0 " + "-87.65664760856414,41.95404201996044,0 " + "-87.65671750555913,41.95416647054043,0 " + "-87.65673983607117,41.95429949810849,0 " + "-87.65673866475777,41.95441024240925,0 " + "-87.6567690255541,41.95490657227902,0 " + "-87.65683672482363,41.95692259283837,0 " + "-87.6568900886376,41.95861070983142,0 " + "-87.65699865558875,41.96181418669004,0 " + "-87.65756347177603,41.96397045777844,0 " + "-87.65758750947528,41.96427269188822,0" + ), + 3: ( + "-87.65362593118043,41.94742799535678,0 " + "-87.65363554415794,41.94819886386848,0 " + "-87.6536456393239,41.95059994675451,0 " + "-87.65365831235026,41.95108288489359,0 " + "-87.6536604873874,41.9519954657554,0 " + "-87.65362592053201,41.95245597302328,0 " + "-87.65367158496069,41.95311153649393,0 " + "-87.65368468595476,41.9533202828916,0 " + "-87.65369271253692,41.95343095587119,0 " + "-87.65373335834569,41.95351536301472,0 " + "-87.65378605844126,41.95358212680591,0 " + "-87.65385067928185,41.95364452823767,0 " + "-87.6539390793817,41.95370263886964,0 " + "-87.6540786298351,41.95373403675265,0 " + "-87.65430648647626,41.9537535411832,0 " + "-87.65492939166126,41.95377494531437,0" + ), + 4: ( + "-87.65345391792157,41.94217681262115,0 " + "-87.65342448305786,41.94237224420864,0 " + "-87.65339745703922,41.94268217746244,0 " + "-87.65337753982941,41.94288140770284,0 " + "-87.65336256753105,41.94317369618263,0 " + "-87.65338799707138,41.94357253961736,0 " + "-87.65340240886648,41.94389158188269,0 " + "-87.65341837392448,41.94406444407721,0 " + "-87.65342275247338,41.94421065714904,0 " + "-87.65347469646018,41.94434829382345,0 " + "-87.65351486483024,41.94447699917548,0 " + "-87.65353483605053,41.9453896864472,0 " + "-87.65361975532807,41.94689193720703,0 " + "-87.65362593118043,41.94742799535678,0" + ), + }, + } +) + + +@pytest.fixture(params=["rb", "r"]) +def mode(request): + return request.param + + +@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"]) +def parser(request): + return request.param + + +# FILE / URL + + +@td.skip_if_no("lxml") +def test_parser_consistency_file(datapath): + filename = datapath("io", "data", "xml", "books.xml") + df_file_lxml = read_xml(filename, parser="lxml") + df_file_etree = read_xml(filename, parser="etree") + + tm.assert_frame_equal(df_file_lxml, df_file_etree) + + +@tm.network +@pytest.mark.slow +@td.skip_if_no("lxml") +def test_parser_consistency_url(datapath): + url = ( + "https://data.cityofchicago.org/api/views/" + "8pix-ypme/rows.xml?accessType=DOWNLOAD" + ) + df_url_lxml = read_xml(url, xpath=".//row/row", parser="lxml") + df_url_etree = read_xml(url, xpath=".//row/row", parser="etree") + + tm.assert_frame_equal(df_url_lxml, df_url_etree) + + +def test_file_like(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with open(filename, mode) as f: + df_file = read_xml(f, parser=parser) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_file, df_expected) + + +def test_file_io(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with open(filename, mode) as f: + xml_obj = f.read() + + df_io = read_xml( + (BytesIO(xml_obj) if isinstance(xml_obj, bytes) else StringIO(xml_obj)), + parser=parser, + ) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_io, df_expected) + + +def test_file_buffered_reader_string(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with open(filename, mode) as f: + xml_obj = f.read() + + df_str = read_xml(xml_obj, parser=parser) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_str, df_expected) + + +def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with open(filename, mode) as f: + next(f) + xml_obj = f.read() + + df_str = read_xml(xml_obj, parser=parser) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_str, df_expected) + + +def test_file_handle_close(datapath, parser): + xml_file = datapath("io", "data", "xml", "books.xml") + + with open(xml_file, "rb") as f: + read_xml(BytesIO(f.read()), parser=parser) + + assert not f.closed + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("val", ["", b""]) +def test_empty_string_lxml(val): + from lxml.etree import XMLSyntaxError + + with pytest.raises(XMLSyntaxError, match="Document is empty"): + read_xml(val, parser="lxml") + + +@pytest.mark.parametrize("val", ["", b""]) +def test_empty_string_etree(val): + from xml.etree.ElementTree import ParseError + + with pytest.raises(ParseError, match="no element found"): + read_xml(val, parser="etree") + + +@td.skip_if_no("lxml") +def test_wrong_file_path_lxml(): + from lxml.etree import XMLSyntaxError + + filename = os.path.join("data", "html", "books.xml") + + with pytest.raises( + XMLSyntaxError, + match=("Start tag expected, '<' not found"), + ): + read_xml(filename, parser="lxml") + + +def test_wrong_file_path_etree(): + from xml.etree.ElementTree import ParseError + + filename = os.path.join("data", "html", "books.xml") + + with pytest.raises( + ParseError, + match=("not well-formed"), + ): + read_xml(filename, parser="etree") + + +@tm.network +@td.skip_if_no("lxml") +def test_url(): + url = "https://www.w3schools.com/xml/books.xml" + df_url = read_xml(url, xpath=".//book[count(*)=4]") + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + "cover": [None, None, "paperback"], + } + ) + + tm.assert_frame_equal(df_url, df_expected) + + +def test_wrong_url(parser): + with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")): + url = "https://www.w3schools.com/xml/python.xml" + read_xml(url, xpath=".//book[count(*)=4]", parser=parser) + + +# XPATH + + +@td.skip_if_no("lxml") +def test_empty_xpath_lxml(datapath): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(ValueError, match=("xpath does not return any nodes")): + read_xml(filename, xpath=".//python", parser="lxml") + + +def test_bad_xpath_etree(datapath): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + SyntaxError, match=("You have used an incorrect or unsupported XPath") + ): + read_xml(filename, xpath=".//[book]", parser="etree") + + +@td.skip_if_no("lxml") +def test_bad_xpath_lxml(datapath): + from lxml.etree import XPathEvalError + + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(XPathEvalError, match=("Invalid expression")): + read_xml(filename, xpath=".//[book]", parser="lxml") + + +# NAMESPACE + + +def test_default_namespace(parser): + df_nmsp = read_xml( + xml_default_nmsp, + xpath=".//ns:row", + namespaces={"ns": "http://example.com"}, + parser=parser, + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_nmsp, df_expected) + + +def test_prefix_namespace(parser): + df_nmsp = read_xml( + xml_prefix_nmsp, + xpath=".//doc:row", + namespaces={"doc": "http://example.com"}, + parser=parser, + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_nmsp, df_expected) + + +@td.skip_if_no("lxml") +def test_consistency_default_namespace(): + df_lxml = read_xml( + xml_default_nmsp, + xpath=".//ns:row", + namespaces={"ns": "http://example.com"}, + parser="lxml", + ) + + df_etree = read_xml( + xml_default_nmsp, + xpath=".//doc:row", + namespaces={"doc": "http://example.com"}, + parser="etree", + ) + + tm.assert_frame_equal(df_lxml, df_etree) + + +@td.skip_if_no("lxml") +def test_consistency_prefix_namespace(): + df_lxml = read_xml( + xml_prefix_nmsp, + xpath=".//doc:row", + namespaces={"doc": "http://example.com"}, + parser="lxml", + ) + + df_etree = read_xml( + xml_prefix_nmsp, + xpath=".//doc:row", + namespaces={"doc": "http://example.com"}, + parser="etree", + ) + + tm.assert_frame_equal(df_lxml, df_etree) + + +# PREFIX + + +def test_missing_prefix_with_default_namespace(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(ValueError, match=("xpath does not return any nodes")): + read_xml(filename, xpath=".//Placemark", parser=parser) + + +def test_missing_prefix_definition_etree(datapath): + filename = datapath("io", "data", "xml", "cta_rail_lines.kml") + with pytest.raises(SyntaxError, match=("you used an undeclared namespace prefix")): + read_xml(filename, xpath=".//kml:Placemark", parser="etree") + + +@td.skip_if_no("lxml") +def test_missing_prefix_definition_lxml(datapath): + from lxml.etree import XPathEvalError + + filename = datapath("io", "data", "xml", "cta_rail_lines.kml") + with pytest.raises(XPathEvalError, match=("Undefined namespace prefix")): + read_xml(filename, xpath=".//kml:Placemark", parser="lxml") + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("key", ["", None]) +def test_none_namespace_prefix(key): + with pytest.raises( + TypeError, match=("empty namespace prefix is not supported in XPath") + ): + read_xml( + xml_default_nmsp, + xpath=".//kml:Placemark", + namespaces={key: "http://www.opengis.net/kml/2.2"}, + parser="lxml", + ) + + +# ELEMS AND ATTRS + + +def test_file_elems_and_attrs(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, parser=parser) + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_file, df_expected) + + +def test_file_only_attrs(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, attrs_only=True, parser=parser) + df_expected = DataFrame({"category": ["cooking", "children", "web"]}) + + tm.assert_frame_equal(df_file, df_expected) + + +def test_file_only_elems(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, elems_only=True, parser=parser) + df_expected = DataFrame( + { + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_file, df_expected) + + +def test_elem_and_attrs_only(datapath, parser): + filename = datapath("io", "data", "xml", "cta_rail_lines.kml") + with pytest.raises( + ValueError, + match=("Either element or attributes can be parsed not both"), + ): + read_xml(filename, elems_only=True, attrs_only=True, parser=parser) + + +@td.skip_if_no("lxml") +def test_attribute_centric_xml(): + xml = """\ + + + + + + + + + + + + + + + + + +""" + + df_lxml = read_xml(xml, xpath=".//station") + df_etree = read_xml(xml, xpath=".//station", parser="etree") + + tm.assert_frame_equal(df_lxml, df_etree) + + +# NAMES + + +def test_names_option_output(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml( + filename, names=["Col1", "Col2", "Col3", "Col4", "Col5"], parser=parser + ) + + df_expected = DataFrame( + { + "Col1": ["cooking", "children", "web"], + "Col2": ["Everyday Italian", "Harry Potter", "Learning XML"], + "Col3": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "Col4": [2005, 2005, 2003], + "Col5": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_file, df_expected) + + +def test_names_option_wrong_length(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + + with pytest.raises(ValueError, match=("names does not match length")): + read_xml(filename, names=["Col1", "Col2", "Col3"], parser=parser) + + +def test_names_option_wrong_type(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + + with pytest.raises(TypeError, match=("is not a valid type for names")): + read_xml( + filename, names="Col1, Col2, Col3", parser=parser # type: ignore[arg-type] + ) + + +# ENCODING + + +def test_wrong_encoding(datapath, parser): + filename = datapath("io", "data", "xml", "baby_names.xml") + with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode")): + read_xml(filename, parser=parser) + + +def test_utf16_encoding(datapath, parser): + filename = datapath("io", "data", "xml", "baby_names.xml") + with pytest.raises( + UnicodeError, + match=( + "UTF-16 stream does not start with BOM|" + "'utf-16-le' codec can't decode byte" + ), + ): + read_xml(filename, encoding="UTF-16", parser=parser) + + +def test_unknown_encoding(datapath, parser): + filename = datapath("io", "data", "xml", "baby_names.xml") + with pytest.raises(LookupError, match=("unknown encoding: uft-8")): + read_xml(filename, encoding="UFT-8", parser=parser) + + +def test_ascii_encoding(datapath, parser): + filename = datapath("io", "data", "xml", "baby_names.xml") + with pytest.raises(UnicodeDecodeError, match=("'ascii' codec can't decode byte")): + read_xml(filename, encoding="ascii", parser=parser) + + +@td.skip_if_no("lxml") +def test_parser_consistency_with_encoding(datapath): + filename = datapath("io", "data", "xml", "baby_names.xml") + df_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1") + df_etree = read_xml(filename, parser="etree", encoding="iso-8859-1") + + tm.assert_frame_equal(df_lxml, df_etree) + + +# PARSER + + +@td.skip_if_installed("lxml") +def test_default_parser_no_lxml(datapath): + filename = datapath("io", "data", "xml", "books.xml") + + with pytest.raises( + ImportError, match=("lxml not found, please install or use the etree parser.") + ): + read_xml(filename) + + +def test_wrong_parser(datapath): + filename = datapath("io", "data", "xml", "books.xml") + + with pytest.raises( + ValueError, match=("Values for parser can only be lxml or etree.") + ): + read_xml(filename, parser="bs4") + + +# STYLESHEET + + +@td.skip_if_no("lxml") +def test_stylesheet_file(datapath): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + df_style = read_xml( + kml, + xpath=".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + stylesheet=xsl, + ) + + tm.assert_frame_equal(df_kml, df_style) + + +@td.skip_if_no("lxml") +def test_stylesheet_file_like(datapath, mode): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + with open(xsl, mode) as f: + df_style = read_xml( + kml, + xpath=".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + stylesheet=f, + ) + + tm.assert_frame_equal(df_kml, df_style) + + +@td.skip_if_no("lxml") +def test_stylesheet_io(datapath, mode): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + xsl_obj: Union[BytesIO, StringIO] + + with open(xsl, mode) as f: + if mode == "rb": + xsl_obj = BytesIO(f.read()) + else: + xsl_obj = StringIO(f.read()) + + df_style = read_xml( + kml, + xpath=".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + stylesheet=xsl_obj, + ) + + tm.assert_frame_equal(df_kml, df_style) + + +@td.skip_if_no("lxml") +def test_stylesheet_buffered_reader(datapath, mode): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + with open(xsl, mode) as f: + xsl_obj = f.read() + + df_style = read_xml( + kml, + xpath=".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + stylesheet=xsl_obj, + ) + + tm.assert_frame_equal(df_kml, df_style) + + +@td.skip_if_no("lxml") +def test_not_stylesheet(datapath): + from lxml.etree import XSLTParseError + + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "books.xml") + + with pytest.raises(XSLTParseError, match=("document is not a stylesheet")): + read_xml(kml, stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_syntax(datapath): + from lxml.etree import XMLSyntaxError + + xsl = """\ + + + + + + + + + + + + + + + +""" + + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + + with pytest.raises( + XMLSyntaxError, match=("Extra content at the end of the document") + ): + read_xml(kml, stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_eval(datapath): + from lxml.etree import XSLTParseError + + xsl = """\ + + + + + + + + + + + + + + + +""" + + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + + with pytest.raises(XSLTParseError, match=("failed to compile")): + read_xml(kml, stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_apply(datapath): + from lxml.etree import XSLTApplyError + + xsl = """\ + + + + + + + + + +""" + + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + + with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")): + read_xml(kml, stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_wrong_stylesheet(): + from lxml.etree import XMLSyntaxError + + kml = os.path.join("data", "xml", "cta_rail_lines.kml") + xsl = os.path.join("data", "xml", "flatten.xsl") + + with pytest.raises( + XMLSyntaxError, + match=("Start tag expected, '<' not found"), + ): + read_xml(kml, stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_stylesheet_file_close(datapath, mode): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + xsl_obj: Union[BytesIO, StringIO] + + with open(xsl, mode) as f: + if mode == "rb": + xsl_obj = BytesIO(f.read()) + else: + xsl_obj = StringIO(f.read()) + + read_xml(kml, stylesheet=xsl_obj) + + assert not f.closed + + +@td.skip_if_no("lxml") +def test_stylesheet_with_etree(datapath): + kml = os.path.join("data", "xml", "cta_rail_lines.kml") + xsl = os.path.join("data", "xml", "flatten_doc.xsl") + + with pytest.raises( + ValueError, match=("To use stylesheet, you need lxml installed") + ): + read_xml(kml, parser="etree", stylesheet=xsl) + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("val", ["", b""]) +def test_empty_stylesheet(val): + from lxml.etree import XMLSyntaxError + + kml = os.path.join("data", "xml", "cta_rail_lines.kml") + + with pytest.raises( + XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found") + ): + read_xml(kml, stylesheet=val) + + +@tm.network +@td.skip_if_no("lxml") +def test_online_stylesheet(): + xml = "https://www.w3schools.com/xml/cdcatalog_with_xsl.xml" + xsl = "https://www.w3schools.com/xml/cdcatalog.xsl" + + df_xsl = read_xml( + xml, + xpath=".//tr[td and position() <= 6]", + names=["title", "artist"], + stylesheet=xsl, + ) + + df_expected = DataFrame( + { + "title": { + 0: "Empire Burlesque", + 1: "Hide your heart", + 2: "Greatest Hits", + 3: "Still got the blues", + 4: "Eros", + }, + "artist": { + 0: "Bob Dylan", + 1: "Bonnie Tyler", + 2: "Dolly Parton", + 3: "Gary Moore", + 4: "Eros Ramazzotti", + }, + } + ) + + tm.assert_frame_equal(df_expected, df_xsl) + + +# COMPRESSION + + +@pytest.mark.parametrize("comp", ["bz2", "gzip", "xz", "zip"]) +def test_compression_read(parser, comp): + with tm.ensure_clean() as path: + geom_df.to_xml(path, index=False, parser=parser, compression=comp) + + xml_df = read_xml(path, parser=parser, compression=comp) + + tm.assert_frame_equal(xml_df, geom_df) + + +@pytest.mark.parametrize("comp", ["gzip", "xz", "zip"]) +def test_wrong_compression_bz2(parser, comp): + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression=comp) + + with pytest.raises(OSError, match="Invalid data stream"): + read_xml(path, parser=parser, compression="bz2") + + +@pytest.mark.parametrize("comp", ["bz2", "xz", "zip"]) +def test_wrong_compression_gz(parser, comp): + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression=comp) + + with pytest.raises(OSError, match="Not a gzipped file"): + read_xml(path, parser=parser, compression="gzip") + + +@pytest.mark.parametrize("comp", ["bz2", "gzip", "zip"]) +def test_wrong_compression_xz(parser, comp): + from lzma import LZMAError + + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression=comp) + + with pytest.raises(LZMAError, match="Input format not supported by decoder"): + read_xml(path, parser=parser, compression="xz") + + +@pytest.mark.parametrize("comp", ["bz2", "gzip", "xz"]) +def test_wrong_compression_zip(parser, comp): + from zipfile import BadZipFile + + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression=comp) + + with pytest.raises(BadZipFile, match="File is not a zip file"): + read_xml(path, parser=parser, compression="zip") + + +def test_unsuported_compression(datapath, parser): + with pytest.raises(ValueError, match="Unrecognized compression type"): + with tm.ensure_clean() as path: + read_xml(path, parser=parser, compression="7z") + + +# STORAGE OPTIONS + + +@tm.network +@td.skip_if_no("s3fs") +@td.skip_if_no("lxml") +def test_s3_parser_consistency(): + # Python Software Foundation (2019 IRS-990 RETURN) + s3 = "s3://irs-form-990/201923199349319487_public.xml" + + df_lxml = read_xml( + s3, + xpath=".//irs:Form990PartVIISectionAGrp", + namespaces={"irs": "http://www.irs.gov/efile"}, + parser="lxml", + storage_options={"anon": True}, + ) + + df_etree = read_xml( + s3, + xpath=".//irs:Form990PartVIISectionAGrp", + namespaces={"irs": "http://www.irs.gov/efile"}, + parser="etree", + storage_options={"anon": True}, + ) + + tm.assert_frame_equal(df_lxml, df_etree) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index bf3e6d822ab19..733a8c0aa58ec 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, NaT, @@ -245,6 +247,7 @@ def test_resampler_is_iterable(series): tm.assert_series_equal(rv, gv) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile @all_ts def test_resample_quantile(series): # GH 15023 diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index a125f85efc8d3..7b520171379c3 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -13,6 +13,7 @@ DataFrame, Index, MultiIndex, + PeriodIndex, Series, concat, date_range, @@ -24,6 +25,22 @@ class TestConcatenate: + def test_append_concat(self): + # GH#1815 + d1 = date_range("12/31/1990", "12/31/1999", freq="A-DEC") + d2 = date_range("12/31/2000", "12/31/2009", freq="A-DEC") + + s1 = Series(np.random.randn(10), d1) + s2 = Series(np.random.randn(10), d2) + + s1 = s1.to_period() + s2 = s2.to_period() + + # drops index + result = concat([s1, s2]) + assert isinstance(result.index, PeriodIndex) + assert result.index[0] == s1.index[0] + def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) @@ -420,21 +437,21 @@ def __getitem__(self, index): except KeyError as err: raise IndexError from err - tm.assert_frame_equal(pd.concat(CustomIterator1(), ignore_index=True), expected) + tm.assert_frame_equal(concat(CustomIterator1(), ignore_index=True), expected) class CustomIterator2(abc.Iterable): def __iter__(self): yield df1 yield df2 - tm.assert_frame_equal(pd.concat(CustomIterator2(), ignore_index=True), expected) + tm.assert_frame_equal(concat(CustomIterator2(), ignore_index=True), expected) def test_concat_order(self): # GH 17344 dfs = [DataFrame(index=range(3), columns=["a", 1, None])] dfs += [DataFrame(index=range(3), columns=[None, 1, "a"]) for i in range(100)] - result = pd.concat(dfs, sort=True).columns + result = concat(dfs, sort=True).columns expected = dfs[0].columns tm.assert_index_equal(result, expected) @@ -442,20 +459,20 @@ def test_concat_different_extension_dtypes_upcasts(self): a = Series(pd.array([1, 2], dtype="Int64")) b = Series(to_decimal([1, 2])) - result = pd.concat([a, b], ignore_index=True) + result = concat([a, b], ignore_index=True) expected = Series([1, 2, Decimal(1), Decimal(2)], dtype=object) tm.assert_series_equal(result, expected) def test_concat_ordered_dict(self): # GH 21510 - expected = pd.concat( + expected = concat( [Series(range(3)), Series(range(4))], keys=["First", "Another"] ) - result = pd.concat({"First": Series(range(3)), "Another": Series(range(4))}) + result = concat({"First": Series(range(3)), "Another": Series(range(4))}) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("pdt", [Series, pd.DataFrame]) +@pytest.mark.parametrize("pdt", [Series, DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["float"]) def test_concat_no_unnecessary_upcast(dt, pdt): # GH 13247 @@ -466,11 +483,11 @@ def test_concat_no_unnecessary_upcast(dt, pdt): pdt(np.array([np.nan], dtype=dt, ndmin=dims)), pdt(np.array([5], dtype=dt, ndmin=dims)), ] - x = pd.concat(dfs) + x = concat(dfs) assert x.values.dtype == dt -@pytest.mark.parametrize("pdt", [create_series_with_explicit_dtype, pd.DataFrame]) +@pytest.mark.parametrize("pdt", [create_series_with_explicit_dtype, DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["int"]) def test_concat_will_upcast(dt, pdt): with catch_warnings(record=True): @@ -480,7 +497,7 @@ def test_concat_will_upcast(dt, pdt): pdt(np.array([np.nan], ndmin=dims)), pdt(np.array([5], dtype=dt, ndmin=dims)), ] - x = pd.concat(dfs) + x = concat(dfs) assert x.values.dtype == "float64" @@ -489,7 +506,7 @@ def test_concat_empty_and_non_empty_frame_regression(): df1 = DataFrame({"foo": [1]}) df2 = DataFrame({"foo": []}) expected = DataFrame({"foo": [1.0]}) - result = pd.concat([df1, df2]) + result = concat([df1, df2]) tm.assert_frame_equal(result, expected) @@ -499,7 +516,7 @@ def test_concat_sparse(): expected = DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype( pd.SparseDtype(np.int64, 0) ) - result = pd.concat([a, a], axis=1) + result = concat([a, a], axis=1) tm.assert_frame_equal(result, expected) @@ -510,7 +527,7 @@ def test_concat_dense_sparse(): expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype( pd.SparseDtype(np.float64, None) ) - result = pd.concat([a, b], axis=0) + result = concat([a, b], axis=0) tm.assert_series_equal(result, expected) @@ -548,11 +565,11 @@ def test_concat_frame_axis0_extension_dtypes(): df1 = DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")}) df2 = DataFrame({"a": np.array([4, 5, 6])}) - result = pd.concat([df1, df2], ignore_index=True) + result = concat([df1, df2], ignore_index=True) expected = DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64") tm.assert_frame_equal(result, expected) - result = pd.concat([df2, df1], ignore_index=True) + result = concat([df2, df1], ignore_index=True) expected = DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") tm.assert_frame_equal(result, expected) @@ -561,7 +578,7 @@ def test_concat_preserves_extension_int64_dtype(): # GH 24768 df_a = DataFrame({"a": [-1]}, dtype="Int64") df_b = DataFrame({"b": [1]}, dtype="Int64") - result = pd.concat([df_a, df_b], ignore_index=True) + result = concat([df_a, df_b], ignore_index=True) expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index f5eb0ab8c9a17..3636139c19eef 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -17,7 +17,7 @@ def test_concat_multiple_frames_dtypes(self): # GH#2759 A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64) B = DataFrame(data=np.ones((10, 2)), dtype=np.float32) - results = pd.concat((A, B), axis=1).dtypes + results = concat((A, B), axis=1).dtypes expected = Series( [np.dtype("float64")] * 2 + [np.dtype("float32")] * 2, index=["foo", "bar", 0, 1], @@ -28,7 +28,7 @@ def test_concat_tuple_keys(self): # GH#14438 df1 = DataFrame(np.ones((2, 2)), columns=list("AB")) df2 = DataFrame(np.ones((3, 2)) * 2, columns=list("AB")) - results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")]) + results = concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")]) expected = DataFrame( { "A": { @@ -53,7 +53,7 @@ def test_concat_named_keys(self): # GH#14252 df = DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]}) index = Index(["a", "b"], name="baz") - concatted_named_from_keys = pd.concat([df, df], keys=index) + concatted_named_from_keys = concat([df, df], keys=index) expected_named = DataFrame( {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]), @@ -61,12 +61,10 @@ def test_concat_named_keys(self): tm.assert_frame_equal(concatted_named_from_keys, expected_named) index_no_name = Index(["a", "b"], name=None) - concatted_named_from_names = pd.concat( - [df, df], keys=index_no_name, names=["baz"] - ) + concatted_named_from_names = concat([df, df], keys=index_no_name, names=["baz"]) tm.assert_frame_equal(concatted_named_from_names, expected_named) - concatted_unnamed = pd.concat([df, df], keys=index_no_name) + concatted_unnamed = concat([df, df], keys=index_no_name) expected_unnamed = DataFrame( {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]), @@ -81,13 +79,13 @@ def test_concat_axis_parameter(self): # Index/row/0 DataFrame expected_index = DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) - concatted_index = pd.concat([df1, df2], axis="index") + concatted_index = concat([df1, df2], axis="index") tm.assert_frame_equal(concatted_index, expected_index) - concatted_row = pd.concat([df1, df2], axis="rows") + concatted_row = concat([df1, df2], axis="rows") tm.assert_frame_equal(concatted_row, expected_index) - concatted_0 = pd.concat([df1, df2], axis=0) + concatted_0 = concat([df1, df2], axis=0) tm.assert_frame_equal(concatted_0, expected_index) # Columns/1 DataFrame @@ -95,10 +93,10 @@ def test_concat_axis_parameter(self): [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"] ) - concatted_columns = pd.concat([df1, df2], axis="columns") + concatted_columns = concat([df1, df2], axis="columns") tm.assert_frame_equal(concatted_columns, expected_columns) - concatted_1 = pd.concat([df1, df2], axis=1) + concatted_1 = concat([df1, df2], axis=1) tm.assert_frame_equal(concatted_1, expected_columns) series1 = Series([0.1, 0.2]) @@ -107,13 +105,13 @@ def test_concat_axis_parameter(self): # Index/row/0 Series expected_index_series = Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) - concatted_index_series = pd.concat([series1, series2], axis="index") + concatted_index_series = concat([series1, series2], axis="index") tm.assert_series_equal(concatted_index_series, expected_index_series) - concatted_row_series = pd.concat([series1, series2], axis="rows") + concatted_row_series = concat([series1, series2], axis="rows") tm.assert_series_equal(concatted_row_series, expected_index_series) - concatted_0_series = pd.concat([series1, series2], axis=0) + concatted_0_series = concat([series1, series2], axis=0) tm.assert_series_equal(concatted_0_series, expected_index_series) # Columns/1 Series @@ -121,15 +119,15 @@ def test_concat_axis_parameter(self): [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1] ) - concatted_columns_series = pd.concat([series1, series2], axis="columns") + concatted_columns_series = concat([series1, series2], axis="columns") tm.assert_frame_equal(concatted_columns_series, expected_columns_series) - concatted_1_series = pd.concat([series1, series2], axis=1) + concatted_1_series = concat([series1, series2], axis=1) tm.assert_frame_equal(concatted_1_series, expected_columns_series) # Testing ValueError with pytest.raises(ValueError, match="No axis named"): - pd.concat([series1, series2], axis="something") + concat([series1, series2], axis="something") def test_concat_numerical_names(self): # GH#15262, GH#12223 @@ -142,7 +140,7 @@ def test_concat_numerical_names(self): ) ), ) - result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :])) + result = concat((df.iloc[:2, :], df.iloc[-2:, :])) expected = DataFrame( {"col": [0, 1, 7, 8]}, dtype="int32", @@ -155,7 +153,7 @@ def test_concat_numerical_names(self): def test_concat_astype_dup_col(self): # GH#23049 df = DataFrame([{"a": "b"}]) - df = pd.concat([df, df], axis=1) + df = concat([df, df], axis=1) result = df.astype("category") expected = DataFrame( diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 92181e7dffc50..332c3c8f30562 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -44,15 +44,15 @@ def test_concat_datetime_datetime64_frame(self): df1 = DataFrame({"date": ind, "test": range(10)}) # it works! - pd.concat([df1, df2_obj]) + concat([df1, df2_obj]) def test_concat_datetime_timezone(self): # GH 18523 - idx1 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") - idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq="H") + idx1 = date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") + idx2 = date_range(start=idx1[0], end=idx1[-1], freq="H") df1 = DataFrame({"a": [1, 2, 3]}, index=idx1) df2 = DataFrame({"b": [1, 2, 3]}, index=idx2) - result = pd.concat([df1, df2], axis=1) + result = concat([df1, df2], axis=1) exp_idx = ( DatetimeIndex( @@ -73,9 +73,9 @@ def test_concat_datetime_timezone(self): tm.assert_frame_equal(result, expected) - idx3 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") + idx3 = date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") df3 = DataFrame({"b": [1, 2, 3]}, index=idx3) - result = pd.concat([df1, df3], axis=1) + result = concat([df1, df3], axis=1) exp_idx = DatetimeIndex( [ @@ -104,9 +104,7 @@ def test_concat_datetime_timezone(self): tm.assert_frame_equal(result, expected) # GH 13783: Concat after resample - result = pd.concat( - [df1.resample("H").mean(), df2.resample("H").mean()], sort=True - ) + result = concat([df1.resample("H").mean(), df2.resample("H").mean()], sort=True) expected = DataFrame( {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, index=idx1.append(idx1), @@ -116,14 +114,14 @@ def test_concat_datetime_timezone(self): def test_concat_datetimeindex_freq(self): # GH 3232 # Monotonic index result - dr = pd.date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") + dr = date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") data = list(range(100)) expected = DataFrame(data, index=dr) - result = pd.concat([expected[:50], expected[50:]]) + result = concat([expected[:50], expected[50:]]) tm.assert_frame_equal(result, expected) # Non-monotonic index result - result = pd.concat([expected[50:], expected[:50]]) + result = concat([expected[50:], expected[:50]]) expected = DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) expected.index._data.freq = None tm.assert_frame_equal(result, expected) @@ -179,21 +177,21 @@ def test_concat_NaT_series(self): # all NaT with tz expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]") - result = pd.concat([y, y], ignore_index=True) + result = concat([y, y], ignore_index=True) tm.assert_series_equal(result, expected) # without tz - x = Series(pd.date_range("20151124 08:00", "20151124 09:00", freq="1h")) - y = Series(pd.date_range("20151124 10:00", "20151124 11:00", freq="1h")) + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h")) + y = Series(date_range("20151124 10:00", "20151124 11:00", freq="1h")) y[:] = pd.NaT expected = Series([x[0], x[1], pd.NaT, pd.NaT]) - result = pd.concat([x, y], ignore_index=True) + result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # all NaT without tz x[:] = pd.NaT expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns]") - result = pd.concat([x, y], ignore_index=True) + result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) @@ -215,7 +213,7 @@ def test_concat_NaT_dataframes(self, tz): ] ) - result = pd.concat([first, second], axis=0) + result = concat([first, second], axis=0) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz1", [None, "UTC"]) @@ -228,7 +226,7 @@ def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): first = DataFrame([[pd.NaT], [pd.NaT]]).apply(lambda x: x.dt.tz_localize(tz1)) second = DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) - result = pd.concat([first, second], axis=0) + result = concat([first, second], axis=0) expected = DataFrame(Series([pd.NaT, pd.NaT, s], index=[0, 1, 0])) expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) if tz1 != tz2: @@ -249,7 +247,7 @@ def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2): 1: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2), } ) - result = pd.concat([first, second], axis=1) + result = concat([first, second], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz1", [None, "UTC"]) @@ -278,7 +276,7 @@ def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): if tz1 != tz2: expected = expected.astype(object) - result = pd.concat([first, second]) + result = concat([first, second]) tm.assert_frame_equal(result, expected) @@ -306,7 +304,7 @@ def test_concat_tz_series(self): second = DataFrame([[datetime(2016, 1, 2)]]) second[0] = second[0].dt.tz_localize("UTC") - result = pd.concat([first, second]) + result = concat([first, second]) assert result[0].dtype == "datetime64[ns, UTC]" # Concatenating two London times @@ -316,7 +314,7 @@ def test_concat_tz_series(self): second = DataFrame([[datetime(2016, 1, 2)]]) second[0] = second[0].dt.tz_localize("Europe/London") - result = pd.concat([first, second]) + result = concat([first, second]) assert result[0].dtype == "datetime64[ns, Europe/London]" # Concatenating 2+1 London times @@ -326,7 +324,7 @@ def test_concat_tz_series(self): second = DataFrame([[datetime(2016, 1, 3)]]) second[0] = second[0].dt.tz_localize("Europe/London") - result = pd.concat([first, second]) + result = concat([first, second]) assert result[0].dtype == "datetime64[ns, Europe/London]" # Concat'ing 1+2 London times @@ -336,7 +334,7 @@ def test_concat_tz_series(self): second = DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) second[0] = second[0].dt.tz_localize("Europe/London") - result = pd.concat([first, second]) + result = concat([first, second]) assert result[0].dtype == "datetime64[ns, Europe/London]" def test_concat_tz_series_tzlocal(self): @@ -379,7 +377,7 @@ def test_concat_tz_frame(self): ) # concat - df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + df3 = concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) tm.assert_frame_equal(df2, df3) def test_concat_multiple_tzs(self): @@ -393,15 +391,15 @@ def test_concat_multiple_tzs(self): df2 = DataFrame({"time": [ts2]}) df3 = DataFrame({"time": [ts3]}) - results = pd.concat([df1, df2]).reset_index(drop=True) + results = concat([df1, df2]).reset_index(drop=True) expected = DataFrame({"time": [ts1, ts2]}, dtype=object) tm.assert_frame_equal(results, expected) - results = pd.concat([df1, df3]).reset_index(drop=True) + results = concat([df1, df3]).reset_index(drop=True) expected = DataFrame({"time": [ts1, ts3]}, dtype=object) tm.assert_frame_equal(results, expected) - results = pd.concat([df2, df3]).reset_index(drop=True) + results = concat([df2, df3]).reset_index(drop=True) expected = DataFrame({"time": [ts2, ts3]}) tm.assert_frame_equal(results, expected) @@ -439,7 +437,7 @@ def test_concat_tz_not_aligned(self): ts = pd.to_datetime([1, 2]).tz_localize("UTC") a = DataFrame({"A": ts}) b = DataFrame({"A": ts, "B": ts}) - result = pd.concat([a, b], sort=True, ignore_index=True) + result = concat([a, b], sort=True, ignore_index=True) expected = DataFrame( {"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)} ) @@ -467,7 +465,7 @@ def test_concat_tz_NaT(self, t1): df1 = DataFrame([[ts1, ts2]]) df2 = DataFrame([[ts3]]) - result = pd.concat([df1, df2]) + result = concat([df1, df2]) expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 0e86cb0ae48c0..ab419e0481973 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -49,7 +49,7 @@ def test_concat_empty_series(self): # GH 11082 s1 = Series([1, 2, 3], name="x") s2 = Series(name="y", dtype="float64") - res = pd.concat([s1, s2], axis=1) + res = concat([s1, s2], axis=1) exp = DataFrame( {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, index=Index([0, 1, 2], dtype="O"), @@ -58,7 +58,7 @@ def test_concat_empty_series(self): s1 = Series([1, 2, 3], name="x") s2 = Series(name="y", dtype="float64") - res = pd.concat([s1, s2], axis=0) + res = concat([s1, s2], axis=0) # name will be reset exp = Series([1, 2, 3]) tm.assert_series_equal(res, exp) @@ -66,7 +66,7 @@ def test_concat_empty_series(self): # empty Series with no name s1 = Series([1, 2, 3], name="x") s2 = Series(name=None, dtype="float64") - res = pd.concat([s1, s2], axis=1) + res = concat([s1, s2], axis=1) exp = DataFrame( {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, columns=["x", 0], @@ -109,7 +109,7 @@ def test_concat_empty_series_timelike(self, tz, values): ], ) def test_concat_empty_series_dtypes(self, left, right, expected): - result = pd.concat([Series(dtype=left), Series(dtype=right)]) + result = concat([Series(dtype=left), Series(dtype=right)]) assert result.dtype == expected @pytest.mark.parametrize( @@ -118,10 +118,10 @@ def test_concat_empty_series_dtypes(self, left, right, expected): def test_concat_empty_series_dtypes_match_roundtrips(self, dtype): dtype = np.dtype(dtype) - result = pd.concat([Series(dtype=dtype)]) + result = concat([Series(dtype=dtype)]) assert result.dtype == dtype - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype)]) + result = concat([Series(dtype=dtype), Series(dtype=dtype)]) assert result.dtype == dtype def test_concat_empty_series_dtypes_roundtrips(self): @@ -164,13 +164,13 @@ def get_result_type(dtype, dtype2): continue expected = get_result_type(dtype, dtype2) - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype + result = concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype assert result.kind == expected def test_concat_empty_series_dtypes_triple(self): assert ( - pd.concat( + concat( [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] ).dtype == np.object_ @@ -179,14 +179,14 @@ def test_concat_empty_series_dtypes_triple(self): def test_concat_empty_series_dtype_category_with_array(self): # GH#18515 assert ( - pd.concat( + concat( [Series(np.array([]), dtype="category"), Series(dtype="float64")] ).dtype == "float64" ) def test_concat_empty_series_dtypes_sparse(self): - result = pd.concat( + result = concat( [ Series(dtype="float64").astype("Sparse"), Series(dtype="float64").astype("Sparse"), @@ -194,14 +194,14 @@ def test_concat_empty_series_dtypes_sparse(self): ) assert result.dtype == "Sparse[float64]" - result = pd.concat( + result = concat( [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] ) # TODO: release-note: concat sparse dtype expected = pd.SparseDtype(np.float64) assert result.dtype == expected - result = pd.concat( + result = concat( [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] ) # TODO: release-note: concat sparse dtype @@ -212,7 +212,7 @@ def test_concat_empty_df_object_dtype(self): # GH 9149 df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) df_2 = DataFrame(columns=df_1.columns) - result = pd.concat([df_1, df_2], axis=0) + result = concat([df_1, df_2], axis=0) expected = df_1.astype(object) tm.assert_frame_equal(result, expected) @@ -222,12 +222,12 @@ def test_concat_empty_dataframe_dtypes(self): df["b"] = df["b"].astype(np.int32) df["c"] = df["c"].astype(np.float64) - result = pd.concat([df, df]) + result = concat([df, df]) assert result["a"].dtype == np.bool_ assert result["b"].dtype == np.int32 assert result["c"].dtype == np.float64 - result = pd.concat([df, df.astype(np.float64)]) + result = concat([df, df.astype(np.float64)]) assert result["a"].dtype == np.object_ assert result["b"].dtype == np.float64 assert result["c"].dtype == np.float64 @@ -239,7 +239,7 @@ def test_concat_inner_join_empty(self): df_expected = DataFrame({"a": []}, index=[], dtype="int64") for how, expected in [("inner", df_expected), ("outer", df_a)]: - result = pd.concat([df_a, df_empty], axis=1, join=how) + result = concat([df_a, df_empty], axis=1, join=how) tm.assert_frame_equal(result, expected) def test_empty_dtype_coerce(self): diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index c822dab9b8cfc..bd845f73c7c69 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -60,7 +60,7 @@ def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out): frames = [ DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"]) ] - result = pd.concat(frames, axis=1) + result = concat(frames, axis=1) exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out) expected = DataFrame( @@ -113,7 +113,7 @@ def test_default_index(self): # is_series and ignore_index s1 = Series([1, 2, 3], name="x") s2 = Series([4, 5, 6], name="y") - res = pd.concat([s1, s2], axis=1, ignore_index=True) + res = concat([s1, s2], axis=1, ignore_index=True) assert isinstance(res.columns, pd.RangeIndex) exp = DataFrame([[1, 4], [2, 5], [3, 6]]) # use check_index_type=True to check the result have @@ -123,7 +123,7 @@ def test_default_index(self): # is_series and all inputs have no names s1 = Series([1, 2, 3]) s2 = Series([4, 5, 6]) - res = pd.concat([s1, s2], axis=1, ignore_index=False) + res = concat([s1, s2], axis=1, ignore_index=False) assert isinstance(res.columns, pd.RangeIndex) exp = DataFrame([[1, 4], [2, 5], [3, 6]]) exp.columns = pd.RangeIndex(2) @@ -133,11 +133,11 @@ def test_default_index(self): df1 = DataFrame({"A": [1, 2], "B": [5, 6]}) df2 = DataFrame({"A": [3, 4], "B": [7, 8]}) - res = pd.concat([df1, df2], axis=0, ignore_index=True) + res = concat([df1, df2], axis=0, ignore_index=True) exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"]) tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) - res = pd.concat([df1, df2], axis=1, ignore_index=True) + res = concat([df1, df2], axis=1, ignore_index=True) exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) @@ -261,7 +261,7 @@ def test_concat_multiindex_dfs_with_deepcopy(self): names=["testname", None, None], ) expected = DataFrame([[0], [1]], index=expected_index) - result_copy = pd.concat(deepcopy(example_dict), names=["testname"]) + result_copy = concat(deepcopy(example_dict), names=["testname"]) tm.assert_frame_equal(result_copy, expected) - result_no_copy = pd.concat(example_dict, names=["testname"]) + result_no_copy = concat(example_dict, names=["testname"]) tm.assert_frame_equal(result_no_copy, expected) diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py index 44e29f08f282e..34bba581b31c7 100644 --- a/pandas/tests/reshape/concat/test_series.py +++ b/pandas/tests/reshape/concat/test_series.py @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -48,7 +47,7 @@ def test_concat_empty_and_non_empty_series_regression(self): s2 = Series([], dtype=object) expected = s1 - result = pd.concat([s1, s2]) + result = concat([s1, s2]) tm.assert_series_equal(result, expected) def test_concat_series_axis1(self, sort=sort): @@ -117,7 +116,7 @@ def test_concat_series_name_npscalar_tuple(self, s1name, s2name): # GH21015 s1 = Series({"a": 1, "b": 2}, name=s1name) s2 = Series({"c": 5, "d": 6}, name=s2name) - result = pd.concat([s1, s2]) + result = concat([s1, s2]) expected = Series({"a": 1, "b": 2, "c": 5, "d": 6}) tm.assert_series_equal(result, expected) @@ -147,5 +146,5 @@ def test_concat_series_partial_columns_names(self): def test_concat_series_length_one_reversed(self, frame_or_series): # GH39401 obj = frame_or_series([100]) - result = pd.concat([obj.iloc[::-1]]) + result = concat([obj.iloc[::-1]]) tm.assert_equal(result, obj) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 2ec94d4cebf5a..d31930aa233cd 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -836,15 +836,13 @@ def test_join_cross(input_col, output_cols): def test_join_multiindex_one_level(join_type): # GH#36909 left = DataFrame( - data={"c": 3}, index=pd.MultiIndex.from_tuples([(1, 2)], names=("a", "b")) - ) - right = DataFrame( - data={"d": 4}, index=pd.MultiIndex.from_tuples([(2,)], names=("b",)) + data={"c": 3}, index=MultiIndex.from_tuples([(1, 2)], names=("a", "b")) ) + right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",))) result = left.join(right, how=join_type) expected = DataFrame( {"c": [3], "d": [4]}, - index=pd.MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), + index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index e1b1e80a29a43..4fa2865a9e320 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -134,7 +134,7 @@ def test_merge_inner_join_empty(self): # GH 15328 df_empty = DataFrame() df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") - result = pd.merge(df_empty, df_a, left_index=True, right_index=True) + result = merge(df_empty, df_a, left_index=True, right_index=True) expected = DataFrame({"a": []}, index=[], dtype="int64") tm.assert_frame_equal(result, expected) @@ -152,7 +152,7 @@ def test_merge_non_string_columns(self): right = left.astype(float) expected = left - result = pd.merge(left, right) + result = merge(left, right) tm.assert_frame_equal(expected, result) def test_merge_index_as_on_arg(self): @@ -459,7 +459,7 @@ def test_merge_left_empty_right_empty(self, join_type, kwarg): dtype=object, ) - result = pd.merge(left, right, how=join_type, **kwarg) + result = merge(left, right, how=join_type, **kwarg) tm.assert_frame_equal(result, exp_in) def test_merge_left_empty_right_notempty(self): @@ -483,15 +483,15 @@ def test_merge_left_empty_right_notempty(self): exp_in.index = exp_in.index.astype(object) def check1(exp, kwarg): - result = pd.merge(left, right, how="inner", **kwarg) + result = merge(left, right, how="inner", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how="left", **kwarg) + result = merge(left, right, how="left", **kwarg) tm.assert_frame_equal(result, exp) def check2(exp, kwarg): - result = pd.merge(left, right, how="right", **kwarg) + result = merge(left, right, how="right", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how="outer", **kwarg) + result = merge(left, right, how="outer", **kwarg) tm.assert_frame_equal(result, exp) for kwarg in [ @@ -532,15 +532,15 @@ def test_merge_left_notempty_right_empty(self): exp_in.index = exp_in.index.astype(object) def check1(exp, kwarg): - result = pd.merge(left, right, how="inner", **kwarg) + result = merge(left, right, how="inner", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how="right", **kwarg) + result = merge(left, right, how="right", **kwarg) tm.assert_frame_equal(result, exp) def check2(exp, kwarg): - result = pd.merge(left, right, how="left", **kwarg) + result = merge(left, right, how="left", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how="outer", **kwarg) + result = merge(left, right, how="outer", **kwarg) tm.assert_frame_equal(result, exp) for kwarg in [ @@ -800,7 +800,7 @@ def test_merge_on_datetime64tz(self): "value_y": [np.nan, 1, 2, 3], } ) - result = pd.merge(left, right, on="key", how="outer") + result = merge(left, right, on="key", how="outer") tm.assert_frame_equal(result, expected) left = DataFrame( @@ -824,7 +824,7 @@ def test_merge_on_datetime64tz(self): + list(pd.date_range("20151011", periods=2, tz="US/Eastern")), } ) - result = pd.merge(left, right, on="key", how="outer") + result = merge(left, right, on="key", how="outer") tm.assert_frame_equal(result, expected) assert result["value_x"].dtype == "datetime64[ns, US/Eastern]" assert result["value_y"].dtype == "datetime64[ns, US/Eastern]" @@ -874,7 +874,7 @@ def test_merge_datetime64tz_with_dst_transition(self): } ) df2["date"] = df2["date"].dt.tz_localize("UTC").dt.tz_convert("Europe/Madrid") - result = pd.merge(df1, df2, how="outer", on="date") + result = merge(df1, df2, how="outer", on="date") expected = DataFrame( { "date": pd.date_range( @@ -917,7 +917,7 @@ def test_merge_on_periods(self): "value_y": [np.nan, 1, 2, 3], } ) - result = pd.merge(left, right, on="key", how="outer") + result = merge(left, right, on="key", how="outer") tm.assert_frame_equal(result, expected) left = DataFrame( @@ -936,7 +936,7 @@ def test_merge_on_periods(self): "value_y": [pd.NaT] + list(exp_y), } ) - result = pd.merge(left, right, on="key", how="outer") + result = merge(left, right, on="key", how="outer") tm.assert_frame_equal(result, expected) assert result["value_x"].dtype == "Period[D]" assert result["value_y"].dtype == "Period[D]" @@ -1430,7 +1430,7 @@ def test_different(self, right_vals): # GH 9780 # We allow merging on object and categorical cols and cast # categorical cols to object - result = pd.merge(left, right, on="A") + result = merge(left, right, on="A") assert is_object_dtype(result.A.dtype) @pytest.mark.parametrize("d1", [np.int64, np.int32, np.int16, np.int8, np.uint8]) @@ -1530,9 +1530,9 @@ def test_merge_incompat_infer_boolean_object(self): df2 = DataFrame({"key": [True, False]}) expected = DataFrame({"key": [True, False]}, dtype=object) - result = pd.merge(df1, df2, on="key") + result = merge(df1, df2, on="key") tm.assert_frame_equal(result, expected) - result = pd.merge(df2, df1, on="key") + result = merge(df2, df1, on="key") tm.assert_frame_equal(result, expected) # with missing value @@ -1540,9 +1540,9 @@ def test_merge_incompat_infer_boolean_object(self): df2 = DataFrame({"key": [True, False]}) expected = DataFrame({"key": [True, False]}, dtype=object) - result = pd.merge(df1, df2, on="key") + result = merge(df1, df2, on="key") tm.assert_frame_equal(result, expected) - result = pd.merge(df2, df1, on="key") + result = merge(df2, df1, on="key") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -1564,9 +1564,9 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): df1 = DataFrame({"A": df1_vals}) df2 = DataFrame({"A": df2_vals}) - result = pd.merge(df1, df2, on=["A"]) + result = merge(df1, df2, on=["A"]) assert is_object_dtype(result.A.dtype) - result = pd.merge(df2, df1, on=["A"]) + result = merge(df2, df1, on=["A"]) assert is_object_dtype(result.A.dtype) @pytest.mark.parametrize( @@ -1605,7 +1605,7 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): ) msg = re.escape(msg) with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, on=["A"]) + merge(df1, df2, on=["A"]) # Check that error still raised when swapping order of dataframes msg = ( @@ -1615,7 +1615,7 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): ) msg = re.escape(msg) with pytest.raises(ValueError, match=msg): - pd.merge(df2, df1, on=["A"]) + merge(df2, df1, on=["A"]) @pytest.fixture @@ -1642,7 +1642,7 @@ def right(): class TestMergeCategorical: def test_identical(self, left): # merging on the same, should preserve dtypes - merged = pd.merge(left, left, on="X") + merged = merge(left, left, on="X") result = merged.dtypes.sort_index() expected = Series( [CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")], @@ -1653,7 +1653,7 @@ def test_identical(self, left): def test_basic(self, left, right): # we have matching Categorical dtypes in X # so should preserve the merged column - merged = pd.merge(left, right, on="X") + merged = merge(left, right, on="X") result = merged.dtypes.sort_index() expected = Series( [ @@ -1680,7 +1680,7 @@ def test_merge_categorical(self): "b": {0: "g", 1: "g", 2: "g", 3: "g", 4: "g"}, } ) - df = pd.merge(left, right, how="left", left_on="b", right_on="c") + df = merge(left, right, how="left", left_on="b", right_on="c") # object-object expected = df.copy() @@ -1690,14 +1690,14 @@ def test_merge_categorical(self): # because we don't have any matching rows cright = right.copy() cright["d"] = cright["d"].astype("category") - result = pd.merge(left, cright, how="left", left_on="b", right_on="c") + result = merge(left, cright, how="left", left_on="b", right_on="c") expected["d"] = expected["d"].astype(CategoricalDtype(["null"])) tm.assert_frame_equal(result, expected) # cat-object cleft = left.copy() cleft["b"] = cleft["b"].astype("category") - result = pd.merge(cleft, cright, how="left", left_on="b", right_on="c") + result = merge(cleft, cright, how="left", left_on="b", right_on="c") tm.assert_frame_equal(result, expected) # cat-cat @@ -1705,7 +1705,7 @@ def test_merge_categorical(self): cright["d"] = cright["d"].astype("category") cleft = left.copy() cleft["b"] = cleft["b"].astype("category") - result = pd.merge(cleft, cright, how="left", left_on="b", right_on="c") + result = merge(cleft, cright, how="left", left_on="b", right_on="c") tm.assert_frame_equal(result, expected) def tests_merge_categorical_unordered_equal(self): @@ -1723,7 +1723,7 @@ def tests_merge_categorical_unordered_equal(self): "Right": ["C1", "B1", "A1"], } ) - result = pd.merge(df1, df2, on=["Foo"]) + result = merge(df1, df2, on=["Foo"]) expected = DataFrame( { "Foo": Categorical(["A", "B", "C"]), @@ -1737,7 +1737,7 @@ def test_other_columns(self, left, right): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype("category")) - merged = pd.merge(left, right, on="X") + merged = merge(left, right, on="X") result = merged.dtypes.sort_index() expected = Series( [ @@ -1770,7 +1770,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): assert is_categorical_dtype(left.X.values.dtype) # assert not left.X.values._categories_match_up_to_permutation(right.X.values) - merged = pd.merge(left, right, on="X", how=join_type) + merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() expected = Series( @@ -1814,7 +1814,7 @@ def test_self_join_multiple_categories(self): df = df.apply(lambda x: x.astype("category")) # self-join should equal ourselves - result = pd.merge(df, df, on=list(df.columns)) + result = merge(df, df, on=list(df.columns)) tm.assert_frame_equal(result, df) @@ -1840,14 +1840,14 @@ def test_dtype_on_categorical_dates(self): ], columns=["date", "num2", "num4"], ) - result_outer = pd.merge(df, df2, how="outer", on=["date"]) + result_outer = merge(df, df2, how="outer", on=["date"]) tm.assert_frame_equal(result_outer, expected_outer) expected_inner = DataFrame( [[pd.Timestamp("2001-01-01").date(), 1.1, 1.3]], columns=["date", "num2", "num4"], ) - result_inner = pd.merge(df, df2, how="inner", on=["date"]) + result_inner = merge(df, df2, how="inner", on=["date"]) tm.assert_frame_equal(result_inner, expected_inner) @pytest.mark.parametrize("ordered", [True, False]) @@ -1875,7 +1875,7 @@ def test_merging_with_bool_or_int_cateorical_column( def test_merge_on_int_array(self): # GH 23020 df = DataFrame({"A": Series([1, 2, np.nan], dtype="Int64"), "B": 1}) - result = pd.merge(df, df, on="A") + result = merge(df, df, on="A") expected = DataFrame( {"A": Series([1, 2, np.nan], dtype="Int64"), "B_x": 1, "B_y": 1} ) @@ -1941,7 +1941,7 @@ class TestMergeOnIndexes: ], ) def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): - result = pd.merge( + result = merge( left_df, right_df, left_index=True, right_index=True, how=how, sort=sort ) tm.assert_frame_equal(result, expected) @@ -1988,23 +1988,19 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): # GH 21220 a = DataFrame( {"A": [1, 2, 3, 4]}, - index=pd.MultiIndex.from_product( - [["a", "b"], [0, 1]], names=["outer", "inner"] - ), + index=MultiIndex.from_product([["a", "b"], [0, 1]], names=["outer", "inner"]), ) b = Series( [1, 2, 3, 4], - index=pd.MultiIndex.from_product( - [["a", "b"], [1, 2]], names=["outer", "inner"] - ), + index=MultiIndex.from_product([["a", "b"], [1, 2]], names=["outer", "inner"]), name=nm, ) expected = DataFrame( {"A": [2, 4], "B": [1, 3]}, - index=pd.MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]), + index=MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]), ) if nm is not None: - result = pd.merge( + result = merge( a, b, on=on, @@ -2017,7 +2013,7 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): else: msg = "Cannot merge a Series without a name" with pytest.raises(ValueError, match=msg): - result = pd.merge( + result = merge( a, b, on=on, @@ -2056,7 +2052,7 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): result = a.merge(b, left_index=True, right_index=True, **kwargs) tm.assert_frame_equal(result, expected) - result = pd.merge(a, b, left_index=True, right_index=True, **kwargs) + result = merge(a, b, left_index=True, right_index=True, **kwargs) tm.assert_frame_equal(result, expected) @@ -2102,7 +2098,7 @@ def test_merge_suffix_error(col1, col2, suffixes): # TODO: might reconsider current raise behaviour, see issue 24782 msg = "columns overlap but no suffix specified" with pytest.raises(ValueError, match=msg): - pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + merge(a, b, left_index=True, right_index=True, suffixes=suffixes) @pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}]) @@ -2111,7 +2107,7 @@ def test_merge_suffix_warns(suffixes): b = DataFrame({"b": [3, 4, 5]}) with tm.assert_produces_warning(FutureWarning): - pd.merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"}) + merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"}) @pytest.mark.parametrize( @@ -2126,7 +2122,7 @@ def test_merge_suffix_length_error(col1, col2, suffixes, msg): b = DataFrame({col2: [3, 4, 5]}) with pytest.raises(ValueError, match=msg): - pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + merge(a, b, left_index=True, right_index=True, suffixes=suffixes) @pytest.mark.parametrize("cat_dtype", ["one", "two"]) @@ -2196,7 +2192,7 @@ def test_merge_on_cat_and_ext_array(): left = right.copy() left["a"] = left["a"].astype("category") - result = pd.merge(left, right, how="inner", on="a") + result = merge(left, right, how="inner", on="a") expected = right.copy() tm.assert_frame_equal(result, expected) @@ -2210,7 +2206,7 @@ def test_merge_multiindex_columns(): letters = ["a", "b", "c", "d"] numbers = ["1", "2", "3"] - index = pd.MultiIndex.from_product((letters, numbers), names=["outer", "inner"]) + index = MultiIndex.from_product((letters, numbers), names=["outer", "inner"]) frame_x = DataFrame(columns=index) frame_x["id"] = "" @@ -2225,7 +2221,7 @@ def test_merge_multiindex_columns(): expected_labels = [letter + l_suf for letter in letters] + [ letter + r_suf for letter in letters ] - expected_index = pd.MultiIndex.from_product( + expected_index = MultiIndex.from_product( [expected_labels, numbers], names=["outer", "inner"] ) expected = DataFrame(columns=expected_index) @@ -2240,7 +2236,7 @@ def test_merge_datetime_upcast_dtype(): df2 = DataFrame( {"y": ["1", "2", "3"], "z": pd.to_datetime(["2000", "2001", "2002"])} ) - result = pd.merge(df1, df2, how="left", on="y") + result = merge(df1, df2, how="left", on="y") expected = DataFrame( { "x": ["a", "b", "c"], @@ -2387,7 +2383,7 @@ def test_merge_right_left_index(): # GH#38616 left = DataFrame({"x": [1, 1], "z": ["foo", "foo"]}) right = DataFrame({"x": [1, 1], "z": ["foo", "foo"]}) - result = pd.merge(left, right, how="right", left_index=True, right_on="x") + result = merge(left, right, how="right", left_index=True, right_on="x") expected = DataFrame( { "x": [1, 1], diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 5fa08904e3fcf..3f5bb9b84372c 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -47,14 +47,14 @@ def test_examples1(self): {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 3, 7]} ) - result = pd.merge_asof(left, right, on="a") + result = merge_asof(left, right, on="a") tm.assert_frame_equal(result, expected) def test_examples2(self): """ doc-string examples """ trades = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.023", "20160525 13:30:00.038", @@ -72,7 +72,7 @@ def test_examples2(self): quotes = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.023", "20160525 13:30:00.023", @@ -100,15 +100,13 @@ def test_examples2(self): columns=["time", "ticker", "bid", "ask"], ) - pd.merge_asof(trades, quotes, on="time", by="ticker") + merge_asof(trades, quotes, on="time", by="ticker") - pd.merge_asof( - trades, quotes, on="time", by="ticker", tolerance=Timedelta("2ms") - ) + merge_asof(trades, quotes, on="time", by="ticker", tolerance=Timedelta("2ms")) expected = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.023", "20160525 13:30:00.038", @@ -126,7 +124,7 @@ def test_examples2(self): columns=["time", "ticker", "price", "quantity", "bid", "ask"], ) - result = pd.merge_asof( + result = merge_asof( trades, quotes, on="time", @@ -147,7 +145,7 @@ def test_examples3(self): {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, np.nan]} ) - result = pd.merge_asof(left, right, on="a", direction="forward") + result = merge_asof(left, right, on="a", direction="forward") tm.assert_frame_equal(result, expected) def test_examples4(self): @@ -161,7 +159,7 @@ def test_examples4(self): {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, 7]} ) - result = pd.merge_asof(left, right, on="a", direction="nearest") + result = merge_asof(left, right, on="a", direction="nearest") tm.assert_frame_equal(result, expected) def test_basic(self): @@ -282,7 +280,7 @@ def test_multiby(self): # GH13936 trades = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.023", "20160525 13:30:00.023", @@ -301,7 +299,7 @@ def test_multiby(self): quotes = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.023", "20160525 13:30:00.023", @@ -321,7 +319,7 @@ def test_multiby(self): expected = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.023", "20160525 13:30:00.023", @@ -340,14 +338,14 @@ def test_multiby(self): columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], ) - result = pd.merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) + result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) tm.assert_frame_equal(result, expected) def test_multiby_heterogeneous_types(self): # GH13936 trades = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.023", "20160525 13:30:00.023", @@ -366,7 +364,7 @@ def test_multiby_heterogeneous_types(self): quotes = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.023", "20160525 13:30:00.023", @@ -386,7 +384,7 @@ def test_multiby_heterogeneous_types(self): expected = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.023", "20160525 13:30:00.023", @@ -405,42 +403,42 @@ def test_multiby_heterogeneous_types(self): columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], ) - result = pd.merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) + result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) tm.assert_frame_equal(result, expected) def test_multiby_indexed(self): # GH15676 left = pd.DataFrame( [ - [pd.to_datetime("20160602"), 1, "a"], - [pd.to_datetime("20160602"), 2, "a"], - [pd.to_datetime("20160603"), 1, "b"], - [pd.to_datetime("20160603"), 2, "b"], + [to_datetime("20160602"), 1, "a"], + [to_datetime("20160602"), 2, "a"], + [to_datetime("20160603"), 1, "b"], + [to_datetime("20160603"), 2, "b"], ], columns=["time", "k1", "k2"], ).set_index("time") right = pd.DataFrame( [ - [pd.to_datetime("20160502"), 1, "a", 1.0], - [pd.to_datetime("20160502"), 2, "a", 2.0], - [pd.to_datetime("20160503"), 1, "b", 3.0], - [pd.to_datetime("20160503"), 2, "b", 4.0], + [to_datetime("20160502"), 1, "a", 1.0], + [to_datetime("20160502"), 2, "a", 2.0], + [to_datetime("20160503"), 1, "b", 3.0], + [to_datetime("20160503"), 2, "b", 4.0], ], columns=["time", "k1", "k2", "value"], ).set_index("time") expected = pd.DataFrame( [ - [pd.to_datetime("20160602"), 1, "a", 1.0], - [pd.to_datetime("20160602"), 2, "a", 2.0], - [pd.to_datetime("20160603"), 1, "b", 3.0], - [pd.to_datetime("20160603"), 2, "b", 4.0], + [to_datetime("20160602"), 1, "a", 1.0], + [to_datetime("20160602"), 2, "a", 2.0], + [to_datetime("20160603"), 1, "b", 3.0], + [to_datetime("20160603"), 2, "b", 4.0], ], columns=["time", "k1", "k2", "value"], ).set_index("time") - result = pd.merge_asof( + result = merge_asof( left, right, left_index=True, right_index=True, by=["k1", "k2"] ) @@ -449,7 +447,7 @@ def test_multiby_indexed(self): with pytest.raises( MergeError, match="left_by and right_by must be same length" ): - pd.merge_asof( + merge_asof( left, right, left_index=True, @@ -629,7 +627,7 @@ def test_tolerance_forward(self): {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]} ) - result = pd.merge_asof(left, right, on="a", direction="forward", tolerance=1) + result = merge_asof(left, right, on="a", direction="forward", tolerance=1) tm.assert_frame_equal(result, expected) def test_tolerance_nearest(self): @@ -642,7 +640,7 @@ def test_tolerance_nearest(self): {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]} ) - result = pd.merge_asof(left, right, on="a", direction="nearest", tolerance=1) + result = merge_asof(left, right, on="a", direction="nearest", tolerance=1) tm.assert_frame_equal(result, expected) def test_tolerance_tz(self): @@ -650,7 +648,7 @@ def test_tolerance_tz(self): left = pd.DataFrame( { "date": pd.date_range( - start=pd.to_datetime("2016-01-02"), + start=to_datetime("2016-01-02"), freq="D", periods=5, tz=pytz.timezone("UTC"), @@ -661,7 +659,7 @@ def test_tolerance_tz(self): right = pd.DataFrame( { "date": pd.date_range( - start=pd.to_datetime("2016-01-01"), + start=to_datetime("2016-01-01"), freq="D", periods=5, tz=pytz.timezone("UTC"), @@ -669,12 +667,12 @@ def test_tolerance_tz(self): "value2": list("ABCDE"), } ) - result = pd.merge_asof(left, right, on="date", tolerance=Timedelta("1 day")) + result = merge_asof(left, right, on="date", tolerance=Timedelta("1 day")) expected = pd.DataFrame( { "date": pd.date_range( - start=pd.to_datetime("2016-01-02"), + start=to_datetime("2016-01-02"), freq="D", periods=5, tz=pytz.timezone("UTC"), @@ -700,7 +698,7 @@ def test_tolerance_float(self): } ) - result = pd.merge_asof(left, right, on="a", direction="nearest", tolerance=0.5) + result = merge_asof(left, right, on="a", direction="nearest", tolerance=0.5) tm.assert_frame_equal(result, expected) def test_index_tolerance(self): @@ -709,7 +707,7 @@ def test_index_tolerance(self): trades = self.trades.set_index("time") quotes = self.quotes.set_index("time") - result = pd.merge_asof( + result = merge_asof( trades, quotes, left_index=True, @@ -737,7 +735,7 @@ def test_allow_exact_matches_forward(self): {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 7, 11]} ) - result = pd.merge_asof( + result = merge_asof( left, right, on="a", direction="forward", allow_exact_matches=False ) tm.assert_frame_equal(result, expected) @@ -752,7 +750,7 @@ def test_allow_exact_matches_nearest(self): {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 3, 11]} ) - result = pd.merge_asof( + result = merge_asof( left, right, on="a", direction="nearest", allow_exact_matches=False ) tm.assert_frame_equal(result, expected) @@ -773,38 +771,38 @@ def test_allow_exact_matches_and_tolerance(self): def test_allow_exact_matches_and_tolerance2(self): # GH 13695 df1 = pd.DataFrame( - {"time": pd.to_datetime(["2016-07-15 13:30:00.030"]), "username": ["bob"]} + {"time": to_datetime(["2016-07-15 13:30:00.030"]), "username": ["bob"]} ) df2 = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"] ), "version": [1, 2], } ) - result = pd.merge_asof(df1, df2, on="time") + result = merge_asof(df1, df2, on="time") expected = pd.DataFrame( { - "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "time": to_datetime(["2016-07-15 13:30:00.030"]), "username": ["bob"], "version": [2], } ) tm.assert_frame_equal(result, expected) - result = pd.merge_asof(df1, df2, on="time", allow_exact_matches=False) + result = merge_asof(df1, df2, on="time", allow_exact_matches=False) expected = pd.DataFrame( { - "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "time": to_datetime(["2016-07-15 13:30:00.030"]), "username": ["bob"], "version": [1], } ) tm.assert_frame_equal(result, expected) - result = pd.merge_asof( + result = merge_asof( df1, df2, on="time", @@ -813,7 +811,7 @@ def test_allow_exact_matches_and_tolerance2(self): ) expected = pd.DataFrame( { - "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "time": to_datetime(["2016-07-15 13:30:00.030"]), "username": ["bob"], "version": [np.nan], } @@ -824,7 +822,7 @@ def test_allow_exact_matches_and_tolerance3(self): # GH 13709 df1 = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"] ), "username": ["bob", "charlie"], @@ -832,14 +830,14 @@ def test_allow_exact_matches_and_tolerance3(self): ) df2 = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"] ), "version": [1, 2], } ) - result = pd.merge_asof( + result = merge_asof( df1, df2, on="time", @@ -848,7 +846,7 @@ def test_allow_exact_matches_and_tolerance3(self): ) expected = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"] ), "username": ["bob", "charlie"], @@ -867,7 +865,7 @@ def test_allow_exact_matches_and_tolerance_forward(self): {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 6, 11]} ) - result = pd.merge_asof( + result = merge_asof( left, right, on="a", @@ -887,7 +885,7 @@ def test_allow_exact_matches_and_tolerance_nearest(self): {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 4, 11]} ) - result = pd.merge_asof( + result = merge_asof( left, right, on="a", @@ -924,7 +922,7 @@ def test_forward_by(self): } ) - result = pd.merge_asof(left, right, on="a", by="b", direction="forward") + result = merge_asof(left, right, on="a", by="b", direction="forward") tm.assert_frame_equal(result, expected) def test_nearest_by(self): @@ -954,14 +952,14 @@ def test_nearest_by(self): } ) - result = pd.merge_asof(left, right, on="a", by="b", direction="nearest") + result = merge_asof(left, right, on="a", by="b", direction="nearest") tm.assert_frame_equal(result, expected) def test_by_int(self): # we specialize by type, so test that this is correct df1 = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.020", "20160525 13:30:00.030", @@ -978,7 +976,7 @@ def test_by_int(self): df2 = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.015", "20160525 13:30:00.020", @@ -996,11 +994,11 @@ def test_by_int(self): columns=["time", "key", "value2"], ) - result = pd.merge_asof(df1, df2, on="time", by="key") + result = merge_asof(df1, df2, on="time", by="key") expected = pd.DataFrame( { - "time": pd.to_datetime( + "time": to_datetime( [ "20160525 13:30:00.020", "20160525 13:30:00.030", @@ -1035,7 +1033,7 @@ def test_on_float(self): df1 = df1.sort_values("price").reset_index(drop=True) - result = pd.merge_asof(df1, df2, on="price") + result = merge_asof(df1, df2, on="price") expected = pd.DataFrame( { @@ -1065,7 +1063,7 @@ def test_on_specialized_type(self, any_real_dtype): df2.value = dtype(df2.value) df1 = df1.sort_values("value").reset_index(drop=True) - result = pd.merge_asof(df1, df2, on="value") + result = merge_asof(df1, df2, on="value") expected = pd.DataFrame( { @@ -1100,7 +1098,7 @@ def test_on_specialized_type_by_int(self, any_real_dtype): df2.value = dtype(df2.value) df1 = df1.sort_values("value").reset_index(drop=True) - result = pd.merge_asof(df1, df2, on="value", by="key") + result = merge_asof(df1, df2, on="value", by="key") expected = pd.DataFrame( { @@ -1148,7 +1146,7 @@ def test_on_float_by_int(self): df1 = df1.sort_values("price").reset_index(drop=True) df2 = df2.sort_values("price").reset_index(drop=True) - result = pd.merge_asof(df1, df2, on="price", by="exch") + result = merge_asof(df1, df2, on="price", by="exch") expected = pd.DataFrame( { @@ -1241,7 +1239,7 @@ def test_merge_by_col_tz_aware(self): "values": ["b"], } ) - result = pd.merge_asof(left, right, by="by_col", on="on_col") + result = merge_asof(left, right, by="by_col", on="on_col") expected = pd.DataFrame( [[pd.Timestamp("2018-01-01", tz="UTC"), 2, "a", "b"]], columns=["by_col", "on_col", "values_x", "values_y"], @@ -1266,7 +1264,7 @@ def test_by_mixed_tz_aware(self): "value": ["b"], } ) - result = pd.merge_asof(left, right, by=["by_col1", "by_col2"], on="on_col") + result = merge_asof(left, right, by=["by_col1", "by_col2"], on="on_col") expected = pd.DataFrame( [[pd.Timestamp("2018-01-01", tz="UTC"), "HELLO", 2, "a"]], columns=["by_col1", "by_col2", "on_col", "value_x"], @@ -1304,7 +1302,7 @@ def test_timedelta_tolerance_nearest(self): expected["time"] = pd.to_timedelta(expected["time"], "ms") - result = pd.merge_asof( + result = merge_asof( left, right, on="time", tolerance=Timedelta("1ms"), direction="nearest" ) @@ -1323,7 +1321,7 @@ def test_int_type_tolerance(self, any_int_dtype): ) expected["a"] = expected["a"].astype(any_int_dtype) - result = pd.merge_asof(left, right, on="a", tolerance=10) + result = merge_asof(left, right, on="a", tolerance=10) tm.assert_frame_equal(result, expected) def test_merge_index_column_tz(self): @@ -1331,7 +1329,7 @@ def test_merge_index_column_tz(self): index = pd.date_range("2019-10-01", freq="30min", periods=5, tz="UTC") left = pd.DataFrame([0.9, 0.8, 0.7, 0.6], columns=["xyz"], index=index[1:]) right = pd.DataFrame({"from_date": index, "abc": [2.46] * 4 + [2.19]}) - result = pd.merge_asof( + result = merge_asof( left=left, right=right, left_index=True, right_on=["from_date"] ) expected = pd.DataFrame( @@ -1344,7 +1342,7 @@ def test_merge_index_column_tz(self): ) tm.assert_frame_equal(result, expected) - result = pd.merge_asof( + result = merge_asof( left=right, right=left, right_index=True, left_on=["from_date"] ) expected = pd.DataFrame( @@ -1370,7 +1368,7 @@ def test_left_index_right_index_tolerance(self): expected = pd.DataFrame( {"val1": "foo", "val2": "bar"}, index=pd.DatetimeIndex(dr1) ) - result = pd.merge_asof( + result = merge_asof( df1, df2, left_index=True, @@ -1395,7 +1393,7 @@ def test_merge_asof_non_numerical_dtype(kwargs, data): MergeError, match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", ): - pd.merge_asof(left, right, **kwargs) + merge_asof(left, right, **kwargs) def test_merge_asof_non_numerical_dtype_object(): @@ -1406,7 +1404,7 @@ def test_merge_asof_non_numerical_dtype_object(): MergeError, match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", ): - pd.merge_asof( + merge_asof( left, right, left_on="left_val1", diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 56ea3c9718a41..d9143549e127d 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -112,7 +112,7 @@ def test_merge_on_multikey(self, left, right, join_type): on_cols = ["key1", "key2"] result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True) - expected = pd.merge(left, right.reset_index(), on=on_cols, how=join_type) + expected = merge(left, right.reset_index(), on=on_cols, how=join_type) tm.assert_frame_equal(result, expected) @@ -120,7 +120,7 @@ def test_merge_on_multikey(self, left, right, join_type): drop=True ) - expected = pd.merge( + expected = merge( left, right.reset_index(), on=on_cols, how=join_type, sort=True ) @@ -200,13 +200,13 @@ def test_merge_right_vs_left(self, left, right, sort): def test_merge_multiple_cols_with_mixed_cols_index(self): # GH29522 - s = pd.Series( + s = Series( range(6), MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]), name="Amount", ) df = DataFrame({"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0}) - result = pd.merge(df, s.reset_index(), on=["lev1", "lev2"]) + result = merge(df, s.reset_index(), on=["lev1", "lev2"]) expected = DataFrame( { "lev1": list("AAABBB"), @@ -840,7 +840,7 @@ def test_join_multi_multi( ): # Multi-index join tests expected = ( - pd.merge( + merge( left_multi.reset_index(), right_multi.reset_index(), how=join_type, @@ -861,7 +861,7 @@ def test_join_multi_empty_frames( right_multi = right_multi.drop(columns=right_multi.columns) expected = ( - pd.merge( + merge( left_multi.reset_index(), right_multi.reset_index(), how=join_type, @@ -917,7 +917,7 @@ def test_single_common_level(self): ) result = left.join(right) - expected = pd.merge( + expected = merge( left.reset_index(), right.reset_index(), on=["key"], how="inner" ).set_index(["key", "X", "Y"]) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 1ecb408d49813..e467dbb7d49b6 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -259,6 +259,8 @@ def test_margin_dropna(self): expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) + def test_margin_dropna2(self): + df = DataFrame( {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} ) @@ -268,6 +270,8 @@ def test_margin_dropna(self): expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) + def test_margin_dropna3(self): + df = DataFrame( {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} ) @@ -277,6 +281,7 @@ def test_margin_dropna(self): expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) + def test_margin_dropna4(self): # GH 12642 # _add_margins raises KeyError: Level None not found # when margins=True and dropna=False @@ -287,6 +292,7 @@ def test_margin_dropna(self): expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) + def test_margin_dropna5(self): df = DataFrame( {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} ) @@ -296,6 +302,7 @@ def test_margin_dropna(self): expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) + def test_margin_dropna6(self): a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) c = np.array( @@ -395,6 +402,12 @@ def test_crosstab_normalize(self): crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins ) + def test_crosstab_normalize_arrays(self): + # GH#12578 + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + # Test arrays crosstab( [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) @@ -798,7 +811,7 @@ def test_categoricals(a_dtype, b_dtype): if not a_is_cat: expected = expected.loc[[0, 2, "All"]] expected["All"] = expected["All"].astype("int64") - print(result) - print(expected) - print(expected.loc[[0, 2, "All"]]) + repr(result) + repr(expected) + repr(expected.loc[[0, 2, "All"]]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 56326dd15bd9b..06159cf70b1ab 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -681,7 +681,7 @@ def test_cut_unordered_with_series_labels(): s = Series([1, 2, 3, 4, 5]) bins = Series([0, 2, 4, 6]) labels = Series(["a", "b", "c"]) - result = pd.cut(s, bins=bins, labels=labels, ordered=False) + result = cut(s, bins=bins, labels=labels, ordered=False) expected = Series(["a", "a", "b", "b", "c"], dtype="category") tm.assert_series_equal(result, expected) @@ -690,4 +690,4 @@ def test_cut_no_warnings(): df = DataFrame({"value": np.random.randint(0, 100, 20)}) labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] with tm.assert_produces_warning(False): - df["group"] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) + df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 53244569d0432..a950c648838ff 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -302,7 +302,7 @@ def test_pandas_dtypes(self, col): def test_preserve_category(self): # GH 15853 data = DataFrame({"A": [1, 2], "B": pd.Categorical(["X", "Y"])}) - result = pd.melt(data, ["B"], ["A"]) + result = melt(data, ["B"], ["A"]) expected = DataFrame( {"B": pd.Categorical(["X", "Y"]), "variable": ["A", "A"], "value": [1, 2]} ) @@ -668,7 +668,7 @@ def test_stubs(self): stubs = ["inc", "edu"] # TODO: unused? - df_long = pd.wide_to_long(df, stubs, i="id", j="age") # noqa + df_long = wide_to_long(df, stubs, i="id", j="age") # noqa assert stubs == ["inc", "edu"] @@ -1055,10 +1055,8 @@ def test_col_substring_of_stubname(self): "PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67}, } wide_df = DataFrame.from_dict(wide_data) - expected = pd.wide_to_long( - wide_df, stubnames=["PA"], i=["node_id", "A"], j="time" - ) - result = pd.wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time") + expected = wide_to_long(wide_df, stubnames=["PA"], i=["node_id", "A"], j="time") + result = wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time") tm.assert_frame_equal(result, expected) def test_warn_of_column_name_value(self): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8d2b4f2b325c2..e345f4f4b5f7f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -197,7 +197,7 @@ def test_pivot_table_categorical(self): ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True ) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - result = pd.pivot_table(df, values="values", index=["A", "B"], dropna=True) + result = pivot_table(df, values="values", index=["A", "B"], dropna=True) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) @@ -302,7 +302,7 @@ def test_pivot_with_interval_index_margins(self): } ) - pivot_tab = pd.pivot_table( + pivot_tab = pivot_table( df, index="C", columns="B", values="A", aggfunc="sum", margins=True ) @@ -409,7 +409,7 @@ def test_pivot_no_values(self): df = DataFrame( { "A": [1, 2, 3, 4, 5], - "dt": pd.date_range("2011-01-01", freq="D", periods=5), + "dt": date_range("2011-01-01", freq="D", periods=5), }, index=idx, ) @@ -492,7 +492,7 @@ def test_pivot_index_with_nan(self, method): # GH9491 df = DataFrame( { - "a": pd.date_range("2014-02-01", periods=6, freq="D"), + "a": date_range("2014-02-01", periods=6, freq="D"), "c": 100 + np.arange(6), } ) @@ -605,7 +605,7 @@ def test_pivot_tz_in_values(self): df = df.set_index("ts").reset_index() mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0)) - result = pd.pivot_table( + result = pivot_table( df.set_index("ts").reset_index(), values="ts", index=["uid"], @@ -1101,7 +1101,7 @@ def test_pivot_columns_lexsorted(self): iproduct = np.random.randint(0, len(products), n) items["Index"] = products["Index"][iproduct] items["Symbol"] = products["Symbol"][iproduct] - dr = pd.date_range(date(2000, 1, 1), date(2010, 12, 31)) + dr = date_range(date(2000, 1, 1), date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items["Year"] = dates.year items["Month"] = dates.month @@ -1664,17 +1664,17 @@ def test_pivot_table_with_iterator_values(self): # GH 12017 aggs = {"D": "sum", "E": "mean"} - pivot_values_list = pd.pivot_table( + pivot_values_list = pivot_table( self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs ) - pivot_values_keys = pd.pivot_table( + pivot_values_keys = pivot_table( self.data, index=["A"], values=aggs.keys(), aggfunc=aggs ) tm.assert_frame_equal(pivot_values_keys, pivot_values_list) agg_values_gen = (value for value in aggs.keys()) - pivot_values_gen = pd.pivot_table( + pivot_values_gen = pivot_table( self.data, index=["A"], values=agg_values_gen, aggfunc=aggs ) tm.assert_frame_equal(pivot_values_gen, pivot_values_list) @@ -1749,7 +1749,7 @@ def test_margins_casted_to_float(self, observed): } ) - result = pd.pivot_table(df, index="D", margins=True) + result = pivot_table(df, index="D", margins=True) expected = DataFrame( {"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]}, index=Index(["X", "Y", "All"], name="D"), @@ -1887,7 +1887,7 @@ def test_pivot_margins_name_unicode(self): # issue #13292 greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae" frame = DataFrame({"foo": [1, 2, 3]}) - table = pd.pivot_table( + table = pivot_table( frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek ) index = Index([1, 2, 3, greek], dtype="object", name="foo") @@ -2006,7 +2006,7 @@ def ret_sum(x): def ret_none(x): return np.nan - result = pd.pivot_table( + result = pivot_table( df, columns="fruit", aggfunc=[ret_sum, ret_none, ret_one], dropna=dropna ) @@ -2028,7 +2028,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna): {"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]} ) - result = pd.pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna) + result = pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna) data = [[2.5, np.nan], [1, np.nan]] col = Index(["one", "two"], name="A") @@ -2063,6 +2063,55 @@ def agg(arr): with pytest.raises(KeyError, match="notpresent"): foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) + def test_pivot_table_doctest_case(self): + # TODO: better name. the relevant characteristic is that + # the call to maybe_downcast_to_dtype(agged[v], data[v].dtype) in + # __internal_pivot_table has `agged[v]` a DataFrame instead of Series, + # i.e agged.columns is not unique + df = DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + + table = pivot_table( + df, + values=["D", "E"], + index=["A", "C"], + aggfunc={"D": np.mean, "E": [min, max, np.mean]}, + ) + cols = MultiIndex.from_tuples( + [("D", "mean"), ("E", "max"), ("E", "mean"), ("E", "min")] + ) + index = MultiIndex.from_tuples( + [("bar", "large"), ("bar", "small"), ("foo", "large"), ("foo", "small")], + names=["A", "C"], + ) + vals = np.array( + [ + [5.5, 9.0, 7.5, 6.0], + [5.5, 9.0, 8.5, 8.0], + [2.0, 5.0, 4.5, 4.0], + [2.33333333, 6.0, 4.33333333, 2.0], + ] + ) + expected = DataFrame(vals, columns=cols, index=index) + tm.assert_frame_equal(table, expected) + class TestPivot: def test_pivot(self): diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index b9594a9c876c6..9f6cdbb81bd89 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -11,6 +11,7 @@ import pytest from pandas.compat import is_numpy_dev +from pandas.errors import OutOfBoundsTimedelta import pandas as pd from pandas import ( @@ -104,7 +105,7 @@ def test_td_add_timestamp_overflow(self): with pytest.raises(OverflowError, match=msg): Timestamp("1700-01-01") + Timedelta(13 * 19999, unit="D") - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsTimedelta, match=msg): Timestamp("1700-01-01") + timedelta(days=13 * 19999) @pytest.mark.parametrize("op", [operator.add, ops.radd]) diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 47b09280854de..ea4a56be6da48 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -200,7 +200,7 @@ def test_overflow_on_construction(): with pytest.raises(OverflowError, match=msg): Timedelta(7 * 19999, unit="D") - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsTimedelta, match=msg): Timedelta(timedelta(days=13 * 19999)) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 6199e77e10166..5db159e1abb80 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -74,7 +74,7 @@ def get_expected(s, name): if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype("int64") - elif not is_list_like(result) or isinstance(result, pd.DataFrame): + elif not is_list_like(result) or isinstance(result, DataFrame): return result return Series(result, index=s.index, name=s.name) @@ -83,7 +83,7 @@ def compare(s, name): b = get_expected(s, prop) if not (is_list_like(a) and is_list_like(b)): assert a == b - elif isinstance(a, pd.DataFrame): + elif isinstance(a, DataFrame): tm.assert_frame_equal(a, b) else: tm.assert_series_equal(a, b) @@ -180,7 +180,7 @@ def compare(s, name): assert result.dtype == object result = s.dt.total_seconds() - assert isinstance(result, pd.Series) + assert isinstance(result, Series) assert result.dtype == "float64" freq_result = s.dt.freq @@ -236,11 +236,11 @@ def get_dir(s): # 11295 # ambiguous time error on the conversions - s = Series(pd.date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + s = Series(date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") s = s.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") results = get_dir(s) tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) - exp_values = pd.date_range( + exp_values = date_range( "2015-01-01", "2016-01-01", freq="T", tz="UTC" ).tz_convert("America/Chicago") # freq not preserved by tz_localize above @@ -297,7 +297,7 @@ def test_dt_round_tz(self): @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) def test_dt_round_tz_ambiguous(self, method): # GH 18946 round near "fall back" DST - df1 = pd.DataFrame( + df1 = DataFrame( [ pd.to_datetime("2017-10-29 02:00:00+02:00", utc=True), pd.to_datetime("2017-10-29 02:00:00+01:00", utc=True), @@ -634,7 +634,7 @@ def test_dt_accessor_invalid(self, ser): assert not hasattr(ser, "dt") def test_dt_accessor_updates_on_inplace(self): - s = Series(pd.date_range("2018-01-01", periods=10)) + s = Series(date_range("2018-01-01", periods=10)) s[2] = None return_value = s.fillna(pd.Timestamp("2018-01-01"), inplace=True) assert return_value is None @@ -680,7 +680,7 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): ) def test_isocalendar(self, input_series, expected_output): result = pd.to_datetime(Series(input_series)).dt.isocalendar() - expected_frame = pd.DataFrame( + expected_frame = DataFrame( expected_output, columns=["year", "week", "day"], dtype="UInt32" ) tm.assert_frame_equal(result, expected_frame) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 1de6540217655..e4ba530d0741c 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -352,7 +352,7 @@ def test_indexing_over_size_cutoff_period_index(monkeypatch): monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 1000) n = 1100 - idx = pd.period_range("1/1/2000", freq="T", periods=n) + idx = period_range("1/1/2000", freq="T", periods=n) assert idx._engine.over_size_threshold s = Series(np.random.randn(len(idx)), index=idx) diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index e6dfafabbfec2..7642ccff31c6a 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -202,6 +202,38 @@ def test_getitem_slice_strings_with_datetimeindex(self): expected = ts[1:4] tm.assert_series_equal(result, expected) + def test_getitem_partial_str_slice_with_timedeltaindex(self): + rng = timedelta_range("1 day 10:11:12", freq="h", periods=500) + ser = Series(np.arange(len(rng)), index=rng) + + result = ser["5 day":"6 day"] + expected = ser.iloc[86:134] + tm.assert_series_equal(result, expected) + + result = ser["5 day":] + expected = ser.iloc[86:] + tm.assert_series_equal(result, expected) + + result = ser[:"6 day"] + expected = ser.iloc[:134] + tm.assert_series_equal(result, expected) + + def test_getitem_partial_str_slice_high_reso_with_timedeltaindex(self): + # higher reso + rng = timedelta_range("1 day 10:11:12", freq="us", periods=2000) + ser = Series(np.arange(len(rng)), index=rng) + + result = ser["1 day 10:11:12":] + expected = ser.iloc[0:] + tm.assert_series_equal(result, expected) + + result = ser["1 day 10:11:12.001":] + expected = ser.iloc[1000:] + tm.assert_series_equal(result, expected) + + result = ser["1 days, 10:11:12.001001"] + assert result == ser.iloc[1001] + def test_getitem_slice_2d(self, datetime_series): # GH#30588 multi-dimensional indexing deprecated @@ -277,7 +309,7 @@ def test_getitem_slice_integers(self): class TestSeriesGetitemListLike: - @pytest.mark.parametrize("box", [list, np.array, Index, pd.Series]) + @pytest.mark.parametrize("box", [list, np.array, Index, Series]) def test_getitem_no_matches(self, box): # GH#33462 we expect the same behavior for list/ndarray/Index/Series ser = Series(["A", "B"]) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index cd5a7af1d5ec0..30c37113f6b8f 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -5,7 +5,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( DataFrame, IndexSlice, @@ -58,7 +57,7 @@ def test_basic_getitem_dt64tz_values(): # GH12089 # with tz for values ser = Series( - pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), index=["a", "b", "c"] + date_range("2011-01-01", periods=3, tz="US/Eastern"), index=["a", "b", "c"] ) expected = Timestamp("2011-01-01", tz="US/Eastern") result = ser.loc["a"] @@ -114,7 +113,7 @@ def test_getitem_setitem_integers(): def test_series_box_timestamp(): - rng = pd.date_range("20090415", "20090519", freq="B") + rng = date_range("20090415", "20090519", freq="B") ser = Series(rng) assert isinstance(ser[0], Timestamp) assert isinstance(ser.at[1], Timestamp) @@ -131,7 +130,7 @@ def test_series_box_timestamp(): def test_series_box_timedelta(): - rng = pd.timedelta_range("1 day 1 s", periods=5, freq="h") + rng = timedelta_range("1 day 1 s", periods=5, freq="h") ser = Series(rng) assert isinstance(ser[0], Timedelta) assert isinstance(ser.at[1], Timedelta) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 1e50fef55b4ec..799f3d257434d 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -475,7 +475,7 @@ def test_where_datetimelike_categorical(tz_naive_fixture): # GH#37682 tz = tz_naive_fixture - dr = pd.date_range("2001-01-01", periods=3, tz=tz)._with_freq(None) + dr = date_range("2001-01-01", periods=3, tz=tz)._with_freq(None) lvals = pd.DatetimeIndex([dr[0], dr[1], pd.NaT]) rvals = pd.Categorical([dr[0], pd.NaT, dr[2]]) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index cad5476d4861c..5686e6478772d 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -642,7 +642,7 @@ def test_interp_datetime64(self, method, tz_naive_fixture): def test_interp_pad_datetime64tz_values(self): # GH#27628 missing.interpolate_2d should handle datetimetz values - dti = pd.date_range("2015-04-05", periods=3, tz="US/Central") + dti = date_range("2015-04-05", periods=3, tz="US/Central") ser = Series(dti) ser[1] = pd.NaT result = ser.interpolate(method="pad") @@ -735,13 +735,13 @@ def test_series_interpolate_method_values(self): def test_series_interpolate_intraday(self): # #1698 - index = pd.date_range("1/1/2012", periods=4, freq="12D") + index = date_range("1/1/2012", periods=4, freq="12D") ts = Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(days=1)).sort_values() exp = ts.reindex(new_index).interpolate(method="time") - index = pd.date_range("1/1/2012", periods=4, freq="12H") + index = date_range("1/1/2012", periods=4, freq="12H") ts = Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() result = ts.reindex(new_index).interpolate(method="time") diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index 60ec0a90e906f..73684e300ed77 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -353,14 +353,14 @@ def test_shift_preserve_freqstr(self, periods): # GH#21275 ser = Series( range(periods), - index=pd.date_range("2016-1-1 00:00:00", periods=periods, freq="H"), + index=date_range("2016-1-1 00:00:00", periods=periods, freq="H"), ) result = ser.shift(1, "2H") expected = Series( range(periods), - index=pd.date_range("2016-1-1 02:00:00", periods=periods, freq="H"), + index=date_range("2016-1-1 02:00:00", periods=periods, freq="H"), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index d70abe2311acd..4df6f52e0fff4 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -203,6 +203,20 @@ def test_sort_index_ascending_list(self): expected = ser.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "ascending", + [ + None, + (True, None), + (False, "True"), + ], + ) + def test_sort_index_ascending_bad_value_raises(self, ascending): + ser = Series(range(10), index=[0, 3, 2, 1, 4, 5, 7, 6, 8, 9]) + match = 'For argument "ascending" expected type bool' + with pytest.raises(ValueError, match=match): + ser.sort_index(ascending=ascending) + class TestSeriesSortIndexKey: def test_sort_index_multiindex_key(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c2d0bf5975059..a69a693bb6203 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -69,6 +69,7 @@ class TestSeriesConstructors: ], ) def test_empty_constructor(self, constructor, check_index_type): + # TODO: share with frame test of the same name with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): expected = Series() result = constructor() @@ -310,6 +311,7 @@ def test_constructor_generator(self): exp = Series(range(10)) tm.assert_series_equal(result, exp) + # same but with non-default index gen = (i for i in range(10)) result = Series(gen, index=range(10, 20)) exp.index = range(10, 20) @@ -323,6 +325,7 @@ def test_constructor_map(self): exp = Series(range(10)) tm.assert_series_equal(result, exp) + # same but with non-default index m = map(lambda x: x, range(10)) result = Series(m, index=range(10, 20)) exp.index = range(10, 20) @@ -386,6 +389,7 @@ def test_constructor_categorical_with_coercion(self): str(df.values) str(df) + def test_constructor_categorical_with_coercion2(self): # GH8623 x = DataFrame( [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], @@ -689,16 +693,16 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series([np.nan, np.nan]), exp) tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) - exp = Series([pd.NaT, pd.NaT]) + exp = Series([NaT, NaT]) assert exp.dtype == "datetime64[ns]" - tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) - tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) + tm.assert_series_equal(Series([NaT, NaT]), exp) + tm.assert_series_equal(Series(np.array([NaT, NaT])), exp) - tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) - tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) + tm.assert_series_equal(Series([NaT, np.nan]), exp) + tm.assert_series_equal(Series(np.array([NaT, np.nan])), exp) - tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) - tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) + tm.assert_series_equal(Series([np.nan, NaT]), exp) + tm.assert_series_equal(Series(np.array([np.nan, NaT])), exp) def test_constructor_cast(self): msg = "could not convert string to float" @@ -747,6 +751,7 @@ def test_constructor_datelike_coercion(self): assert s.iloc[1] == "NOV" assert s.dtype == object + def test_constructor_datelike_coercion2(self): # the dtype was being reset on the slicing and re-inferred to datetime # even thought the blocks are mixed belly = "216 3T19".split() @@ -760,6 +765,14 @@ def test_constructor_datelike_coercion(self): result = df.loc["216"] assert result.dtype == object + def test_constructor_mixed_int_and_timestamp(self, frame_or_series): + # specifically Timestamp with nanos, not datetimes + objs = [Timestamp(9), 10, NaT.value] + result = frame_or_series(objs, dtype="M8[ns]") + + expected = frame_or_series([Timestamp(9), Timestamp(10), NaT]) + tm.assert_equal(result, expected) + def test_constructor_datetimes_with_nulls(self): # gh-15869 for arr in [ @@ -790,6 +803,7 @@ def test_constructor_dtype_datetime64(self): assert isna(s[1]) assert s.dtype == "M8[ns]" + def test_constructor_dtype_datetime64_10(self): # GH3416 dates = [ np.datetime64(datetime(2013, 1, 1)), @@ -816,7 +830,7 @@ def test_constructor_dtype_datetime64(self): tm.assert_series_equal(result, expected) expected = Series( - [pd.NaT, datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]" + [NaT, datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]" ) result = Series([np.nan] + dates[1:], dtype="datetime64[ns]") tm.assert_series_equal(result, expected) @@ -842,6 +856,7 @@ def test_constructor_dtype_datetime64(self): expected = Series(dts.astype(np.int64)) tm.assert_series_equal(result, expected) + def test_constructor_dtype_datetime64_9(self): # invalid dates can be help as object result = Series([datetime(2, 1, 1)]) assert result[0] == datetime(2, 1, 1, 0, 0) @@ -849,11 +864,13 @@ def test_constructor_dtype_datetime64(self): result = Series([datetime(3000, 1, 1)]) assert result[0] == datetime(3000, 1, 1, 0, 0) + def test_constructor_dtype_datetime64_8(self): # don't mix types result = Series([Timestamp("20130101"), 1], index=["a", "b"]) assert result["a"] == Timestamp("20130101") assert result["b"] == 1 + def test_constructor_dtype_datetime64_7(self): # GH6529 # coerce datetime64 non-ns properly dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M") @@ -879,16 +896,18 @@ def test_constructor_dtype_datetime64(self): tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object + def test_constructor_dtype_datetime64_6(self): # these will correctly infer a datetime - s = Series([None, pd.NaT, "2013-08-05 15:30:00.000001"]) + s = Series([None, NaT, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" - s = Series([np.nan, pd.NaT, "2013-08-05 15:30:00.000001"]) + s = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" - s = Series([pd.NaT, None, "2013-08-05 15:30:00.000001"]) + s = Series([NaT, None, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" - s = Series([pd.NaT, np.nan, "2013-08-05 15:30:00.000001"]) + s = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" + def test_constructor_dtype_datetime64_5(self): # tz-aware (UTC and other tz's) # GH 8411 dr = date_range("20130101", periods=3) @@ -898,18 +917,21 @@ def test_constructor_dtype_datetime64(self): dr = date_range("20130101", periods=3, tz="US/Eastern") assert str(Series(dr).iloc[0].tz) == "US/Eastern" + def test_constructor_dtype_datetime64_4(self): # non-convertible - s = Series([1479596223000, -1479590, pd.NaT]) + s = Series([1479596223000, -1479590, NaT]) assert s.dtype == "object" - assert s[2] is pd.NaT + assert s[2] is NaT assert "NaT" in str(s) + def test_constructor_dtype_datetime64_3(self): # if we passed a NaT it remains - s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) + s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) assert s.dtype == "object" - assert s[2] is pd.NaT + assert s[2] is NaT assert "NaT" in str(s) + def test_constructor_dtype_datetime64_2(self): # if we passed a nan it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) assert s.dtype == "object" @@ -933,7 +955,7 @@ def test_constructor_with_datetime_tz(self): assert isinstance(result, np.ndarray) assert result.dtype == "datetime64[ns]" - exp = pd.DatetimeIndex(result) + exp = DatetimeIndex(result) exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) @@ -969,9 +991,10 @@ def test_constructor_with_datetime_tz(self): t = Series(date_range("20130101", periods=1000, tz="US/Eastern")) assert "datetime64[ns, US/Eastern]" in str(t) - result = pd.DatetimeIndex(s, freq="infer") + result = DatetimeIndex(s, freq="infer") tm.assert_index_equal(result, dr) + def test_constructor_with_datetime_tz4(self): # inference s = Series( [ @@ -982,6 +1005,7 @@ def test_constructor_with_datetime_tz(self): assert s.dtype == "datetime64[ns, US/Pacific]" assert lib.infer_dtype(s, skipna=True) == "datetime64" + def test_constructor_with_datetime_tz3(self): s = Series( [ Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), @@ -991,9 +1015,10 @@ def test_constructor_with_datetime_tz(self): assert s.dtype == "object" assert lib.infer_dtype(s, skipna=True) == "datetime" + def test_constructor_with_datetime_tz2(self): # with all NaT - s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") - expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) + s = Series(NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") + expected = Series(DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) tm.assert_series_equal(s, expected) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @@ -1010,7 +1035,7 @@ def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("arg", ["2013-01-01 00:00:00", pd.NaT, np.nan, None]) + @pytest.mark.parametrize("arg", ["2013-01-01 00:00:00", NaT, np.nan, None]) def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype="datetime64[ns, CET]") @@ -1223,14 +1248,6 @@ def test_constructor_dict_of_tuples(self): expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected) - def test_constructor_set(self): - values = {1, 2, 3, 4, 5} - with pytest.raises(TypeError, match="'set' type is unordered"): - Series(values) - values = frozenset(values) - with pytest.raises(TypeError, match="'frozenset' type is unordered"): - Series(values) - # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") def test_fromDict(self): @@ -1294,7 +1311,7 @@ def test_constructor_dtype_timedelta64(self): td = Series([timedelta(days=1), np.nan], dtype="m8[ns]") assert td.dtype == "timedelta64[ns]" - td = Series([np.timedelta64(300000000), pd.NaT], dtype="m8[ns]") + td = Series([np.timedelta64(300000000), NaT], dtype="m8[ns]") assert td.dtype == "timedelta64[ns]" # improved inference @@ -1309,7 +1326,7 @@ def test_constructor_dtype_timedelta64(self): td = Series([np.timedelta64(300000000), np.nan]) assert td.dtype == "timedelta64[ns]" - td = Series([pd.NaT, np.timedelta64(300000000)]) + td = Series([NaT, np.timedelta64(300000000)]) assert td.dtype == "timedelta64[ns]" td = Series([np.timedelta64(1, "s")]) @@ -1341,13 +1358,13 @@ def test_constructor_dtype_timedelta64(self): assert td.dtype == "object" # these will correctly infer a timedelta - s = Series([None, pd.NaT, "1 Day"]) + s = Series([None, NaT, "1 Day"]) assert s.dtype == "timedelta64[ns]" - s = Series([np.nan, pd.NaT, "1 Day"]) + s = Series([np.nan, NaT, "1 Day"]) assert s.dtype == "timedelta64[ns]" - s = Series([pd.NaT, None, "1 Day"]) + s = Series([NaT, None, "1 Day"]) assert s.dtype == "timedelta64[ns]" - s = Series([pd.NaT, np.nan, "1 Day"]) + s = Series([NaT, np.nan, "1 Day"]) assert s.dtype == "timedelta64[ns]" # GH 16406 @@ -1598,7 +1615,7 @@ def test_constructor_dict_multiindex(self): _d = sorted(d.items()) result = Series(d) expected = Series( - [x[1] for x in _d], index=pd.MultiIndex.from_tuples([x[0] for x in _d]) + [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d]) ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index a91908f7fba52..96a69476ccbef 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -169,7 +169,7 @@ def test_repr_should_return_str(self): def test_repr_max_rows(self): # GH 6863 - with pd.option_context("max_rows", None): + with option_context("max_rows", None): str(Series(range(1001))) # should not raise exception def test_unicode_string_with_unicode(self): diff --git a/pandas/tests/series/test_unary.py b/pandas/tests/series/test_unary.py index 40d5e56203c6c..67bb89b42a56d 100644 --- a/pandas/tests/series/test_unary.py +++ b/pandas/tests/series/test_unary.py @@ -18,40 +18,35 @@ def test_invert(self): tm.assert_series_equal(-(ser < 0), ~(ser < 0)) @pytest.mark.parametrize( - "source, target", + "source, neg_target, abs_target", [ - ([1, 2, 3], [-1, -2, -3]), - ([1, 2, None], [-1, -2, None]), - ([-1, 0, 1], [1, 0, -1]), + ([1, 2, 3], [-1, -2, -3], [1, 2, 3]), + ([1, 2, None], [-1, -2, None], [1, 2, None]), ], ) - def test_unary_minus_nullable_int( - self, any_signed_nullable_int_dtype, source, target + def test_all_numeric_unary_operators( + self, any_nullable_numeric_dtype, source, neg_target, abs_target ): - dtype = any_signed_nullable_int_dtype + # GH38794 + dtype = any_nullable_numeric_dtype ser = Series(source, dtype=dtype) - result = -ser - expected = Series(target, dtype=dtype) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]]) - def test_unary_plus_nullable_int(self, any_signed_nullable_int_dtype, source): - dtype = any_signed_nullable_int_dtype - expected = Series(source, dtype=dtype) - result = +expected - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "source, target", - [ - ([1, 2, 3], [1, 2, 3]), - ([1, -2, None], [1, 2, None]), - ([-1, 0, 1], [1, 0, 1]), - ], - ) - def test_abs_nullable_int(self, any_signed_nullable_int_dtype, source, target): - dtype = any_signed_nullable_int_dtype - ser = Series(source, dtype=dtype) - result = abs(ser) - expected = Series(target, dtype=dtype) - tm.assert_series_equal(result, expected) + neg_result, pos_result, abs_result = -ser, +ser, abs(ser) + if dtype.startswith("U"): + neg_target = -Series(source, dtype=dtype) + else: + neg_target = Series(neg_target, dtype=dtype) + + abs_target = Series(abs_target, dtype=dtype) + + tm.assert_series_equal(neg_result, neg_target) + tm.assert_series_equal(pos_result, ser) + tm.assert_series_equal(abs_result, abs_target) + + @pytest.mark.parametrize("op", ["__neg__", "__abs__"]) + def test_unary_float_op_mask(self, float_ea_dtype, op): + dtype = float_ea_dtype + ser = Series([1.1, 2.2, 3.3], dtype=dtype) + result = getattr(ser, op)() + target = result.copy(deep=True) + ser[0] = None + tm.assert_series_equal(result, target) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 2b65655e7bdad..1a47b5b37e3d2 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -20,7 +20,10 @@ iNaT, parsing, ) -from pandas.errors import OutOfBoundsDatetime +from pandas.errors import ( + OutOfBoundsDatetime, + OutOfBoundsTimedelta, +) import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_datetime64_ns_dtype @@ -1675,12 +1678,14 @@ def test_to_datetime_overflow(self): # gh-17637 # we are overflowing Timedelta range here - msg = ( - "(Python int too large to convert to C long)|" - "(long too big to convert)|" - "(int too big to convert)" + msg = "|".join( + [ + "Python int too large to convert to C long", + "long too big to convert", + "int too big to convert", + ] ) - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsTimedelta, match=msg): date_range(start="1/1/1700", freq="B", periods=100000) @pytest.mark.parametrize("cache", [True, False]) diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index fbd7a36a75bf0..3e823844c7f56 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -142,6 +142,29 @@ def test_ewm_with_nat_raises(halflife_with_times): ser.ewm(com=0.1, halflife=halflife_with_times, times=times) +def test_ewm_with_times_getitem(halflife_with_times): + # GH 40164 + halflife = halflife_with_times + data = np.arange(10.0) + data[::2] = np.nan + times = date_range("2000", freq="D", periods=10) + df = DataFrame({"A": data, "B": data}) + result = df.ewm(halflife=halflife, times=times)["A"].mean() + expected = df.ewm(halflife=1.0)["A"].mean() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("arg", ["com", "halflife", "span", "alpha"]) +def test_ewm_getitem_attributes_retained(arg, adjust, ignore_na): + # GH 40164 + kwargs = {arg: 1, "adjust": adjust, "ignore_na": ignore_na} + ewm = DataFrame({"A": range(1), "B": range(1)}).ewm(**kwargs) + expected = {attr: getattr(ewm, attr) for attr in ewm._attributes} + ewm_slice = ewm["A"] + result = {attr: getattr(ewm, attr) for attr in ewm_slice._attributes} + assert result == expected + + def test_ewm_vol_deprecated(): ser = Series(range(1)) with tm.assert_produces_warning(FutureWarning): diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 5ca96a1f9989f..c31c421ee1445 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -31,7 +31,7 @@ def find_stack_level() -> int: if stack[n].function == "astype": break - while stack[n].function in ["astype", "apply", "_astype"]: + while stack[n].function in ["astype", "apply", "astype_array_safe", "astype_array"]: # e.g. # bump up Block.astype -> BlockManager.astype -> NDFrame.astype # bump up Datetime.Array.astype -> DatetimeIndex.astype diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 60a81ed63b005..087dccfadcce1 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -4,6 +4,7 @@ """ from typing import ( Iterable, + Sequence, Union, ) import warnings @@ -208,9 +209,39 @@ def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_ar validate_kwargs(fname, kwargs, compat_args) -def validate_bool_kwarg(value, arg_name): - """ Ensures that argument passed in arg_name is of type bool. """ - if not (is_bool(value) or value is None): +def validate_bool_kwarg(value, arg_name, none_allowed=True, int_allowed=False): + """ + Ensure that argument passed in arg_name can be interpreted as boolean. + + Parameters + ---------- + value : bool + Value to be validated. + arg_name : str + Name of the argument. To be reflected in the error message. + none_allowed : bool, default True + Whether to consider None to be a valid boolean. + int_allowed : bool, default False + Whether to consider integer value to be a valid boolean. + + Returns + ------- + value + The same value as input. + + Raises + ------ + ValueError + If the value is not a valid boolean. + """ + good_value = is_bool(value) + if none_allowed: + good_value = good_value or value is None + + if int_allowed: + good_value = good_value or isinstance(value, int) + + if not good_value: raise ValueError( f'For argument "{arg_name}" expected type bool, received ' f"type {type(value).__name__}." @@ -384,3 +415,14 @@ def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: if not all(0 <= qs <= 1 for qs in q_arr): raise ValueError(msg.format(q_arr / 100.0)) return q_arr + + +def validate_ascending( + ascending: Union[Union[bool, int], Sequence[Union[bool, int]]] = True, +): + """Validate ``ascending`` kwargs for ``sort_index`` method.""" + kwargs = {"none_allowed": False, "int_allowed": True} + if not isinstance(ascending, (list, tuple)): + return validate_bool_kwarg(ascending, "ascending", **kwargs) + + return [validate_bool_kwarg(item, "ascending", **kwargs) for item in ascending] diff --git a/requirements-dev.txt b/requirements-dev.txt index be60c90aef8aa..37adbbb8e671f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,7 +11,7 @@ cpplint flake8 flake8-comprehensions>=3.1.0 isort>=5.2.1 -mypy==0.800 +mypy==0.812 pre-commit>=2.9.2 pycodestyle pyupgrade diff --git a/setup.cfg b/setup.cfg index ce055f550a868..ca0673bd5fc34 100644 --- a/setup.cfg +++ b/setup.cfg @@ -125,7 +125,7 @@ ignore-words-list = ba,blocs,coo,hist,nd,ser ignore-regex = https://(\w+\.)+ [coverage:run] -branch = False +branch = True omit = */tests/* pandas/_typing.py From 6bf4a8fa93eb48b8248a426c20eed35a2caa8a16 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Mar 2021 18:12:20 -0800 Subject: [PATCH 04/10] restore RangeIndex.array --- pandas/core/construction.py | 14 +++++++++----- pandas/core/indexes/range.py | 9 +-------- pandas/core/reshape/merge.py | 4 ++-- pandas/core/series.py | 6 +++--- pandas/tests/construction/__init__.py | 0 .../tests/construction/test_extract_array.py | 18 ++++++++++++++++++ 6 files changed, 33 insertions(+), 18 deletions(-) create mode 100644 pandas/tests/construction/__init__.py create mode 100644 pandas/tests/construction/test_extract_array.py diff --git a/pandas/core/construction.py b/pandas/core/construction.py index bcfa238f6d0dd..7573b520b7141 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -371,7 +371,7 @@ def array( def extract_array( - obj: object, extract_numpy: bool = False, range_compat: bool = False + obj: object, extract_numpy: bool = False, extract_range: bool = False ) -> Union[Any, ArrayLike]: """ Extract the ndarray or ExtensionArray from a Series or Index. @@ -387,8 +387,9 @@ def extract_array( extract_numpy : bool, default False Whether to extract the ndarray from a PandasArray - range_compat : bool, default False - If we have a RangeIndex, return range._values if True, otherwise raise. + extract_range : bool, default False + If we have a RangeIndex, return range._values if True, otherwise + return unchanged. Returns ------- @@ -418,8 +419,11 @@ def extract_array( array([1, 2, 3]) """ if isinstance(obj, (ABCIndex, ABCSeries)): - if range_compat and isinstance(obj, ABCRangeIndex): - return obj._values + if isinstance(obj, ABCRangeIndex): + if extract_range: + return obj._values + return obj + obj = obj.array if extract_numpy and isinstance(obj, ABCPandasArray): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b0697eee9e1a0..e501c4cb5348e 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -185,13 +185,6 @@ def _data(self): """ return np.arange(self.start, self.stop, self.step, dtype=np.int64) - @property - def array(self): - raise ValueError( - f"{type(self).__name__} has no single backing array. Use " - f"'{type(self).__name__}.to_numpy()' to get a NumPy array." - ) - @cache_readonly def _cached_int64index(self) -> Int64Index: return Int64Index._simple_new(self._data, name=self.name) @@ -897,7 +890,7 @@ def _arith_method(self, other, op): step = op # TODO: if other is a RangeIndex we may have more efficient options - other = extract_array(other, extract_numpy=True, range_compat=True) + other = extract_array(other, extract_numpy=True, extract_range=True) attrs = self._get_attributes_dict() left, right = self, other diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ce06fc55ee8e6..8ce9195e70080 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2058,8 +2058,8 @@ def _factorize_keys( (array([0, 1, 2]), array([0, 1]), 3) """ # Some pre-processing for non-ndarray lk / rk - lk = extract_array(lk, extract_numpy=True, range_compat=True) - rk = extract_array(rk, extract_numpy=True, range_compat=True) + lk = extract_array(lk, extract_numpy=True, extract_range=True) + rk = extract_array(rk, extract_numpy=True, extract_range=True) # TODO: if either is a RangeIndex, we can likely factorize more efficiently? if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): diff --git a/pandas/core/series.py b/pandas/core/series.py index 8ed94edd395ca..e56773f239901 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5027,7 +5027,7 @@ def _cmp_method(self, other, op): raise ValueError("Can only compare identically-labeled Series objects") lvalues = self._values - rvalues = extract_array(other, extract_numpy=True, range_compat=True) + rvalues = extract_array(other, extract_numpy=True, extract_range=True) res_values = ops.comparison_op(lvalues, rvalues, op) @@ -5038,7 +5038,7 @@ def _logical_method(self, other, op): self, other = ops.align_method_SERIES(self, other, align_asobject=True) lvalues = self._values - rvalues = extract_array(other, extract_numpy=True, range_compat=True) + rvalues = extract_array(other, extract_numpy=True, extract_range=True) res_values = ops.logical_op(lvalues, rvalues, op) return self._construct_result(res_values, name=res_name) @@ -5048,7 +5048,7 @@ def _arith_method(self, other, op): self, other = ops.align_method_SERIES(self, other) lvalues = self._values - rvalues = extract_array(other, extract_numpy=True, range_compat=True) + rvalues = extract_array(other, extract_numpy=True, extract_range=True) result = ops.arithmetic_op(lvalues, rvalues, op) diff --git a/pandas/tests/construction/__init__.py b/pandas/tests/construction/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/construction/test_extract_array.py b/pandas/tests/construction/test_extract_array.py new file mode 100644 index 0000000000000..4dd3eda8c995c --- /dev/null +++ b/pandas/tests/construction/test_extract_array.py @@ -0,0 +1,18 @@ +from pandas import Index +import pandas._testing as tm +from pandas.core.construction import extract_array + + +def test_extract_array_rangeindex(): + ri = Index(range(5)) + + expected = ri._values + res = extract_array(ri, extract_numpy=True, extract_range=True) + tm.assert_numpy_array_equal(res, expected) + res = extract_array(ri, extract_numpy=False, extract_range=True) + tm.assert_numpy_array_equal(res, expected) + + res = extract_array(ri, extract_numpy=True, extract_range=False) + tm.assert_index_equal(res, ri) + res = extract_array(ri, extract_numpy=False, extract_range=False) + tm.assert_index_equal(res, ri) From bc7c6281ef8e52d22304a03c9438be175348a9e2 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 11:01:06 -0800 Subject: [PATCH 05/10] troubleshoot ci --- pandas/tests/extension/test_boolean.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 89991a459795e..3ef3beaa9c1b1 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -16,8 +16,6 @@ import numpy as np import pytest -from pandas.compat.numpy import is_numpy_dev - import pandas as pd import pandas._testing as tm from pandas.core.arrays.boolean import BooleanDtype @@ -322,7 +320,6 @@ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df.groupby("A").apply(groupby_apply_op) df.groupby("A").B.apply(groupby_apply_op) - @pytest.mark.xfail(is_numpy_dev, reason="2021-03-02 #40144 expecting fix in numpy") def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) result = df.groupby("A").B.apply(lambda x: x.array) From c5977d65748c3a7f6d0c26a10840e2bfb36043bf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 8 Mar 2021 12:25:44 -0800 Subject: [PATCH 06/10] Update pandas/core/construction.py Co-authored-by: Joris Van den Bossche --- pandas/core/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 7573b520b7141..c96b139b70fd8 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -388,7 +388,7 @@ def extract_array( Whether to extract the ndarray from a PandasArray extract_range : bool, default False - If we have a RangeIndex, return range._values if True, otherwise + If we have a RangeIndex, return range._values if True (which is a materialized integer ndarray), otherwise return unchanged. Returns From 217dd68edf8842e6e00ace9febe36ae38f9f7e4e Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 12:30:41 -0800 Subject: [PATCH 07/10] no longer need to skip array manager test --- pandas/tests/groupby/transform/test_transform.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c4621d5fc0f8c..9350a3fcd3036 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -4,8 +4,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import ( ensure_platform_int, is_timedelta64_dtype, @@ -190,8 +188,6 @@ def test_transform_axis_1(request, transformation_func, using_array_manager): tm.assert_equal(result, expected) -# TODO(ArrayManager) groupby().transform returns DataFrame backed by BlockManager -@td.skip_array_manager_not_yet_implemented def test_transform_axis_ts(tsframe): # make sure that we are setting the axes From 5c24b7534d808c0b66196cef0e6539553305bcb3 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 12:53:40 -0800 Subject: [PATCH 08/10] flake8 fixup --- pandas/core/construction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c96b139b70fd8..18bd8ac97d756 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -388,8 +388,8 @@ def extract_array( Whether to extract the ndarray from a PandasArray extract_range : bool, default False - If we have a RangeIndex, return range._values if True (which is a materialized integer ndarray), otherwise - return unchanged. + If we have a RangeIndex, return range._values if True + (which is a materialized integer ndarray), otherwise return unchanged. Returns ------- From 20bb6fc01b53dd6277e80bdc96ee144831cde15e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Mar 2021 19:01:45 -0700 Subject: [PATCH 09/10] mypy fixup --- pandas/_testing/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 679c9a2b44b53..fdcefc622f373 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -209,7 +209,7 @@ def box_expected(expected, box_cls, transpose=True): if box_cls is pd.array: if isinstance(expected, pd.RangeIndex): # pd.array would return an IntegerArray - expected = PandasArray(expected._values) + expected = PandasArray(np.asarray(expected._values)) else: expected = pd.array(expected) elif box_cls is pd.Index: From 9dcf5f483cc199395c9cbfdf39576e380f7bf00a Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Mar 2021 11:31:46 -0700 Subject: [PATCH 10/10] revert xfail --- pandas/tests/extension/base/groupby.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index d93afef60561a..30b115b9dba6f 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -1,7 +1,5 @@ import pytest -from pandas.compat.numpy import is_numpy_dev - import pandas as pd import pandas._testing as tm from pandas.tests.extension.base.base import BaseExtensionTests @@ -75,10 +73,6 @@ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df.groupby("A").apply(groupby_apply_op) df.groupby("A").B.apply(groupby_apply_op) - # Non-strict bc these xpass on dt64tz, Period, Interval, JSON, PandasArray - @pytest.mark.xfail( - is_numpy_dev, reason="2021-03-02 #40144 expecting fix in numpy", strict=False - ) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("A").B.apply(lambda x: x.array)