diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5401cc81785ab..ce1ab29ef0746 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -237,6 +237,11 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -RI --exclude=\*.{svg,c,cpp,html,js} --exclude-dir=env "\s$" * RET=$(($RET + $?)) ; echo $MSG "DONE" unset INVGREP_APPEND + + # Check if pandas is referenced as pandas, not *pandas*,Pandas or PANDAS + MSG='Checking if the pandas word reference is always used lowercase (pandas,not Pandas or PANDAS' ; echo $MSG + invgrep -R '*pandas*|Pandas|PANDAS' web/* doc/ + RET=$(($RET + $?)) ; echo $MSG "DONE" fi ### CODE ### @@ -279,8 +284,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/groupby/groupby.py -k"-cumcount -describe -pipe" RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests datetimes.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/tools/datetimes.py + MSG='Doctests tools' ; echo $MSG + pytest -q --doctest-modules pandas/core/tools/ RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests reshaping functions' ; echo $MSG @@ -323,6 +328,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then MSG='Doctests tseries' ; echo $MSG pytest -q --doctest-modules pandas/tseries/ RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests computation' ; echo $MSG + pytest -q --doctest-modules pandas/core/computation/ + RET=$(($RET + $?)) ; echo $MSG "DONE" fi ### DOCSTRINGS ### diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index efa165e0a2d0c..0c780ad5f5847 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -160,7 +160,7 @@ backticks. The following are considered inline code: .. _docstring.short_summary: -Section 1: Short summary +Section 1: short summary ~~~~~~~~~~~~~~~~~~~~~~~~ The short summary is a single sentence that expresses what the function does in @@ -228,7 +228,7 @@ infinitive verb. .. _docstring.extended_summary: -Section 2: Extended summary +Section 2: extended summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~ The extended summary provides details on what the function does. It should not @@ -259,7 +259,7 @@ their use cases, if it is not too generic. .. _docstring.parameters: -Section 3: Parameters +Section 3: parameters ~~~~~~~~~~~~~~~~~~~~~ The details of the parameters will be added in this section. This section has @@ -424,7 +424,7 @@ For axis, the convention is to use something like: .. _docstring.returns: -Section 4: Returns or Yields +Section 4: returns or yields ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If the method returns a value, it will be documented in this section. Also @@ -505,7 +505,7 @@ If the method yields its value: .. _docstring.see_also: -Section 5: See Also +Section 5: see also ~~~~~~~~~~~~~~~~~~~ This section is used to let users know about pandas functionality @@ -583,7 +583,7 @@ For example: .. _docstring.notes: -Section 6: Notes +Section 6: notes ~~~~~~~~~~~~~~~~ This is an optional section used for notes about the implementation of the @@ -597,7 +597,7 @@ This section follows the same format as the extended summary section. .. _docstring.examples: -Section 7: Examples +Section 7: examples ~~~~~~~~~~~~~~~~~~~ This is one of the most important sections of a docstring, despite being @@ -998,4 +998,4 @@ mapping function names to docstrings. Wherever possible, we prefer using See ``pandas.core.generic.NDFrame.fillna`` for an example template, and ``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna`` -for the filled versions. +for the filled versions. \ No newline at end of file diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index 33646e5d74757..fbd83af3de82e 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -62,7 +62,7 @@ for each column, *including the index columns*. This has JSON form: See below for the detailed specification for these. -Index Metadata Descriptors +Index metadata descriptors ~~~~~~~~~~~~~~~~~~~~~~~~~~ ``RangeIndex`` can be stored as metadata only, not requiring serialization. The @@ -89,7 +89,7 @@ with other column names) a disambiguating name with pattern matching columns, ``name`` attribute is always stored in the column descriptors as above. -Column Metadata +Column metadata ~~~~~~~~~~~~~~~ ``pandas_type`` is the logical type of the column, and is one of: @@ -182,4 +182,4 @@ As an example of fully-formed metadata: 'creator': { 'library': 'pyarrow', 'version': '0.13.0' - }} + }} \ No newline at end of file diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 98e3ffcf74ad1..c0b20e2d2843b 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -210,7 +210,7 @@ will .. _extending.extension.ufunc: -NumPy Universal Functions +NumPy universal functions ^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`Series` implements ``__array_ufunc__``. As part of the implementation, @@ -501,4 +501,4 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at -https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. +https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. \ No newline at end of file diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 9ae9d47b89341..9f9e9dc2631f3 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -1,7 +1,7 @@ .. _maintaining: ****************** -Pandas Maintenance +pandas maintenance ****************** This guide is for pandas' maintainers. It may also be interesting to contributors @@ -41,7 +41,7 @@ reading. .. _maintaining.triage: -Issue Triage +Issue triage ------------ @@ -123,7 +123,7 @@ Here's a typical workflow for triaging a newly opened issue. .. _maintaining.closing: -Closing Issues +Closing issues -------------- Be delicate here: many people interpret closing an issue as us saying that the @@ -132,7 +132,7 @@ respond or self-close their issue if it's determined that the behavior is not a or the feature is out of scope. Sometimes reporters just go away though, and we'll close the issue after the conversation has died. -Reviewing Pull Requests +Reviewing pull requests ----------------------- Anybody can review a pull request: regular contributors, triagers, or core-team @@ -144,7 +144,7 @@ members. Here are some guidelines to check. * User-facing changes should have a whatsnew in the appropriate file. * Regression tests should reference the original GitHub issue number like ``# GH-1234``. -Cleaning up old Issues +Cleaning up old issues ---------------------- Every open issue in pandas has a cost. Open issues make finding duplicates harder, @@ -164,7 +164,7 @@ If an older issue lacks a reproducible example, label it as "Needs Info" and ask them to provide one (or write one yourself if possible). If one isn't provide reasonably soon, close it according to the policies in :ref:`maintaining.closing`. -Cleaning up old Pull Requests +Cleaning up old pull requests ----------------------------- Occasionally, contributors are unable to finish off a pull request. diff --git a/doc/source/development/meeting.rst b/doc/source/development/meeting.rst index 803f1b7002de0..35826af5912c2 100644 --- a/doc/source/development/meeting.rst +++ b/doc/source/development/meeting.rst @@ -1,7 +1,7 @@ .. _meeting: ================== -Developer Meetings +Developer meetings ================== We hold regular developer meetings on the second Wednesday @@ -29,4 +29,3 @@ You can subscribe to this calendar with the following links: Additionally, we'll sometimes have one-off meetings on specific topics. These will be published on the same calendar. - diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 0d9e0b0f4c668..575b82b4b06de 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -108,3 +108,11 @@ Scalar introspection api.types.is_re api.types.is_re_compilable api.types.is_scalar + +Bug report function +------------------- +.. autosummary:: + :toctree: api/ + + show_versions + diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 5817efb31814e..398336960e769 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -140,7 +140,7 @@ More information can be found in the `ipython documentation .. _options.frequently_used: -Frequently Used Options +Frequently used options ----------------------- The following is a walk-through of the more frequently used display options. diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 58733b852e3a1..7e890962d8da1 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -272,7 +272,7 @@ the right thing: .. _reshaping.melt: -Reshaping by Melt +Reshaping by melt ----------------- .. image:: ../_static/reshaping_melt.png diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 234c12ce79822..bea0f42f6849c 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -8,7 +8,7 @@ Working with text data .. _text.types: -Text Data Types +Text data types --------------- .. versionadded:: 1.0.0 @@ -113,7 +113,7 @@ Everything else that follows in the rest of this document applies equally to .. _text.string_methods: -String Methods +String methods -------------- Series and Index are equipped with a set of string processing methods @@ -633,7 +633,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0). pd.Series(["a1a2", "b1", "c1"], dtype="string").str.extractall(two_groups) -Testing for Strings that match or contain a pattern +Testing for strings that match or contain a pattern --------------------------------------------------- You can check whether elements contain a pattern: diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index f208c8d576131..0d49a2d8db77c 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -122,7 +122,7 @@ as ``np.nan`` does for float data. .. _timeseries.representation: -Timestamps vs. Time Spans +Timestamps vs. time spans ------------------------- Timestamped data is the most basic type of time series data that associates @@ -1434,7 +1434,7 @@ or calendars with additional rules. .. _timeseries.advanced_datetime: -Time Series-Related Instance Methods +Time series-related instance methods ------------------------------------ Shifting / lagging diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 756dd06aced7f..451ddf046416e 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -796,7 +796,7 @@ before plotting. .. _visualization.tools: -Plotting Tools +Plotting tools -------------- These functions can be imported from ``pandas.plotting`` @@ -1045,7 +1045,7 @@ for more information. .. _visualization.formatting: -Plot Formatting +Plot formatting --------------- Setting the plot style diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8db837a38170b..25f847c698278 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -313,7 +313,7 @@ Timedelta Timezones ^^^^^^^^^ -- +- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` where timezone names (e.g. ``UTC``) would not be parsed correctly (:issue:`33133`) - @@ -471,6 +471,7 @@ Other - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) - Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) - Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) +- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a318bea14b52b..6fa9159c469c2 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -8,8 +8,7 @@ cnp.import_array() import pytz # stdlib datetime imports -from datetime import time as datetime_time -from cpython.datetime cimport (datetime, tzinfo, +from cpython.datetime cimport (datetime, time, tzinfo, PyDateTime_Check, PyDate_Check, PyDateTime_IMPORT) PyDateTime_IMPORT @@ -284,7 +283,7 @@ cdef convert_to_tsobject(object ts, object tz, object unit, return convert_datetime_to_tsobject(ts, tz, nanos) elif PyDate_Check(ts): # Keep the converter same as PyDateTime's - ts = datetime.combine(ts, datetime_time()) + ts = datetime.combine(ts, time()) return convert_datetime_to_tsobject(ts, tz) elif getattr(ts, '_typ', None) == 'period': raise ValueError("Cannot convert Period to Timestamp " diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index a66c9cd86d00c..306636278bcbe 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -462,7 +462,7 @@ class _BaseOffset: def _validate_n(self, n): """ - Require that `n` be a nonzero integer. + Require that `n` be an integer. Parameters ---------- diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 74b95a2f3076f..5272a0a042d0e 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -805,6 +805,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, (('second',), '%S', 2), (('microsecond',), '%f', 6), (('second', 'microsecond'), '%S.%f', 0), + (('tzinfo',), '%Z', 0), ] if dayfirst: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c3a47902cff0f..dd745f840d0ab 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1,5 +1,3 @@ -from datetime import datetime - from cpython.object cimport PyObject_RichCompareBool, Py_EQ, Py_NE from numpy cimport int64_t, import_array, ndarray @@ -13,6 +11,7 @@ from libc.string cimport strlen, memset import cython from cpython.datetime cimport ( + datetime, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 64b79200028b6..7858072407a35 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -5,8 +5,7 @@ cimport numpy as cnp from numpy cimport int64_t cnp.import_array() -from datetime import time as datetime_time, timedelta -from cpython.datetime cimport (datetime, PyDateTime_Check, +from cpython.datetime cimport (datetime, time, PyDateTime_Check, PyDelta_Check, PyTZInfo_Check, PyDateTime_IMPORT) PyDateTime_IMPORT @@ -33,7 +32,7 @@ from pandas._libs.tslibs.tzconversion import ( # ---------------------------------------------------------------------- # Constants -_zero_time = datetime_time(0, 0) +_zero_time = time(0, 0) _no_input = object() # ---------------------------------------------------------------------- @@ -879,8 +878,7 @@ default 'raise' raise ValueError('Cannot infer offset with only one time.') nonexistent_options = ('raise', 'NaT', 'shift_forward', 'shift_backward') - if nonexistent not in nonexistent_options and not isinstance( - nonexistent, timedelta): + if nonexistent not in nonexistent_options and not PyDelta_Check(nonexistent): raise ValueError( "The nonexistent argument must be one of 'raise', " "'NaT', 'shift_forward', 'shift_backward' or a timedelta object" diff --git a/pandas/_testing.py b/pandas/_testing.py index 1f6b645c821c8..8522ccf1b6bab 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -821,7 +821,7 @@ def assert_categorical_equal( if check_category_order: assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") assert_numpy_array_equal( - left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", + left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes" ) else: try: @@ -830,9 +830,7 @@ def assert_categorical_equal( except TypeError: # e.g. '<' not supported between instances of 'int' and 'str' lc, rc = left.categories, right.categories - assert_index_equal( - lc, rc, obj=f"{obj}.categories", - ) + assert_index_equal(lc, rc, obj=f"{obj}.categories") assert_index_equal( left.categories.take(left.codes), right.categories.take(right.codes), @@ -974,7 +972,7 @@ def _raise(left, right, err_msg): if err_msg is None: if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shapes are different", left.shape, right.shape, + obj, f"{obj} shapes are different", left.shape, right.shape ) diff = 0 @@ -1328,7 +1326,7 @@ def assert_frame_equal( # shape comparison if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", + obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}" ) if check_like: diff --git a/pandas/conftest.py b/pandas/conftest.py index 2ee64403c7cf4..0b14c12f2356f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -526,6 +526,64 @@ def empty_frame(): return DataFrame() +@pytest.fixture +def int_frame(): + """ + Fixture for DataFrame of ints with index of unique strings + + Columns are ['A', 'B', 'C', 'D'] + + A B C D + vpBeWjM651 1 0 1 0 + 5JyxmrP1En -1 0 0 0 + qEDaoD49U2 -1 1 0 0 + m66TkTfsFe 0 0 0 0 + EHPaNzEUFm -1 0 -1 0 + fpRJCevQhi 2 0 0 0 + OlQvnmfi3Q 0 0 -2 0 + ... .. .. .. .. + uB1FPlz4uP 0 0 0 1 + EcSe6yNzCU 0 0 -1 0 + L50VudaiI8 -1 1 -2 0 + y3bpw4nwIp 0 -1 0 0 + H0RdLLwrCT 1 1 0 0 + rY82K0vMwm 0 0 0 0 + 1OPIUjnkjk 2 0 0 0 + + [30 rows x 4 columns] + """ + return DataFrame(tm.getSeriesData()).astype("int64") + + +@pytest.fixture +def datetime_frame(): + """ + Fixture for DataFrame of floats with DatetimeIndex + + Columns are ['A', 'B', 'C', 'D'] + + A B C D + 2000-01-03 -1.122153 0.468535 0.122226 1.693711 + 2000-01-04 0.189378 0.486100 0.007864 -1.216052 + 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 + 2000-01-06 0.430050 0.894352 0.090719 0.036939 + 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 + 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 + 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 + ... ... ... ... ... + 2000-02-03 1.642618 -0.579288 0.046005 1.385249 + 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 + 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 + 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 + 2000-02-09 1.377373 0.398619 1.008453 -0.928207 + 2000-02-10 0.473194 -0.636677 0.984058 0.511519 + 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 + + [30 rows x 4 columns] + """ + return DataFrame(tm.getTimeSeriesData()) + + @pytest.fixture def float_frame(): """ diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index fc40f1db1918a..b6ca19bde8009 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -211,7 +211,9 @@ def _register_accessor(name, cls): See Also -------- - {others} + register_dataframe_accessor : Register a custom accessor on DataFrame objects. + register_series_accessor : Register a custom accessor on Series objects. + register_index_accessor : Register a custom accessor on Index objects. Notes ----- @@ -279,33 +281,21 @@ def decorator(accessor): return decorator -@doc( - _register_accessor, - klass="DataFrame", - others="register_series_accessor, register_index_accessor", -) +@doc(_register_accessor, klass="DataFrame") def register_dataframe_accessor(name): from pandas import DataFrame return _register_accessor(name, DataFrame) -@doc( - _register_accessor, - klass="Series", - others="register_dataframe_accessor, register_index_accessor", -) +@doc(_register_accessor, klass="Series") def register_series_accessor(name): from pandas import Series return _register_accessor(name, Series) -@doc( - _register_accessor, - klass="Index", - others="register_dataframe_accessor, register_series_accessor", -) +@doc(_register_accessor, klass="Index") def register_index_accessor(name): from pandas import Index diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c11d879840fb9..edc138574830d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -416,12 +416,12 @@ def categories(self): See Also -------- - rename_categories - reorder_categories - add_categories - remove_categories - remove_unused_categories - set_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. """ return self.dtype.categories @@ -830,11 +830,11 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal See Also -------- - rename_categories - reorder_categories - add_categories - remove_categories - remove_unused_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. """ inplace = validate_bool_kwarg(inplace, "inplace") if ordered is None: @@ -901,11 +901,11 @@ def rename_categories(self, new_categories, inplace=False): See Also -------- - reorder_categories - add_categories - remove_categories - remove_unused_categories - set_categories + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. Examples -------- @@ -969,11 +969,11 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): See Also -------- - rename_categories - add_categories - remove_categories - remove_unused_categories - set_categories + rename_categories : Rename categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. """ inplace = validate_bool_kwarg(inplace, "inplace") if set(self.dtype.categories) != set(new_categories): @@ -1009,11 +1009,11 @@ def add_categories(self, new_categories, inplace=False): See Also -------- - rename_categories - reorder_categories - remove_categories - remove_unused_categories - set_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. """ inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(new_categories): @@ -1058,11 +1058,11 @@ def remove_categories(self, removals, inplace=False): See Also -------- - rename_categories - reorder_categories - add_categories - remove_unused_categories - set_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. """ inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(removals): @@ -1100,11 +1100,11 @@ def remove_unused_categories(self, inplace=False): See Also -------- - rename_categories - reorder_categories - add_categories - remove_categories - set_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + set_categories : Set the categories to the specified ones. """ inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 4f3c68aa03b16..d79f2145a91f3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -124,7 +124,7 @@ def __from_arrow__( return IntegerArray._concat_same_type(results) -def integer_array(values, dtype=None, copy: bool = False,) -> "IntegerArray": +def integer_array(values, dtype=None, copy: bool = False) -> "IntegerArray": """ Infer and return an integer array of the values. @@ -168,7 +168,7 @@ def safe_cast(values, dtype, copy: bool): def coerce_to_array( - values, dtype, mask=None, copy: bool = False, + values, dtype, mask=None, copy: bool = False ) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index cf6c16d4cad5d..627135bba5f36 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -61,7 +61,7 @@ def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT: return type(self)(~self._data, self._mask) def to_numpy( - self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default, + self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default ) -> np.ndarray: """ Convert to a NumPy Array. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 3058e1d6073f3..6699511fdb885 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -281,7 +281,7 @@ def isna(self) -> np.ndarray: return isna(self._ndarray) def fillna( - self, value=None, method: Optional[str] = None, limit: Optional[int] = None, + self, value=None, method: Optional[str] = None, limit: Optional[int] = None ) -> "PandasArray": # TODO(_values_for_fillna): remove this value, method = validate_fillna_kwargs(value, method) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index be9cc53d33d6f..426b3710a13f9 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -606,7 +606,7 @@ def _sub_period(self, other): return new_data def _addsub_int_array( - self, other: np.ndarray, op: Callable[[Any, Any], Any], + self, other: np.ndarray, op: Callable[[Any, Any], Any] ) -> "PeriodArray": """ Add or subtract array of integers; equivalent to applying diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 8021e0babe4e0..8c09aa9176f31 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -232,7 +232,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` is not a ``SparseDtype`` and `data` is a ``SparseArray``. - kind : {'integer', 'block'}, default 'integer' + kind : {'int', 'block'}, default 'int' The type of storage for sparse locations. * 'block': Stores a `block` and `block_length` for each diff --git a/pandas/core/base.py b/pandas/core/base.py index 34a3276d03c37..a28a2c9594341 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -13,7 +13,6 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc -from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( @@ -1505,18 +1504,14 @@ def factorize(self, sort=False, na_sentinel=-1): def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) - def drop_duplicates(self, keep="first", inplace=False): - inplace = validate_bool_kwarg(inplace, "inplace") + def drop_duplicates(self, keep="first"): if isinstance(self, ABCIndexClass): if self.is_unique: return self._shallow_copy() duplicated = self.duplicated(keep=keep) result = self[np.logical_not(duplicated)] - if inplace: - return self._update_inplace(result) - else: - return result + return result def duplicated(self, keep="first"): if isinstance(self, ABCIndexClass): @@ -1527,9 +1522,3 @@ def duplicated(self, keep="first"): return self._constructor( duplicated(self, keep=keep), index=self.index ).__finalize__(self) - - # ---------------------------------------------------------------------- - # abstracts - - def _update_inplace(self, result, verify_is_copy=True, **kwargs): - raise AbstractMethodError(self) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d29102cbd4604..ef681cb204598 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -194,7 +194,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): See Also -------- - Categorical + Categorical : Represent a categorical variable in classic R / S-plus fashion. Notes ----- diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index d461db2d05f9d..f7b0615366ba0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -16,30 +16,24 @@ ensure_object, is_bool_dtype, is_complex_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, - is_period_dtype, is_scalar, is_string_dtype, is_string_like_dtype, - is_timedelta64_dtype, needs_i8_conversion, pandas_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeArray, ABCExtensionArray, ABCIndexClass, ABCMultiIndex, ABCSeries, - ABCTimedeltaArray, ) from pandas.core.dtypes.inference import is_list_like @@ -139,17 +133,7 @@ def _isna_new(obj): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, type): return False - elif isinstance( - obj, - ( - ABCSeries, - np.ndarray, - ABCIndexClass, - ABCExtensionArray, - ABCDatetimeArray, - ABCTimedeltaArray, - ), - ): + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)): return _isna_ndarraylike(obj) elif isinstance(obj, ABCDataFrame): return obj.isna() @@ -158,7 +142,7 @@ def _isna_new(obj): elif hasattr(obj, "__array__"): return _isna_ndarraylike(np.asarray(obj)) else: - return obj is None + return False def _isna_old(obj): @@ -189,7 +173,7 @@ def _isna_old(obj): elif hasattr(obj, "__array__"): return _isna_ndarraylike_old(np.asarray(obj)) else: - return obj is None + return False _isna = _isna_new @@ -224,37 +208,14 @@ def _use_inf_as_na(key): def _isna_ndarraylike(obj): - is_extension = is_extension_array_dtype(obj) - - if not is_extension: - # Avoid accessing `.values` on things like - # PeriodIndex, which may be expensive. - values = getattr(obj, "_values", obj) - else: - values = obj - + is_extension = is_extension_array_dtype(obj.dtype) + values = getattr(obj, "_values", obj) dtype = values.dtype if is_extension: - if isinstance(obj, (ABCIndexClass, ABCSeries)): - values = obj._values - else: - values = obj result = values.isna() - elif isinstance(obj, ABCDatetimeArray): - return obj.isna() elif is_string_dtype(dtype): - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - # object array of strings - result = np.zeros(values.shape, dtype=bool) - else: - # object array of non-strings - result = np.empty(shape, dtype=bool) - vec = libmissing.isnaobj(values.ravel()) - result[...] = vec.reshape(shape) + result = _isna_string_dtype(values, dtype, old=False) elif needs_i8_conversion(dtype): # this is the NaT pattern @@ -274,17 +235,9 @@ def _isna_ndarraylike_old(obj): dtype = values.dtype if is_string_dtype(dtype): - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = libmissing.isnaobj_old(values.ravel()) - result[:] = vec.reshape(shape) + result = _isna_string_dtype(values, dtype, old=True) - elif is_datetime64_dtype(dtype): + elif needs_i8_conversion(dtype): # this is the NaT pattern result = values.view("i8") == iNaT else: @@ -297,6 +250,24 @@ def _isna_ndarraylike_old(obj): return result +def _isna_string_dtype(values: np.ndarray, dtype: np.dtype, old: bool) -> np.ndarray: + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + if old: + vec = libmissing.isnaobj_old(values.ravel()) + else: + vec = libmissing.isnaobj(values.ravel()) + + result[...] = vec.reshape(shape) + + return result + + def notna(obj): """ Detect non-missing values for an array-like object. @@ -556,12 +527,7 @@ def na_value_for_dtype(dtype, compat: bool = True): if is_extension_array_dtype(dtype): return dtype.na_value - if ( - is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - or is_timedelta64_dtype(dtype) - or is_period_dtype(dtype) - ): + if needs_i8_conversion(dtype): return NaT elif is_float_dtype(dtype): return np.nan diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8ada8e830b834..a4039aaa510aa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3042,16 +3042,16 @@ def query(self, expr, inplace=False, **kwargs): res = self.eval(expr, **kwargs) try: - new_data = self.loc[res] + result = self.loc[res] except ValueError: # when res is multi-dimensional loc raises, but this is sometimes a # valid query - new_data = self[res] + result = self[res] if inplace: - self._update_inplace(new_data) + self._update_inplace(result) else: - return new_data + return result def eval(self, expr, inplace=False, **kwargs): """ @@ -4684,7 +4684,7 @@ def drop_duplicates( result.index = ibase.default_index(len(result)) if inplace: - self._update_inplace(result._data) + self._update_inplace(result) return None else: return result @@ -4803,10 +4803,11 @@ def sort_values( if ignore_index: new_data.axes[1] = ibase.default_index(len(indexer)) + result = self._constructor(new_data) if inplace: - return self._update_inplace(new_data) + return self._update_inplace(result) else: - return self._constructor(new_data).__finalize__(self) + return result.__finalize__(self) def sort_index( self, @@ -4938,10 +4939,11 @@ def sort_index( if ignore_index: new_data.axes[1] = ibase.default_index(len(indexer)) + result = self._constructor(new_data) if inplace: - return self._update_inplace(new_data) + return self._update_inplace(result) else: - return self._constructor(new_data).__finalize__(self) + return result.__finalize__(self) def value_counts( self, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 62f5419c1f4c8..143e4543e7ab8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -147,7 +147,7 @@ def _single_replace(self, to_replace, method, inplace, limit): result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self) if inplace: - self._update_inplace(result._data) + self._update_inplace(result) return return result @@ -988,7 +988,7 @@ def rename( result._clear_item_cache() if inplace: - self._update_inplace(result._data) + self._update_inplace(result) return None else: return result.__finalize__(self) @@ -3957,15 +3957,15 @@ def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: Parameters ---------- + result : same type as self verify_is_copy : bool, default True Provide is_copy checks. """ # NOTE: This does *not* call __finalize__ and that's an explicit # decision that we may revisit in the future. - self._reset_cache() self._clear_item_cache() - self._data = getattr(result, "_data", result) + self._data = result._data self._maybe_update_cacher(verify_is_copy=verify_is_copy) def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: @@ -6107,15 +6107,15 @@ def fillna( value=value, limit=limit, inplace=inplace, downcast=downcast ) elif isinstance(value, ABCDataFrame) and self.ndim == 2: - new_data = self.where(self.notna(), value) + new_data = self.where(self.notna(), value)._data else: raise ValueError(f"invalid fill value with a {type(value)}") + result = self._constructor(new_data) if inplace: - self._update_inplace(new_data) - return None + return self._update_inplace(result) else: - return self._constructor(new_data).__finalize__(self) + return result.__finalize__(self) def ffill( self: FrameOrSeries, @@ -6627,10 +6627,11 @@ def replace( f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}' ) + result = self._constructor(new_data) if inplace: - self._update_inplace(new_data) + return self._update_inplace(result) else: - return self._constructor(new_data).__finalize__(self) + return result.__finalize__(self) _shared_docs[ "interpolate" @@ -7231,7 +7232,7 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): result[mask] = np.nan if inplace: - self._update_inplace(result) + return self._update_inplace(result) else: return result @@ -8641,7 +8642,8 @@ def _where( new_data = self._data.putmask( mask=cond, new=other, align=align, axis=block_axis, ) - self._update_inplace(new_data) + result = self._constructor(new_data) + return self._update_inplace(result) else: new_data = self._data.where( @@ -8652,8 +8654,8 @@ def _where( try_cast=try_cast, axis=block_axis, ) - - return self._constructor(new_data).__finalize__(self) + result = self._constructor(new_data) + return result.__finalize__(self) _shared_docs[ "where" diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 0c5d2658978b4..15d8a996b6e7b 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -2,10 +2,4 @@ from pandas.core.groupby.groupby import GroupBy from pandas.core.groupby.grouper import Grouper -__all__ = [ - "DataFrameGroupBy", - "NamedAgg", - "SeriesGroupBy", - "GroupBy", - "Grouper", -] +__all__ = ["DataFrameGroupBy", "NamedAgg", "SeriesGroupBy", "GroupBy", "Grouper"] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ebdb0062491be..dff712ee17ea6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -211,7 +211,7 @@ class providing the base-class of operations. Parameters ---------- -func : callable or tuple of (callable, string) +func : callable or tuple of (callable, str) Function to apply to this %(klass)s object or, alternatively, a `(callable, data_keyword)` tuple where `data_keyword` is a string indicating the keyword of `callable` that expects the diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2a5dfff35e4a5..d487516ab4abe 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -280,7 +280,7 @@ def _outer_indexer(self, left, right): # Constructors def __new__( - cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs, + cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs ) -> "Index": from pandas.core.indexes.range import RangeIndex @@ -534,10 +534,6 @@ def _shallow_copy_with_infer(self, values, **kwargs): attributes.pop("tz", None) return Index(values, **attributes) - def _update_inplace(self, result, **kwargs): - # guard when called from IndexOpsMixin - raise TypeError("Index can't be updated inplace") - def is_(self, other) -> bool: """ More flexible, faster check like ``is`` but that works through views. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index d5df661efa692..f54b93b52b4a8 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -184,10 +184,10 @@ def func(intvidx_self, other, sort=False): ) @inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True) @inherit_names( - ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray, + ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray ) @inherit_names( - ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True, + ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True ) class IntervalIndex(IntervalMixin, ExtensionIndex): _typ = "intervalindex" diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b00af4653dfe3..0338a0de7d83e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1584,7 +1584,8 @@ def to_frame(self, index=True, name=None): See Also -------- - DataFrame + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous + tabular data. """ from pandas import DataFrame diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index e2be58a56018d..6714b36638f44 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -462,7 +462,7 @@ def isin(self, values, level=None): def _is_compatible_with_other(self, other) -> bool: return super()._is_compatible_with_other(other) or all( isinstance( - obj, (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex), + obj, (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex) ) for obj in [self, other] ) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 0646acab57081..8aaf828787179 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -324,8 +324,8 @@ def _formatter_func(self): @cache_readonly def _engine(self): - # To avoid a reference cycle, pass a weakref of self to _engine_type. - period = weakref.ref(self) + # To avoid a reference cycle, pass a weakref of self._values to _engine_type. + period = weakref.ref(self._values) return self._engine_type(period, len(self)) @doc(Index.__contains__) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b463b8d738d30..b1dc83823aa08 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -86,7 +86,7 @@ class RangeIndex(Int64Index): # Constructors def __new__( - cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, + cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None ): cls._validate_dtype(dtype) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 778e2866f1eff..7f541537f2de1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1631,9 +1631,7 @@ def set(self, locs, values): assert locs.tolist() == [0] self.values[:] = values - def putmask( - self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, - ) -> List["Block"]: + def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False): """ See Block.putmask.__doc__ """ @@ -1816,7 +1814,7 @@ def diff(self, n: int, axis: int = 1) -> List["Block"]: return super().diff(n, axis) def shift( - self, periods: int, axis: int = 0, fill_value: Any = None, + self, periods: int, axis: int = 0, fill_value: Any = None ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 822ab775e7e46..74f45f12e5763 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1308,7 +1308,7 @@ def _zero_out_fperr(arg): @disallow("M8", "m8") def nancorr( - a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None, + a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None ): """ a, b: ndarrays diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 10dcb59977cdd..50447624151da 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -80,14 +80,7 @@ } -COMPARISON_BINOPS: Set[str] = { - "eq", - "ne", - "lt", - "gt", - "le", - "ge", -} +COMPARISON_BINOPS: Set[str] = {"eq", "ne", "lt", "gt", "le", "ge"} # ----------------------------------------------------------------------------- # Ops Wrapping Utilities @@ -385,9 +378,7 @@ def _align_method_SERIES(left, right, align_asobject=False): return left, right -def _construct_result( - left: ABCSeries, result: ArrayLike, index: ABCIndexClass, name, -): +def _construct_result(left: ABCSeries, result: ArrayLike, index: ABCIndexClass, name): """ Construct an appropriately-labelled Series from the result of an op. diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 5dd7af454cbd1..228dbd4159f2c 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -257,7 +257,7 @@ def _can_broadcast(lvalues, rvalues) -> bool: def comparison_op( - left: ArrayLike, right: Any, op, str_rep: Optional[str] = None, + left: ArrayLike, right: Any, op, str_rep: Optional[str] = None ) -> ArrayLike: """ Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index 2463a1f58a447..e44cf2f6c533c 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -76,3 +76,36 @@ def should_series_dispatch(left, right, op): return True return False + + +def dispatch_to_extension_op( + op, left: Union[ABCExtensionArray, np.ndarray], right: Any +): + """ + Assume that left or right is a Series backed by an ExtensionArray, + apply the operator defined by op. + + Parameters + ---------- + op : binary operator + left : ExtensionArray or np.ndarray + right : object + + Returns + ------- + ExtensionArray or np.ndarray + 2-tuple of these if op is divmod or rdivmod + """ + # NB: left and right should already be unboxed, so neither should be + # a Series or Index. + + if left.dtype.kind in "mM" and isinstance(left, np.ndarray): + # We need to cast datetime64 and timedelta64 ndarrays to + # DatetimeArray/TimedeltaArray. But we avoid wrapping others in + # PandasArray as that behaves poorly with e.g. IntegerArray. + left = array(left) + + # The op calls will raise TypeError if the op is not defined + # on the ExtensionArray + res_values = op(left, right) + return res_values diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index 0cf1ac4d107f6..7c63bc43de67e 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -98,7 +98,7 @@ def f(self, other): # this makes sure that we are aligned like the input # we are updating inplace so we want to ignore is_copy self._update_inplace( - result.reindex_like(self, copy=False)._data, verify_is_copy=False + result.reindex_like(self, copy=False), verify_is_copy=False ) return self diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index efb72d1b61d1f..fbc25a0dd97db 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -395,7 +395,7 @@ def __init__( # Need to flip BlockManager axis in the DataFrame special case self._is_frame = isinstance(sample, ABCDataFrame) if self._is_frame: - axis = 1 if axis == 0 else 0 + axis = DataFrame._get_block_manager_axis(axis) self._is_series = isinstance(sample, ABCSeries) if not 0 <= axis <= sample.ndim: @@ -436,7 +436,8 @@ def __init__( self.objs.append(obj) # note: this is the BlockManager axis (since DataFrame is transposed) - self.axis = axis + self.bm_axis = axis + self.axis = 1 - self.bm_axis if self._is_frame else 0 self.keys = keys self.names = names or getattr(keys, "names", None) self.levels = levels @@ -454,7 +455,7 @@ def get_result(self): if self._is_series: # stack blocks - if self.axis == 0: + if self.bm_axis == 0: name = com.consensus_name_attr(self.objs) mgr = self.objs[0]._data.concat( @@ -477,21 +478,22 @@ def get_result(self): else: mgrs_indexers = [] for obj in self.objs: - mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): - if ax == self.axis: + # ::-1 to convert BlockManager ax to DataFrame ax + if ax == self.bm_axis: # Suppress reindexing on concat axis continue - obj_labels = mgr.axes[ax] + # 1-ax to convert BlockManager axis to DataFrame axis + obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy + mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) if not self.copy: new_data._consolidate_inplace() @@ -500,7 +502,7 @@ def get_result(self): return cons(new_data).__finalize__(self, method="concat") def _get_result_dim(self) -> int: - if self._is_series and self.axis == 1: + if self._is_series and self.bm_axis == 1: return 2 else: return self.objs[0].ndim @@ -508,7 +510,7 @@ def _get_result_dim(self) -> int: def _get_new_axes(self) -> List[Index]: ndim = self._get_result_dim() return [ - self._get_concat_axis() if i == self.axis else self._get_comb_axis(i) + self._get_concat_axis() if i == self.bm_axis else self._get_comb_axis(i) for i in range(ndim) ] @@ -527,7 +529,7 @@ def _get_concat_axis(self) -> Index: Return index to be used along concatenation axis. """ if self._is_series: - if self.axis == 0: + if self.bm_axis == 0: indexes = [x.index for x in self.objs] elif self.ignore_index: idx = ibase.default_index(len(self.objs)) @@ -555,7 +557,7 @@ def _get_concat_axis(self) -> Index: else: return ensure_index(self.keys).set_names(self.names) else: - indexes = [x._data.axes[self.axis] for x in self.objs] + indexes = [x.axes[self.axis] for x in self.objs] if self.ignore_index: idx = ibase.default_index(sum(len(i) for i in indexes)) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4b1fd73d9950e..e78d5ccaa30c7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -596,7 +596,11 @@ def __init__( self.left = self.orig_left = _left self.right = self.orig_right = _right self.how = how - self.axis = axis + + # bm_axis -> the axis on the BlockManager + self.bm_axis = axis + # axis --> the axis on the Series/DataFrame + self.axis = 1 - axis if self.left.ndim == 2 else 0 self.on = com.maybe_make_list(on) self.left_on = com.maybe_make_list(left_on) @@ -664,18 +668,17 @@ def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes llabels, rlabels = _items_overlap_with_suffix( - ldata.items, lsuf, rdata.items, rsuf + self.left._info_axis, lsuf, self.right._info_axis, rsuf ) lindexers = {1: left_indexer} if left_indexer is not None else {} rindexers = {1: right_indexer} if right_indexer is not None else {} result_data = concatenate_block_managers( - [(ldata, lindexers), (rdata, rindexers)], + [(self.left._data, lindexers), (self.right._data, rindexers)], axes=[llabels.append(rlabels), join_index], concat_axis=0, copy=self.copy, @@ -864,8 +867,8 @@ def _get_join_indexers(self): ) def _get_join_info(self): - left_ax = self.left._data.axes[self.axis] - right_ax = self.right._data.axes[self.axis] + left_ax = self.left.axes[self.axis] + right_ax = self.right.axes[self.axis] if self.left_index and self.right_index and self.how != "asof": join_index, left_indexer, right_indexer = left_ax.join( @@ -1478,12 +1481,10 @@ def __init__( def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - # this is a bit kludgy - ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes llabels, rlabels = _items_overlap_with_suffix( - ldata.items, lsuf, rdata.items, rsuf + self.left._info_axis, lsuf, self.right._info_axis, rsuf ) if self.fill_method == "ffill": @@ -1497,7 +1498,7 @@ def get_result(self): rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} result_data = concatenate_block_managers( - [(ldata, lindexers), (rdata, rindexers)], + [(self.left._data, lindexers), (self.right._data, rindexers)], axes=[llabels.append(rlabels), join_index], concat_axis=0, copy=self.copy, diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b3b0166334413..1a5752043b33e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -228,7 +228,7 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name, + table, data, values, rows, cols, aggfunc, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -297,7 +297,7 @@ def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): def _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All", + table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All" ): if len(cols) > 0: # need to "interleave" the margins diff --git a/pandas/core/series.py b/pandas/core/series.py index 7e575bf9e6115..03b82365358ac 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -418,10 +418,6 @@ def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: # The ensure_index call aabove ensures we have an Index object self._data.set_axis(axis, labels) - def _update_inplace(self, result, **kwargs): - # we want to call the generic version and not the IndexOpsMixin - return generic.NDFrame._update_inplace(self, result, **kwargs) - # ndarray compatibility @property def dtype(self) -> DtypeObj: @@ -1800,7 +1796,7 @@ def unique(self): result = super().unique() return result - def drop_duplicates(self, keep="first", inplace=False) -> "Series": + def drop_duplicates(self, keep="first", inplace=False) -> Optional["Series"]: """ Return Series with duplicate values removed. @@ -1875,7 +1871,13 @@ def drop_duplicates(self, keep="first", inplace=False) -> "Series": 5 hippo Name: animal, dtype: object """ - return super().drop_duplicates(keep=keep, inplace=inplace) + inplace = validate_bool_kwarg(inplace, "inplace") + result = super().drop_duplicates(keep=keep) + if inplace: + self._update_inplace(result) + return None + else: + return result def duplicated(self, keep="first") -> "Series": """ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a2042de7f337e..207c5cc98449a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -606,9 +606,9 @@ def to_datetime( would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False If True and no `format` is given, attempt to infer the format of the - datetime strings, and if it can be inferred, switch to a faster - method of parsing them. In some cases this can increase the parsing - speed by ~5-10x. + datetime strings based on the first non-NaN element, + and if it can be inferred, switch to a faster method of parsing them. + In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index a6198f8b752ae..f4eb16602f8a0 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -40,13 +40,13 @@ def to_numeric(arg, errors="raise", downcast=None): - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. - downcast : {'integer', 'signed', 'unsigned', 'float'}, default None + downcast : {'int', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'int' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 921cdb3c2523f..70298d5df3606 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -35,7 +35,7 @@ class BaseIndexer: """Base class for window bounds calculations.""" def __init__( - self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, + self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs ): """ Parameters @@ -100,7 +100,7 @@ def get_window_bounds( ) -> Tuple[np.ndarray, np.ndarray]: return calculate_variable_window_bounds( - num_values, self.window_size, min_periods, center, closed, self.index_array, + num_values, self.window_size, min_periods, center, closed, self.index_array ) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 5d35ec7457ab0..aec294c3c84c2 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -57,7 +57,7 @@ def generate_numba_apply_func( @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( - values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int ) -> np.ndarray: result = np.empty(len(begin)) for i in loop_range(len(result)): diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index b40d2a57b8106..4d6f03489725f 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -20,9 +20,7 @@ def expand(self, prop, value: str): try: mapping = self.SIDE_SHORTHANDS[len(tokens)] except KeyError: - warnings.warn( - f'Could not expand "{prop}: {value}"', CSSWarning, - ) + warnings.warn(f'Could not expand "{prop}: {value}"', CSSWarning) return for key, idx in zip(self.SIDES, mapping): yield prop_fmt.format(key), tokens[idx] @@ -117,10 +115,7 @@ def __call__(self, declarations_str, inherited=None): props[prop] = self.size_to_pt( props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS ) - for prop in [ - f"margin-{side}", - f"padding-{side}", - ]: + for prop in [f"margin-{side}", f"padding-{side}"]: if prop in props: # TODO: support % props[prop] = self.size_to_pt( diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index aac1df5dcd396..527f6d738d95a 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -141,8 +141,7 @@ def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: return { side: { "style": self._border_style( - props.get(f"border-{side}-style"), - props.get(f"border-{side}-width"), + props.get(f"border-{side}-style"), props.get(f"border-{side}-width") ), "color": self.color_to_excel(props.get(f"border-{side}-color")), } diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 718534e42ec25..3feae8b0a30dd 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -292,10 +292,7 @@ def format_attr(pair): # ... except maybe the last for columns.names name = self.data.columns.names[r] - cs = [ - BLANK_CLASS if name is None else INDEX_NAME_CLASS, - f"level{r}", - ] + cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS, f"level{r}"] name = BLANK_VALUE if name is None else name row_es.append( { @@ -309,11 +306,7 @@ def format_attr(pair): if clabels: for c, value in enumerate(clabels[r]): - cs = [ - COL_HEADING_CLASS, - f"level{r}", - f"col{c}", - ] + cs = [COL_HEADING_CLASS, f"level{r}", f"col{c}"] cs.extend( cell_context.get("col_headings", {}).get(r, {}).get(c, []) ) @@ -357,11 +350,7 @@ def format_attr(pair): for r, idx in enumerate(self.data.index): row_es = [] for c, value in enumerate(rlabels[r]): - rid = [ - ROW_HEADING_CLASS, - f"level{c}", - f"row{r}", - ] + rid = [ROW_HEADING_CLASS, f"level{c}", f"row{r}"] es = { "type": "th", "is_visible": (_is_visible(r, c, idx_lengths) and not hidden_index), diff --git a/pandas/io/html.py b/pandas/io/html.py index 9efdacadce83e..ce6674ffb9588 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1036,7 +1036,7 @@ def read_html( See Also -------- - read_csv + read_csv : Read a comma-separated values (csv) file into DataFrame. Notes ----- diff --git a/pandas/io/orc.py b/pandas/io/orc.py index ea79efd0579e5..b556732e4d116 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -12,7 +12,7 @@ def read_orc( - path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs, + path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs ) -> "DataFrame": """ Load an ORC object from the file path, returning a DataFrame. diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9ae9729fc05ee..281ba1fc9e6c2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -105,9 +105,7 @@ def write( **kwargs, ) else: - self.api.parquet.write_table( - table, path, compression=compression, **kwargs, - ) + self.api.parquet.write_table(table, path, compression=compression, **kwargs) def read(self, path, columns=None, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8c213803170a3..189aa1e8fab6c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2824,7 +2824,7 @@ def read_index_node( # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: - data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,) + data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type) kind = _ensure_decoded(node._v_attrs.kind) name = None @@ -3564,10 +3564,7 @@ def _read_axes( for a in self.axes: a.set_info(self.info) res = a.convert( - values, - nan_rep=self.nan_rep, - encoding=self.encoding, - errors=self.errors, + values, nan_rep=self.nan_rep, encoding=self.encoding, errors=self.errors ) results.append(res) @@ -3990,7 +3987,7 @@ def create_description( return d def read_coordinates( - self, where=None, start: Optional[int] = None, stop: Optional[int] = None, + self, where=None, start: Optional[int] = None, stop: Optional[int] = None ): """ select coordinates (row numbers) from a table; return the @@ -4259,7 +4256,7 @@ def write_data_chunk( self.table.flush() def delete( - self, where=None, start: Optional[int] = None, stop: Optional[int] = None, + self, where=None, start: Optional[int] = None, stop: Optional[int] = None ): # delete all rows (and return the nrows) @@ -4690,7 +4687,7 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index if inferred_type == "date": converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) return IndexCol( - name, converted, "date", _tables().Time32Col(), index_name=index_name, + name, converted, "date", _tables().Time32Col(), index_name=index_name ) elif inferred_type == "string": @@ -4706,13 +4703,13 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index elif inferred_type in ["integer", "floating"]: return IndexCol( - name, values=converted, kind=kind, typ=atom, index_name=index_name, + name, values=converted, kind=kind, typ=atom, index_name=index_name ) else: assert isinstance(converted, np.ndarray) and converted.dtype == object assert kind == "object", kind atom = _tables().ObjectAtom() - return IndexCol(name, converted, kind, atom, index_name=index_name,) + return IndexCol(name, converted, kind, atom, index_name=index_name) def _unconvert_index( diff --git a/pandas/io/sql.py b/pandas/io/sql.py index ff647040ebbfb..3891afa5e332d 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -313,7 +313,7 @@ def read_sql_query( See Also -------- read_sql_table : Read SQL database table into a DataFrame. - read_sql + read_sql : Read SQL query or database table into a DataFrame. Notes ----- diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index d3db539084609..e466a215091ea 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1468,15 +1468,19 @@ def scatter(self, x, y, s=None, c=None, **kwargs): y : int or str The column name or column position to be used as vertical coordinates for each point. - s : scalar or array_like, optional + s : str, scalar or array_like, optional The size of each point. Possible values are: + - A string with the name of the column to be used for marker's size. + - A single scalar so all points have the same size. - A sequence of scalars, which will be used for each point's size recursively. For instance, when passing [2,14] all points size will be either 2 or 14, alternatively. + .. versionchanged:: 1.1.0 + c : str, int or array_like, optional The color of each point. Possible values are: diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 63d0b8abe59d9..bc8346fd48433 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -934,6 +934,8 @@ def __init__(self, data, x, y, s=None, c=None, **kwargs): # hide the matplotlib default for size, in case we want to change # the handling of this argument later s = 20 + elif is_hashable(s) and s in data.columns: + s = data[s] super().__init__(data, x, y, s=s, **kwargs) if is_integer(c) and not self.data.columns.holds_integer(): c = self.data.columns[c] diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 0d24a785b0be2..30c5ba0ed94b6 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -29,10 +29,10 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): def register(): """ - Register Pandas Formatters and Converters with matplotlib. + Register pandas formatters and converters with matplotlib. This function modifies the global ``matplotlib.units.registry`` - dictionary. Pandas adds custom converters for + dictionary. pandas adds custom converters for * pd.Timestamp * pd.Period @@ -43,7 +43,7 @@ def register(): See Also -------- - deregister_matplotlib_converters + deregister_matplotlib_converters : Remove pandas formatters and converters. """ plot_backend = _get_plot_backend("matplotlib") plot_backend.register() @@ -51,7 +51,7 @@ def register(): def deregister(): """ - Remove pandas' formatters and converters. + Remove pandas formatters and converters. Removes the custom converters added by :func:`register`. This attempts to set the state of the registry back to the state before @@ -62,7 +62,8 @@ def deregister(): See Also -------- - register_matplotlib_converters + register_matplotlib_converters : Register pandas formatters and converters + with matplotlib. """ plot_backend = _get_plot_backend("matplotlib") plot_backend.deregister() @@ -155,7 +156,7 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): Parameters ---------- frame : `DataFrame` - Pandas object holding the data. + pandas object holding the data. class_column : str Column name containing the name of the data point category. ax : :class:`matplotlib.axes.Axes`, optional @@ -270,7 +271,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): Parameters ---------- series : pandas.Series - Pandas Series from where to get the samplings for the bootstrapping. + pandas Series from where to get the samplings for the bootstrapping. fig : matplotlib.figure.Figure, default None If given, it will use the `fig` reference for plotting instead of creating a new one with default parameters. diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py index d7c312b2fda1b..14846cf252edc 100644 --- a/pandas/tests/arithmetic/test_interval.py +++ b/pandas/tests/arithmetic/test_interval.py @@ -153,9 +153,7 @@ def test_compare_scalar_other(self, op, array, other): expected = self.elementwise_comparison(op, array, other) tm.assert_numpy_array_equal(result, expected) - def test_compare_list_like_interval( - self, op, array, interval_constructor, - ): + def test_compare_list_like_interval(self, op, array, interval_constructor): # same endpoints other = interval_constructor(array.left, array.right) result = op(array, other) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 202e30287881f..85c38dcd3566b 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -97,7 +97,7 @@ class TestNumericArraylikeArithmeticWithDatetimeLike: # TODO: also check name retentention @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize( - "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype), + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) ) def test_mul_td64arr(self, left, box_cls): # GH#22390 @@ -117,7 +117,7 @@ def test_mul_td64arr(self, left, box_cls): # TODO: also check name retentention @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize( - "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype), + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) ) def test_div_td64arr(self, left, box_cls): # GH#22390 diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index b9ac3ce9a37ae..8b784fde1d3c5 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -43,9 +43,5 @@ def test_replace(to_replace, value, expected, flip_categories): # the replace call loses categorical dtype expected = pd.Series(np.asarray(expected)) - tm.assert_series_equal( - expected, result, check_category_order=False, - ) - tm.assert_series_equal( - expected, s, check_category_order=False, - ) + tm.assert_series_equal(expected, result, check_category_order=False) + tm.assert_series_equal(expected, s, check_category_order=False) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index cb3a70e934dcb..80562b9294126 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -676,16 +676,12 @@ def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) - res = sparse[ - 4:, - ] # noqa: E231 + res = sparse[4:,] # noqa: E231 exp = SparseArray(dense[4:,]) # noqa: E231 tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) - res = sparse[ - 4:, - ] # noqa: E231 + res = sparse[4:,] # noqa: E231 exp = SparseArray(dense[4:,], fill_value=0) # noqa: E231 tm.assert_sp_array_equal(res, exp) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index ad6e6e4a98057..117c42e17caf3 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -35,7 +35,7 @@ np.dtype("float32"), PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), ), - (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2]),), + (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2])), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias @@ -120,10 +120,10 @@ (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String (["a", None], "string", StringArray._from_sequence(["a", None])), - (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None]),), + (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])), # Boolean ([True, None], "boolean", BooleanArray._from_sequence([True, None])), - ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None]),), + ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -174,7 +174,7 @@ def test_array_copy(): period_array(["2000", "2001"], freq="D"), ), # interval - ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2]),), + ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])), # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py new file mode 100644 index 0000000000000..1a8b03562ead6 --- /dev/null +++ b/pandas/tests/arrays/test_boolean.py @@ -0,0 +1,934 @@ +import operator + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.core.arrays.boolean import coerce_to_array +from pandas.tests.extension.base import BaseOpsUtil + + +def make_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + +@pytest.fixture +def dtype(): + return pd.BooleanDtype() + + +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) + + +def test_boolean_array_constructor(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.tolist(), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, mask.tolist()) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.astype(int), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, None) + + with pytest.raises(ValueError, match="values must be a 1D array"): + BooleanArray(values.reshape(1, -1), mask) + + with pytest.raises(ValueError, match="mask must be a 1D array"): + BooleanArray(values, mask.reshape(1, -1)) + + +def test_boolean_array_constructor_copy(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = BooleanArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_boolean_array(): + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, False]) + ) + + result = pd.array([True, False, True], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, True]) + ) + + result = pd.array([True, False, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_all_none(): + expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) + + result = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]), + ([True, np.nan], [True, None]), + ([True, pd.NA], [True, None]), + ([np.nan, np.nan], [None, None]), + (np.array([np.nan, np.nan], dtype=float), [None, None]), + ], +) +def test_to_boolean_array_missing_indicators(a, b): + result = pd.array(a, dtype="boolean") + expected = pd.array(b, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + # "foo", + [1, 2], + [1.0, 2.0], + pd.date_range("20130101", periods=2), + np.array(["foo"]), + np.array([1, 2]), + np.array([1.0, 2.0]), + [np.nan, {"a": 1}], + ], +) +def test_to_boolean_array_error(values): + # error in converting existing arrays to BooleanArray + msg = "Need to pass bool-like value" + with pytest.raises(TypeError, match=msg): + pd.array(values, dtype="boolean") + + +def test_to_boolean_array_from_integer_array(): + result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1, 0, 1, None]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_float_array(): + result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_integer_like(): + # integers of 0's and 1's + result = pd.array([1, 0, 1, 0], dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array([1, 0, 1, None], dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_coerce_to_array(): + # TODO this is currently not public API + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is values + assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is not values + assert result._mask is not mask + + # mixed missing from values and mask + values = [True, False, None, False] + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray( + np.array([True, False, True, True]), np.array([False, False, True, True]) + ) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) + tm.assert_extension_array_equal(result, expected) + + # raise errors for wrong dimension + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + with pytest.raises(ValueError, match="values must be a 1D list-like"): + coerce_to_array(values.reshape(1, -1)) + + with pytest.raises(ValueError, match="mask must be a 1D list-like"): + coerce_to_array(values, mask=mask.reshape(1, -1)) + + +def test_coerce_to_array_from_boolean_array(): + # passing BooleanArray to coerce_to_array + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + arr = BooleanArray(values, mask) + result = BooleanArray(*coerce_to_array(arr)) + tm.assert_extension_array_equal(result, arr) + # no copy + assert result._data is arr._data + assert result._mask is arr._mask + + result = BooleanArray(*coerce_to_array(arr), copy=True) + tm.assert_extension_array_equal(result, arr) + assert result._data is not arr._data + assert result._mask is not arr._mask + + with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): + coerce_to_array(arr, mask=mask) + + +def test_coerce_to_numpy_array(): + # with missing values -> object dtype + arr = pd.array([True, False, None], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # also with no missing values -> object dtype + arr = pd.array([True, False, True], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # force bool dtype + result = np.array(arr, dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + # with missing values will raise error + arr = pd.array([True, False, None], dtype="boolean") + msg = ( + "cannot convert to 'bool'-dtype NumPy array with missing values. " + "Specify an appropriate 'na_value' for this dtype." + ) + with pytest.raises(ValueError, match=msg): + np.array(arr, dtype="bool") + + +def test_to_boolean_array_from_strings(): + result = BooleanArray._from_sequence_of_strings( + np.array(["True", "False", np.nan], dtype=object) + ) + expected = BooleanArray( + np.array([True, False, False]), np.array([False, False, True]) + ) + + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_strings_invalid_string(): + with pytest.raises(ValueError, match="cannot be cast"): + BooleanArray._from_sequence_of_strings(["donkey"]) + + +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + # default (with or without missing values) -> object dtype + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype="str") + expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + arr.to_numpy(dtype="int64") + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): + arr.to_numpy(dtype="float64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) + + +def test_astype(): + # with missing values + arr = pd.array([True, False, None], dtype="boolean") + + with pytest.raises(ValueError, match="cannot convert NA to integer"): + arr.astype("int64") + + with pytest.raises(ValueError, match="cannot convert float NaN to"): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("str") + expected = np.array(["True", "False", ""], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.astype("int64") + expected = np.array([1, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("boolean") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, arr) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("Int64") + expected = pd.array([1, 0, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("na", [None, np.nan, pd.NA]) +def test_setitem_missing_values(na): + arr = pd.array([True, False, None], dtype="boolean") + expected = pd.array([True, None, None], dtype="boolean") + arr[1] = na + tm.assert_extension_array_equal(arr, expected) + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] +) +def test_ufuncs_binary(ufunc): + # two BooleanArrays + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a, a) + expected = pd.array(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s, a) + expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + # Boolean with numpy array + arr = np.array([True, True, False]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a._data, arr), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # BooleanArray with scalar + result = ufunc(a, True) + expected = pd.array(ufunc(a._data, True), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(True, a) + expected = pd.array(ufunc(True, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # not handled types + with pytest.raises(TypeError): + ufunc(a, "test") + + +@pytest.mark.parametrize("ufunc", [np.logical_not]) +def test_ufuncs_unary(ufunc): + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a) + expected = pd.array(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("values", [[True, False], [True, None]]) +def test_ufunc_reduce_raises(values): + a = pd.array(values, dtype="boolean") + with pytest.raises(NotImplementedError): + np.add.reduce(a) + + +class TestUnaryOps: + def test_invert(self): + a = pd.array([True, False, None], dtype="boolean") + expected = pd.array([False, True, None], dtype="boolean") + tm.assert_extension_array_equal(~a, expected) + + expected = pd.Series(expected, index=["a", "b", "c"], name="name") + result = ~pd.Series(a, index=["a", "b", "c"], name="name") + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"]) + result = ~df + expected = pd.DataFrame( + {"A": expected, "B": [False, True, True]}, index=["a", "b", "c"] + ) + tm.assert_frame_equal(result, expected) + + +class TestLogicalOps(BaseOpsUtil): + def test_numpy_scalars_ok(self, all_logical_operators): + a = pd.array([True, False, None], dtype="boolean") + op = getattr(a, all_logical_operators) + + tm.assert_extension_array_equal(op(True), op(np.bool(True))) + tm.assert_extension_array_equal(op(False), op(np.bool(False))) + + def get_op_from_name(self, op_name): + short_opname = op_name.strip("_") + short_opname = short_opname if "xor" in short_opname else short_opname + "_" + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + def test_empty_ok(self, all_logical_operators): + a = pd.array([], dtype="boolean") + op_name = all_logical_operators + result = getattr(a, op_name)(True) + tm.assert_extension_array_equal(a, result) + + result = getattr(a, op_name)(False) + tm.assert_extension_array_equal(a, result) + + # TODO: pd.NA + # result = getattr(a, op_name)(pd.NA) + # tm.assert_extension_array_equal(a, result) + + def test_logical_length_mismatch_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Lengths must match to compare" + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)([True, False]) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(np.array([True, False])) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(pd.array([True, False], dtype="boolean")) + + def test_logical_nan_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Got float instead" + + with pytest.raises(TypeError, match=msg): + getattr(a, op_name)(np.nan) + + @pytest.mark.parametrize("other", ["a", 1]) + def test_non_bool_or_na_other_raises(self, other, all_logical_operators): + a = pd.array([True, False], dtype="boolean") + with pytest.raises(TypeError, match=str(type(other).__name__)): + getattr(a, all_logical_operators)(other) + + def test_kleene_or(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a | b + expected = pd.array( + [True, True, True, True, False, None, True, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [True, None, None]), + (True, [True, True, True]), + (np.bool_(True), [True, True, True]), + (False, [True, False, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_or_scalar(self, other, expected): + # TODO: test True & False + a = pd.array([True, False, None], dtype="boolean") + result = a | other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_and(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a & b + expected = pd.array( + [True, False, None, False, False, False, None, False, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, False, None]), + (True, [True, False, None]), + (False, [False, False, False]), + (np.bool_(True), [True, False, None]), + (np.bool_(False), [False, False, False]), + ], + ) + def test_kleene_and_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a & other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_xor(self): + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a ^ b + expected = pd.array( + [False, True, None, True, False, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, None, None]), + (True, [False, True, None]), + (np.bool_(True), [False, True, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_xor_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a ^ other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + @pytest.mark.parametrize("other", [True, False, pd.NA, [True, False, None] * 3]) + def test_no_masked_assumptions(self, other, all_logical_operators): + # The logical operations should not assume that masked values are False! + a = pd.arrays.BooleanArray( + np.array([True, True, True, False, False, False, True, False, True]), + np.array([False] * 6 + [True, True, True]), + ) + b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + if isinstance(other, list): + other = pd.array(other, dtype="boolean") + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + if isinstance(other, BooleanArray): + other._data[other._mask] = True + a._data[a._mask] = False + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + expected = expected.astype("boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + self._compare_other(data, op_name, True) + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + other = np.array([True] * len(data)) + self._compare_other(data, op_name, other) + other = pd.Series([True] * len(data)) + self._compare_other(data, op_name, other) + + @pytest.mark.parametrize("other", [True, False, pd.NA]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None], dtype="boolean") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + + result = op(a, b) + + values = op(a._data, b._data) + mask = a._mask | b._mask + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + +class TestArithmeticOps(BaseOpsUtil): + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + with pytest.raises(TypeError): + ops("foo") + with pytest.raises(TypeError): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + if op not in ("__mul__", "__rmul__"): + # TODO(extension) numpy's mul with object array sees booleans as numbers + with pytest.raises(TypeError): + ops(pd.Series("foo", index=s.index)) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + with pytest.raises(NotImplementedError): + opa(np.arange(len(s)).reshape(-1, len(s))) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_reductions_return_types(dropna, data, all_numeric_reductions): + op = all_numeric_reductions + s = pd.Series(data) + if dropna: + s = s.dropna() + + if op in ("sum", "prod"): + assert isinstance(getattr(s, op)(), np.int64) + elif op in ("min", "max"): + assert isinstance(getattr(s, op)(), np.bool_) + else: + # "mean", "std", "var", "median", "kurt", "skew" + assert isinstance(getattr(s, op)(), np.float64) + + +@pytest.mark.parametrize( + "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", + [ + ([True, pd.NA], True, True, True, pd.NA), + ([False, pd.NA], False, False, pd.NA, False), + ([pd.NA], False, True, pd.NA, pd.NA), + ([], False, True, False, True), + ], +) +def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): + # the methods return numpy scalars + exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) + exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) + exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) + exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) + + for con in [pd.array, pd.Series]: + a = con(values, dtype="boolean") + assert a.any() is exp_any + assert a.all() is exp_all + assert a.any(skipna=False) is exp_any_noskip + assert a.all(skipna=False) is exp_all_noskip + + assert np.any(a.any()) is exp_any + assert np.all(a.all()) is exp_all + + +# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion +# manually in the indexing code +# def test_indexing_boolean_mask(): +# arr = pd.array([1, 2, 3, 4], dtype="Int64") +# mask = pd.array([True, False, True, False], dtype="boolean") +# result = arr[mask] +# expected = pd.array([1, 3], dtype="Int64") +# tm.assert_extension_array_equal(result, expected) + +# # missing values -> error +# mask = pd.array([True, False, True, None], dtype="boolean") +# with pytest.raises(IndexError): +# result = arr[mask] + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + + # TODO use to_numpy(na_value=None) here + data_object = np.array(data, dtype=object) + data_object[data.isna()] = None + expected = pa.array(data_object, type=pa.bool_(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array([True, False, None], dtype="boolean") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "bool" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.BooleanDtype) + tm.assert_frame_equal(result, df) + + +def test_value_counts_na(): + arr = pd.array([True, False, pd.NA], dtype="boolean") + result = arr.value_counts(dropna=False) + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_diff(): + a = pd.array( + [True, True, False, False, True, None, True, None, False], dtype="boolean" + ) + result = pd.core.algorithms.diff(a, 1) + expected = pd.array( + [None, False, True, False, True, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = s.diff() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index c86b4f71ee592..a32529cb58ba3 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -46,7 +46,7 @@ def test_incorrect_dtype_raises(self): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") with pytest.raises( - ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]", + ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]" ): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 59f9103072fe9..370b887b78eab 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -187,7 +187,7 @@ def test_iter_box(self): PeriodArray, pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), ), - (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval",), + (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"), # This test is currently failing for datetime64[ns] and timedelta64[ns]. # The NumPy type system is sufficient for representing these types, so # we just use NumPy for Series / DataFrame columns of these types (so @@ -289,10 +289,7 @@ def test_array_multiindex_raises(): pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), ), - ( - pd.core.arrays.integer_array([0, np.nan]), - np.array([0, pd.NA], dtype=object), - ), + (pd.core.arrays.integer_array([0, np.nan]), np.array([0, pd.NA], dtype=object)), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 7ba59786bb0fa..cad46d0a23967 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -185,6 +185,21 @@ def test_isna_datetime(self): exp = np.zeros(len(mask), dtype=bool) tm.assert_numpy_array_equal(mask, exp) + def test_isna_old_datetimelike(self): + # isna_old should work for dt64tz, td64, and period, not just tznaive + dti = pd.date_range("2016-01-01", periods=3) + dta = dti._data + dta[-1] = pd.NaT + expected = np.array([False, False, True], dtype=bool) + + objs = [dta, dta.tz_localize("US/Eastern"), dta - dta, dta.to_period("D")] + + for obj in objs: + with cf.option_context("mode.use_inf_as_na", True): + result = pd.isna(obj) + + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "value, expected", [ diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 059d3453995bd..c9b3eb7baf45b 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -283,8 +283,7 @@ def _compare_other(self, s, data, op_name, other): op(data, other) @pytest.mark.parametrize( - "categories", - [["a", "b"], [0, 1], [pd.Timestamp("2019"), pd.Timestamp("2020")]], + "categories", [["a", "b"], [0, 1], [pd.Timestamp("2019"), pd.Timestamp("2020")]] ) def test_not_equal_with_na(self, categories): # https://github.com/pandas-dev/pandas/issues/32276 diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index e89b2c6f1fec0..486d140849159 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -79,66 +79,6 @@ def bool_frame_with_na(): return df -@pytest.fixture -def int_frame(): - """ - Fixture for DataFrame of ints with index of unique strings - - Columns are ['A', 'B', 'C', 'D'] - - A B C D - vpBeWjM651 1 0 1 0 - 5JyxmrP1En -1 0 0 0 - qEDaoD49U2 -1 1 0 0 - m66TkTfsFe 0 0 0 0 - EHPaNzEUFm -1 0 -1 0 - fpRJCevQhi 2 0 0 0 - OlQvnmfi3Q 0 0 -2 0 - ... .. .. .. .. - uB1FPlz4uP 0 0 0 1 - EcSe6yNzCU 0 0 -1 0 - L50VudaiI8 -1 1 -2 0 - y3bpw4nwIp 0 -1 0 0 - H0RdLLwrCT 1 1 0 0 - rY82K0vMwm 0 0 0 0 - 1OPIUjnkjk 2 0 0 0 - - [30 rows x 4 columns] - """ - df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) - # force these all to int64 to avoid platform testing issues - return DataFrame({c: s for c, s in df.items()}, dtype=np.int64) - - -@pytest.fixture -def datetime_frame(): - """ - Fixture for DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D'] - - A B C D - 2000-01-03 -1.122153 0.468535 0.122226 1.693711 - 2000-01-04 0.189378 0.486100 0.007864 -1.216052 - 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 - 2000-01-06 0.430050 0.894352 0.090719 0.036939 - 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 - 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 - 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 - ... ... ... ... ... - 2000-02-03 1.642618 -0.579288 0.046005 1.385249 - 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 - 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 - 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 - 2000-02-09 1.377373 0.398619 1.008453 -0.928207 - 2000-02-10 0.473194 -0.636677 0.984058 0.511519 - 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 - - [30 rows x 4 columns] - """ - return DataFrame(tm.getTimeSeriesData()) - - @pytest.fixture def float_string_frame(): """ diff --git a/pandas/tests/frame/indexing/test_get.py b/pandas/tests/frame/indexing/test_get.py new file mode 100644 index 0000000000000..5f2651eec683c --- /dev/null +++ b/pandas/tests/frame/indexing/test_get.py @@ -0,0 +1,27 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestGet: + def test_get(self, float_frame): + b = float_frame.get("B") + tm.assert_series_equal(b, float_frame["B"]) + + assert float_frame.get("foo") is None + tm.assert_series_equal( + float_frame.get("foo", float_frame["B"]), float_frame["B"] + ) + + @pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=list("AB")), + DataFrame(columns=list("AB"), index=range(3)), + ], + ) + def test_get_none(self, df): + # see gh-5652 + assert df.get(None) is None diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 4fa5e4196ae5b..9e6d41a8886b3 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -31,29 +31,6 @@ _slice_msg = "slice indices must be integers or None or have an __index__ method" -class TestGet: - def test_get(self, float_frame): - b = float_frame.get("B") - tm.assert_series_equal(b, float_frame["B"]) - - assert float_frame.get("foo") is None - tm.assert_series_equal( - float_frame.get("foo", float_frame["B"]), float_frame["B"] - ) - - @pytest.mark.parametrize( - "df", - [ - DataFrame(), - DataFrame(columns=list("AB")), - DataFrame(columns=list("AB"), index=range(3)), - ], - ) - def test_get_none(self, df): - # see gh-5652 - assert df.get(None) is None - - class TestDataFrameIndexing: def test_getitem(self, float_frame): # Slicing @@ -1639,11 +1616,6 @@ def test_reindex_methods(self, method, expected_values): actual = df.reindex(target, method=method) tm.assert_frame_equal(expected, actual) - actual = df.reindex_like(df, method=method, tolerance=0) - tm.assert_frame_equal(df, actual) - actual = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) - tm.assert_frame_equal(df, actual) - actual = df.reindex(target, method=method, tolerance=1) tm.assert_frame_equal(expected, actual) actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) @@ -1664,17 +1636,6 @@ def test_reindex_methods(self, method, expected_values): actual = df[::-1].reindex(target, method=switched_method) tm.assert_frame_equal(expected, actual) - def test_reindex_subclass(self): - # https://github.com/pandas-dev/pandas/issues/31925 - class MyDataFrame(DataFrame): - pass - - expected = DataFrame() - df = MyDataFrame() - result = df.reindex_like(expected) - - tm.assert_frame_equal(result, expected) - def test_reindex_methods_nearest_special(self): df = pd.DataFrame({"x": list(range(5))}) target = np.array([-0.1, 0.9, 1.1, 1.5]) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 0bc234dcb39aa..177d10cdbf615 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -6,7 +6,7 @@ from pandas.errors import PerformanceWarning import pandas as pd -from pandas import DataFrame, Index, MultiIndex +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp import pandas._testing as tm @@ -258,3 +258,162 @@ def test_drop_non_empty_list(self, index, drop_labels): # GH# 21494 with pytest.raises(KeyError, match="not found in axis"): pd.DataFrame(index=index).drop(drop_labels) + + def test_mixed_depth_drop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.randn(4, 6), columns=index) + + result = df.drop("a", axis=1) + expected = df.drop([("a", "", "")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(["top"], axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + expected = expected.drop([("top", "OD", "wy")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(("top", "OD", "wx"), axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + tm.assert_frame_equal(expected, result) + + expected = df.drop([("top", "OD", "wy")], axis=1) + expected = df.drop("top", axis=1) + + result = df.drop("result1", level=1, axis=1) + expected = df.drop( + [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 + ) + tm.assert_frame_equal(expected, result) + + def test_drop_multiindex_other_level_nan(self): + # GH#12754 + df = ( + DataFrame( + { + "A": ["one", "one", "two", "two"], + "B": [np.nan, 0.0, 1.0, 2.0], + "C": ["a", "b", "c", "c"], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.drop("c", level="C") + expected = DataFrame( + [2, 1], + columns=["D"], + index=pd.MultiIndex.from_tuples( + [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_drop_nonunique(self): + df = DataFrame( + [ + ["x-a", "x", "a", 1.5], + ["x-a", "x", "a", 1.2], + ["z-c", "z", "c", 3.1], + ["x-a", "x", "a", 4.1], + ["x-b", "x", "b", 5.1], + ["x-b", "x", "b", 4.1], + ["x-b", "x", "b", 2.2], + ["y-a", "y", "a", 1.2], + ["z-b", "z", "b", 2.1], + ], + columns=["var1", "var2", "var3", "var4"], + ) + + grp_size = df.groupby("var1").size() + drop_idx = grp_size.loc[grp_size == 1] + + idf = df.set_index(["var1", "var2", "var3"]) + + # it works! GH#2101 + result = idf.drop(drop_idx.index, level=0).reset_index() + expected = df[-df.var1.isin(drop_idx.index)] + + result.index = expected.index + + tm.assert_frame_equal(result, expected) + + def test_drop_level(self): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + + result = frame.drop(["bar", "qux"], level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = frame.drop(["two"], level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]] + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["bar", "qux"], axis=1, level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]].T + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["two"], axis=1, level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T + tm.assert_frame_equal(result, expected) + + def test_drop_level_nonunique_datetime(self): + # GH#12701 + idx = Index([2, 3, 4, 4, 5], name="id") + idxdt = pd.to_datetime( + [ + "201603231400", + "201603231500", + "201603231600", + "201603231600", + "201603231700", + ] + ) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) + df["tstamp"] = idxdt + df = df.set_index("tstamp", append=True) + ts = Timestamp("201603231600") + assert df.index.is_unique is False + + result = df.drop(ts, level="tstamp") + expected = df.loc[idx != 4] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("box", [Series, DataFrame]) + def test_drop_tz_aware_timestamp_across_dst(self, box): + # GH#21761 + start = Timestamp("2017-10-29", tz="Europe/Berlin") + end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") + index = pd.date_range(start, end, freq="15min") + data = box(data=[1] * len(index), index=index) + result = data.drop(start) + expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") + expected_idx = pd.date_range(expected_start, end, freq="15min") + expected = box(data=[1] * len(expected_idx), index=expected_idx) + tm.assert_equal(result, expected) + + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] + ) + + df = DataFrame(np.random.randn(6, 3), index=index) + + result = df.drop([(0, 2)]) + assert result.index.names == ("one", "two") diff --git a/pandas/tests/frame/methods/test_reindex_like.py b/pandas/tests/frame/methods/test_reindex_like.py new file mode 100644 index 0000000000000..ce68ec28eec3d --- /dev/null +++ b/pandas/tests/frame/methods/test_reindex_like.py @@ -0,0 +1,39 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestDataFrameReindexLike: + def test_reindex_like(self, float_frame): + other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"]) + + tm.assert_frame_equal(other, float_frame.reindex_like(other)) + + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) + def test_reindex_like_methods(self, method, expected_values): + df = DataFrame({"x": list(range(5))}) + + result = df.reindex_like(df, method=method, tolerance=0) + tm.assert_frame_equal(df, result) + result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) + tm.assert_frame_equal(df, result) + + def test_reindex_like_subclass(self): + # https://github.com/pandas-dev/pandas/issues/31925 + class MyDataFrame(DataFrame): + pass + + expected = DataFrame() + df = MyDataFrame() + result = df.reindex_like(expected) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index c409b0bbe6fa9..22aa6091b1d57 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -77,8 +77,7 @@ def test_data_frame_value_counts_single_col_default(): result = df.value_counts() expected = pd.Series( - data=[2, 1, 1], - index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]), + data=[2, 1, 1], index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]) ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 74fe3bfd41b8f..961c18749f055 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -776,35 +776,3 @@ def test_set_reset_index(self): df = df.set_index("B") df = df.reset_index() - - def test_set_axis_inplace(self): - # GH14636 - df = DataFrame( - {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, - index=[2010, 2011, 2012], - ) - - expected = {0: df.copy(), 1: df.copy()} - expected[0].index = list("abc") - expected[1].columns = list("abc") - expected["index"] = expected[0] - expected["columns"] = expected[1] - - for axis in expected: - result = df.copy() - result.set_axis(list("abc"), axis=axis, inplace=True) - tm.assert_frame_equal(result, expected[axis]) - - # inplace=False - result = df.set_axis(list("abc"), axis=axis) - tm.assert_frame_equal(expected[axis], result) - - # omitting the "axis" parameter - with tm.assert_produces_warning(None): - result = df.set_axis(list("abc")) - tm.assert_frame_equal(result, expected[0]) - - # wrong values for the "axis" parameter - for axis in 3, "foo": - with pytest.raises(ValueError, match="No axis named"): - df.set_axis(list("abc"), axis=axis) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 89f8bc433419b..22a167ce5b7fb 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -803,9 +803,7 @@ def test_zero_len_frame_with_series_corner_cases(): def test_frame_single_columns_object_sum_axis_1(): # GH 13758 - data = { - "One": pd.Series(["A", 1.2, np.nan]), - } + data = {"One": pd.Series(["A", 1.2, np.nan])} df = pd.DataFrame(data) result = df.sum(axis=1) expected = pd.Series(["A", 1.2, 0]) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 42fb722c92b26..f61512b1a62d9 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -155,11 +155,6 @@ def test_reindex_int(self, int_frame): smaller = int_frame.reindex(columns=["A", "B"]) assert smaller["A"].dtype == np.int64 - def test_reindex_like(self, float_frame): - other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"]) - - tm.assert_frame_equal(other, float_frame.reindex_like(other)) - def test_reindex_columns(self, float_frame): new_frame = float_frame.reindex(columns=["A", "B", "E"]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 9f40e8c6931c8..5805a465283c3 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -915,7 +915,7 @@ def test_constructor_mrecarray(self): # from GH3479 assert_fr_equal = functools.partial( - tm.assert_frame_equal, check_index_type=True, check_column_type=True, + tm.assert_frame_equal, check_index_type=True, check_column_type=True ) arrays = [ ("float", np.array([1.5, 2.0])), diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 9d3c40ce926d7..7805cb6154d56 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -406,7 +406,7 @@ def test_unstack_mixed_type_name_in_multiindex( result = df.unstack(unstack_idx) expected = pd.DataFrame( - expected_values, columns=expected_columns, index=expected_index, + expected_values, columns=expected_columns, index=expected_index ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/generic/methods/test_set_axis.py b/pandas/tests/generic/methods/test_set_axis.py new file mode 100644 index 0000000000000..278d43ef93d2f --- /dev/null +++ b/pandas/tests/generic/methods/test_set_axis.py @@ -0,0 +1,75 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class SharedSetAxisTests: + @pytest.fixture + def obj(self): + raise NotImplementedError("Implemented by subclasses") + + def test_set_axis(self, obj): + # GH14636; this tests setting index for both Series and DataFrame + new_index = list("abcd")[: len(obj)] + + expected = obj.copy() + expected.index = new_index + + # inplace=False + result = obj.set_axis(new_index, axis=0, inplace=False) + tm.assert_equal(expected, result) + + @pytest.mark.parametrize("axis", [0, "index", 1, "columns"]) + def test_set_axis_inplace_axis(self, axis, obj): + # GH#14636 + if obj.ndim == 1 and axis in [1, "columns"]: + # Series only has [0, "index"] + return + + new_index = list("abcd")[: len(obj)] + + expected = obj.copy() + if axis in [0, "index"]: + expected.index = new_index + else: + expected.columns = new_index + + result = obj.copy() + result.set_axis(new_index, axis=axis, inplace=True) + tm.assert_equal(result, expected) + + def test_set_axis_unnamed_kwarg_warns(self, obj): + # omitting the "axis" parameter + new_index = list("abcd")[: len(obj)] + + expected = obj.copy() + expected.index = new_index + + with tm.assert_produces_warning(None): + result = obj.set_axis(new_index, inplace=False) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("axis", [3, "foo"]) + def test_set_axis_invalid_axis_name(self, axis, obj): + # wrong values for the "axis" parameter + with pytest.raises(ValueError, match="No axis named"): + obj.set_axis(list("abc"), axis=axis) + + +class TestDataFrameSetAxis(SharedSetAxisTests): + @pytest.fixture + def obj(self): + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, + index=[2010, 2011, 2012], + ) + return df + + +class TestSeriesSetAxis(SharedSetAxisTests): + @pytest.fixture + def obj(self): + ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") + return ser diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 31501f20db453..eb00fdda058ac 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -226,3 +226,88 @@ def test_unexpected_keyword(self): with pytest.raises(TypeError, match=msg): ts.fillna(0, in_place=True) +<<<<<<< HEAD + + +class TestToXArray: + @pytest.mark.skipif( + not _XARRAY_INSTALLED + or _XARRAY_INSTALLED + and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), + reason="xarray >= 0.10.0 required", + ) + @pytest.mark.parametrize("index", tm.all_index_generator(3)) + def test_to_xarray_index_types(self, index): + from xarray import Dataset + + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index = index + df.index.name = "foo" + df.columns.name = "bar" + result = df.to_xarray() + assert result.dims["foo"] == 3 + assert len(result.coords) == 1 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, Dataset) + + # idempotency + # datetimes w/tz are preserved + # column names are lost + expected = df.copy() + expected["f"] = expected["f"].astype(object) + expected.columns.name = None + tm.assert_frame_equal(result.to_dataframe(), expected) + + @td.skip_if_no("xarray", min_version="0.7.0") + def test_to_xarray(self): + from xarray import Dataset + + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index.name = "foo" + result = df[0:0].to_xarray() + assert result.dims["foo"] == 0 + assert isinstance(result, Dataset) + + # available in 0.7.1 + # MultiIndex + df.index = pd.MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) + result = df.to_xarray() + assert result.dims["one"] == 1 + assert result.dims["two"] == 3 + assert len(result.coords) == 2 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) + assert isinstance(result, Dataset) + + result = result.to_dataframe() + expected = df.copy() + expected["f"] = expected["f"].astype(object) + expected.columns.name = None + tm.assert_frame_equal(result, expected, check_index_type=False) +======= +>>>>>>> master diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 9ea5252b91e13..3388251ae4a28 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1364,7 +1364,7 @@ def test_groupby_agg_categorical_columns(func, expected_values): result = df.groupby("groups").agg(func) expected = pd.DataFrame( - {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"), + {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups") ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 954601ad423bf..543edc6b66ff2 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -442,18 +442,6 @@ def test_frame_repr(self): expected = " A\na 1\nb 2\nc 3" assert result == expected - def test_fillna_categorical(self): - # GH 11343 - idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") - # fill by value in categories - exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x") - tm.assert_index_equal(idx.fillna(1.0), exp) - - # fill by value not in categories raises ValueError - msg = "fill value must be in categories" - with pytest.raises(ValueError, match=msg): - idx.fillna(2.0) - @pytest.mark.parametrize( "dtype, engine_type", [ diff --git a/pandas/tests/indexes/categorical/test_fillna.py b/pandas/tests/indexes/categorical/test_fillna.py new file mode 100644 index 0000000000000..0d878249d3800 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_fillna.py @@ -0,0 +1,19 @@ +import numpy as np +import pytest + +from pandas import CategoricalIndex +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_categorical(self): + # GH#11343 + idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") + # fill by value in categories + exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x") + tm.assert_index_equal(idx.fillna(1.0), exp) + + # fill by value not in categories raises ValueError + msg = "fill value must be in categories" + with pytest.raises(ValueError, match=msg): + idx.fillna(2.0) diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_fillna.py similarity index 98% rename from pandas/tests/indexes/datetimes/test_missing.py rename to pandas/tests/indexes/datetimes/test_fillna.py index 3399c8eaf6750..5fbe60bb0c50f 100644 --- a/pandas/tests/indexes/datetimes/test_missing.py +++ b/pandas/tests/indexes/datetimes/test_fillna.py @@ -4,7 +4,7 @@ import pandas._testing as tm -class TestDatetimeIndex: +class TestDatetimeIndexFillNA: @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) def test_fillna_datetime64(self, tz): # GH 11343 diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 1157c7f8bb962..16af884c89e9e 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -741,18 +741,18 @@ def test_raise_invalid_sortorder(): with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"): MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2, + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2 ) with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"): MultiIndex( - levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1, + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1 ) def test_datetimeindex(): idx1 = pd.DatetimeIndex( - ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo", + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo" ) idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") idx = MultiIndex.from_arrays([idx1, idx2]) diff --git a/pandas/tests/indexes/multi/test_get_level_values.py b/pandas/tests/indexes/multi/test_get_level_values.py index 1215e72be3c59..985fe5773ceed 100644 --- a/pandas/tests/indexes/multi/test_get_level_values.py +++ b/pandas/tests/indexes/multi/test_get_level_values.py @@ -89,3 +89,17 @@ def test_get_level_values_na(): result = index.get_level_values(0) expected = pd.Index([], dtype=object) tm.assert_index_equal(result, expected) + + +def test_get_level_values_when_periods(): + # GH33131. See also discussion in GH32669. + # This test can probably be removed when PeriodIndex._engine is removed. + from pandas import Period, PeriodIndex + + idx = MultiIndex.from_arrays( + [PeriodIndex([Period("2019Q1"), Period("2019Q2")], name="b")] + ) + idx2 = MultiIndex.from_arrays( + [idx._get_level_values(level) for level in range(idx.nlevels)] + ) + assert all(x.is_monotonic for x in idx2.levels) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 3b3ae074c774a..d9da059eb9e9c 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -500,6 +500,34 @@ def test_contains_with_missing_value(self): assert np.nan not in idx assert (1, np.nan) in idx + def test_multiindex_contains_dropped(self): + # GH#19027 + # test that dropped MultiIndex levels are not in the MultiIndex + # despite continuing to be in the MultiIndex's levels + idx = MultiIndex.from_product([[1, 2], [3, 4]]) + assert 2 in idx + idx = idx.drop(2) + + # drop implementation keeps 2 in the levels + assert 2 in idx.levels[0] + # but it should no longer be in the index itself + assert 2 not in idx + + # also applies to strings + idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + assert "a" in idx + idx = idx.drop("a") + assert "a" in idx.levels[0] + assert "a" not in idx + + def test_contains_td64_level(self): + # GH#24570 + tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") + idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) + assert tx[0] in idx + assert "element_not_exit" not in idx + assert "0 day 09:30:00" in idx + def test_timestamp_multiindex_indexer(): # https://github.com/pandas-dev/pandas/issues/26944 diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py index 122263e6ec198..b369b9a50954e 100644 --- a/pandas/tests/indexes/multi/test_isin.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -78,7 +78,7 @@ def test_isin_level_kwarg(): @pytest.mark.parametrize( "labels,expected,level", [ - ([("b", np.nan)], np.array([False, False, True]), None,), + ([("b", np.nan)], np.array([False, False, True]), None), ([np.nan, "a"], np.array([True, True, False]), 0), (["d", np.nan], np.array([False, True, True]), 1), ], diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 54ffec2e03fd3..4c9d518778ceb 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -1,55 +1,16 @@ import numpy as np import pytest -from pandas._libs.tslib import iNaT - import pandas as pd -from pandas import Int64Index, MultiIndex, PeriodIndex, UInt64Index +from pandas import MultiIndex import pandas._testing as tm -from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin def test_fillna(idx): # GH 11343 - - # TODO: Remove or Refactor. Not Implemented for MultiIndex - for name, index in [("idx", idx)]: - if len(index) == 0: - pass - elif isinstance(index, MultiIndex): - idx = index.copy() - msg = "isna is not defined for MultiIndex" - with pytest.raises(NotImplementedError, match=msg): - idx.fillna(idx[0]) - else: - idx = index.copy() - result = idx.fillna(idx[0]) - tm.assert_index_equal(result, idx) - assert result is not idx - - msg = "'value' must be a scalar, passed: " - with pytest.raises(TypeError, match=msg): - idx.fillna([idx[0]]) - - idx = index.copy() - values = idx.values - - if isinstance(index, DatetimeIndexOpsMixin): - values[1] = iNaT - elif isinstance(index, (Int64Index, UInt64Index)): - continue - else: - values[1] = np.nan - - if isinstance(index, PeriodIndex): - idx = type(index)(values, freq=index.freq) - else: - idx = type(index)(values) - - expected = np.array([False] * len(idx), dtype=bool) - expected[1] = True - tm.assert_numpy_array_equal(idx._isnan, expected) - assert idx.hasnans is True + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.fillna(idx[0]) def test_dropna(): diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 4ec7ef64e2272..b76ec45ac7c1a 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -49,11 +49,7 @@ def test_base_constructor_with_period_dtype(self): ) def test_index_object_dtype(self, values_constructor): # Index(periods, dtype=object) is an Index (not an PeriodIndex) - periods = [ - Period("2011-01", freq="M"), - NaT, - Period("2011-03", freq="M"), - ] + periods = [Period("2011-01", freq="M"), NaT, Period("2011-03", freq="M")] values = values_constructor(periods) result = Index(values, dtype=object) diff --git a/pandas/tests/indexes/period/test_fillna.py b/pandas/tests/indexes/period/test_fillna.py new file mode 100644 index 0000000000000..602e87333a6c1 --- /dev/null +++ b/pandas/tests/indexes/period/test_fillna.py @@ -0,0 +1,36 @@ +from pandas import Index, NaT, Period, PeriodIndex +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_period(self): + # GH#11343 + idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="H") + + exp = PeriodIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" + ) + result = idx.fillna(Period("2011-01-01 10:00", freq="H")) + tm.assert_index_equal(result, exp) + + exp = Index( + [ + Period("2011-01-01 09:00", freq="H"), + "x", + Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + result = idx.fillna("x") + tm.assert_index_equal(result, exp) + + exp = Index( + [ + Period("2011-01-01 09:00", freq="H"), + Period("2011-01-01", freq="D"), + Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + result = idx.fillna(Period("2011-01-01", freq="D")) + tm.assert_index_equal(result, exp) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 39688e5b92380..73d7bcdd16c44 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -466,11 +466,7 @@ def test_get_indexer2(self): idx.get_indexer(target, "nearest", tolerance="1 day"), np.array([0, 1, 1], dtype=np.intp), ) - tol_raw = [ - Timedelta("1 hour"), - Timedelta("1 hour"), - np.timedelta64(1, "D"), - ] + tol_raw = [Timedelta("1 hour"), Timedelta("1 hour"), np.timedelta64(1, "D")] tm.assert_numpy_array_equal( idx.get_indexer( target, "nearest", tolerance=[np.timedelta64(x) for x in tol_raw] diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index a62936655e09c..0ce10fb8779a1 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -67,35 +67,6 @@ def test_repeat_freqstr(self, index, use_numpy): tm.assert_index_equal(result, expected) assert result.freqstr == index.freqstr - def test_fillna_period(self): - # GH 11343 - idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="H") - - exp = PeriodIndex( - ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" - ) - tm.assert_index_equal(idx.fillna(Period("2011-01-01 10:00", freq="H")), exp) - - exp = Index( - [ - Period("2011-01-01 09:00", freq="H"), - "x", - Period("2011-01-01 11:00", freq="H"), - ], - dtype=object, - ) - tm.assert_index_equal(idx.fillna("x"), exp) - - exp = Index( - [ - Period("2011-01-01 09:00", freq="H"), - Period("2011-01-01", freq="D"), - Period("2011-01-01 11:00", freq="H"), - ], - dtype=object, - ) - tm.assert_index_equal(idx.fillna(Period("2011-01-01", freq="D")), exp) - def test_no_millisecond_field(self): msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" with pytest.raises(AttributeError, match=msg): diff --git a/pandas/tests/indexes/timedeltas/test_fillna.py b/pandas/tests/indexes/timedeltas/test_fillna.py new file mode 100644 index 0000000000000..47b2f2ff597f4 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_fillna.py @@ -0,0 +1,17 @@ +from pandas import Index, NaT, Timedelta, TimedeltaIndex +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_timedelta(self): + # GH#11343 + idx = TimedeltaIndex(["1 day", NaT, "3 day"]) + + exp = TimedeltaIndex(["1 day", "2 day", "3 day"]) + tm.assert_index_equal(idx.fillna(Timedelta("2 day")), exp) + + exp = TimedeltaIndex(["1 day", "3 hour", "3 day"]) + idx.fillna(Timedelta("3 hour")) + + exp = Index([Timedelta("1 day"), "x", Timedelta("3 day")], dtype=object) + tm.assert_index_equal(idx.fillna("x"), exp) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index fa00b870ca757..129bdef870a14 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -43,21 +43,6 @@ def test_shift(self): def test_pickle_compat_construction(self): pass - def test_fillna_timedelta(self): - # GH 11343 - idx = pd.TimedeltaIndex(["1 day", pd.NaT, "3 day"]) - - exp = pd.TimedeltaIndex(["1 day", "2 day", "3 day"]) - tm.assert_index_equal(idx.fillna(pd.Timedelta("2 day")), exp) - - exp = pd.TimedeltaIndex(["1 day", "3 hour", "3 day"]) - idx.fillna(pd.Timedelta("3 hour")) - - exp = pd.Index( - [pd.Timedelta("1 day"), "x", pd.Timedelta("3 day")], dtype=object - ) - tm.assert_index_equal(idx.fillna("x"), exp) - def test_isin(self): index = tm.makeTimedeltaIndex(4) diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 9cc031001f81c..656d25bec2a6b 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -144,9 +144,7 @@ def check_values(self, f, func, values=False): tm.assert_almost_equal(result, expected) - def check_result( - self, method, key, typs=None, axes=None, fails=None, - ): + def check_result(self, method, key, typs=None, axes=None, fails=None): def _eq(axis, obj, key): """ compare equal for these 2 keys """ axified = _axify(obj, key, axis) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 0064187a94265..5e5fcd3db88d8 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -26,26 +26,6 @@ def test_multiindex_perf_warn(self): with tm.assert_produces_warning(PerformanceWarning): df.loc[(0,)] - def test_multiindex_contains_dropped(self): - # GH 19027 - # test that dropped MultiIndex levels are not in the MultiIndex - # despite continuing to be in the MultiIndex's levels - idx = MultiIndex.from_product([[1, 2], [3, 4]]) - assert 2 in idx - idx = idx.drop(2) - - # drop implementation keeps 2 in the levels - assert 2 in idx.levels[0] - # but it should no longer be in the index itself - assert 2 not in idx - - # also applies to strings - idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) - assert "a" in idx - idx = idx.drop("a") - assert "a" in idx.levels[0] - assert "a" not in idx - def test_indexing_over_hashtable_size_cutoff(self): n = 10000 @@ -85,14 +65,6 @@ def test_multi_nan_indexing(self): ) tm.assert_frame_equal(result, expected) - def test_contains(self): - # GH 24570 - tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") - idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) - assert tx[0] in idx - assert "element_not_exit" not in idx - assert "0 day 09:30:00" in idx - def test_nested_tuples_duplicates(self): # GH#30892 diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index 621417eb38d94..be1bd4908fc79 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -17,14 +17,10 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[ - lambda x: x.A > 2, - ] # noqa: E231 + res = df.loc[lambda x: x.A > 2,] # noqa: E231 tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 - res = df.loc[ - lambda x: x.A > 2, - ] # noqa: E231 + res = df.loc[lambda x: x.A > 2,] # noqa: E231 tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 res = df.loc[lambda x: x.B == "b", :] @@ -94,9 +90,7 @@ def test_frame_loc_callable_labels(self): res = df.loc[lambda x: ["A", "C"]] tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[ - lambda x: ["A", "C"], - ] # noqa: E231 + res = df.loc[lambda x: ["A", "C"],] # noqa: E231 tm.assert_frame_equal(res, df.loc[["A", "C"],]) # noqa: E231 res = df.loc[lambda x: ["A", "C"], :] diff --git a/pandas/tests/indexing/test_check_indexer.py b/pandas/tests/indexing/test_check_indexer.py index 69d4065234d93..865ecb129cdfa 100644 --- a/pandas/tests/indexing/test_check_indexer.py +++ b/pandas/tests/indexing/test_check_indexer.py @@ -32,7 +32,7 @@ def test_valid_input(indexer, expected): @pytest.mark.parametrize( - "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")], + "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")] ) def test_boolean_na_returns_indexer(indexer): # https://github.com/pandas-dev/pandas/issues/31503 @@ -61,7 +61,7 @@ def test_bool_raise_length(indexer): @pytest.mark.parametrize( - "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")], + "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")] ) def test_int_raise_missing_values(indexer): array = np.array([1, 2, 3]) @@ -89,9 +89,7 @@ def test_raise_invalid_array_dtypes(indexer): check_array_indexer(array, indexer) -@pytest.mark.parametrize( - "indexer", [None, Ellipsis, slice(0, 3), (None,)], -) +@pytest.mark.parametrize("indexer", [None, Ellipsis, slice(0, 3), (None,)]) def test_pass_through_non_array_likes(indexer): array = np.array([1, 2, 3]) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 18b9898e7d800..4804bc63077be 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -181,9 +181,7 @@ def test_scalar_with_mixed(self): expected = 3 assert result == expected - @pytest.mark.parametrize( - "index_func", [tm.makeIntIndex, tm.makeRangeIndex], - ) + @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) @pytest.mark.parametrize("klass", [Series, DataFrame]) def test_scalar_integer(self, index_func, klass): @@ -425,9 +423,7 @@ def test_integer_positional_indexing(self, l): with pytest.raises(TypeError, match=msg): s.iloc[l] - @pytest.mark.parametrize( - "index_func", [tm.makeIntIndex, tm.makeRangeIndex], - ) + @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) def test_slice_integer_frame_getitem(self, index_func): # similar to above, but on the getitem dim (of a DataFrame) @@ -486,9 +482,7 @@ def test_slice_integer_frame_getitem(self, index_func): s[l] @pytest.mark.parametrize("l", [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]) - @pytest.mark.parametrize( - "index_func", [tm.makeIntIndex, tm.makeRangeIndex], - ) + @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) def test_float_slice_getitem_with_integer_index_raises(self, l, index_func): # similar to above, but on the getitem dim (of a DataFrame) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 9664f8d7212ad..cefa3037c8407 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -56,7 +56,7 @@ def test_is_scalar_access(self): assert ser.iloc._is_scalar_access((1,)) df = ser.to_frame() - assert df.iloc._is_scalar_access((1, 0,)) + assert df.iloc._is_scalar_access((1, 0)) def test_iloc_exceeds_bounds(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d1f67981b1ec5..5382a265ef334 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -27,13 +27,11 @@ def test_loc_getitem_label_out_of_range(self): # out of range label self.check_result( - "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError, + "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError ) self.check_result("loc", "f", typs=["floats"], fails=KeyError) self.check_result("loc", "f", typs=["floats"], fails=KeyError) - self.check_result( - "loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError, - ) + self.check_result("loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError) self.check_result("loc", 20, typs=["labels"], fails=KeyError) self.check_result("loc", 20, typs=["ts"], axes=0, fails=KeyError) self.check_result("loc", 20, typs=["floats"], axes=0, fails=KeyError) @@ -44,26 +42,24 @@ def test_loc_getitem_label_list(self): pass def test_loc_getitem_label_list_with_missing(self): + self.check_result("loc", [0, 1, 2], typs=["empty"], fails=KeyError) self.check_result( - "loc", [0, 1, 2], typs=["empty"], fails=KeyError, - ) - self.check_result( - "loc", [0, 2, 10], typs=["ints", "uints", "floats"], axes=0, fails=KeyError, + "loc", [0, 2, 10], typs=["ints", "uints", "floats"], axes=0, fails=KeyError ) self.check_result( - "loc", [3, 6, 7], typs=["ints", "uints", "floats"], axes=1, fails=KeyError, + "loc", [3, 6, 7], typs=["ints", "uints", "floats"], axes=1, fails=KeyError ) # GH 17758 - MultiIndex and missing keys self.check_result( - "loc", [(1, 3), (1, 4), (2, 5)], typs=["multi"], axes=0, fails=KeyError, + "loc", [(1, 3), (1, 4), (2, 5)], typs=["multi"], axes=0, fails=KeyError ) def test_loc_getitem_label_list_fails(self): # fails self.check_result( - "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, fails=KeyError, + "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, fails=KeyError ) def test_loc_getitem_label_array_like(self): @@ -93,18 +89,14 @@ def test_loc_getitem_label_slice(self): ) self.check_result( - "loc", slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError, + "loc", slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError ) - self.check_result( - "loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError, - ) - self.check_result( - "loc", slice(2, 8), typs=["mixed"], axes=1, fails=KeyError, - ) + self.check_result("loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError) + self.check_result("loc", slice(2, 8), typs=["mixed"], axes=1, fails=KeyError) self.check_result( - "loc", slice(2, 4, 2), typs=["mixed"], axes=0, fails=TypeError, + "loc", slice(2, 4, 2), typs=["mixed"], axes=0, fails=TypeError ) @@ -654,8 +646,7 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): (1, ["A", "B", "C"]), np.array([7, 8, 9], dtype=np.int64), pd.DataFrame( - [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], - columns=["A", "B", "C"], + [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], columns=["A", "B", "C"] ), ), ( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 91ec1c29873cf..496ef9d88a894 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -900,16 +900,16 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): fill_value, ) assert_reindex_indexer_is_ok( - mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value, + mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value ) assert_reindex_indexer_is_ok( - mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value, + mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value ) assert_reindex_indexer_is_ok( mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value ) assert_reindex_indexer_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value, + mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value ) assert_reindex_indexer_is_ok( mgr, @@ -921,7 +921,7 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): if mgr.shape[ax] >= 3: assert_reindex_indexer_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value, + mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value ) diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index 7008cef7b28fa..f6871e7a272b3 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -101,11 +101,11 @@ def test_css_side_shorthands(shorthand, expansions): top, right, bottom, left = expansions assert_resolves( - f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, + f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"} ) assert_resolves( - f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, + f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"} ) assert_resolves( @@ -191,9 +191,7 @@ def test_css_absolute_font_size(size, relative_to, resolved): inherited = None else: inherited = {"font-size": relative_to} - assert_resolves( - f"font-size: {size}", {"font-size": resolved}, inherited=inherited, - ) + assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited) @pytest.mark.parametrize( @@ -227,6 +225,4 @@ def test_css_relative_font_size(size, relative_to, resolved): inherited = None else: inherited = {"font-size": relative_to} - assert_resolves( - f"font-size: {size}", {"font-size": resolved}, inherited=inherited, - ) + assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 1a5d122d732a9..2d6ad689f6703 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2389,8 +2389,7 @@ def test_east_asian_unicode_series(self): # object dtype, longer than unicode repr s = Series( - [1, 22, 3333, 44444], - index=[1, "AB", pd.Timestamp("2011-01-01"), "あああ"], + [1, 22, 3333, 44444], index=[1, "AB", pd.Timestamp("2011-01-01"), "あああ"] ) expected = ( "1 1\n" diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 877bd1650ae60..7000daeb9b575 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -299,7 +299,7 @@ def test_info_memory_usage(): DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) DataFrame(1, index=["a"], columns=["A"]).index.nbytes df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"], + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] ) df.index.nbytes df.memory_usage(index=True) @@ -336,7 +336,7 @@ def test_info_memory_usage_deep_pypy(): @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") def test_usage_via_getsizeof(): df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"], + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] ) mem = df.memory_usage(deep=True).sum() # sys.getsizeof will call the .memory_usage with @@ -359,16 +359,14 @@ def test_info_memory_usage_qualified(): buf = StringIO() df = DataFrame( - 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]), + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) ) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() df = DataFrame( - 1, - columns=list("ab"), - index=MultiIndex.from_product([range(3), ["foo", "bar"]]), + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) ) df.info(buf=buf) assert "+" in buf.getvalue() @@ -384,7 +382,7 @@ def memory_usage(f): N = 100 M = len(uppercase) index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"], + [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"] ) df = DataFrame({"value": np.random.randn(N * M)}, index=index) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6a7a81e88d318..b74abc965f7fa 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -16,20 +16,15 @@ import pandas._testing as tm _seriesd = tm.getSeriesData() -_tsd = tm.getTimeSeriesData() _frame = DataFrame(_seriesd) -_intframe = DataFrame({k: v.astype(np.int64) for k, v in _seriesd.items()}) -_tsframe = DataFrame(_tsd) _cat_frame = _frame.copy() cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * (len(_cat_frame) - 15) _cat_frame.index = pd.CategoricalIndex(cat, name="E") _cat_frame["E"] = list(reversed(cat)) _cat_frame["sort"] = np.arange(len(_cat_frame), dtype="int64") -_mixed_frame = _frame.copy() - def assert_json_roundtrip_equal(result, expected, orient): if orient == "records" or orient == "values": @@ -43,17 +38,10 @@ def assert_json_roundtrip_equal(result, expected, orient): class TestPandasContainer: @pytest.fixture(autouse=True) def setup(self): - self.intframe = _intframe.copy() - self.tsframe = _tsframe.copy() - self.mixed_frame = _mixed_frame.copy() self.categorical = _cat_frame.copy() yield - del self.intframe - del self.tsframe - del self.mixed_frame - def test_frame_double_encoded_labels(self, orient): df = DataFrame( [["a", "b"], ["c", "d"]], @@ -137,12 +125,12 @@ def test_roundtrip_simple(self, orient, convert_axes, numpy, dtype, float_frame) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) @pytest.mark.parametrize("numpy", [True, False]) - def test_roundtrip_intframe(self, orient, convert_axes, numpy, dtype): - data = self.intframe.to_json(orient=orient) + def test_roundtrip_intframe(self, orient, convert_axes, numpy, dtype, int_frame): + data = int_frame.to_json(orient=orient) result = pd.read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy, dtype=dtype ) - expected = self.intframe.copy() + expected = int_frame if ( numpy and (is_platform_32bit() or is_platform_windows()) @@ -236,13 +224,13 @@ def test_roundtrip_empty(self, orient, convert_axes, numpy, empty_frame): @pytest.mark.parametrize("convert_axes", [True, False]) @pytest.mark.parametrize("numpy", [True, False]) - def test_roundtrip_timestamp(self, orient, convert_axes, numpy): + def test_roundtrip_timestamp(self, orient, convert_axes, numpy, datetime_frame): # TODO: improve coverage with date_format parameter - data = self.tsframe.to_json(orient=orient) + data = datetime_frame.to_json(orient=orient) result = pd.read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy ) - expected = self.tsframe.copy() + expected = datetime_frame.copy() if not convert_axes: # one off for ts handling # DTI gets converted to epoch values @@ -730,23 +718,22 @@ def test_reconstruction_index(self): result = read_json(df.to_json()) tm.assert_frame_equal(result, df) - def test_path(self, float_frame): + def test_path(self, float_frame, int_frame, datetime_frame): with tm.ensure_clean("test.json") as path: for df in [ float_frame, - self.intframe, - self.tsframe, - self.mixed_frame, + int_frame, + datetime_frame, ]: df.to_json(path) read_json(path) - def test_axis_dates(self, datetime_series): + def test_axis_dates(self, datetime_series, datetime_frame): # frame - json = self.tsframe.to_json() + json = datetime_frame.to_json() result = read_json(json) - tm.assert_frame_equal(result, self.tsframe) + tm.assert_frame_equal(result, datetime_frame) # series json = datetime_series.to_json() @@ -754,10 +741,10 @@ def test_axis_dates(self, datetime_series): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None - def test_convert_dates(self, datetime_series): + def test_convert_dates(self, datetime_series, datetime_frame): # frame - df = self.tsframe.copy() + df = datetime_frame df["date"] = Timestamp("20130101") json = df.to_json() @@ -837,8 +824,8 @@ def test_convert_dates_infer(self, infer_word): ("20130101 20:43:42.123456789", "ns"), ], ) - def test_date_format_frame(self, date, date_unit): - df = self.tsframe.copy() + def test_date_format_frame(self, date, date_unit, datetime_frame): + df = datetime_frame df["date"] = Timestamp(date) df.iloc[1, df.columns.get_loc("date")] = pd.NaT @@ -853,8 +840,8 @@ def test_date_format_frame(self, date, date_unit): expected["date"] = expected["date"].dt.tz_localize("UTC") tm.assert_frame_equal(result, expected) - def test_date_format_frame_raises(self): - df = self.tsframe.copy() + def test_date_format_frame_raises(self, datetime_frame): + df = datetime_frame msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") @@ -890,8 +877,8 @@ def test_date_format_series_raises(self, datetime_series): ts.to_json(date_format="iso", date_unit="foo") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) - def test_date_unit(self, unit): - df = self.tsframe.copy() + def test_date_unit(self, unit, datetime_frame): + df = datetime_frame df["date"] = Timestamp("20130101 20:43:42") dl = df.columns.get_loc("date") df.iloc[1, dl] = Timestamp("19710101 20:43:42") diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 2fcac6fa57cf8..6d4e39cd6de14 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1546,5 +1546,5 @@ def test_missing_parse_dates_column_raises( msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" with pytest.raises(ValueError, match=msg): parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates, + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates ) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 979eb4702cc84..e05575cd79ccc 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -199,7 +199,7 @@ def test_usecols_with_whitespace(all_parsers): # Column selection by index. ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), # Column selection by name. - (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),), + (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])), ], ) def test_usecols_with_integer_like_header(all_parsers, usecols, expected): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d0eaafb787222..3b8e0bbd87f11 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -567,7 +567,7 @@ def test_additional_extension_types(self, pa): { # Arrow does not yet support struct in writing to Parquet (ARROW-1644) # "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]), - "d": pd.period_range("2012-01-01", periods=3, freq="D"), + "d": pd.period_range("2012-01-01", periods=3, freq="D") } ) check_round_trip(df, pa) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index eaa92fa53d799..ccf51962f3636 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1149,7 +1149,7 @@ def test_read_chunks_117( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True ) pos += chunksize @@ -1247,7 +1247,7 @@ def test_read_chunks_115( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True ) pos += chunksize diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 45ac18b2661c3..08b33ee547a48 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1306,6 +1306,13 @@ def test_plot_scatter_with_c(self): float_array = np.array([0.0, 1.0]) df.plot.scatter(x="A", y="B", c=float_array, cmap="spring") + def test_plot_scatter_with_s(self): + # this refers to GH 32904 + df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"],) + + ax = df.plot.scatter(x="a", y="b", s="c") + tm.assert_numpy_array_equal(df["c"].values, right=ax.collections[0].get_sizes()) + def test_scatter_colors(self): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) with pytest.raises(TypeError): diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 8795af2e11122..d56e121429e7c 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -350,11 +350,10 @@ def test_crosstab_normalize(self): tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal) tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal) tm.assert_frame_equal( - crosstab(df.a, df.b, normalize=1), - crosstab(df.a, df.b, normalize="columns"), + crosstab(df.a, df.b, normalize=1), crosstab(df.a, df.b, normalize="columns") ) tm.assert_frame_equal( - crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index"), + crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index") ) row_normal_margins = DataFrame( @@ -377,7 +376,7 @@ def test_crosstab_normalize(self): crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins ) tm.assert_frame_equal( - crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins, + crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins ) tm.assert_frame_equal( crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_get_dummies.py similarity index 84% rename from pandas/tests/reshape/test_reshape.py rename to pandas/tests/reshape/test_get_dummies.py index 6113cfec48df9..ce13762ea8f86 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.common import is_integer_dtype import pandas as pd -from pandas import Categorical, DataFrame, Index, Series, get_dummies +from pandas import Categorical, CategoricalIndex, DataFrame, Series, get_dummies import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype @@ -31,11 +31,11 @@ def effective_dtype(self, dtype): return np.uint8 return dtype - def test_raises_on_dtype_object(self, df): + def test_get_dummies_raises_on_dtype_object(self, df): with pytest.raises(ValueError): get_dummies(df, dtype="object") - def test_basic(self, sparse, dtype): + def test_get_dummies_basic(self, sparse, dtype): s_list = list("abc") s_series = Series(s_list) s_series_index = Series(s_list, list("ABC")) @@ -56,7 +56,7 @@ def test_basic(self, sparse, dtype): result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) tm.assert_frame_equal(result, expected) - def test_basic_types(self, sparse, dtype): + def test_get_dummies_basic_types(self, sparse, dtype): # GH 10531 s_list = list("abc") s_series = Series(s_list) @@ -106,7 +106,7 @@ def test_basic_types(self, sparse, dtype): result = result.sort_index() tm.assert_series_equal(result, expected) - def test_just_na(self, sparse): + def test_get_dummies_just_na(self, sparse): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=["A"]) @@ -123,7 +123,7 @@ def test_just_na(self, sparse): assert res_series.index.tolist() == [0] assert res_series_index.index.tolist() == ["A"] - def test_include_na(self, sparse, dtype): + def test_get_dummies_include_na(self, sparse, dtype): s = ["a", "b", np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) exp = DataFrame( @@ -152,7 +152,7 @@ def test_include_na(self, sparse, dtype): ) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) - def test_unicode(self, sparse): + def test_get_dummies_unicode(self, sparse): # See GH 6885 - get_dummies chokes on unicode values import unicodedata @@ -161,7 +161,7 @@ def test_unicode(self, sparse): s = [e, eacute, eacute] res = get_dummies(s, prefix="letter", sparse=sparse) exp = DataFrame( - {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8, + {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8 ) if sparse: exp = exp.apply(SparseArray, fill_value=0) @@ -175,7 +175,7 @@ def test_dataframe_dummies_all_obj(self, df, sparse): dtype=np.uint8, ) if sparse: - expected = pd.DataFrame( + expected = DataFrame( { "A_a": SparseArray([1, 0, 1], dtype="uint8"), "A_b": SparseArray([0, 1, 0], dtype="uint8"), @@ -223,7 +223,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected = expected[["C"] + cols] - typ = SparseArray if sparse else pd.Series + typ = SparseArray if sparse else Series expected[cols] = expected[cols].apply(lambda x: typ(x)) tm.assert_frame_equal(result, expected) @@ -242,11 +242,11 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): # https://github.com/pandas-dev/pandas/issues/14427 expected = pd.concat( [ - pd.Series([1, 2, 3], name="C"), - pd.Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"), - pd.Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"), - pd.Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"), - pd.Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"), + Series([1, 2, 3], name="C"), + Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"), + Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"), ], axis=1, ) @@ -267,7 +267,7 @@ def test_dataframe_dummies_subset(self, df, sparse): expected[["C"]] = df[["C"]] if sparse: cols = ["from_A_a", "from_A_b"] - expected[cols] = expected[cols].astype(pd.SparseDtype("uint8", 0)) + expected[cols] = expected[cols].astype(SparseDtype("uint8", 0)) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): @@ -286,7 +286,7 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse): expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] if sparse: cols = ["A..a", "A..b", "B..b", "B..c"] - expected[cols] = expected[cols].astype(pd.SparseDtype("uint8", 0)) + expected[cols] = expected[cols].astype(SparseDtype("uint8", 0)) tm.assert_frame_equal(result, expected) @@ -323,7 +323,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse): columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected[columns] = expected[columns].astype(np.uint8) if sparse: - expected[columns] = expected[columns].astype(pd.SparseDtype("uint8", 0)) + expected[columns] = expected[columns].astype(SparseDtype("uint8", 0)) tm.assert_frame_equal(result, expected) @@ -359,7 +359,7 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype): tm.assert_frame_equal(result, expected) def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): - df["cat"] = pd.Categorical(["x", "y", "y"]) + df["cat"] = Categorical(["x", "y", "y"]) result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1) if sparse: arr = SparseArray @@ -386,30 +386,30 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): "get_dummies_kwargs,expected", [ ( - {"data": pd.DataFrame(({"ä": ["a"]}))}, - pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + {"data": DataFrame(({"ä": ["a"]}))}, + DataFrame({"ä_a": [1]}, dtype=np.uint8), ), ( - {"data": pd.DataFrame({"x": ["ä"]})}, - pd.DataFrame({"x_ä": [1]}, dtype=np.uint8), + {"data": DataFrame({"x": ["ä"]})}, + DataFrame({"x_ä": [1]}, dtype=np.uint8), ), ( - {"data": pd.DataFrame({"x": ["a"]}), "prefix": "ä"}, - pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + {"data": DataFrame({"x": ["a"]}), "prefix": "ä"}, + DataFrame({"ä_a": [1]}, dtype=np.uint8), ), ( - {"data": pd.DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, - pd.DataFrame({"xäa": [1]}, dtype=np.uint8), + {"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, + DataFrame({"xäa": [1]}, dtype=np.uint8), ), ], ) def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): - # GH22084 pd.get_dummies incorrectly encodes unicode characters + # GH22084 get_dummies incorrectly encodes unicode characters # in dataframe column names result = get_dummies(**get_dummies_kwargs) tm.assert_frame_equal(result, expected) - def test_basic_drop_first(self, sparse): + def test_get_dummies_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case s_list = list("abc") @@ -430,7 +430,7 @@ def test_basic_drop_first(self, sparse): result = get_dummies(s_series_index, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) - def test_basic_drop_first_one_level(self, sparse): + def test_get_dummies_basic_drop_first_one_level(self, sparse): # Test the case that categorical variable only has one level. s_list = list("aaa") s_series = Series(s_list) @@ -448,7 +448,7 @@ def test_basic_drop_first_one_level(self, sparse): result = get_dummies(s_series_index, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) - def test_basic_drop_first_NA(self, sparse): + def test_get_dummies_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first s_NA = ["a", "b", np.nan] res = get_dummies(s_NA, drop_first=True, sparse=sparse) @@ -481,7 +481,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse): tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): - df["cat"] = pd.Categorical(["x", "y", "y"]) + df["cat"] = Categorical(["x", "y", "y"]) result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame( {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]} @@ -521,24 +521,24 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): expected = expected[["C", "A_b", "B_c"]] tm.assert_frame_equal(result, expected) - def test_int_int(self): + def test_get_dummies_int_int(self): data = Series([1, 2, 1]) - result = pd.get_dummies(data) + result = get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) tm.assert_frame_equal(result, expected) - data = Series(pd.Categorical(["a", "b", "a"])) - result = pd.get_dummies(data) + data = Series(Categorical(["a", "b", "a"])) + result = get_dummies(data) expected = DataFrame( - [[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(["a", "b"]), dtype=np.uint8 + [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8 ) tm.assert_frame_equal(result, expected) - def test_int_df(self, dtype): + def test_get_dummies_int_df(self, dtype): data = DataFrame( { "A": [1, 2, 1], - "B": pd.Categorical(["a", "b", "a"]), + "B": Categorical(["a", "b", "a"]), "C": [1, 2, 1], "D": [1.0, 2.0, 1.0], } @@ -549,22 +549,22 @@ def test_int_df(self, dtype): columns=columns, ) expected[columns[2:]] = expected[columns[2:]].astype(dtype) - result = pd.get_dummies(data, columns=["A", "B"], dtype=dtype) + result = get_dummies(data, columns=["A", "B"], dtype=dtype) tm.assert_frame_equal(result, expected) - def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): + @pytest.mark.parametrize("ordered", [True, False]) + def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered): # GH13854 - for ordered in [False, True]: - cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) - result = get_dummies(cat, dtype=dtype) + cat = Categorical(list("xy"), categories=list("xyz"), ordered=ordered) + result = get_dummies(cat, dtype=dtype) - data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype)) - cols = pd.CategoricalIndex( - cat.categories, categories=cat.categories, ordered=ordered - ) - expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype)) + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype)) + cols = CategoricalIndex( + cat.categories, categories=cat.categories, ordered=ordered + ) + expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype)) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("sparse", [True, False]) def test_get_dummies_dont_sparsify_all_columns(self, sparse): @@ -593,10 +593,10 @@ def test_get_dummies_duplicate_columns(self, df): tm.assert_frame_equal(result, expected) def test_get_dummies_all_sparse(self): - df = pd.DataFrame({"A": [1, 2]}) - result = pd.get_dummies(df, columns=["A"], sparse=True) + df = DataFrame({"A": [1, 2]}) + result = get_dummies(df, columns=["A"], sparse=True) dtype = SparseDtype("uint8", 0) - expected = pd.DataFrame( + expected = DataFrame( { "A_1": SparseArray([1, 0], dtype=dtype), "A_2": SparseArray([0, 1], dtype=dtype), @@ -607,7 +607,7 @@ def test_get_dummies_all_sparse(self): @pytest.mark.parametrize("values", ["baz"]) def test_get_dummies_with_string_values(self, values): # issue #28383 - df = pd.DataFrame( + df = DataFrame( { "bar": [1, 2, 3, 4, 5, 6], "foo": ["one", "one", "one", "two", "two", "two"], @@ -619,26 +619,4 @@ def test_get_dummies_with_string_values(self, values): msg = "Input must be a list-like for parameter `columns`" with pytest.raises(TypeError, match=msg): - pd.get_dummies(df, columns=values) - - -class TestCategoricalReshape: - def test_reshaping_multi_index_categorical(self): - - cols = ["ItemA", "ItemB", "ItemC"] - data = {c: tm.makeTimeDataFrame() for c in cols} - df = pd.concat({c: data[c].stack() for c in data}, axis="columns") - df.index.names = ["major", "minor"] - df["str"] = "foo" - - df["category"] = df["str"].astype("category") - result = df["category"].unstack() - - dti = df.index.levels[0] - c = Categorical(["foo"] * len(dti)) - expected = DataFrame( - {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, - columns=Index(list("ABCD"), name="minor"), - index=dti.rename("major"), - ) - tm.assert_frame_equal(result, expected) + get_dummies(df, columns=values) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index a0e3f8984fbe4..fd72db2118c0e 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -100,7 +100,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)], + "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)] ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -117,9 +117,7 @@ def test_rpow_special(value, asarray): assert result == value -@pytest.mark.parametrize( - "value", [-1, -1.0, np.int_(-1), np.float_(-1)], -) +@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float_(-1)]) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_minus_one(value, asarray): if asarray: @@ -182,9 +180,7 @@ def test_logical_not(): assert ~NA is NA -@pytest.mark.parametrize( - "shape", [(3,), (3, 3), (1, 2, 3)], -) +@pytest.mark.parametrize("shape", [(3,), (3, 3), (1, 2, 3)]) def test_arithmetic_ndarray(shape, all_arithmetic_functions): op = all_arithmetic_functions a = np.zeros(shape) diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index b45f831ff00aa..f2969e15fad8a 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -1,5 +1,3 @@ -from datetime import datetime - import numpy as np import pytest @@ -149,25 +147,17 @@ def test_reindex_pad(): def test_reindex_nearest(): s = Series(np.arange(10, dtype="int64")) target = [0.1, 0.9, 1.5, 2.0] - actual = s.reindex(target, method="nearest") + result = s.reindex(target, method="nearest") expected = Series(np.around(target).astype("int64"), target) - tm.assert_series_equal(expected, actual) - - actual = s.reindex_like(actual, method="nearest") - tm.assert_series_equal(expected, actual) - - actual = s.reindex_like(actual, method="nearest", tolerance=1) - tm.assert_series_equal(expected, actual) - actual = s.reindex_like(actual, method="nearest", tolerance=[1, 2, 3, 4]) - tm.assert_series_equal(expected, actual) + tm.assert_series_equal(expected, result) - actual = s.reindex(target, method="nearest", tolerance=0.2) + result = s.reindex(target, method="nearest", tolerance=0.2) expected = Series([0, 1, np.nan, 2], target) - tm.assert_series_equal(expected, actual) + tm.assert_series_equal(expected, result) - actual = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) + result = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) expected = Series([0, np.nan, np.nan, 2], target) - tm.assert_series_equal(expected, actual) + tm.assert_series_equal(expected, result) def test_reindex_backfill(): @@ -237,25 +227,6 @@ def test_reindex_categorical(): tm.assert_series_equal(result, expected) -def test_reindex_like(datetime_series): - other = datetime_series[::2] - tm.assert_series_equal( - datetime_series.reindex(other.index), datetime_series.reindex_like(other) - ) - - # GH 7179 - day1 = datetime(2013, 3, 5) - day2 = datetime(2013, 5, 5) - day3 = datetime(2014, 3, 5) - - series1 = Series([5, None, None], [day1, day2, day3]) - series2 = Series([None, None], [day1, day3]) - - result = series1.reindex_like(series2, method="pad") - expected = Series([5, np.nan], index=[day1, day3]) - tm.assert_series_equal(result, expected) - - def test_reindex_fill_value(): # ----------------------------------------------------------- # floats diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py index 5847141a44ef5..3371c47fa1b0a 100644 --- a/pandas/tests/series/indexing/test_get.py +++ b/pandas/tests/series/indexing/test_get.py @@ -1,7 +1,9 @@ import numpy as np +import pytest import pandas as pd from pandas import Series +import pandas._testing as tm def test_get(): @@ -149,3 +151,44 @@ def test_get_with_default(): for other in others: assert s.get(other, "z") == "z" assert s.get(other, other) == other + + +@pytest.mark.parametrize( + "arr", + [np.random.randn(10), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")], +) +def test_get2(arr): + # TODO: better name, possibly split + # GH#21260 + ser = Series(arr, index=[2 * i for i in range(len(arr))]) + assert ser.get(4) == ser.iloc[2] + + result = ser.get([4, 6]) + expected = ser.iloc[[2, 3]] + tm.assert_series_equal(result, expected) + + result = ser.get(slice(2)) + expected = ser.iloc[[0, 1]] + tm.assert_series_equal(result, expected) + + assert ser.get(-1) is None + assert ser.get(ser.index.max() + 1) is None + + ser = Series(arr[:6], index=list("abcdef")) + assert ser.get("c") == ser.iloc[2] + + result = ser.get(slice("b", "d")) + expected = ser.iloc[[1, 2, 3]] + tm.assert_series_equal(result, expected) + + result = ser.get("Z") + assert result is None + + assert ser.get(4) == ser.iloc[4] + assert ser.get(-1) == ser.iloc[-1] + assert ser.get(len(ser)) is None + + # GH#21257 + ser = Series(arr) + ser2 = ser[::2] + assert ser2.get(1) is None diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 5b3786e1a0d3c..f7c7457f3a703 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -188,46 +188,6 @@ def test_getitem_box_float64(datetime_series): assert isinstance(value, np.float64) -@pytest.mark.parametrize( - "arr", - [np.random.randn(10), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")], -) -def test_get(arr): - # GH 21260 - s = Series(arr, index=[2 * i for i in range(len(arr))]) - assert s.get(4) == s.iloc[2] - - result = s.get([4, 6]) - expected = s.iloc[[2, 3]] - tm.assert_series_equal(result, expected) - - result = s.get(slice(2)) - expected = s.iloc[[0, 1]] - tm.assert_series_equal(result, expected) - - assert s.get(-1) is None - assert s.get(s.index.max() + 1) is None - - s = Series(arr[:6], index=list("abcdef")) - assert s.get("c") == s.iloc[2] - - result = s.get(slice("b", "d")) - expected = s.iloc[[1, 2, 3]] - tm.assert_series_equal(result, expected) - - result = s.get("Z") - assert result is None - - assert s.get(4) == s.iloc[4] - assert s.get(-1) == s.iloc[-1] - assert s.get(len(s)) is None - - # GH 21257 - s = pd.Series(arr) - s2 = s[::2] - assert s2.get(1) is None - - def test_series_box_timestamp(): rng = pd.date_range("20090415", "20090519", freq="B") ser = Series(rng) @@ -669,11 +629,8 @@ def test_timedelta_assignment(): s = s.reindex(s.index.insert(0, "A")) tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) - result = s.fillna(timedelta(1)) - expected = Series(Timedelta("1 days"), index=["A", "B"]) - tm.assert_series_equal(result, expected) - s.loc["A"] = timedelta(1) + expected = Series(Timedelta("1 days"), index=["A", "B"]) tm.assert_series_equal(s, expected) # GH 14155 diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 4353eb4c8cd64..ec9ba468c996c 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -9,7 +9,7 @@ class TestSeriesArgsort: def _check_accum_op(self, name, ser, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal( - func(ser).values, func(np.array(ser)), check_dtype=check_dtype, + func(ser).values, func(np.array(ser)), check_dtype=check_dtype ) # with missing values diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index a41f893e3753f..94dc42bc764fa 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -90,7 +90,7 @@ class TestSeriesConvertDtypes: (True, False), (True, False), (True, False), - ): np.dtype("O"), + ): np.dtype("O") }, ), ( @@ -112,7 +112,7 @@ class TestSeriesConvertDtypes: (True, False), (True, False), (True, False), - ): np.dtype("float"), + ): np.dtype("float") }, ), ( @@ -129,7 +129,7 @@ class TestSeriesConvertDtypes: (True, False), (True, False), (True, False), - ): np.dtype("O"), + ): np.dtype("O") }, ), ( @@ -186,7 +186,7 @@ class TestSeriesConvertDtypes: (True, False), (True, False), (True, False), - ): pd.CategoricalDtype(), + ): pd.CategoricalDtype() }, ), ( @@ -198,7 +198,7 @@ class TestSeriesConvertDtypes: (True, False), (True, False), (True, False), - ): pd.DatetimeTZDtype(tz="UTC"), + ): pd.DatetimeTZDtype(tz="UTC") }, ), ( @@ -210,17 +210,17 @@ class TestSeriesConvertDtypes: (True, False), (True, False), (True, False), - ): np.dtype("datetime64[ns]"), + ): np.dtype("datetime64[ns]") }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, { - ((True,), (True, False), (True, False), (True, False),): np.dtype( + ((True,), (True, False), (True, False), (True, False)): np.dtype( "datetime64[ns]" ), - ((False,), (True, False), (True, False), (True, False),): np.dtype( + ((False,), (True, False), (True, False), (True, False)): np.dtype( "O" ), }, @@ -234,7 +234,7 @@ class TestSeriesConvertDtypes: (True, False), (True, False), (True, False), - ): pd.PeriodDtype("M"), + ): pd.PeriodDtype("M") }, ), ( @@ -246,7 +246,7 @@ class TestSeriesConvertDtypes: (True, False), (True, False), (True, False), - ): pd.IntervalDtype("int64"), + ): pd.IntervalDtype("int64") }, ), ], diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py new file mode 100644 index 0000000000000..c34838be24fc1 --- /dev/null +++ b/pandas/tests/series/methods/test_fillna.py @@ -0,0 +1,176 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas import Categorical, DataFrame, NaT, Period, Series, Timedelta, Timestamp +import pandas._testing as tm + + +class TestSeriesFillNA: + def test_fillna_pytimedelta(self): + # GH#8209 + ser = Series([np.nan, Timedelta("1 days")], index=["A", "B"]) + + result = ser.fillna(timedelta(1)) + expected = Series(Timedelta("1 days"), index=["A", "B"]) + tm.assert_series_equal(result, expected) + + def test_fillna_period(self): + # GH#13737 + ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")]) + + res = ser.fillna(Period("2012-01", freq="M")) + exp = Series([Period("2011-01", freq="M"), Period("2012-01", freq="M")]) + tm.assert_series_equal(res, exp) + assert res.dtype == "Period[M]" + + def test_fillna_dt64_timestamp(self): + ser = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + ser[2] = np.nan + + # reg fillna + result = ser.fillna(Timestamp("20130104")) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130104"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + result = ser.fillna(NaT) + expected = ser + tm.assert_series_equal(result, expected) + + def test_fillna_dt64_non_nao(self): + # GH#27419 + ser = Series([Timestamp("2010-01-01"), NaT, Timestamp("2000-01-01")]) + val = np.datetime64("1975-04-05", "ms") + + result = ser.fillna(val) + expected = Series( + [Timestamp("2010-01-01"), Timestamp("1975-04-05"), Timestamp("2000-01-01")] + ) + tm.assert_series_equal(result, expected) + + def test_fillna_numeric_inplace(self): + x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) + y = x.copy() + + y.fillna(value=0, inplace=True) + + expected = x.fillna(value=0) + tm.assert_series_equal(y, expected) + + # --------------------------------------------------------------- + # CategoricalDtype + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + ("a", ["a", "a", "b", "a", "a"]), + ({1: "a", 3: "b", 4: "b"}, ["a", "a", "b", "b", "b"]), + ({1: "a"}, ["a", "a", "b", np.nan, np.nan]), + ({1: "a", 3: "b"}, ["a", "a", "b", "b", np.nan]), + (Series("a"), ["a", np.nan, "b", np.nan, np.nan]), + (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]), + (Series({1: "a", 3: "b"}), ["a", "a", "b", "b", np.nan]), + (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]), + ], + ) + def test_fillna_categorical(self, fill_value, expected_output): + # GH#17033 + # Test fillna for a Categorical series + data = ["a", np.nan, "b", np.nan, np.nan] + ser = Series(Categorical(data, categories=["a", "b"])) + exp = Series(Categorical(expected_output, categories=["a", "b"])) + result = ser.fillna(fill_value) + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), + (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), + ( + Series( + Categorical( + ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] + ) + ), + ["a", "d", "b", "d", "a"], + ), + ], + ) + def test_fillna_categorical_with_new_categories(self, fill_value, expected_output): + # GH#26215 + data = ["a", np.nan, "b", np.nan, np.nan] + ser = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) + exp = Series(Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) + result = ser.fillna(fill_value) + tm.assert_series_equal(result, exp) + + def test_fillna_categorical_raises(self): + data = ["a", np.nan, "b", np.nan, np.nan] + ser = Series(Categorical(data, categories=["a", "b"])) + + with pytest.raises(ValueError, match="fill value must be in categories"): + ser.fillna("d") + + with pytest.raises(ValueError, match="fill value must be in categories"): + ser.fillna(Series("d")) + + with pytest.raises(ValueError, match="fill value must be in categories"): + ser.fillna({1: "d", 3: "a"}) + + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' + with pytest.raises(TypeError, match=msg): + ser.fillna(["a", "b"]) + + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' + with pytest.raises(TypeError, match=msg): + ser.fillna(("a", "b")) + + msg = ( + '"value" parameter must be a scalar, dict ' + 'or Series, but you passed a "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + ser.fillna(DataFrame({1: ["a"], 3: ["b"]})) + + # --------------------------------------------------------------- + # Invalid Usages + + def test_fillna_listlike_invalid(self): + ser = Series(np.random.randint(-100, 100, 50)) + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' + with pytest.raises(TypeError, match=msg): + ser.fillna([1, 2]) + + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' + with pytest.raises(TypeError, match=msg): + ser.fillna((1, 2)) + + def test_fillna_method_and_limit_invalid(self): + + # related GH#9217, make sure limit is an int and greater than 0 + ser = Series([1, 2, 3, None]) + msg = ( + r"Cannot specify both 'value' and 'method'\.|" + r"Limit must be greater than 0|" + "Limit must be an integer" + ) + for limit in [-1, 0, 1.0, 2.0]: + for method in ["backfill", "bfill", "pad", "ffill", None]: + with pytest.raises(ValueError, match=msg): + ser.fillna(1, limit=limit, method=method) diff --git a/pandas/tests/series/methods/test_reindex_like.py b/pandas/tests/series/methods/test_reindex_like.py new file mode 100644 index 0000000000000..7f24c778feb1b --- /dev/null +++ b/pandas/tests/series/methods/test_reindex_like.py @@ -0,0 +1,41 @@ +from datetime import datetime + +import numpy as np + +from pandas import Series +import pandas._testing as tm + + +def test_reindex_like(datetime_series): + other = datetime_series[::2] + tm.assert_series_equal( + datetime_series.reindex(other.index), datetime_series.reindex_like(other) + ) + + # GH#7179 + day1 = datetime(2013, 3, 5) + day2 = datetime(2013, 5, 5) + day3 = datetime(2014, 3, 5) + + series1 = Series([5, None, None], [day1, day2, day3]) + series2 = Series([None, None], [day1, day3]) + + result = series1.reindex_like(series2, method="pad") + expected = Series([5, np.nan], index=[day1, day3]) + tm.assert_series_equal(result, expected) + + +def test_reindex_like_nearest(): + ser = Series(np.arange(10, dtype="int64")) + + target = [0.1, 0.9, 1.5, 2.0] + other = ser.reindex(target, method="nearest") + expected = Series(np.around(target).astype("int64"), target) + + result = ser.reindex_like(other, method="nearest") + tm.assert_series_equal(expected, result) + + result = ser.reindex_like(other, method="nearest", tolerance=1) + tm.assert_series_equal(expected, result) + result = ser.reindex_like(other, method="nearest", tolerance=[1, 2, 3, 4]) + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/series/methods/test_searchsorted.py b/pandas/tests/series/methods/test_searchsorted.py index fd6c6f74a9136..5a6ec0039c7cd 100644 --- a/pandas/tests/series/methods/test_searchsorted.py +++ b/pandas/tests/series/methods/test_searchsorted.py @@ -40,6 +40,14 @@ def test_searchsorted_datetime64_scalar(self): assert is_scalar(res) assert res == 1 + def test_searchsorted_datetime64_scalar_mixed_timezones(self): + # GH 30086 + ser = Series(date_range("20120101", periods=10, freq="2D", tz="UTC")) + val = Timestamp("20120102", tz="America/New_York") + res = ser.searchsorted(val) + assert is_scalar(res) + assert res == 1 + def test_searchsorted_datetime64_list(self): ser = Series(date_range("20120101", periods=10, freq="2D")) vals = [Timestamp("20120102"), Timestamp("20120104")] diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index d4ebc9062a0c9..2d4fdfd5a3950 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -8,6 +8,10 @@ class TestSeriesSortIndex: + def test_sort_index_name(self, datetime_series): + result = datetime_series.sort_index(ascending=False) + assert result.name == datetime_series.name + def test_sort_index(self, datetime_series): rindex = list(datetime_series.index) random.shuffle(rindex) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index 7645fb8759a54..d651315d64561 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -75,9 +75,7 @@ def test_unstack_tuplename_in_multiindex(): expected = pd.DataFrame( [[1, 1, 1], [1, 1, 1], [1, 1, 1]], - columns=pd.MultiIndex.from_tuples( - [("a",), ("b",), ("c",)], names=[("A", "a")], - ), + columns=pd.MultiIndex.from_tuples([("a",), ("b",), ("c",)], names=[("A", "a")]), index=pd.Index([1, 2, 3], name=("B", "b")), ) tm.assert_frame_equal(result, expected) @@ -115,6 +113,23 @@ def test_unstack_mixed_type_name_in_multiindex( result = ser.unstack(unstack_idx) expected = pd.DataFrame( - expected_values, columns=expected_columns, index=expected_index, + expected_values, columns=expected_columns, index=expected_index + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_multi_index_categorical_values(): + + mi = tm.makeTimeDataFrame().stack().index.rename(["major", "minor"]) + ser = pd.Series(["foo"] * len(mi), index=mi, name="category", dtype="category") + + result = ser.unstack() + + dti = ser.index.levels[0] + c = pd.Categorical(["foo"] * len(dti)) + expected = DataFrame( + {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, + columns=pd.Index(list("ABCD"), name="minor"), + index=dti.rename("major"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index c2bb498df2be2..203750757e28d 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -53,39 +53,3 @@ def test_set_index_makes_timeseries(self): s = Series(range(10)) s.index = idx assert s.index.is_all_dates - - def test_set_axis_inplace_axes(self, axis_series): - # GH14636 - ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") - - expected = ser.copy() - expected.index = list("abcd") - - # inplace=True - # The FutureWarning comes from the fact that we would like to have - # inplace default to False some day - result = ser.copy() - result.set_axis(list("abcd"), axis=axis_series, inplace=True) - tm.assert_series_equal(result, expected) - - def test_set_axis_inplace(self): - # GH14636 - - s = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") - - expected = s.copy() - expected.index = list("abcd") - - # inplace=False - result = s.set_axis(list("abcd"), axis=0, inplace=False) - tm.assert_series_equal(expected, result) - - # omitting the "axis" parameter - with tm.assert_produces_warning(None): - result = s.set_axis(list("abcd"), inplace=False) - tm.assert_series_equal(result, expected) - - # wrong values for the "axis" parameter - for axis in [2, "foo"]: - with pytest.raises(ValueError, match="No axis named"): - s.set_axis(list("abcd"), axis=axis, inplace=False) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 3e877cf2fc787..302ca8d1aa43e 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -110,10 +110,6 @@ def _pickle_roundtrip(self, obj): unpickled = pd.read_pickle(path) return unpickled - def test_sort_index_name(self, datetime_series): - result = datetime_series.sort_index(ascending=False) - assert result.name == datetime_series.name - def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} result = Series(d) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 0cb1c038478f5..038db0b0a70fc 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -18,7 +18,7 @@ def _check_accum_op(name, series, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal( - func(series).values, func(np.array(series)), check_dtype=check_dtype, + func(series).values, func(np.array(series)), check_dtype=check_dtype ) # with missing values diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 1687f80e9f3ed..9e9b93a499487 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -122,22 +122,6 @@ def test_datetime64_fillna(self): ) s[2] = np.nan - # reg fillna - result = s.fillna(Timestamp("20130104")) - expected = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130104"), - Timestamp("20130103 9:01:01"), - ] - ) - tm.assert_series_equal(result, expected) - - result = s.fillna(NaT) - expected = s - tm.assert_series_equal(result, expected) - # ffill result = s.ffill() expected = Series( @@ -177,242 +161,228 @@ def test_datetime64_fillna(self): result = s.fillna(method="backfill") tm.assert_series_equal(result, expected) - def test_datetime64_tz_fillna(self): - - for tz in ["US/Eastern", "Asia/Tokyo"]: - # DatetimeBlock - s = Series( - [ - Timestamp("2011-01-01 10:00"), - pd.NaT, - Timestamp("2011-01-03 10:00"), - pd.NaT, - ] - ) - null_loc = pd.Series([False, True, False, True]) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00")) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-02 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - # check s is not changed - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna("AAA") - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - "AAA", - Timestamp("2011-01-03 10:00"), - "AAA", - ], - dtype=object, - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00"), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - # DatetimeBlockTZ - idx = pd.DatetimeIndex( - ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz - ) - s = pd.Series(idx) - assert s.dtype == f"datetime64[ns, {tz}]" - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-02 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) - idx = pd.DatetimeIndex( - [ - "2011-01-01 10:00", - "2011-01-02 10:00", - "2011-01-03 10:00", - "2011-01-02 10:00", - ], - tz=tz, - ) - expected = Series(idx) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) - idx = pd.DatetimeIndex( - [ - "2011-01-01 10:00", - "2011-01-02 10:00", - "2011-01-03 10:00", - "2011-01-02 10:00", - ], - tz=tz, - ) - expected = Series(idx) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna("AAA") - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - "AAA", - Timestamp("2011-01-03 10:00", tz=tz), - "AAA", - ], - dtype=object, - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00", tz=tz), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-04 10:00", tz=tz), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - # filling with a naive/other zone, coerce to object - result = s.fillna(Timestamp("20130101")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2013-01-01"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2013-01-01"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(Timestamp("20130101", tz="US/Pacific")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2013-01-01", tz="US/Pacific"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2013-01-01", tz="US/Pacific"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) + @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) + def test_datetime64_tz_fillna(self, tz): + # DatetimeBlock + s = Series( + [ + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-03 10:00"), + pd.NaT, + ] + ) + null_loc = pd.Series([False, True, False, True]) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + # check s is not changed + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + "AAA", + Timestamp("2011-01-03 10:00"), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + {1: pd.Timestamp("2011-01-02 10:00"), 3: pd.Timestamp("2011-01-04 10:00")} + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + # DatetimeBlockTZ + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz + ) + s = pd.Series(idx) + assert s.dtype == f"datetime64[ns, {tz}]" + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + "AAA", + Timestamp("2011-01-03 10:00", tz=tz), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00", tz=tz), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + # filling with a naive/other zone, coerce to object + result = s.fillna(Timestamp("20130101")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(Timestamp("20130101", tz="US/Pacific")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + def test_fillna_dt64tz_with_method(self): # with timezone # GH 15855 - df = pd.Series([pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]) + ser = pd.Series([pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]) exp = pd.Series( [ pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.Timestamp("2012-11-11 00:00:00+01:00"), ] ) - tm.assert_series_equal(df.fillna(method="pad"), exp) + tm.assert_series_equal(ser.fillna(method="pad"), exp) - df = pd.Series([pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]) + ser = pd.Series([pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]) exp = pd.Series( [ pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.Timestamp("2012-11-11 00:00:00+01:00"), ] ) - tm.assert_series_equal(df.fillna(method="bfill"), exp) - - def test_datetime64_non_nano_fillna(self): - # GH#27419 - ser = Series([Timestamp("2010-01-01"), pd.NaT, Timestamp("2000-01-01")]) - val = np.datetime64("1975-04-05", "ms") - - result = ser.fillna(val) - expected = Series( - [Timestamp("2010-01-01"), Timestamp("1975-04-05"), Timestamp("2000-01-01")] - ) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(ser.fillna(method="bfill"), exp) def test_fillna_consistency(self): # GH 16402 @@ -486,28 +456,6 @@ def test_fillna_int(self): s.fillna(method="ffill", inplace=True) tm.assert_series_equal(s.fillna(method="ffill", inplace=False), s) - def test_fillna_raise(self): - s = Series(np.random.randint(-100, 100, 50)) - msg = '"value" parameter must be a scalar or dict, but you passed a "list"' - with pytest.raises(TypeError, match=msg): - s.fillna([1, 2]) - - msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' - with pytest.raises(TypeError, match=msg): - s.fillna((1, 2)) - - # related GH 9217, make sure limit is an int and greater than 0 - s = Series([1, 2, 3, None]) - msg = ( - r"Cannot specify both 'value' and 'method'\.|" - r"Limit must be greater than 0|" - "Limit must be an integer" - ) - for limit in [-1, 0, 1.0, 2.0]: - for method in ["backfill", "bfill", "pad", "ffill", None]: - with pytest.raises(ValueError, match=msg): - s.fillna(1, limit=limit, method=method) - def test_categorical_nan_equality(self): cat = Series(Categorical(["a", "b", "c", np.nan])) exp = Series([True, True, True, False]) @@ -523,77 +471,6 @@ def test_categorical_nan_handling(self): s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8) ) - @pytest.mark.parametrize( - "fill_value, expected_output", - [ - ("a", ["a", "a", "b", "a", "a"]), - ({1: "a", 3: "b", 4: "b"}, ["a", "a", "b", "b", "b"]), - ({1: "a"}, ["a", "a", "b", np.nan, np.nan]), - ({1: "a", 3: "b"}, ["a", "a", "b", "b", np.nan]), - (Series("a"), ["a", np.nan, "b", np.nan, np.nan]), - (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]), - (Series({1: "a", 3: "b"}), ["a", "a", "b", "b", np.nan]), - (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]), - ], - ) - def test_fillna_categorical(self, fill_value, expected_output): - # GH 17033 - # Test fillna for a Categorical series - data = ["a", np.nan, "b", np.nan, np.nan] - s = Series(Categorical(data, categories=["a", "b"])) - exp = Series(Categorical(expected_output, categories=["a", "b"])) - tm.assert_series_equal(s.fillna(fill_value), exp) - - @pytest.mark.parametrize( - "fill_value, expected_output", - [ - (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), - (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), - ( - Series( - Categorical( - ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] - ) - ), - ["a", "d", "b", "d", "a"], - ), - ], - ) - def test_fillna_categorical_with_new_categories(self, fill_value, expected_output): - # GH 26215 - data = ["a", np.nan, "b", np.nan, np.nan] - s = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) - exp = Series(Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) - tm.assert_series_equal(s.fillna(fill_value), exp) - - def test_fillna_categorical_raise(self): - data = ["a", np.nan, "b", np.nan, np.nan] - s = Series(Categorical(data, categories=["a", "b"])) - - with pytest.raises(ValueError, match="fill value must be in categories"): - s.fillna("d") - - with pytest.raises(ValueError, match="fill value must be in categories"): - s.fillna(Series("d")) - - with pytest.raises(ValueError, match="fill value must be in categories"): - s.fillna({1: "d", 3: "a"}) - - msg = '"value" parameter must be a scalar or dict, but you passed a "list"' - with pytest.raises(TypeError, match=msg): - s.fillna(["a", "b"]) - - msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' - with pytest.raises(TypeError, match=msg): - s.fillna(("a", "b")) - - msg = ( - '"value" parameter must be a scalar, dict ' - 'or Series, but you passed a "DataFrame"' - ) - with pytest.raises(TypeError, match=msg): - s.fillna(DataFrame({1: ["a"], 3: ["b"]})) - def test_fillna_nat(self): series = Series([0, 1, 2, iNaT], dtype="M8[ns]") @@ -736,15 +613,6 @@ def test_fillna_bug(self): expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], x.index) tm.assert_series_equal(filled, expected) - def test_fillna_inplace(self): - x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) - y = x.copy() - - y.fillna(value=0, inplace=True) - - expected = x.fillna(value=0) - tm.assert_series_equal(y, expected) - def test_fillna_invalid_method(self, datetime_series): try: datetime_series.fillna(method="ffil") diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index d5a3efcf5757c..5c2c1db14e70f 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -38,15 +38,6 @@ def test_isna(self): tm.assert_series_equal(s.isna(), Series([False, True])) tm.assert_series_equal(s.notna(), Series([True, False])) - def test_fillna(self): - # GH 13737 - s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) - - res = s.fillna(pd.Period("2012-01", freq="M")) - exp = Series([pd.Period("2011-01", freq="M"), pd.Period("2012-01", freq="M")]) - tm.assert_series_equal(res, exp) - assert res.dtype == "Period[M]" - def test_dropna(self): # GH 13737 s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 84279d874bae1..207b397c3e1ff 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1252,92 +1252,6 @@ def test_level_with_tuples(self): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - def test_mixed_depth_drop(self): - arrays = [ - ["a", "top", "top", "routine1", "routine1", "routine2"], - ["", "OD", "OD", "result1", "result2", "result1"], - ["", "wx", "wy", "", "", ""], - ] - - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - df = DataFrame(randn(4, 6), columns=index) - - result = df.drop("a", axis=1) - expected = df.drop([("a", "", "")], axis=1) - tm.assert_frame_equal(expected, result) - - result = df.drop(["top"], axis=1) - expected = df.drop([("top", "OD", "wx")], axis=1) - expected = expected.drop([("top", "OD", "wy")], axis=1) - tm.assert_frame_equal(expected, result) - - result = df.drop(("top", "OD", "wx"), axis=1) - expected = df.drop([("top", "OD", "wx")], axis=1) - tm.assert_frame_equal(expected, result) - - expected = df.drop([("top", "OD", "wy")], axis=1) - expected = df.drop("top", axis=1) - - result = df.drop("result1", level=1, axis=1) - expected = df.drop( - [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 - ) - tm.assert_frame_equal(expected, result) - - def test_drop_multiindex_other_level_nan(self): - # GH 12754 - df = ( - DataFrame( - { - "A": ["one", "one", "two", "two"], - "B": [np.nan, 0.0, 1.0, 2.0], - "C": ["a", "b", "c", "c"], - "D": [1, 2, 3, 4], - } - ) - .set_index(["A", "B", "C"]) - .sort_index() - ) - result = df.drop("c", level="C") - expected = DataFrame( - [2, 1], - columns=["D"], - index=pd.MultiIndex.from_tuples( - [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] - ), - ) - tm.assert_frame_equal(result, expected) - - def test_drop_nonunique(self): - df = DataFrame( - [ - ["x-a", "x", "a", 1.5], - ["x-a", "x", "a", 1.2], - ["z-c", "z", "c", 3.1], - ["x-a", "x", "a", 4.1], - ["x-b", "x", "b", 5.1], - ["x-b", "x", "b", 4.1], - ["x-b", "x", "b", 2.2], - ["y-a", "y", "a", 1.2], - ["z-b", "z", "b", 2.1], - ], - columns=["var1", "var2", "var3", "var4"], - ) - - grp_size = df.groupby("var1").size() - drop_idx = grp_size.loc[grp_size == 1] - - idf = df.set_index(["var1", "var2", "var3"]) - - # it works! #2101 - result = idf.drop(drop_idx.index, level=0).reset_index() - expected = df[-df.var1.isin(drop_idx.index)] - - result.index = expected.index - - tm.assert_frame_equal(result, expected) - def test_mixed_depth_pop(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], @@ -1380,68 +1294,6 @@ def test_reindex_level_partial_selection(self): result = self.frame.T.loc[:, ["foo", "qux"]] tm.assert_frame_equal(result, expected.T) - def test_drop_level(self): - result = self.frame.drop(["bar", "qux"], level="first") - expected = self.frame.iloc[[0, 1, 2, 5, 6]] - tm.assert_frame_equal(result, expected) - - result = self.frame.drop(["two"], level="second") - expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]] - tm.assert_frame_equal(result, expected) - - result = self.frame.T.drop(["bar", "qux"], axis=1, level="first") - expected = self.frame.iloc[[0, 1, 2, 5, 6]].T - tm.assert_frame_equal(result, expected) - - result = self.frame.T.drop(["two"], axis=1, level="second") - expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T - tm.assert_frame_equal(result, expected) - - def test_drop_level_nonunique_datetime(self): - # GH 12701 - idx = Index([2, 3, 4, 4, 5], name="id") - idxdt = pd.to_datetime( - [ - "201603231400", - "201603231500", - "201603231600", - "201603231600", - "201603231700", - ] - ) - df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) - df["tstamp"] = idxdt - df = df.set_index("tstamp", append=True) - ts = Timestamp("201603231600") - assert df.index.is_unique is False - - result = df.drop(ts, level="tstamp") - expected = df.loc[idx != 4] - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("box", [Series, DataFrame]) - def test_drop_tz_aware_timestamp_across_dst(self, box): - # GH 21761 - start = Timestamp("2017-10-29", tz="Europe/Berlin") - end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") - index = pd.date_range(start, end, freq="15min") - data = box(data=[1] * len(index), index=index) - result = data.drop(start) - expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") - expected_idx = pd.date_range(expected_start, end, freq="15min") - expected = box(data=[1] * len(expected_idx), index=expected_idx) - tm.assert_equal(result, expected) - - def test_drop_preserve_names(self): - index = MultiIndex.from_arrays( - [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] - ) - - df = DataFrame(np.random.randn(6, 3), index=index) - - result = df.drop([(0, 2)]) - assert result.index.names == ("one", "two") - def test_unicode_repr_level_names(self): index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"]) @@ -2292,7 +2144,7 @@ def test_multilevel_index_loc_order(self, dim, keys, expected): # GH 22797 # Try to respect order of keys given for MultiIndex.loc kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} - df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs,) + df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs) exp_index = MultiIndex.from_arrays(expected) if dim == "index": res = df.loc[keys, :] diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index cac6a59527a6e..9b440a8a84892 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -285,7 +285,7 @@ def test_nansum(self, skipna): def test_nanmean(self, skipna): self.check_funs( - nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False, + nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False ) def test_nanmean_overflow(self): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 6289c2efea7f1..84b5d64ea1f84 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3556,9 +3556,7 @@ def test_string_array(any_string_method): result = getattr(b.str, method_name)(*args, **kwargs) if isinstance(expected, Series): - if expected.dtype == "object" and lib.is_string_array( - expected.dropna().values, - ): + if expected.dtype == "object" and lib.is_string_array(expected.dropna().values): assert result.dtype == "string" result = result.astype(object) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a751182dbf7af..f3b68caf3dcf5 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -602,7 +602,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): pd.to_datetime(dts_with_oob, errors="coerce", cache=cache), pd.DatetimeIndex( [Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30 - + [pd.NaT], + + [pd.NaT] ), ) @@ -1862,6 +1862,18 @@ def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): pd.to_datetime(s, infer_datetime_format=True, cache=cache), ) + @pytest.mark.parametrize( + "tz_name, offset", [("UTC", 0), ("UTC-3", 180), ("UTC+3", -180)] + ) + def test_infer_datetime_format_tz_name(self, tz_name, offset): + # GH 33133 + s = pd.Series([f"2019-02-02 08:07:13 {tz_name}"]) + result = to_datetime(s, infer_datetime_format=True) + expected = pd.Series( + [pd.Timestamp("2019-02-02 08:07:13").tz_localize(pytz.FixedOffset(offset))] + ) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 22c0f455fa3ac..12320cd52cec8 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -91,7 +91,7 @@ def to_offset(freq) -> Optional[DateOffset]: See Also -------- - DateOffset + DateOffset : Standard kind of date increment used for a date range. Examples -------- diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 7fc85a04e7d84..72003eeddf5ee 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -88,6 +88,20 @@ def _get_dependency_info() -> Dict[str, JSONSerializable]: def show_versions(as_json: Union[str, bool] = False) -> None: + """ + Provide useful information, important for bug reports. + + It comprises info about hosting operation system, pandas version, + and versions of other installed relative packages. + + Parameters + ---------- + as_json : str or bool, default False + * If False, outputs info in a human readable form to the console. + * If str, it will be considered as a path to a file. + Info will be written to that file in JSON format. + * If True, outputs info in JSON format to the console. + """ sys_info = _get_sys_info() deps = _get_dependency_info() diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 25394dc6775d8..8b3424b70d809 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -182,10 +182,10 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: is_platform_windows(), reason="not used on win32" ) skip_if_has_locale = pytest.mark.skipif( - _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", + _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}" ) skip_if_not_us_locale = pytest.mark.skipif( - _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", + _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}" ) skip_if_no_scipy = pytest.mark.skipif( _skip_if_no_scipy(), reason="Missing SciPy requirement"