pandas-dev · znicholls · Feb 16, 2021 · Feb 16, 2021 · Feb 16, 2021 · Feb 16, 2021
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -153,3 +153,9 @@ jobs:
       run: |
         source activate pandas-dev
         pytest pandas/tests/frame/methods --array-manager
+
+        # indexing iset related (temporary since other tests don't pass yet)
+        pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_multi_index --array-manager
+        pytest pandas/tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_listlike_indexer_duplicate_columns --array-manager
+        pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_astype_assignment_with_dups --array-manager
+        pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_frame_setitem_multi_column --array-manager
diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml
@@ -170,3 +170,11 @@ jobs:
 
     - name: Print skipped tests
       run: python ci/print_skipped.py
+
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v1
+      with:
+        files: /tmp/test_coverage.xml
+        flags: unittests
+        name: codecov-pandas
+        fail_ci_if_error: true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -180,6 +180,12 @@ repos:
         language: pygrep
         types: [python]
         files: ^pandas/tests/
+    -   id: title-capitalization
+        name: Validate correct capitalization among titles in documentation
+        entry: python scripts/validate_rst_title_capitalization.py
+        language: python
+        types: [rst]
+        files: ^doc/source/(development|reference)/
 -   repo: https://github.com/asottile/yesqa
     rev: v1.2.2
     hooks:

diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py
@@ -0,0 +1,42 @@
+"""
+Benchmarks for code in pandas/_libs, excluding pandas/_libs/tslibs,
+which has its own directory
+"""
+import numpy as np
+
+from pandas._libs.lib import (
+    is_list_like,
+    is_scalar,
+)
+
+from pandas import (
+    NA,
+    NaT,
+)
+
+# TODO: share with something in pd._testing?
+scalars = [
+    0,
+    1.0,
+    1 + 2j,
+    True,
+    "foo",
+    b"bar",
+    None,
+    np.datetime64(123, "ns"),
+    np.timedelta64(123, "ns"),
+    NaT,
+    NA,
+]
+zero_dims = [np.array("123")]
+listlikes = [np.array([1, 2, 3]), {0: "foo"}, set(1, 2, 3), [1, 2, 3], (1, 2, 3)]
+
+
+class ScalarListLike:
+    params = scalars + zero_dims + listlikes
+
+    def time_is_list_like(self, param):
+        is_list_like(param)
+
+    def time_is_scalar(self, param):
+        is_scalar(param)
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -233,10 +233,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
     $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS02,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
-    MSG='Validate correct capitalization among titles in documentation' ; echo $MSG
-    $BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development $BASE_DIR/doc/source/reference
-    RET=$(($RET + $?)) ; echo $MSG "DONE"
-
 fi
 
 ### TYPING ###

diff --git a/ci/deps/azure-37-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml
@@ -18,7 +18,7 @@ dependencies:
   - lxml
   - matplotlib=3.0.0
   - numpy=1.16.*
-  - openpyxl=2.6.0
+  - openpyxl=3.0.0
   - python-dateutil
   - python-blosc
   - pytz=2017.3

diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml
@@ -19,7 +19,7 @@ dependencies:
   - numba=0.46.0
   - numexpr=2.6.8
   - numpy=1.16.5
-  - openpyxl=2.6.0
+  - openpyxl=3.0.0
   - pytables=3.5.1
   - python-dateutil=2.7.3
   - pytz=2017.3

diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -476,6 +476,14 @@ storing numeric arrays with units. These arrays can be stored inside pandas'
 Series and DataFrame. Operations between Series and DataFrame columns which
 use pint's extension array are then units aware.
 
+`Text Extensions for Pandas`_
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``Text Extensions for Pandas <https://ibm.biz/text-extensions-for-pandas>``
+provides extension types to cover common data structures for representing natural language
+data, plus library integrations that convert the outputs of popular natural language
+processing libraries into Pandas DataFrames.
+
 .. _ecosystem.accessors:
 
 Accessors

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -274,7 +274,7 @@ html5lib                  1.0.1              HTML parser for read_html (see :ref
 lxml                      4.3.0              HTML parser for read_html (see :ref:`note <optional_html>`)
 matplotlib                2.2.3              Visualization
 numba                     0.46.0             Alternative execution engine for rolling operations
-openpyxl                  2.6.0              Reading / writing for xlsx files
+openpyxl                  3.0.0              Reading / writing for xlsx files
 pandas-gbq                0.12.0             Google Big Query access
 psycopg2                  2.7                PostgreSQL engine for sqlalchemy
 pyarrow                   0.15.0             Parquet, ORC, and feather reading / writing

diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst
@@ -178,6 +178,75 @@ To test for membership in the values, use the method :meth:`~pandas.Series.isin`
 For ``DataFrames``, likewise, ``in`` applies to the column axis,
 testing for membership in the list of column names.
 
+.. _udf-mutation:
+
+Mutating with User Defined Function (UDF) methods
+-------------------------------------------------
+
+It is a general rule in programming that one should not mutate a container
+while it is being iterated over. Mutation will invalidate the iterator,
+causing unexpected behavior. Consider the example:
+
+.. ipython:: python
+
+   values = [0, 1, 2, 3, 4, 5]
+   n_removed = 0
+   for k, value in enumerate(values):
+       idx = k - n_removed
+       if value % 2 == 1:
+           del values[idx]
+           n_removed += 1
+       else:
+           values[idx] = value + 1
+   values
+
+One probably would have expected that the result would be ``[1, 3, 5]``.
+When using a pandas method that takes a UDF, internally pandas is often
+iterating over the
+``DataFrame`` or other pandas object. Therefore, if the UDF mutates (changes)
+the ``DataFrame``, unexpected behavior can arise.
+
+Here is a similar example with :meth:`DataFrame.apply`:
+
+.. ipython:: python
+
+   def f(s):
+       s.pop("a")
+       return s
+
+   df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+   try:
+       df.apply(f, axis="columns")
+   except Exception as err:
+       print(repr(err))
+
+To resolve this issue, one can make a copy so that the mutation does
+not apply to the container being iterated over.
+
+.. ipython:: python
+
+   values = [0, 1, 2, 3, 4, 5]
+   n_removed = 0
+   for k, value in enumerate(values.copy()):
+       idx = k - n_removed
+       if value % 2 == 1:
+           del values[idx]
+           n_removed += 1
+       else:
+           values[idx] = value + 1
+   values
+
+.. ipython:: python
+
+   def f(s):
+       s = s.copy()
+       s.pop("a")
+       return s
+
+   df = pd.DataFrame({"a": [1, 2, 3], 'b': [4, 5, 6]})
+   df.apply(f, axis="columns")
+
+
 ``NaN``, Integer ``NA`` values and ``NA`` type promotions
 ---------------------------------------------------------
 

diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst
@@ -176,7 +176,7 @@ New plotting methods
 Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot
 types. For example, ``'kde'`` is a new option:
 
-.. ipython:: python
+.. code-block:: python
 
    s = pd.Series(
        np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3))

diff --git a/doc/source/whatsnew/v1.2.3.rst b/doc/source/whatsnew/v1.2.3.rst
@@ -15,7 +15,7 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 
--
+- Fixed regression in :func:`pandas.to_excel` raising ``KeyError`` when giving duplicate columns with ``columns`` attribute (:issue:`39695`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -186,7 +186,7 @@ Optional libraries below the lowest tested version may still work, but are not c
 +-----------------+-----------------+---------+
 | numba           | 0.46.0          |         |
 +-----------------+-----------------+---------+
-| openpyxl        | 2.6.0           |         |
+| openpyxl        | 3.0.0           |    X    |
 +-----------------+-----------------+---------+
 | pyarrow         | 0.15.0          |         |
 +-----------------+-----------------+---------+
@@ -239,7 +239,7 @@ Deprecations
 - Deprecated :attr:`Rolling.is_datetimelike` (:issue:`38963`)
 - Deprecated :meth:`core.window.ewm.ExponentialMovingWindow.vol` (:issue:`39220`)
 - Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`)
--
+- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`)
 
 .. ---------------------------------------------------------------------------
 
@@ -346,7 +346,9 @@ Indexing
 - Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, issue:`39619`)
 - Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`)
 - Bug in setting ``datetime64`` values into a :class:`Series` with integer-dtype incorrect casting the datetime64 values to integers (:issue:`39266`)
+- Bug in setting ``np.datetime64("NaT")`` into a :class:`Series` with :class:`Datetime64TZDtype` incorrectly treating the timezone-naive value as timezone-aware (:issue:`39769`)
 - Bug in :meth:`Index.get_loc` not raising ``KeyError`` when method is specified for ``NaN`` value when ``NaN`` is not in :class:`Index` (:issue:`39382`)
+- Bug in :meth:`DatetimeIndex.insert` when inserting ``np.datetime64("NaT")`` into a timezone-aware index incorrectly treating the timezone-naive value as timezone-aware (:issue:`39769`)
 - Bug in incorrectly raising in :meth:`Index.insert`, when setting a new column that cannot be held in the existing ``frame.columns``, or in :meth:`Series.reset_index` or :meth:`DataFrame.reset_index` instead of casting to a compatible dtype (:issue:`39068`)
 - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`)
 - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1044,11 +1044,15 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool:
 
 cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1:
     return (
-        isinstance(obj, abc.Iterable)
+        # equiv: `isinstance(obj, abc.Iterable)`
+        hasattr(obj, "__iter__") and not isinstance(obj, type)
         # we do not count strings/unicode/bytes as list-like
         and not isinstance(obj, (str, bytes))
         # exclude zero-dimensional numpy arrays, effectively scalars
-        and not (util.is_array(obj) and obj.ndim == 0)
+        and not cnp.PyArray_IsZeroDim(obj)
+        # extra check for numpy-like objects which aren't captured by
+        # the above
+        and not (hasattr(obj, "ndim") and obj.ndim == 0)
         # exclude sets if allow_sets is False
         and not (allow_sets is False and isinstance(obj, abc.Set))
     )

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -559,7 +559,7 @@ def makeCustomIndex(
         "p": makePeriodIndex,
     }.get(idx_type)
     if idx_func:
-        # pandas\_testing.py:2120: error: Cannot call function of unknown type
+        # error: Cannot call function of unknown type
         idx = idx_func(nentries)  # type: ignore[operator]
         # but we need to fill in the name
         if names:

diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
@@ -82,9 +82,8 @@ def dec(f):
         is_decorating = not kwargs and len(args) == 1 and callable(args[0])
         if is_decorating:
             f = args[0]
-            # pandas\_testing.py:2331: error: Incompatible types in assignment
-            # (expression has type "List[<nothing>]", variable has type
-            # "Tuple[Any, ...]")
+            # error: Incompatible types in assignment (expression has type
+            # "List[<nothing>]", variable has type "Tuple[Any, ...]")
             args = []  # type: ignore[assignment]
             return dec(f)
         else:
@@ -205,8 +204,7 @@ def wrapper(*args, **kwargs):
         except Exception as err:
             errno = getattr(err, "errno", None)
             if not errno and hasattr(errno, "reason"):
-                # pandas\_testing.py:2521: error: "Exception" has no attribute
-                # "reason"
+                # error: "Exception" has no attribute "reason"
                 errno = getattr(err.reason, "errno", None)  # type: ignore[attr-defined]
 
             if errno in skip_errnos:

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -17,7 +17,7 @@
     "matplotlib": "2.2.3",
     "numexpr": "2.6.8",
     "odfpy": "1.3.0",
-    "openpyxl": "2.6.0",
+    "openpyxl": "3.0.0",
     "pandas_gbq": "0.12.0",
     "pyarrow": "0.15.0",
     "pytest": "5.0.1",

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1565,6 +1565,14 @@ def indexer_si(request):
     return request.param
 
 
+@pytest.fixture(params=[tm.setitem, tm.loc])
+def indexer_sl(request):
+    """
+    Parametrize over __setitem__, loc.__setitem__
+    """
+    return request.param
+
+
 @pytest.fixture
 def using_array_manager(request):
     """

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -2208,7 +2208,7 @@ def _sort_mixed(values):
     return np.concatenate([nums, np.asarray(strs, dtype=object)])
 
 
-def _sort_tuples(values: np.ndarray[tuple]):
+def _sort_tuples(values: np.ndarray):
     """
     Convert array of tuples (1d) to array or array (2d).
     We need to keep the columns separately as they contain different types and

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -147,18 +147,14 @@ def index(self) -> Index:
     def apply(self) -> FrameOrSeriesUnion:
         pass
 
-    def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]:
+    def agg(self) -> Optional[FrameOrSeriesUnion]:
         """
         Provide an implementation for the aggregators.
 
         Returns
         -------
-        tuple of result, how.
-
-        Notes
-        -----
-        how can be a string describe the required post-processing, or
-        None if not required.
+        Result of aggregation, or None if agg cannot be performed by
+        this method.
         """
         obj = self.obj
         arg = self.f
@@ -171,23 +167,21 @@ def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]:
 
         result = self.maybe_apply_str()
         if result is not None:
-            return result, None
+            return result
 
         if is_dict_like(arg):
-            return self.agg_dict_like(_axis), True
+            return self.agg_dict_like(_axis)
         elif is_list_like(arg):
             # we require a list, but not a 'str'
-            return self.agg_list_like(_axis=_axis), None
-        else:
-            result = None
+            return self.agg_list_like(_axis=_axis)
 
         if callable(arg):
             f = obj._get_cython_func(arg)
             if f and not args and not kwargs:
-                return getattr(obj, f)(), None
+                return getattr(obj, f)()
 
         # caller can react
-        return result, True
+        return None
 
     def agg_list_like(self, _axis: int) -> FrameOrSeriesUnion:
         """