Merge remote-tracking branch 'upstream/master' into string-use-inf-as-na

dsaxton · dsaxton · commit 2b4d109d2f3d · 2020-04-27T21:32:39.000-05:00
diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml
@@ -14,8 +14,7 @@ dependencies:
   - pytz
   - pip
   - pip:
-    - cython==0.29.16
-    # GH#33507 cython 3.0a1 is causing TypeErrors 2020-04-13
+    - cython>=0.29.16
     - "git+git://github.com/dateutil/dateutil.git"
     - "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com"
     - "--pre"
diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst
@@ -396,7 +396,7 @@ Consider the following toy example of doubling each observation:
    1000 loops, best of 3: 233 us per loop
 
    # Custom function with numba
-   In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy())
+   In [7]: %timeit df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy())
    1000 loops, best of 3: 145 us per loop
 
 Caveats
@@ -599,13 +599,6 @@ identifier.
 The ``inplace`` keyword determines whether this assignment will performed
 on the original ``DataFrame`` or return a copy with the new column.
 
-.. warning::
-
-   For backwards compatibility, ``inplace`` defaults to ``True`` if not
-   specified. This will change in a future version of pandas - if your
-   code depends on an inplace assignment you should update to explicitly
-   set ``inplace=True``.
-
 .. ipython:: python
 
    df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
@@ -614,7 +607,7 @@ on the original ``DataFrame`` or return a copy with the new column.
    df.eval('a = 1', inplace=True)
    df
 
-When ``inplace`` is set to ``False``, a copy of the ``DataFrame`` with the
+When ``inplace`` is set to ``False``, the default, a copy of the ``DataFrame`` with the
 new or modified columns is returned and the original frame is unchanged.
 
 .. ipython:: python
@@ -653,11 +646,6 @@ whether the query modifies the original frame.
    df.query('a > 2', inplace=True)
    df
 
-.. warning::
-
-   Unlike with ``eval``, the default value for ``inplace`` for ``query``
-   is ``False``.  This is consistent with prior versions of pandas.
-
 Local variables
 ~~~~~~~~~~~~~~~
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -569,7 +569,7 @@ Numeric
 - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`)
 - Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`)
 - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
--
+- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
 
 Conversion
 ^^^^^^^^^^
@@ -732,7 +732,7 @@ ExtensionArray
 
 - Fixed bug where :meth:`Series.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`)
 - Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`)
-
+-
 
 Other
 ^^^^^
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -84,7 +84,6 @@
     validate_numeric_casting,
 )
 from pandas.core.dtypes.common import (
-    ensure_float64,
     ensure_int64,
     ensure_platform_int,
     infer_dtype_from_object,
@@ -7871,16 +7870,16 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame":
         numeric_df = self._get_numeric_data()
         cols = numeric_df.columns
         idx = cols.copy()
-        mat = numeric_df.values
+        mat = numeric_df.astype(float, copy=False).to_numpy()
 
         if method == "pearson":
-            correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
+            correl = libalgos.nancorr(mat, minp=min_periods)
         elif method == "spearman":
-            correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods)
+            correl = libalgos.nancorr_spearman(mat, minp=min_periods)
         elif method == "kendall" or callable(method):
             if min_periods is None:
                 min_periods = 1
-            mat = ensure_float64(mat).T
+            mat = mat.T
             corrf = nanops.get_corr_func(method)
             K = len(cols)
             correl = np.empty((K, K), dtype=float)
@@ -8006,19 +8005,19 @@ def cov(self, min_periods=None) -> "DataFrame":
         numeric_df = self._get_numeric_data()
         cols = numeric_df.columns
         idx = cols.copy()
-        mat = numeric_df.values
+        mat = numeric_df.astype(float, copy=False).to_numpy()
 
         if notna(mat).all():
             if min_periods is not None and min_periods > len(mat):
-                baseCov = np.empty((mat.shape[1], mat.shape[1]))
-                baseCov.fill(np.nan)
+                base_cov = np.empty((mat.shape[1], mat.shape[1]))
+                base_cov.fill(np.nan)
             else:
-                baseCov = np.cov(mat.T)
-            baseCov = baseCov.reshape((len(cols), len(cols)))
+                base_cov = np.cov(mat.T)
+            base_cov = base_cov.reshape((len(cols), len(cols)))
         else:
-            baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods)
+            base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
 
-        return self._constructor(baseCov, index=idx, columns=cols)
+        return self._constructor(base_cov, index=idx, columns=cols)
 
     def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series:
         """
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -1,7 +1,7 @@
 """
 Base and utility classes for tseries type pandas objects.
 """
-from datetime import datetime, timedelta
+from datetime import datetime
 from typing import Any, List, Optional, Union, cast
 
 import numpy as np
@@ -16,18 +16,14 @@
 from pandas.core.dtypes.common import (
     ensure_int64,
     is_bool_dtype,
-    is_datetime64_any_dtype,
     is_dtype_equal,
     is_integer,
     is_list_like,
-    is_object_dtype,
     is_period_dtype,
     is_scalar,
-    is_timedelta64_dtype,
 )
 from pandas.core.dtypes.concat import concat_compat
 from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
-from pandas.core.dtypes.missing import isna
 
 from pandas.core import algorithms
 from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray
@@ -46,7 +42,6 @@
 from pandas.core.tools.timedeltas import to_timedelta
 
 from pandas.tseries.frequencies import DateOffset
-from pandas.tseries.offsets import Tick
 
 _index_doc_kwargs = dict(ibase._index_doc_kwargs)
 
@@ -77,33 +72,13 @@ def wrapper(left, right):
     return wrapper
 
 
-def _make_wrapped_arith_op_with_freq(opname: str):
-    """
-    Dispatch the operation to the underlying ExtensionArray, and infer
-    the appropriate frequency for the result.
-    """
-    meth = make_wrapped_arith_op(opname)
-
-    def wrapped(self, other):
-        result = meth(self, other)
-        if result is NotImplemented:
-            return NotImplemented
-
-        new_freq = self._get_addsub_freq(other, result)
-        result._freq = new_freq
-        return result
-
-    wrapped.__name__ = opname
-    return wrapped
-
-
 @inherit_names(
     ["inferred_freq", "_isnan", "_resolution", "resolution"],
     DatetimeLikeArrayMixin,
     cache=True,
 )
 @inherit_names(
-    ["mean", "asi8", "_box_func"], DatetimeLikeArrayMixin,
+    ["mean", "asi8", "freq", "freqstr", "_box_func"], DatetimeLikeArrayMixin,
 )
 class DatetimeIndexOpsMixin(ExtensionIndex):
     """
@@ -437,44 +412,8 @@ def _partial_date_slice(
     # --------------------------------------------------------------------
     # Arithmetic Methods
 
-    def _get_addsub_freq(self, other, result) -> Optional[DateOffset]:
-        """
-        Find the freq we expect the result of an addition/subtraction operation
-        to have.
-        """
-        if is_period_dtype(self.dtype):
-            if is_period_dtype(result.dtype):
-                # Only used for ops that stay PeriodDtype
-                return self.freq
-            return None
-        elif self.freq is None:
-            return None
-        elif lib.is_scalar(other) and isna(other):
-            return None
-
-        elif isinstance(other, (Tick, timedelta, np.timedelta64)):
-            new_freq = None
-            if isinstance(self.freq, Tick):
-                new_freq = self.freq
-            return new_freq
-
-        elif isinstance(other, DateOffset):
-            # otherwise just DatetimeArray
-            return None  # TODO: Should we infer if it matches self.freq * n?
-        elif isinstance(other, (datetime, np.datetime64)):
-            return self.freq
-
-        elif is_timedelta64_dtype(other):
-            return None  # TODO: shouldnt we be able to do self.freq + other.freq?
-        elif is_object_dtype(other):
-            return None  # TODO: is this quite right?  sometimes we unpack singletons
-        elif is_datetime64_any_dtype(other):
-            return None  # TODO: shouldnt we be able to do self.freq + other.freq?
-        else:
-            raise NotImplementedError
-
-    __add__ = _make_wrapped_arith_op_with_freq("__add__")
-    __sub__ = _make_wrapped_arith_op_with_freq("__sub__")
+    __add__ = make_wrapped_arith_op("__add__")
+    __sub__ = make_wrapped_arith_op("__sub__")
     __radd__ = make_wrapped_arith_op("__radd__")
     __rsub__ = make_wrapped_arith_op("__rsub__")
     __pow__ = make_wrapped_arith_op("__pow__")
@@ -643,25 +582,6 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index):
     _is_monotonic_increasing = Index.is_monotonic_increasing
     _is_monotonic_decreasing = Index.is_monotonic_decreasing
     _is_unique = Index.is_unique
-    _freq = lib.no_default
-
-    @property
-    def freq(self):
-        """
-        In limited circumstances, our freq may differ from that of our _data.
-        """
-        if self._freq is not lib.no_default:
-            return self._freq
-        return self._data.freq
-
-    @property
-    def freqstr(self):
-        """
-        Return the frequency object as a string if its set, otherwise None.
-        """
-        if self.freq is None:
-            return None
-        return self.freq.freqstr
 
     def _with_freq(self, freq):
         arr = self._data._with_freq(freq)
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -70,7 +70,7 @@ def _new_PeriodIndex(cls, **d):
     PeriodArray,
     wrap=True,
 )
-@inherit_names(["is_leap_year", "freq", "freqstr", "_format_native_types"], PeriodArray)
+@inherit_names(["is_leap_year", "_format_native_types"], PeriodArray)
 class PeriodIndex(DatetimeIndexOpsMixin, Int64Index):
     """
     Immutable ndarray holding ordinal values indicating regular periods in time.
diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -58,6 +58,17 @@ def test_cov(self, float_frame, float_string_frame):
         )
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])]
+    )
+    def test_cov_nullable_integer(self, other_column):
+        # https://github.com/pandas-dev/pandas/issues/33803
+        data = pd.DataFrame({"a": pd.array([1, 2, None]), "b": other_column})
+        result = data.cov()
+        arr = np.array([[0.5, 0.5], [0.5, 1.0]])
+        expected = pd.DataFrame(arr, columns=["a", "b"], index=["a", "b"])
+        tm.assert_frame_equal(result, expected)
+
 
 class TestDataFrameCorr:
     # DataFrame.corr(), as opposed to DataFrame.corrwith
@@ -153,6 +164,22 @@ def test_corr_int(self):
         df3.cov()
         df3.corr()
 
+    @td.skip_if_no_scipy
+    @pytest.mark.parametrize(
+        "nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])]
+    )
+    @pytest.mark.parametrize(
+        "other_column",
+        [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])],
+    )
+    @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
+    def test_corr_nullable_integer(self, nullable_column, other_column, method):
+        # https://github.com/pandas-dev/pandas/issues/33803
+        data = pd.DataFrame({"a": nullable_column, "b": other_column})
+        result = data.corr(method=method)
+        expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
+        tm.assert_frame_equal(result, expected)
+
 
 class TestDataFrameCorrWith:
     def test_corrwith(self, datetime_frame):
diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py
@@ -96,3 +96,10 @@ def test_map_dictlike(self, mapper):
         expected = pd.Index([np.nan] * len(index))
         result = index.map(mapper([], []))
         tm.assert_index_equal(result, expected)
+
+    def test_getitem_preserves_freq(self):
+        index = self.create_index()
+        assert index.freq is not None
+
+        result = index[:]
+        assert result.freq == index.freq

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ def _new_PeriodIndex(cls, **d):`
`70`	`70`	`PeriodArray,`
`71`	`71`	`wrap=True,`
`72`	`72`	`)`
`73`		`-@inherit_names(["is_leap_year", "freq", "freqstr", "_format_native_types"], PeriodArray)`
	`73`	`+@inherit_names(["is_leap_year", "_format_native_types"], PeriodArray)`
`74`	`74`	`class PeriodIndex(DatetimeIndexOpsMixin, Int64Index):`
`75`	`75`	`"""`
`76`	`76`	`Immutable ndarray holding ordinal values indicating regular periods in time.`