Merge branch 'master' into feature/fix_wom

Maximiliano Greco · web-flow · commit bb3f5410ed61 · 2018-03-31T00:52:54.000+02:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -404,6 +404,8 @@ Other Enhancements
 - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
 - zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`)
 - Now is possible create a :class:`WeekOfMonth` offset with `n=0` (:issue:`20517`).
+- :class:`DataFrame` and :class:`Series` now support matrix multiplication (```@```) operator (:issue:`10259`) for Python>=3.5
+
 
 .. _whatsnew_0230.api_breaking:
 
@@ -902,6 +904,22 @@ Performance Improvements
 Documentation Changes
 ~~~~~~~~~~~~~~~~~~~~~
 
+Thanks to all of the contributors who participated in the Pandas Documentation
+Sprint, which took place on March 10th. We had about 500 participants from over
+30 locations across the world. You should notice that many of the
+:ref:`API docstrings <api>` have greatly improved.
+
+There were too many simultaneous contributions to include a release note for each
+improvement, but this `GitHub search`_ should give you an idea of how many docstrings
+were improved.
+
+Special thanks to Marc Garcia for organizing the sprint. For more information,
+read the `NumFOCUS blogpost`_ recapping the sprint.
+
+.. _GitHub search: https://github.com/pandas-dev/pandas/pulls?utf8=%E2%9C%93&q=is%3Apr+label%3ADocs+created%3A2018-03-10..2018-03-15+
+.. _NumFOCUS blogpost: https://www.numfocus.org/blog/worldwide-pandas-sprint/
+
+
 - Changed spelling of "numpy" to "NumPy", and "python" to "Python". (:issue:`19017`)
 - Consistency when introducing code samples, using either colon or period.
   Rewrote some sentences for greater clarity, added more dynamic references
@@ -1023,6 +1041,7 @@ Numeric
 - Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`)
 - Bug in :class:`DataFrame` flex arithmetic (e.g. ``df.add(other, fill_value=foo)``) with a ``fill_value`` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`)
 - Multiplication and division of numeric-dtyped :class:`Index` objects with timedelta-like scalars returns ``TimedeltaIndex`` instead of raising ``TypeError`` (:issue:`19333`)
+- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``ascending='False'`` failed to return correct ranks for infinity if ``NaN`` were present (:issue:`19538`)
 - Bug where ``NaN`` was returned instead of 0 by :func:`Series.pct_change` and :func:`DataFrame.pct_change` when ``fill_method`` is not ``None`` (:issue:`19873`)
 
 
@@ -1134,6 +1153,7 @@ Reshaping
 - Bug in :class:`Series` constructor with ``Categorical`` where a ```ValueError`` is not raised when an index of different length is given (:issue:`19342`)
 - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
 - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)
+- Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
 
 Other
 ^^^^^
diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in
@@ -135,7 +135,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
 
     sorted_data = values.take(_as)
     sorted_mask = mask.take(_as)
-    _indices = order[1].take(_as).nonzero()[0]
+    _indices = np.diff(sorted_mask).nonzero()[0]
     non_na_idx = _indices[0] if len(_indices) > 0 else -1
     argsorted = _as.astype('i8')
 
@@ -153,7 +153,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
 
         if (i == n - 1 or
                 are_diff(util.get_value_at(sorted_data, i + 1), val) or
-                i == non_na_idx - 1):
+                i == non_na_idx):
             if tiebreak == TIEBREAK_AVERAGE:
                 for j in range(i - dups + 1, i + 1):
                     ranks[argsorted[j]] = sum_ranks / dups
@@ -190,7 +190,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
             count += 1.0
 
             if (i == n - 1 or sorted_data[i + 1] != val or
-                i == non_na_idx - 1):
+                i == non_na_idx):
                 if tiebreak == TIEBREAK_AVERAGE:
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = sum_ranks / dups
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -863,7 +863,8 @@ def __len__(self):
 
     def dot(self, other):
         """
-        Matrix multiplication with DataFrame or Series objects
+        Matrix multiplication with DataFrame or Series objects.  Can also be
+        called using `self @ other` in Python >= 3.5.
 
         Parameters
         ----------
@@ -905,6 +906,14 @@ def dot(self, other):
         else:  # pragma: no cover
             raise TypeError('unsupported type: %s' % type(other))
 
+    def __matmul__(self, other):
+        """ Matrix multiplication using binary `@` operator in Python>=3.5 """
+        return self.dot(other)
+
+    def __rmatmul__(self, other):
+        """ Matrix multiplication using binary `@` operator in Python>=3.5 """
+        return self.T.dot(np.transpose(other)).T
+
     # ----------------------------------------------------------------------
     # IO methods (to / from other formats)
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1994,7 +1994,7 @@ def autocorr(self, lag=1):
     def dot(self, other):
         """
         Matrix multiplication with DataFrame or inner-product with Series
-        objects
+        objects. Can also be called using `self @ other` in Python >= 3.5.
 
         Parameters
         ----------
@@ -2033,6 +2033,14 @@ def dot(self, other):
         else:  # pragma: no cover
             raise TypeError('unsupported type: %s' % type(other))
 
+    def __matmul__(self, other):
+        """ Matrix multiplication using binary `@` operator in Python>=3.5 """
+        return self.dot(other)
+
+    def __rmatmul__(self, other):
+        """ Matrix multiplication using binary `@` operator in Python>=3.5 """
+        return self.dot(other)
+
     @Substitution(klass='Series')
     @Appender(base._shared_docs['searchsorted'])
     @deprecate_kwarg(old_arg_name='v', new_arg_name='value')
@@ -4156,9 +4164,10 @@ def _try_cast(arr, take_fast_path):
     if issubclass(subarr.dtype.type, compat.string_types):
         # GH 16605
         # If not empty convert the data to dtype
-        if not isna(data).all():
-            data = np.array(data, dtype=dtype, copy=False)
-
-        subarr = np.array(data, dtype=object, copy=copy)
+        # GH 19853: If data is a scalar, subarr has already the result
+        if not is_scalar(data):
+            if not np.all(isna(data)):
+                data = np.array(data, dtype=dtype, copy=False)
+            subarr = np.array(data, dtype=object, copy=copy)
 
     return subarr
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -5,6 +5,7 @@
 import warnings
 from datetime import timedelta
 from distutils.version import LooseVersion
+import operator
 import sys
 import pytest
 
@@ -13,7 +14,7 @@
 from numpy.random import randn
 import numpy as np
 
-from pandas.compat import lrange, product
+from pandas.compat import lrange, product, PY35
 from pandas import (compat, isna, notna, DataFrame, Series,
                     MultiIndex, date_range, Timestamp, Categorical,
                     _np_version_under1p15)
@@ -2091,7 +2092,6 @@ def test_clip_with_na_args(self):
                               self.frame)
 
     # Matrix-like
-
     def test_dot(self):
         a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
                       columns=['p', 'q', 'r', 's'])
@@ -2144,6 +2144,63 @@ def test_dot(self):
         with tm.assert_raises_regex(ValueError, 'aligned'):
             df.dot(df2)
 
+    @pytest.mark.skipif(not PY35,
+                        reason='matmul supported for Python>=3.5')
+    def test_matmul(self):
+        # matmul test is for GH #10259
+        a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
+                      columns=['p', 'q', 'r', 's'])
+        b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'],
+                      columns=['one', 'two'])
+
+        # DataFrame @ DataFrame
+        result = operator.matmul(a, b)
+        expected = DataFrame(np.dot(a.values, b.values),
+                             index=['a', 'b', 'c'],
+                             columns=['one', 'two'])
+        tm.assert_frame_equal(result, expected)
+
+        # DataFrame @ Series
+        result = operator.matmul(a, b.one)
+        expected = Series(np.dot(a.values, b.one.values),
+                          index=['a', 'b', 'c'])
+        tm.assert_series_equal(result, expected)
+
+        # np.array @ DataFrame
+        result = operator.matmul(a.values, b)
+        expected = np.dot(a.values, b.values)
+        tm.assert_almost_equal(result, expected)
+
+        # nested list @ DataFrame (__rmatmul__)
+        result = operator.matmul(a.values.tolist(), b)
+        expected = DataFrame(np.dot(a.values, b.values),
+                             index=['a', 'b', 'c'],
+                             columns=['one', 'two'])
+        tm.assert_almost_equal(result.values, expected.values)
+
+        # mixed dtype DataFrame @ DataFrame
+        a['q'] = a.q.round().astype(int)
+        result = operator.matmul(a, b)
+        expected = DataFrame(np.dot(a.values, b.values),
+                             index=['a', 'b', 'c'],
+                             columns=['one', 'two'])
+        tm.assert_frame_equal(result, expected)
+
+        # different dtypes DataFrame @ DataFrame
+        a = a.astype(int)
+        result = operator.matmul(a, b)
+        expected = DataFrame(np.dot(a.values, b.values),
+                             index=['a', 'b', 'c'],
+                             columns=['one', 'two'])
+        tm.assert_frame_equal(result, expected)
+
+        # unaligned
+        df = DataFrame(randn(3, 4), index=[1, 2, 3], columns=lrange(4))
+        df2 = DataFrame(randn(5, 3), index=lrange(5), columns=[1, 2, 3])
+
+        with tm.assert_raises_regex(ValueError, 'aligned'):
+            operator.matmul(df, df2)
+
 
 @pytest.fixture
 def df_duplicates():
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -3,7 +3,7 @@
 
 from itertools import product
 from distutils.version import LooseVersion
-
+import operator
 import pytest
 
 from numpy import nan
@@ -18,7 +18,7 @@
 from pandas.core.indexes.timedeltas import Timedelta
 import pandas.core.nanops as nanops
 
-from pandas.compat import lrange, range
+from pandas.compat import lrange, range, PY35
 from pandas import compat
 from pandas.util.testing import (assert_series_equal, assert_almost_equal,
                                  assert_frame_equal, assert_index_equal)
@@ -921,6 +921,52 @@ def test_dot(self):
         pytest.raises(Exception, a.dot, a.values[:3])
         pytest.raises(ValueError, a.dot, b.T)
 
+    @pytest.mark.skipif(not PY35,
+                        reason='matmul supported for Python>=3.5')
+    def test_matmul(self):
+        # matmul test is for GH #10259
+        a = Series(np.random.randn(4), index=['p', 'q', 'r', 's'])
+        b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'],
+                      columns=['p', 'q', 'r', 's']).T
+
+        # Series @ DataFrame
+        result = operator.matmul(a, b)
+        expected = Series(np.dot(a.values, b.values), index=['1', '2', '3'])
+        assert_series_equal(result, expected)
+
+        # DataFrame @ Series
+        result = operator.matmul(b.T, a)
+        expected = Series(np.dot(b.T.values, a.T.values),
+                          index=['1', '2', '3'])
+        assert_series_equal(result, expected)
+
+        # Series @ Series
+        result = operator.matmul(a, a)
+        expected = np.dot(a.values, a.values)
+        assert_almost_equal(result, expected)
+
+        # np.array @ Series (__rmatmul__)
+        result = operator.matmul(a.values, a)
+        expected = np.dot(a.values, a.values)
+        assert_almost_equal(result, expected)
+
+        # mixed dtype DataFrame @ Series
+        a['p'] = int(a.p)
+        result = operator.matmul(b.T, a)
+        expected = Series(np.dot(b.T.values, a.T.values),
+                          index=['1', '2', '3'])
+        assert_series_equal(result, expected)
+
+        # different dtypes DataFrame @ Series
+        a = a.astype(int)
+        result = operator.matmul(b.T, a)
+        expected = Series(np.dot(b.T.values, a.T.values),
+                          index=['1', '2', '3'])
+        assert_series_equal(result, expected)
+
+        pytest.raises(Exception, a.dot, a.values[:3])
+        pytest.raises(ValueError, a.dot, b.T)
+
     def test_value_counts_nunique(self):
 
         # basics.rst doc example
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -110,6 +110,11 @@ def test_constructor_empty(self, input_class):
             empty2 = Series(input_class(), index=lrange(10), dtype='float64')
             assert_series_equal(empty, empty2)
 
+            # GH 19853 : with empty string, index and dtype str
+            empty = Series('', dtype=str, index=range(3))
+            empty2 = Series('', index=range(3))
+            assert_series_equal(empty, empty2)
+
     @pytest.mark.parametrize('input_arg', [np.nan, float('nan')])
     def test_constructor_nan(self, input_arg):
         empty = Series(dtype='float64', index=lrange(10))
diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py
@@ -16,6 +16,8 @@
 from pandas.tests.series.common import TestData
 from pandas._libs.tslib import iNaT
 from pandas._libs.algos import Infinity, NegInfinity
+from itertools import chain
+import pandas.util._test_decorators as td
 
 
 class TestSeriesRank(TestData):
@@ -257,38 +259,52 @@ def _check(s, expected, method='average'):
             series = s if dtype is None else s.astype(dtype)
             _check(series, results[method], method=method)
 
-    def test_rank_tie_methods_on_infs_nans(self):
+    @td.skip_if_no_scipy
+    @pytest.mark.parametrize('ascending', [True, False])
+    @pytest.mark.parametrize('method', ['average', 'min', 'max', 'first',
+                                        'dense'])
+    @pytest.mark.parametrize('na_option', ['top', 'bottom', 'keep'])
+    def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending):
         dtypes = [('object', None, Infinity(), NegInfinity()),
                   ('float64', np.nan, np.inf, -np.inf)]
         chunk = 3
         disabled = set([('object', 'first')])
 
-        def _check(s, expected, method='average', na_option='keep'):
-            result = s.rank(method=method, na_option=na_option)
+        def _check(s, method, na_option, ascending):
+            exp_ranks = {
+                'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
+                'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
+                'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
+                'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
+                'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
+            }
+            ranks = exp_ranks[method]
+            if na_option == 'top':
+                order = [ranks[1], ranks[0], ranks[2]]
+            elif na_option == 'bottom':
+                order = [ranks[0], ranks[2], ranks[1]]
+            else:
+                order = [ranks[0], [np.nan] * chunk, ranks[1]]
+            expected = order if ascending else order[::-1]
+            expected = list(chain.from_iterable(expected))
+            result = s.rank(method=method, na_option=na_option,
+                            ascending=ascending)
             tm.assert_series_equal(result, Series(expected, dtype='float64'))
 
-        exp_ranks = {
-            'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
-            'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
-            'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
-            'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
-            'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
-        }
-        na_options = ('top', 'bottom', 'keep')
         for dtype, na_value, pos_inf, neg_inf in dtypes:
             in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
             iseries = Series(in_arr, dtype=dtype)
-            for method, na_opt in product(exp_ranks.keys(), na_options):
-                ranks = exp_ranks[method]
-                if (dtype, method) in disabled:
-                    continue
-                if na_opt == 'top':
-                    order = ranks[1] + ranks[0] + ranks[2]
-                elif na_opt == 'bottom':
-                    order = ranks[0] + ranks[2] + ranks[1]
-                else:
-                    order = ranks[0] + [np.nan] * chunk + ranks[1]
-                _check(iseries, order, method, na_opt)
+            if (dtype, method) in disabled:
+                continue
+            _check(iseries, method, na_option, ascending)
+
+    def test_rank_desc_mix_nans_infs(self):
+        # GH 19538
+        # check descending ranking when mix nans and infs
+        iseries = Series([1, np.nan, np.inf, -np.inf, 25])
+        result = iseries.rank(ascending=False)
+        exp = Series([3, np.nan, 1, 4, 2], dtype='float64')
+        tm.assert_series_equal(result, exp)
 
     def test_rank_methods_series(self):
         pytest.importorskip('scipy.stats.special')
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py