Merge branch 'master' into crosstable_margins_name

cmohl2013 · web-flow · commit 42a089e31a1d · 2017-05-25T12:09:57.000+02:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1738,6 +1738,7 @@ application to columns of a specific data type.
    DataFrameGroupBy.diff
    DataFrameGroupBy.ffill
    DataFrameGroupBy.fillna
+   DataFrameGroupBy.filter
    DataFrameGroupBy.hist
    DataFrameGroupBy.idxmax
    DataFrameGroupBy.idxmin
diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -37,11 +37,13 @@ Bug Fixes
 ~~~~~~~~~
 
 - Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`)
+- Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`)
 
 Conversion
 ^^^^^^^^^^
 
 - Bug in ``pd.to_numeric()`` in which empty data inputs were causing Python to crash (:issue:`16302`)
+- Silence numpy warnings when broadcasting DataFrame to Series with comparison ops (:issue:`16378`, :issue:`16306`)
 
 
 Indexing
@@ -85,6 +87,7 @@ Reshaping
 - Bug in ``DataFrame.stack`` with unsorted levels in MultiIndex columns (:issue:`16323`)
 - Bug in ``pd.wide_to_long()`` where no error was raised when ``i`` was not a unique identifier (:issue:`16382`)
 - Bug in ``Series.isin(..)`` with a list of tuples (:issue:`16394`)
+- Bug in construction of a ``DataFrame`` with mixed dtypes including an all-NaT column. (:issue:`16395`)
 
 
 Numeric
@@ -98,3 +101,5 @@ Categorical
 
 Other
 ^^^^^
+
+- Bug in ``pd.drop([])`` for DataFrame with non-unique indices (:issue:`16270`)
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -36,6 +36,7 @@ Other Enhancements
 - :func:`to_pickle` has gained a protocol parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL <https://docs.python.org/3/library/pickle.html#data-stream-format>`__
 - :func:`api.types.infer_dtype` now infers decimals. (:issue: `15690`)
 - ``Crosstab`` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when margins is True. (:issue:`15972`)
+- :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`)
 
 .. _whatsnew_0210.api_breaking:
 
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -837,7 +837,7 @@ def try_timedelta(v):
         try:
             return to_timedelta(v)._values.reshape(shape)
         except:
-            return v
+            return v.reshape(shape)
 
     inferred_type = lib.infer_datetimelike_array(_ensure_object(v))
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2947,9 +2947,44 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
 
         Examples
         --------
-        >>> indexed_df = df.set_index(['A', 'B'])
-        >>> indexed_df2 = df.set_index(['A', [0, 1, 2, 0, 1, 2]])
-        >>> indexed_df3 = df.set_index([[0, 1, 2, 0, 1, 2]])
+        >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
+        ...                    'year': [2012, 2014, 2013, 2014],
+        ...                    'sale':[55, 40, 84, 31]})
+           month  sale  year
+        0  1      55    2012
+        1  4      40    2014
+        2  7      84    2013
+        3  10     31    2014
+
+        Set the index to become the 'month' column:
+
+        >>> df.set_index('month')
+               sale  year
+        month
+        1      55    2012
+        4      40    2014
+        7      84    2013
+        10     31    2014
+
+        Create a multi-index using columns 'year' and 'month':
+
+        >>> df.set_index(['year', 'month'])
+                    sale
+        year  month
+        2012  1     55
+        2014  4     40
+        2013  7     84
+        2014  10    31
+
+        Create a multi-index using a set of values and a column:
+
+        >>> df.set_index([[1, 2, 3, 4], 'year'])
+                 month  sale
+           year
+        1  2012  1      55
+        2  2014  4      40
+        3  2013  7      84
+        4  2014  10     31
 
         Returns
         -------
@@ -3921,13 +3956,13 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
 
                 if overwrite:
                     mask = isnull(that)
-
-                    # don't overwrite columns unecessarily
-                    if mask.all():
-                        continue
                 else:
                     mask = notnull(this)
 
+            # don't overwrite columns unecessarily
+            if mask.all():
+                continue
+
             self[col] = expressions.where(mask, this, that,
                                           raise_on_error=True)
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -12,6 +12,7 @@
 from pandas._libs import tslib, lib
 from pandas.core.dtypes.common import (
     _ensure_int64,
+    _ensure_object,
     needs_i8_conversion,
     is_scalar,
     is_number,
@@ -2076,7 +2077,7 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'):
             result = dropped
 
         else:
-            labels = com._index_labels_to_array(labels)
+            labels = _ensure_object(com._index_labels_to_array(labels))
             if level is not None:
                 if not isinstance(axis, MultiIndex):
                     raise AssertionError('axis must be a MultiIndex')
diff --git a/pandas/core/ops.py b/pandas/core/ops.py
@@ -1250,7 +1250,8 @@ def _flex_comp_method_FRAME(op, name, str_rep=None, default_axis='columns',
                             masker=False):
     def na_op(x, y):
         try:
-            result = op(x, y)
+            with np.errstate(invalid='ignore'):
+                result = op(x, y)
         except TypeError:
             xrav = x.ravel()
             result = np.empty(x.size, dtype=bool)
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -43,6 +43,7 @@ def to_feather(df, path):
     df : DataFrame
     path : string
         File path
+
     """
     path = _stringify_path(path)
     if not isinstance(df, DataFrame):
@@ -83,7 +84,7 @@ def to_feather(df, path):
     feather.write_dataframe(df, path)
 
 
-def read_feather(path):
+def read_feather(path, nthreads=1):
     """
     Load a feather-format object from the file path
 
@@ -93,6 +94,10 @@ def read_feather(path):
     ----------
     path : string
         File path
+    nthreads : int, default 1
+        Number of CPU threads to use when reading to pandas.DataFrame
+
+       .. versionadded 0.21.0
 
     Returns
     -------
@@ -102,4 +107,8 @@ def read_feather(path):
 
     feather = _try_import()
     path = _stringify_path(path)
-    return feather.read_dataframe(path)
+
+    if feather.__version__ < LooseVersion('0.4.0'):
+        return feather.read_dataframe(path)
+
+    return feather.read_dataframe(path, nthreads=nthreads)
diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
@@ -9,7 +9,7 @@
 from datetime import datetime, timedelta, date
 import numpy as np
 
-from pandas import Timedelta, Timestamp, DatetimeIndex
+from pandas import Timedelta, Timestamp, DatetimeIndex, DataFrame, NaT
 
 from pandas.core.dtypes.cast import (
     maybe_downcast_to_dtype,
@@ -213,6 +213,17 @@ def test_maybe_convert_scalar(self):
         result = maybe_convert_scalar(Timedelta('1 day 1 min'))
         assert result == Timedelta('1 day 1 min').value
 
+    def test_maybe_infer_to_datetimelike(self):
+        # GH16362
+        # pandas=0.20.1 raises IndexError: tuple index out of range
+        result = DataFrame(np.array([[NaT, 'a', 'b', 0],
+                                     [NaT, 'b', 'c', 1]]))
+        assert result.size == 8
+        # this construction was fine
+        result = DataFrame(np.array([[NaT, 'a', 0],
+                                     [NaT, 'b', 1]]))
+        assert result.size == 6
+
 
 class TestConvert(object):
 
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -2081,3 +2081,16 @@ def test_n_duplicate_index(self, df_duplicates, n, order):
         result = df.nlargest(n, order)
         expected = df.sort_values(order, ascending=False).head(n)
         tm.assert_frame_equal(result, expected)
+
+    def test_series_broadcasting(self):
+        # smoke test for numpy warnings
+        # GH 16378, GH 16306
+        df = DataFrame([1.0, 1.0, 1.0])
+        df_nan = DataFrame({'A': [np.nan, 2.0, np.nan]})
+        s = Series([1, 1, 1])
+        s_nan = Series([np.nan, np.nan, 1])
+
+        with tm.assert_produces_warning(None):
+            df_nan.clip_lower(s, axis=0)
+            for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']:
+                getattr(df, op)(s_nan, axis=0)
diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py
@@ -61,6 +61,11 @@ def test_drop_names(self):
         expected = Index(['e', 'f'], name='second')
         tm.assert_index_equal(dropped.columns, expected)
 
+        # GH 16398
+        dropped = df.drop([], errors='ignore')
+        expected = Index(['a', 'b', 'c'], name='first')
+        tm.assert_index_equal(dropped.index, expected)
+
     def test_drop_col_still_multiindex(self):
         arrays = [['a', 'b', 'c', 'top'],
                   ['', '', '', 'OD'],
@@ -100,6 +105,7 @@ def test_drop(self):
                           columns=['a', 'a', 'b'])
         assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']])
         assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a'])
+        assert_frame_equal(nu_df.drop([]), nu_df)  # GH 16398
 
         nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X']))
         nu_df.columns = list('abc')
diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py
@@ -763,3 +763,26 @@ def test_concat_datetime_datetime64_frame(self):
 
         # it works!
         pd.concat([df1, df2_obj])
+
+
+class TestDataFrameUpdate(TestData):
+
+    def test_update_nan(self):
+        # #15593 #15617
+        # test 1
+        df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
+        df2 = DataFrame({'A': [None, 2, 3]})
+        expected = df1.copy()
+        df1.update(df2, overwrite=False)
+
+        tm.assert_frame_equal(df1, expected)
+
+        # test 2
+        df1 = DataFrame({'A': [1.0, None, 3],
+                         'B': date_range('2000', periods=3)})
+        df2 = DataFrame({'A': [None, 2, 3]})
+        expected = DataFrame({'A': [1.0, 2, 3],
+                              'B': date_range('2000', periods=3)})
+        df1.update(df2, overwrite=False)
+
+        tm.assert_frame_equal(df1, expected)
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -27,11 +27,11 @@ def check_error_on_write(self, df, exc):
             with ensure_clean() as path:
                 to_feather(df, path)
 
-    def check_round_trip(self, df):
+    def check_round_trip(self, df, **kwargs):
 
         with ensure_clean() as path:
             to_feather(df, path)
-            result = read_feather(path)
+            result = read_feather(path, **kwargs)
             assert_frame_equal(result, df)
 
     def test_error(self):
@@ -98,6 +98,12 @@ def test_unsupported_other(self):
         df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
         self.check_error_on_write(df, ValueError)
 
+    @pytest.mark.skipif(fv < '0.4.0', reason='new in 0.4.0')
+    def test_rw_nthreads(self):
+
+        df = pd.DataFrame({'A': np.arange(100000)})
+        self.check_round_trip(df, nthreads=2)
+
     def test_write_with_index(self):
 
         df = pd.DataFrame({'A': [1, 2, 3]})