Merge branch 'master' into PR_TOOL_MERGE_PR_22952

jreback · jreback · commit 24ef8ef58021 · 2018-11-18T18:02:04.000-05:00
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1008,6 +1008,8 @@ Other API Changes
 - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
 - Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`)
+- :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`).
+- :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`).
 - :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`).
 - The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`)
 
@@ -1382,6 +1384,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
 - Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`)
 - Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
 - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
+- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
 - Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
 - Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1344,15 +1344,13 @@ def searchsorted(self, value, side='left', sorter=None):
                              "ordered one")
 
         from pandas.core.series import Series
+        codes = _get_codes_for_values(Series(value).values, self.categories)
+        if -1 in codes:
+            raise KeyError("Value(s) to be inserted must be in categories.")
 
-        values_as_codes = _get_codes_for_values(Series(value).values,
-                                                self.categories)
+        codes = codes[0] if is_scalar(value) else codes
 
-        if -1 in values_as_codes:
-            raise ValueError("Value(s) to be inserted must be in categories.")
-
-        return self.codes.searchsorted(values_as_codes, side=side,
-                                       sorter=sorter)
+        return self.codes.searchsorted(codes, side=side, sorter=sorter)
 
     def isna(self):
         """
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -630,11 +630,12 @@ def _parse_cell(cell_contents, cell_typ):
                     if is_integer(skiprows):
                         row += skiprows
 
-                    data[row], control_row = _fill_mi_header(
-                        data[row], control_row)
-                    header_name, _ = _pop_header_name(
-                        data[row], index_col)
-                    header_names.append(header_name)
+                    data[row], control_row = _fill_mi_header(data[row],
+                                                             control_row)
+
+                    if index_col is not None:
+                        header_name, _ = _pop_header_name(data[row], index_col)
+                        header_names.append(header_name)
 
             if is_list_like(index_col):
                 # Forward fill values for MultiIndex index.
@@ -682,7 +683,8 @@ def _parse_cell(cell_contents, cell_typ):
 
                 output[asheetname] = parser.read(nrows=nrows)
 
-                if not squeeze or isinstance(output[asheetname], DataFrame):
+                if ((not squeeze or isinstance(output[asheetname], DataFrame))
+                        and header_names):
                     output[asheetname].columns = output[
                         asheetname].columns.set_names(header_names)
             except EmptyDataError:
@@ -863,16 +865,30 @@ def _fill_mi_header(row, control_row):
 
 
 def _pop_header_name(row, index_col):
-    """ (header, new_data) for header rows in MultiIndex parsing"""
-    none_fill = lambda x: None if x == '' else x
-
-    if index_col is None:
-        # no index col specified, trim data for inference path
-        return none_fill(row[0]), row[1:]
-    else:
-        # pop out header name and fill w/ blank
-        i = index_col if not is_list_like(index_col) else max(index_col)
-        return none_fill(row[i]), row[:i] + [''] + row[i + 1:]
+    """
+    Pop the header name for MultiIndex parsing.
+
+    Parameters
+    ----------
+    row : list
+        The data row to parse for the header name.
+    index_col : int, list
+        The index columns for our data. Assumed to be non-null.
+
+    Returns
+    -------
+    header_name : str
+        The extracted header name.
+    trimmed_row : list
+        The original data row with the header name removed.
+    """
+    # Pop out header name and fill w/blank.
+    i = index_col if not is_list_like(index_col) else max(index_col)
+
+    header_name = row[i]
+    header_name = None if header_name == "" else header_name
+
+    return header_name, row[:i] + [''] + row[i + 1:]
 
 
 @add_metaclass(abc.ABCMeta)
diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py
@@ -78,7 +78,7 @@ def _write_cell(self, s, kind='td', indent=0, tags=None):
         self.write(u'{start}{rs}</{kind}>'
                    .format(start=start_tag, rs=rs, kind=kind), indent)
 
-    def write_tr(self, line, indent=0, indent_delta=4, header=False,
+    def write_tr(self, line, indent=0, indent_delta=0, header=False,
                  align=None, tags=None, nindex_levels=0):
         if tags is None:
             tags = {}
@@ -200,26 +200,6 @@ def _write_header(self, indent):
             # write nothing
             return indent
 
-        def _column_header():
-            if self.fmt.index:
-                row = [''] * (self.frame.index.nlevels - 1)
-            else:
-                row = []
-
-            if isinstance(self.columns, ABCMultiIndex):
-                if self.fmt.has_column_names and self.fmt.index:
-                    row.append(single_column_table(self.columns.names))
-                else:
-                    row.append('')
-                style = "text-align: {just};".format(just=self.fmt.justify)
-                row.extend([single_column_table(c, self.fmt.justify, style)
-                            for c in self.columns])
-            else:
-                if self.fmt.index:
-                    row.append(self.columns.name or '')
-                row.extend(self.columns)
-            return row
-
         self.write('<thead>', indent)
 
         indent += self.indent_delta
@@ -301,16 +281,21 @@ def _column_header():
                 self.write_tr(row, indent, self.indent_delta, tags=tags,
                               header=True)
         else:
-            col_row = _column_header()
+            if self.fmt.index:
+                row = [''] * (self.frame.index.nlevels - 1)
+                row.append(self.columns.name or '')
+            else:
+                row = []
+            row.extend(self.columns)
             align = self.fmt.justify
 
             if truncate_h:
                 if not self.fmt.index:
                     row_levels = 0
                 ins_col = row_levels + self.fmt.tr_col_num
-                col_row.insert(ins_col, '...')
+                row.insert(ins_col, '...')
 
-            self.write_tr(col_row, indent, self.indent_delta, header=True,
+            self.write_tr(row, indent, self.indent_delta, header=True,
                           align=align)
 
         if all((self.fmt.has_index_names,
@@ -486,24 +471,3 @@ def _write_hierarchical_rows(self, fmt_values, indent):
                     row.insert(row_levels + self.fmt.tr_col_num, '...')
                 self.write_tr(row, indent, self.indent_delta, tags=None,
                               nindex_levels=frame.index.nlevels)
-
-
-def single_column_table(column, align=None, style=None):
-    table = '<table'
-    if align is not None:
-        table += (' align="{align}"'.format(align=align))
-    if style is not None:
-        table += (' style="{style}"'.format(style=style))
-    table += '><tbody>'
-    for i in column:
-        table += ('<tr><td>{i!s}</td></tr>'.format(i=i))
-    table += '</tbody></table>'
-    return table
-
-
-def single_row_table(row):  # pragma: no cover
-    table = '<table><tbody><tr>'
-    for i in row:
-        table += ('<td>{i!s}</td>'.format(i=i))
-    table += '</tr></tbody></table>'
-    return table
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
@@ -85,10 +85,10 @@ def test_searchsorted(self):
 
         # Searching for single item argument, side='left' (default)
         res_cat = c1.searchsorted('apple')
+        assert res_cat == 2
+
         res_ser = s1.searchsorted('apple')
-        exp = np.array([2], dtype=np.intp)
-        tm.assert_numpy_array_equal(res_cat, exp)
-        tm.assert_numpy_array_equal(res_ser, exp)
+        assert res_ser == 2
 
         # Searching for single item array, side='left' (default)
         res_cat = c1.searchsorted(['bread'])
@@ -105,13 +105,13 @@ def test_searchsorted(self):
         tm.assert_numpy_array_equal(res_ser, exp)
 
         # Searching for a single value that is not from the Categorical
-        pytest.raises(ValueError, lambda: c1.searchsorted('cucumber'))
-        pytest.raises(ValueError, lambda: s1.searchsorted('cucumber'))
+        pytest.raises(KeyError, lambda: c1.searchsorted('cucumber'))
+        pytest.raises(KeyError, lambda: s1.searchsorted('cucumber'))
 
         # Searching for multiple values one of each is not from the Categorical
-        pytest.raises(ValueError,
+        pytest.raises(KeyError,
                       lambda: c1.searchsorted(['bread', 'cucumber']))
-        pytest.raises(ValueError,
+        pytest.raises(KeyError,
                       lambda: s1.searchsorted(['bread', 'cucumber']))
 
         # searchsorted call for unordered Categorical
diff --git a/pandas/tests/io/data/testmultiindex.xls b/pandas/tests/io/data/testmultiindex.xls
diff --git a/pandas/tests/io/data/testmultiindex.xlsm b/pandas/tests/io/data/testmultiindex.xlsm
diff --git a/pandas/tests/io/data/testmultiindex.xlsx b/pandas/tests/io/data/testmultiindex.xlsx
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -896,6 +896,17 @@ def test_read_excel_multiindex(self, ext):
                             header=[0, 1], skiprows=2)
         tm.assert_frame_equal(actual, expected)
 
+    def test_read_excel_multiindex_header_only(self, ext):
+        # see gh-11733.
+        #
+        # Don't try to parse a header name if there isn't one.
+        mi_file = os.path.join(self.dirpath, "testmultiindex" + ext)
+        result = read_excel(mi_file, "index_col_none", header=[0, 1])
+
+        exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
+        expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
+        tm.assert_frame_equal(result, expected)
+
     @td.skip_if_no("xlsxwriter")
     def test_read_excel_multiindex_empty_level(self, ext):
         # see gh-12453
diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py
@@ -7,25 +7,58 @@
 import pandas.util.testing as tm
 from pandas.compat import long
 from pandas.tseries import offsets
+from pandas.tseries.frequencies import to_offset
 from pandas import Timestamp, Timedelta
 
 
 class TestTimestampArithmetic(object):
     def test_overflow_offset(self):
+        # no overflow expected
+
+        stamp = Timestamp("2000/1/1")
+        offset_no_overflow = to_offset("D") * 100
+
+        expected = Timestamp("2000/04/10")
+        assert stamp + offset_no_overflow == expected
+
+        assert offset_no_overflow + stamp == expected
+
+        expected = Timestamp("1999/09/23")
+        assert stamp - offset_no_overflow == expected
+
+    def test_overflow_offset_raises(self):
         # xref https://github.com/statsmodels/statsmodels/issues/3374
         # ends up multiplying really large numbers which overflow
 
         stamp = Timestamp('2017-01-13 00:00:00', freq='D')
-        offset = 20169940 * offsets.Day(1)
+        offset_overflow = 20169940 * offsets.Day(1)
+        msg = ("the add operation between "
+               r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} "
+               "will overflow")
+
+        with pytest.raises(OverflowError, match=msg):
+            stamp + offset_overflow
+
+        with pytest.raises(OverflowError, match=msg):
+            offset_overflow + stamp
+
+        with pytest.raises(OverflowError, match=msg):
+            stamp - offset_overflow
+
+        # xref https://github.com/pandas-dev/pandas/issues/14080
+        # used to crash, so check for proper overflow exception
+
+        stamp = Timestamp("2000/1/1")
+        offset_overflow = to_offset("D") * 100 ** 25
 
-        with pytest.raises(OverflowError):
-            stamp + offset
+        with pytest.raises(OverflowError, match=msg):
+            stamp + offset_overflow
 
-        with pytest.raises(OverflowError):
-            offset + stamp
+        with pytest.raises(OverflowError, match=msg):
+            offset_overflow + stamp
 
-        with pytest.raises(OverflowError):
-            stamp - offset
+        with pytest.raises(OverflowError, match=msg):
+            stamp - offset_overflow
 
     def test_delta_preserve_nanos(self):
         val = Timestamp(long(1337299200000000123))