diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 111ba6b020bc7..dc51597a33209 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -34,3 +34,6 @@ dependencies: - xlsxwriter - xlwt - pyarrow>=0.15 + - pip + - pip: + - pyxlsb diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 3bbbdb4cf32ad..90980133b31c1 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -33,3 +33,4 @@ dependencies: - pip - pip: - pyreadstat + - pyxlsb diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 62be1075b3337..6b3ad6f560292 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -35,3 +35,6 @@ dependencies: - xlsxwriter - xlwt - pyreadstat + - pip + - pip: + - pyxlsb diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index a46001c58d165..869d2ab683f0c 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -51,3 +51,4 @@ dependencies: - coverage - pandas-datareader - python-dateutil + - pyxlsb diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b3fd443e662a9..b5c512cdc8328 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -264,6 +264,7 @@ pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing +pyxlsb 1.0.5 Reading for xlsb files qtpy Clipboard I/O s3fs 0.3.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 55bbf6848820b..a8d84469fbf74 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -23,7 +23,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` - binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` + ;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` binary;`OpenDocument `__;:ref:`read_excel`; binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` @@ -2768,7 +2768,8 @@ Excel files The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files -can be read using either ``xlrd`` or ``openpyxl``. +can be read using either ``xlrd`` or ``openpyxl``. Binary Excel (``.xlsb``) +files can be read using ``pyxlsb``. The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. @@ -3229,6 +3230,30 @@ OpenDocument spreadsheets match what can be done for `Excel files`_ using Currently pandas only supports *reading* OpenDocument spreadsheets. Writing is not implemented. +.. _io.xlsb: + +Binary Excel (.xlsb) files +-------------------------- + +.. versionadded:: 1.0.0 + +The :func:`~pandas.read_excel` method can also read binary Excel files +using the ``pyxlsb`` module. The semantics and features for reading +binary Excel files mostly match what can be done for `Excel files`_ using +``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types +in files and will return floats instead. + +.. code-block:: python + + # Returns a DataFrame + pd.read_excel('path_to_file.xlsb', engine='pyxlsb') + +.. note:: + + Currently pandas only supports *reading* binary Excel files. Writing + is not implemented. + + .. _io.clipboard: Clipboard diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index fa562838c8f7c..1623aca49b9c3 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -215,7 +215,8 @@ Other enhancements - :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`) - Roundtripping DataFrames with nullable integer, string and period data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine - now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`). + now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). +- :func:`read_excel` now can read binary Excel (``.xlsb``) files by passing ``engine='pyxlsb'``. For more details and example usage, see the :ref:`Binary Excel files documentation `. Closes :issue:`8540`. - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) - :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) - :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 7aeb0327139f1..d561ab9a10548 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -19,6 +19,7 @@ "pyarrow": "0.13.0", "pytables": "3.4.2", "pytest": "5.0.1", + "pyxlsb": "1.0.5", "s3fs": "0.3.0", "scipy": "0.19.0", "sqlalchemy": "1.1.4", diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index afdd8a01ee003..eb1587313910d 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -479,6 +479,7 @@ def use_inf_as_na_cb(key): _xlsm_options = ["xlrd", "openpyxl"] _xlsx_options = ["xlrd", "openpyxl"] _ods_options = ["odf"] +_xlsb_options = ["pyxlsb"] with cf.config_prefix("io.excel.xls"): @@ -515,6 +516,13 @@ def use_inf_as_na_cb(key): validator=str, ) +with cf.config_prefix("io.excel.xlsb"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)), + validator=str, + ) # Set up the io.excel specific writer configuration. writer_engine_doc = """ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 04015a08bce2f..2a91381b7fbeb 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -35,8 +35,9 @@ """ Read an Excel file into a pandas DataFrame. -Support both `xls` and `xlsx` file extensions from a local filesystem or URL. -Support an option to read a single sheet or a list of sheets. +Supports `xls`, `xlsx`, `xlsm`, `xlsb`, and `odf` file extensions +read from a local filesystem or URL. Supports an option to read +a single sheet or a list of sheets. Parameters ---------- @@ -789,15 +790,21 @@ class ExcelFile: If a string or path object, expected to be a path to xls, xlsx or odf file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Acceptable values are None, ``xlrd``, ``openpyxl`` or ``odf``. + Acceptable values are None, ``xlrd``, ``openpyxl``, ``odf``, or ``pyxlsb``. Note that ``odf`` reads tables out of OpenDocument formatted files. """ from pandas.io.excel._odfreader import _ODFReader from pandas.io.excel._openpyxl import _OpenpyxlReader from pandas.io.excel._xlrd import _XlrdReader - - _engines = {"xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader} + from pandas.io.excel._pyxlsb import _PyxlsbReader + + _engines = { + "xlrd": _XlrdReader, + "openpyxl": _OpenpyxlReader, + "odf": _ODFReader, + "pyxlsb": _PyxlsbReader, + } def __init__(self, io, engine=None): if engine is None: diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py new file mode 100644 index 0000000000000..df6a38000452d --- /dev/null +++ b/pandas/io/excel/_pyxlsb.py @@ -0,0 +1,68 @@ +from typing import List + +from pandas._typing import FilePathOrBuffer, Scalar +from pandas.compat._optional import import_optional_dependency + +from pandas.io.excel._base import _BaseExcelReader + + +class _PyxlsbReader(_BaseExcelReader): + def __init__(self, filepath_or_buffer: FilePathOrBuffer): + """Reader using pyxlsb engine. + + Parameters + __________ + filepath_or_buffer: string, path object, or Workbook + Object to be parsed. + """ + import_optional_dependency("pyxlsb") + # This will call load_workbook on the filepath or buffer + # And set the result to the book-attribute + super().__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + from pyxlsb import Workbook + + return Workbook + + def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + from pyxlsb import open_workbook + + # Todo: hack in buffer capability + # This might need some modifications to the Pyxlsb library + # Actual work for opening it is in xlsbpackage.py, line 20-ish + + return open_workbook(filepath_or_buffer) + + @property + def sheet_names(self) -> List[str]: + return self.book.sheets + + def get_sheet_by_name(self, name: str): + return self.book.get_sheet(name) + + def get_sheet_by_index(self, index: int): + # pyxlsb sheets are indexed from 1 onwards + # There's a fix for this in the source, but the pypi package doesn't have it + return self.book.get_sheet(index + 1) + + def _convert_cell(self, cell, convert_float: bool) -> Scalar: + # Todo: there is no way to distinguish between floats and datetimes in pyxlsb + # This means that there is no way to read datetime types from an xlsb file yet + if cell.v is None: + return "" # Prevents non-named columns from not showing up as Unnamed: i + if isinstance(cell.v, float) and convert_float: + val = int(cell.v) + if val == cell.v: + return val + else: + return float(cell.v) + + return cell.v + + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + return [ + [self._convert_cell(c, convert_float) for c in r] + for r in sheet.rows(sparse=False) + ] diff --git a/pandas/tests/io/data/excel/blank.xlsb b/pandas/tests/io/data/excel/blank.xlsb new file mode 100644 index 0000000000000..d72fd68ab3dbf Binary files /dev/null and b/pandas/tests/io/data/excel/blank.xlsb differ diff --git a/pandas/tests/io/data/excel/blank_with_header.xlsb b/pandas/tests/io/data/excel/blank_with_header.xlsb new file mode 100644 index 0000000000000..3c241513d221a Binary files /dev/null and b/pandas/tests/io/data/excel/blank_with_header.xlsb differ diff --git a/pandas/tests/io/data/excel/test1.xlsb b/pandas/tests/io/data/excel/test1.xlsb new file mode 100644 index 0000000000000..d0b8a1f2735bd Binary files /dev/null and b/pandas/tests/io/data/excel/test1.xlsb differ diff --git a/pandas/tests/io/data/excel/test2.xlsb b/pandas/tests/io/data/excel/test2.xlsb new file mode 100644 index 0000000000000..e19a0f1e067c8 Binary files /dev/null and b/pandas/tests/io/data/excel/test2.xlsb differ diff --git a/pandas/tests/io/data/excel/test3.xlsb b/pandas/tests/io/data/excel/test3.xlsb new file mode 100644 index 0000000000000..617d27630e8a0 Binary files /dev/null and b/pandas/tests/io/data/excel/test3.xlsb differ diff --git a/pandas/tests/io/data/excel/test4.xlsb b/pandas/tests/io/data/excel/test4.xlsb new file mode 100644 index 0000000000000..2e5bb229939be Binary files /dev/null and b/pandas/tests/io/data/excel/test4.xlsb differ diff --git a/pandas/tests/io/data/excel/test5.xlsb b/pandas/tests/io/data/excel/test5.xlsb new file mode 100644 index 0000000000000..022ebef25aee2 Binary files /dev/null and b/pandas/tests/io/data/excel/test5.xlsb differ diff --git a/pandas/tests/io/data/excel/test_converters.xlsb b/pandas/tests/io/data/excel/test_converters.xlsb new file mode 100644 index 0000000000000..c39c33d8dd94f Binary files /dev/null and b/pandas/tests/io/data/excel/test_converters.xlsb differ diff --git a/pandas/tests/io/data/excel/test_index_name_pre17.xlsb b/pandas/tests/io/data/excel/test_index_name_pre17.xlsb new file mode 100644 index 0000000000000..5251b8f3b3194 Binary files /dev/null and b/pandas/tests/io/data/excel/test_index_name_pre17.xlsb differ diff --git a/pandas/tests/io/data/excel/test_multisheet.xlsb b/pandas/tests/io/data/excel/test_multisheet.xlsb new file mode 100644 index 0000000000000..39b15568a7121 Binary files /dev/null and b/pandas/tests/io/data/excel/test_multisheet.xlsb differ diff --git a/pandas/tests/io/data/excel/test_squeeze.xlsb b/pandas/tests/io/data/excel/test_squeeze.xlsb new file mode 100644 index 0000000000000..6aadd727e957b Binary files /dev/null and b/pandas/tests/io/data/excel/test_squeeze.xlsb differ diff --git a/pandas/tests/io/data/excel/test_types.xlsb b/pandas/tests/io/data/excel/test_types.xlsb new file mode 100644 index 0000000000000..e7403aa288263 Binary files /dev/null and b/pandas/tests/io/data/excel/test_types.xlsb differ diff --git a/pandas/tests/io/data/excel/testdateoverflow.xlsb b/pandas/tests/io/data/excel/testdateoverflow.xlsb new file mode 100644 index 0000000000000..3d279396924b9 Binary files /dev/null and b/pandas/tests/io/data/excel/testdateoverflow.xlsb differ diff --git a/pandas/tests/io/data/excel/testdtype.xlsb b/pandas/tests/io/data/excel/testdtype.xlsb new file mode 100644 index 0000000000000..1c1d45f0d783b Binary files /dev/null and b/pandas/tests/io/data/excel/testdtype.xlsb differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsb b/pandas/tests/io/data/excel/testmultiindex.xlsb new file mode 100644 index 0000000000000..b66d6dab17ee0 Binary files /dev/null and b/pandas/tests/io/data/excel/testmultiindex.xlsb differ diff --git a/pandas/tests/io/data/excel/testskiprows.xlsb b/pandas/tests/io/data/excel/testskiprows.xlsb new file mode 100644 index 0000000000000..a5ff4ed22e70c Binary files /dev/null and b/pandas/tests/io/data/excel/testskiprows.xlsb differ diff --git a/pandas/tests/io/data/excel/times_1900.xlsb b/pandas/tests/io/data/excel/times_1900.xlsb new file mode 100644 index 0000000000000..ceb7bccb0c66e Binary files /dev/null and b/pandas/tests/io/data/excel/times_1900.xlsb differ diff --git a/pandas/tests/io/data/excel/times_1904.xlsb b/pandas/tests/io/data/excel/times_1904.xlsb new file mode 100644 index 0000000000000..e426dc959da49 Binary files /dev/null and b/pandas/tests/io/data/excel/times_1904.xlsb differ diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index a257735dc1ec5..0455e0d61ad97 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -35,7 +35,7 @@ def df_ref(datapath): return df_ref -@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods"]) +@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods", ".xlsb"]) def read_ext(request): """ Valid extensions for reading Excel files. diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 629d3d02028bd..f8ff3567b8b64 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -31,7 +31,7 @@ def ignore_xlrd_time_clock_warning(): yield -read_ext_params = [".xls", ".xlsx", ".xlsm", ".ods"] +read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ # Add any engines to test here # When defusedxml is installed it triggers deprecation warnings for @@ -57,6 +57,7 @@ def ignore_xlrd_time_clock_warning(): pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), ], ), + pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), ] @@ -73,6 +74,10 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if read_ext == ".ods" and engine != "odf": return False + if engine == "pyxlsb" and read_ext != ".xlsb": + return False + if read_ext == ".xlsb" and engine != "pyxlsb": + return False return True @@ -120,7 +125,6 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch): """ Change directory and set engine for read_excel calls. """ - func = partial(pd.read_excel, engine=engine) monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "read_excel", func) @@ -142,6 +146,8 @@ def test_usecols_int(self, read_ext, df_ref): ) def test_usecols_list(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") df_ref = df_ref.reindex(columns=["B", "C"]) df1 = pd.read_excel( @@ -156,6 +162,8 @@ def test_usecols_list(self, read_ext, df_ref): tm.assert_frame_equal(df2, df_ref, check_names=False) def test_usecols_str(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") df1 = df_ref.reindex(columns=["A", "B", "C"]) df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A:D") @@ -188,6 +196,9 @@ def test_usecols_str(self, read_ext, df_ref): "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] ) def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + expected = df_ref[["A", "C"]] result = pd.read_excel( "test1" + read_ext, "Sheet1", index_col=0, usecols=usecols @@ -203,11 +214,17 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r tm.assert_frame_equal(result, expected, check_names=False) def test_read_excel_without_slicing(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + expected = df_ref result = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0) tm.assert_frame_equal(result, expected, check_names=False) def test_usecols_excel_range_str(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + expected = df_ref[["C", "D"]] result = pd.read_excel( "test1" + read_ext, "Sheet1", index_col=0, usecols="A,D:E" @@ -274,12 +291,16 @@ def test_excel_stop_iterator(self, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self, read_ext): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") parsed = pd.read_excel("test3" + read_ext, "Sheet1") expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) def test_excel_table(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") df1 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0) df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], index_col=0) @@ -291,6 +312,8 @@ def test_excel_table(self, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, read_ext): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") expected = DataFrame.from_dict( OrderedDict( @@ -488,6 +511,9 @@ def test_read_excel_blank_with_header(self, read_ext): def test_date_conversion_overflow(self, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + expected = pd.DataFrame( [ [pd.Timestamp("2016-03-12"), "Marc Johnson"], @@ -504,9 +530,14 @@ def test_date_conversion_overflow(self, read_ext): tm.assert_frame_equal(result, expected) def test_sheet_name(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") filename = "test1" sheet_name = "Sheet1" + if pd.read_excel.keywords["engine"] == "openpyxl": + pytest.xfail("Maybe not supported by openpyxl") + df1 = pd.read_excel( filename + read_ext, sheet_name=sheet_name, index_col=0 ) # doc @@ -531,6 +562,10 @@ def test_bad_engine_raises(self, read_ext): @tm.network def test_read_from_http_url(self, read_ext): + if read_ext == ".xlsb": + pytest.xfail("xlsb files not present in master repo yet") + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") url = ( "https://raw.githubusercontent.com/pandas-dev/pandas/master/" @@ -599,6 +634,8 @@ def test_read_from_py_localpath(self, read_ext): tm.assert_frame_equal(expected, actual) def test_reader_seconds(self, read_ext): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") # Test reading times with and without milliseconds. GH5945. expected = DataFrame.from_dict( @@ -627,6 +664,9 @@ def test_reader_seconds(self, read_ext): def test_read_excel_multiindex(self, read_ext): # see gh-4679 + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext @@ -786,6 +826,9 @@ def test_read_excel_chunksize(self, read_ext): def test_read_excel_skiprows_list(self, read_ext): # GH 4903 + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + actual = pd.read_excel( "testskiprows" + read_ext, "skiprows_list", skiprows=[0, 2] ) @@ -851,13 +894,11 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch): """ Change directory and set engine for ExcelFile objects. """ - func = partial(pd.ExcelFile, engine=engine) monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "ExcelFile", func) def test_excel_passes_na(self, read_ext): - with pd.ExcelFile("test4" + read_ext) as excel: parsed = pd.read_excel( excel, "Sheet1", keep_default_na=False, na_values=["apple"] @@ -928,6 +969,10 @@ def test_unexpected_kwargs_raises(self, read_ext, arg): pd.read_excel(excel, **kwarg) def test_excel_table_sheet_by_index(self, read_ext, df_ref): + # For some reason pd.read_excel has no attribute 'keywords' here. + # Skipping based on read_ext instead. + if read_ext == ".xlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") with pd.ExcelFile("test1" + read_ext) as excel: df1 = pd.read_excel(excel, 0, index_col=0) @@ -951,6 +996,11 @@ def test_excel_table_sheet_by_index(self, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_sheet_name(self, read_ext, df_ref): + # For some reason pd.read_excel has no attribute 'keywords' here. + # Skipping based on read_ext instead. + if read_ext == ".xlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + filename = "test1" sheet_name = "Sheet1" diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index d1f900a2dc58b..cc7e2311f362a 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -10,9 +10,11 @@ @pytest.fixture(autouse=True) -def skip_ods_files(read_ext): +def skip_ods_and_xlsb_files(read_ext): if read_ext == ".ods": pytest.skip("Not valid for xlrd") + if read_ext == ".xlsb": + pytest.skip("Not valid for xlrd") def test_read_xlrd_book(read_ext, frame):