diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 60aa1759958f6..ec3af524083c3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -316,6 +316,7 @@ Other enhancements - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). - :meth:`Dataframe.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list of dict to change only some specific columns' width (:issue:`28917`). +- :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) .. --------------------------------------------------------------------------- @@ -1018,6 +1019,7 @@ I/O - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) - :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) +- Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) Plotting ^^^^^^^^ diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 5089445c79897..54d23fe8829e6 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -553,6 +553,7 @@ def use_inf_as_na_cb(key): _xls_options = ["xlwt"] _xlsm_options = ["openpyxl"] _xlsx_options = ["openpyxl", "xlsxwriter"] +_ods_options = ["odf"] with cf.config_prefix("io.excel.xls"): @@ -581,6 +582,15 @@ def use_inf_as_na_cb(key): ) +with cf.config_prefix("io.excel.ods"): + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="ods", others=", ".join(_ods_options)), + validator=str, + ) + + # Set up the io.parquet specific configuration. parquet_engine_doc = """ : string diff --git a/pandas/io/excel/__init__.py b/pandas/io/excel/__init__.py index 455abaa7fb589..d035223957a76 100644 --- a/pandas/io/excel/__init__.py +++ b/pandas/io/excel/__init__.py @@ -1,4 +1,5 @@ from pandas.io.excel._base import ExcelFile, ExcelWriter, read_excel +from pandas.io.excel._odswriter import _ODSWriter from pandas.io.excel._openpyxl import _OpenpyxlWriter from pandas.io.excel._util import register_writer from pandas.io.excel._xlsxwriter import _XlsxWriter @@ -14,3 +15,6 @@ register_writer(_XlsxWriter) + + +register_writer(_ODSWriter) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6c3b49b9afc68..4fa4f158e9c3c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,8 +1,9 @@ import abc import datetime -from io import BytesIO +from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill +from typing import Union from pandas._config import config @@ -533,13 +534,13 @@ class ExcelWriter(metaclass=abc.ABCMeta): """ Class for writing DataFrame objects into excel sheets. - Default is to use xlwt for xls, openpyxl for xlsx. + Default is to use xlwt for xls, openpyxl for xlsx, odf for ods. See DataFrame.to_excel for typical usage. Parameters ---------- path : str - Path to xls or xlsx file. + Path to xls or xlsx or ods file. engine : str (optional) Engine to use for writing. If None, defaults to ``io.excel..writer``. NOTE: can only be passed as a keyword @@ -692,10 +693,7 @@ def __init__( # validate that this engine can handle the extension if isinstance(path, str): ext = os.path.splitext(path)[-1] - else: - ext = "xls" if engine == "xlwt" else "xlsx" - - self.check_extension(ext) + self.check_extension(ext) self.path = path self.sheets = {} @@ -781,6 +779,34 @@ def close(self): return self.save() +def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: + """ + Check if the stream is an OpenDocument Spreadsheet (.ods) file + + It uses magic values inside the stream + + Parameters + ---------- + stream : Union[BufferedIOBase, RawIOBase] + IO stream with data which might be an ODS file + + Returns + ------- + is_ods : bool + Boolean indication that this is indeed an ODS file or not + """ + stream.seek(0) + is_ods = False + if stream.read(4) == b"PK\003\004": + stream.seek(30) + is_ods = ( + stream.read(54) == b"mimetype" + b"application/vnd.oasis.opendocument.spreadsheet" + ) + stream.seek(0) + return is_ods + + class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. @@ -789,8 +815,8 @@ class ExcelFile: Parameters ---------- - io : str, path object (pathlib.Path or py._path.local.LocalPath), - a file-like object, xlrd workbook or openpypl workbook. + path_or_buffer : str, path object (pathlib.Path or py._path.local.LocalPath), + a file-like object, xlrd workbook or openpypl workbook. If a string or path object, expected to be a path to a .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None @@ -816,18 +842,25 @@ class ExcelFile: "pyxlsb": _PyxlsbReader, } - def __init__(self, io, engine=None): + def __init__(self, path_or_buffer, engine=None): if engine is None: engine = "xlrd" + if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): + if _is_ods_stream(path_or_buffer): + engine = "odf" + else: + ext = os.path.splitext(str(path_or_buffer))[-1] + if ext == ".ods": + engine = "odf" if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") self.engine = engine # Could be a str, ExcelFile, Book, etc. - self.io = io + self.io = path_or_buffer # Always a string - self._io = stringify_path(io) + self._io = stringify_path(path_or_buffer) self._reader = self._engines[engine](self._io) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index be86b57ca2066..85ec9afaaec25 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,5 +1,7 @@ from typing import List, cast +import numpy as np + from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency @@ -148,6 +150,9 @@ def _is_empty_row(self, row) -> bool: def _get_cell_value(self, cell, convert_float: bool) -> Scalar: from odf.namespaces import OFFICENS + if str(cell) == "#N/A": + return np.nan + cell_type = cell.attributes.get((OFFICENS, "value-type")) if cell_type == "boolean": if str(cell) == "TRUE": @@ -158,10 +163,6 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: elif cell_type == "float": # GH5394 cell_value = float(cell.attributes.get((OFFICENS, "value"))) - - if cell_value == 0.0: # NA handling - return str(cell) - if convert_float: val = int(cell_value) if val == cell_value: diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py new file mode 100644 index 0000000000000..0131240f99cf6 --- /dev/null +++ b/pandas/io/excel/_odswriter.py @@ -0,0 +1,272 @@ +from collections import defaultdict +import datetime +from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union + +import pandas._libs.json as json + +from pandas.io.excel._base import ExcelWriter +from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.formats.excel import ExcelCell + + +class _ODSWriter(ExcelWriter): + engine = "odf" + supported_extensions = (".ods",) + + def __init__( + self, path: str, engine: Optional[str] = None, mode: str = "w", **engine_kwargs + ): + from odf.opendocument import OpenDocumentSpreadsheet + + engine_kwargs["engine"] = engine + + if mode == "a": + raise ValueError("Append mode is not supported with odf!") + + super().__init__(path, mode=mode, **engine_kwargs) + + self.book: OpenDocumentSpreadsheet = OpenDocumentSpreadsheet() + self._style_dict: Dict[str, str] = {} + + def save(self) -> None: + """ + Save workbook to disk. + """ + for sheet in self.sheets.values(): + self.book.spreadsheet.addElement(sheet) + self.book.save(self.path) + + def write_cells( + self, + cells: List[ExcelCell], + sheet_name: Optional[str] = None, + startrow: int = 0, + startcol: int = 0, + freeze_panes: Optional[List] = None, + ) -> None: + """ + Write the frame cells using odf + """ + from odf.table import Table, TableCell, TableRow + from odf.text import P + + sheet_name = self._get_sheet_name(sheet_name) + assert sheet_name is not None + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = Table(name=sheet_name) + self.sheets[sheet_name] = wks + + if _validate_freeze_panes(freeze_panes): + assert freeze_panes is not None + self._create_freeze_panes(sheet_name, freeze_panes) + + for _ in range(startrow): + wks.addElement(TableRow()) + + rows: DefaultDict = defaultdict(TableRow) + col_count: DefaultDict = defaultdict(int) + + for cell in sorted(cells, key=lambda cell: (cell.row, cell.col)): + # only add empty cells if the row is still empty + if not col_count[cell.row]: + for _ in range(startcol): + rows[cell.row].addElement(TableCell()) + + # fill with empty cells if needed + for _ in range(cell.col - col_count[cell.row]): + rows[cell.row].addElement(TableCell()) + col_count[cell.row] += 1 + + pvalue, tc = self._make_table_cell(cell) + rows[cell.row].addElement(tc) + col_count[cell.row] += 1 + p = P(text=pvalue) + tc.addElement(p) + + # add all rows to the sheet + for row_nr in range(max(rows.keys()) + 1): + wks.addElement(rows[row_nr]) + + def _make_table_cell_attributes(self, cell) -> Dict[str, Union[int, str]]: + """Convert cell attributes to OpenDocument attributes + + Parameters + ---------- + cell : ExcelCell + Spreadsheet cell data + + Returns + ------- + attributes : Dict[str, Union[int, str]] + Dictionary with attributes and attribute values + """ + attributes: Dict[str, Union[int, str]] = {} + style_name = self._process_style(cell.style) + if style_name is not None: + attributes["stylename"] = style_name + if cell.mergestart is not None and cell.mergeend is not None: + attributes["numberrowsspanned"] = max(1, cell.mergestart) + attributes["numbercolumnsspanned"] = cell.mergeend + return attributes + + def _make_table_cell(self, cell) -> Tuple[str, Any]: + """Convert cell data to an OpenDocument spreadsheet cell + + Parameters + ---------- + cell : ExcelCell + Spreadsheet cell data + + Returns + ------- + pvalue, cell : Tuple[str, TableCell] + Display value, Cell value + """ + from odf.table import TableCell + + attributes = self._make_table_cell_attributes(cell) + val, fmt = self._value_with_fmt(cell.val) + pvalue = value = val + if isinstance(val, bool): + value = str(val).lower() + pvalue = str(val).upper() + if isinstance(val, datetime.datetime): + value = val.isoformat() + pvalue = val.strftime("%c") + return ( + pvalue, + TableCell(valuetype="date", datevalue=value, attributes=attributes), + ) + elif isinstance(val, datetime.date): + value = val.strftime("%Y-%m-%d") + pvalue = val.strftime("%x") + return ( + pvalue, + TableCell(valuetype="date", datevalue=value, attributes=attributes), + ) + else: + class_to_cell_type = { + str: "string", + int: "float", + float: "float", + bool: "boolean", + } + return ( + pvalue, + TableCell( + valuetype=class_to_cell_type[type(val)], + value=value, + attributes=attributes, + ), + ) + + def _process_style(self, style: Dict[str, Any]) -> str: + """Convert a style dictionary to a OpenDocument style sheet + + Parameters + ---------- + style : Dict + Style dictionary + + Returns + ------- + style_key : str + Unique style key for for later reference in sheet + """ + from odf.style import ( + ParagraphProperties, + Style, + TableCellProperties, + TextProperties, + ) + + if style is None: + return None + style_key = json.dumps(style) + if style_key in self._style_dict: + return self._style_dict[style_key] + name = f"pd{len(self._style_dict)+1}" + self._style_dict[style_key] = name + odf_style = Style(name=name, family="table-cell") + if "font" in style: + font = style["font"] + if font.get("bold", False): + odf_style.addElement(TextProperties(fontweight="bold")) + if "borders" in style: + borders = style["borders"] + for side, thickness in borders.items(): + thickness_translation = {"thin": "0.75pt solid #000000"} + odf_style.addElement( + TableCellProperties( + attributes={f"border{side}": thickness_translation[thickness]} + ) + ) + if "alignment" in style: + alignment = style["alignment"] + horizontal = alignment.get("horizontal") + if horizontal: + odf_style.addElement(ParagraphProperties(textalign=horizontal)) + vertical = alignment.get("vertical") + if vertical: + odf_style.addElement(TableCellProperties(verticalalign=vertical)) + self.book.styles.addElement(odf_style) + return name + + def _create_freeze_panes(self, sheet_name: str, freeze_panes: List[int]) -> None: + """Create freeze panes in the sheet + + Parameters + ---------- + sheet_name : str + Name of the spreadsheet + freeze_panes : list + Freeze pane location x and y + """ + from odf.config import ( + ConfigItem, + ConfigItemMapEntry, + ConfigItemMapIndexed, + ConfigItemMapNamed, + ConfigItemSet, + ) + + config_item_set = ConfigItemSet(name="ooo:view-settings") + self.book.settings.addElement(config_item_set) + + config_item_map_indexed = ConfigItemMapIndexed(name="Views") + config_item_set.addElement(config_item_map_indexed) + + config_item_map_entry = ConfigItemMapEntry() + config_item_map_indexed.addElement(config_item_map_entry) + + config_item_map_named = ConfigItemMapNamed(name="Tables") + config_item_map_entry.addElement(config_item_map_named) + + config_item_map_entry = ConfigItemMapEntry(name=sheet_name) + config_item_map_named.addElement(config_item_map_entry) + + config_item_map_entry.addElement( + ConfigItem(name="HorizontalSplitMode", type="short", text="2") + ) + config_item_map_entry.addElement( + ConfigItem(name="VerticalSplitMode", type="short", text="2") + ) + config_item_map_entry.addElement( + ConfigItem( + name="HorizontalSplitPosition", type="int", text=str(freeze_panes[0]) + ) + ) + config_item_map_entry.addElement( + ConfigItem( + name="VerticalSplitPosition", type="int", text=str(freeze_panes[1]) + ) + ) + config_item_map_entry.addElement( + ConfigItem(name="PositionRight", type="int", text=str(freeze_panes[0])) + ) + config_item_map_entry.addElement( + ConfigItem(name="PositionBottom", type="int", text=str(freeze_panes[1])) + ) diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 7c8e1abb497bc..285aeaf7d4c6e 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -35,7 +35,12 @@ def _get_default_writer(ext): str The default engine for the extension. """ - _default_writers = {"xlsx": "openpyxl", "xlsm": "openpyxl", "xls": "xlwt"} + _default_writers = { + "xlsx": "openpyxl", + "xlsm": "openpyxl", + "xls": "xlwt", + "ods": "odf", + } xlsxwriter = import_optional_dependency( "xlsxwriter", raise_on_missing=False, on_version="warn" ) diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py new file mode 100644 index 0000000000000..b50c641ebf0c0 --- /dev/null +++ b/pandas/tests/io/excel/test_odswriter.py @@ -0,0 +1,17 @@ +import pytest + +import pandas._testing as tm + +from pandas.io.excel import ExcelWriter + +odf = pytest.importorskip("odf") + +pytestmark = pytest.mark.parametrize("ext", [".ods"]) + + +def test_write_append_mode_raises(ext): + msg = "Append mode is not supported with odf!" + + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=msg): + ExcelWriter(f, engine="odf", mode="a") diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index ba759c7766fa5..e3ee53b63e102 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -48,11 +48,19 @@ def set_engine(engine, ext): set_option(option_name, prev_engine) # Roll back option change -@td.skip_if_no("xlrd") -@pytest.mark.parametrize("ext", [".xls", ".xlsx", ".xlsm"]) +@pytest.mark.parametrize( + "ext", + [ + pytest.param(".xlsx", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")]), + pytest.param(".xlsm", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")]), + pytest.param(".xls", marks=[td.skip_if_no("xlwt"), td.skip_if_no("xlrd")]), + pytest.param( + ".xlsx", marks=[td.skip_if_no("xlsxwriter"), td.skip_if_no("xlrd")] + ), + pytest.param(".ods", marks=td.skip_if_no("odf")), + ], +) class TestRoundTrip: - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") @pytest.mark.parametrize( "header,expected", [(None, DataFrame([np.nan] * 4)), (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))], @@ -70,8 +78,6 @@ def test_read_one_empty_col_no_header(self, ext, header, expected): tm.assert_frame_equal(result, expected) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") @pytest.mark.parametrize( "header,expected", [(None, DataFrame([0] + [np.nan] * 4)), (0, DataFrame([np.nan] * 4))], @@ -88,8 +94,6 @@ def test_read_one_empty_col_with_header(self, ext, header, expected): tm.assert_frame_equal(result, expected) - @td.skip_if_no("openpyxl") - @td.skip_if_no("xlwt") def test_set_column_names_in_parameter(self, ext): # GH 12870 : pass down column names associated with # keyword argument names @@ -116,8 +120,6 @@ def test_set_column_names_in_parameter(self, ext): tm.assert_frame_equal(xlsdf_no_head, refdf) tm.assert_frame_equal(xlsdf_with_head, refdf) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") def test_creating_and_reading_multiple_sheets(self, ext): # see gh-9450 # @@ -142,7 +144,6 @@ def tdf(col_sheet_name): for s in sheets: tm.assert_frame_equal(dfs[s], dfs_returned[s]) - @td.skip_if_no("xlsxwriter") def test_read_excel_multiindex_empty_level(self, ext): # see gh-12453 with tm.ensure_clean(ext) as path: @@ -190,7 +191,6 @@ def test_read_excel_multiindex_empty_level(self, ext): actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - @td.skip_if_no("xlsxwriter") @pytest.mark.parametrize("c_idx_names", [True, False]) @pytest.mark.parametrize("r_idx_names", [True, False]) @pytest.mark.parametrize("c_idx_levels", [1, 3]) @@ -240,8 +240,6 @@ def test_excel_multindex_roundtrip( ) tm.assert_frame_equal(df, act, check_names=check_names) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") def test_read_excel_parse_dates(self, ext): # see gh-11544, gh-12051 df = DataFrame( @@ -296,14 +294,28 @@ def test_multiindex_interval_datetimes(self, ext): tm.assert_frame_equal(result, expected) -@td.skip_if_no("xlrd") @pytest.mark.parametrize( "engine,ext", [ - pytest.param("openpyxl", ".xlsx", marks=td.skip_if_no("openpyxl")), - pytest.param("openpyxl", ".xlsm", marks=td.skip_if_no("openpyxl")), - pytest.param("xlwt", ".xls", marks=td.skip_if_no("xlwt")), - pytest.param("xlsxwriter", ".xlsx", marks=td.skip_if_no("xlsxwriter")), + pytest.param( + "openpyxl", + ".xlsx", + marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")], + ), + pytest.param( + "openpyxl", + ".xlsm", + marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")], + ), + pytest.param( + "xlwt", ".xls", marks=[td.skip_if_no("xlwt"), td.skip_if_no("xlrd")] + ), + pytest.param( + "xlsxwriter", + ".xlsx", + marks=[td.skip_if_no("xlsxwriter"), td.skip_if_no("xlrd")], + ), + pytest.param("odf", ".ods", marks=td.skip_if_no("odf")), ], ) @pytest.mark.usefixtures("set_engine") @@ -326,9 +338,7 @@ def test_excel_sheet_size(self, path): with pytest.raises(ValueError, match=msg): col_df.to_excel(path) - def test_excel_sheet_by_name_raise(self, path): - import xlrd - + def test_excel_sheet_by_name_raise(self, path, engine): gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(path) @@ -337,9 +347,16 @@ def test_excel_sheet_by_name_raise(self, path): tm.assert_frame_equal(gt, df) - msg = "No sheet named <'0'>" - with pytest.raises(xlrd.XLRDError, match=msg): - pd.read_excel(xl, sheet_name="0") + if engine == "odf": + msg = "sheet 0 not found" + with pytest.raises(ValueError, match=msg): + pd.read_excel(xl, "0") + else: + import xlrd + + msg = "No sheet named <'0'>" + with pytest.raises(xlrd.XLRDError, match=msg): + pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -1246,7 +1263,7 @@ def test_path_path_lib(self, engine, ext): writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, path=f"foo.{ext}") + result = tm.round_trip_pathlib(writer, reader, path=f"foo{ext}") tm.assert_frame_equal(result, df) def test_path_local_path(self, engine, ext): @@ -1254,7 +1271,7 @@ def test_path_local_path(self, engine, ext): writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, path=f"foo.{ext}") + result = tm.round_trip_localpath(writer, reader, path=f"foo{ext}") tm.assert_frame_equal(result, df) def test_merged_cell_custom_objects(self, merge_cells, path):