From 9a7b29ae7a2945ee07fcd956f6a54bd1089de6a7 Mon Sep 17 00:00:00 2001
From: NickFillot <40593450+NickFillot@users.noreply.github.com>
Date: Sun, 3 Oct 2021 16:23:02 +0200
Subject: [PATCH 01/49] [ENH] to_orc
pandas.io.orc.to_orc method definition
---
pandas/io/orc.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 81 insertions(+), 2 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index b02660c089382..06d9563aa080f 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -1,7 +1,9 @@
""" orc compat """
from __future__ import annotations
+import os
from typing import TYPE_CHECKING
+from tempfile import gettempdir
from pandas._typing import (
FilePath,
@@ -11,8 +13,10 @@
from pandas.io.common import get_handle
-if TYPE_CHECKING:
- from pandas import DataFrame
+from pandas.core import generic
+from pandas.util._decorators import doc
+
+from pandas import DataFrame
def read_orc(
@@ -52,3 +56,78 @@ def read_orc(
with get_handle(path, "rb", is_text=False) as handles:
orc_file = orc.ORCFile(handles.handle)
return orc_file.read(columns=columns, **kwargs).to_pandas()
+
+
+def to_orc(
+ df: DataFrame,
+ path: FilePathOrBuffer = None,
+ engine: str = 'pyarrow',
+ index: bool = None,
+ **kwargs
+) -> bytes:
+ """
+ Write a DataFrame to the orc/arrow format.
+ Parameters
+ ----------
+ df : DataFrame
+ path : str or file-like object, default None
+ If a string, it will be used as Root Directory path
+ when writing a partitioned dataset. By file-like object,
+ we refer to objects with a write() method, such as a file handle
+ (e.g. via builtin open function) or io.BytesIO. The engine
+ fastparquet does not accept file-like objects. If path is None,
+ a bytes object is returned.
+ engine : {{'pyarrow'}}, default 'pyarrow'
+ Parquet library to use, or library it self, checked with 'pyarrow' name
+ and version > 4.0.0
+ index : bool, default None
+ If ``True``, include the dataframe's index(es) in the file output. If
+ ``False``, they will not be written to the file.
+ If ``None``, similar to ``infer`` the dataframe's index(es)
+ will be saved. However, instead of being saved as values,
+ the RangeIndex will be stored as a range in the metadata so it
+ doesn't require much space and is faster. Other indexes will
+ be included as columns in the file output.
+ kwargs
+ Additional keyword arguments passed to the engine
+ Returns
+ -------
+ bytes if no path argument is provided else None
+ """
+ if index is None:
+ index = df.index.names[0] is not None
+
+ if isinstance(engine, str):
+ engine = import_optional_dependency(engine, min_version='4.0.0')
+ else:
+ try:
+ assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module"
+ assert hasattr(engine, 'orc'), "'pyarrow' module must have version > 4.0.0 with orc module"
+ except Exception as e:
+ raise ValueError("Wrong engine passed, %s" % (
+ e,
+ ))
+
+ if path is None:
+ # to bytes: tmp path, pyarrow auto closes buffers
+ path = os.path.join(gettempdir(), os.urandom(12).hex())
+ try:
+ engine.orc.write_table(
+ engine.Table.from_pandas(df, preserve_index=index),
+ path, **kwargs
+ )
+ with open(path, 'rb') as path:
+ return path.read()
+ except BaseException as e:
+ raise e
+ finally:
+ try:
+ os.remove(path)
+ except Exception as e:
+ pass
+ else:
+ engine.orc.write_table(
+ engine.Table.from_pandas(df, preserve_index=index),
+ path, **kwargs
+ )
+ return
From d11026f0a310a24a09f0357407835a03ccd2a7bf Mon Sep 17 00:00:00 2001
From: NickFillot <40593450+NickFillot@users.noreply.github.com>
Date: Sun, 3 Oct 2021 16:34:37 +0200
Subject: [PATCH 02/49] pandas.DataFrame.to_orc
set to_orc to pandas.DataFrame
---
pandas/core/frame.py | 74 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 74 insertions(+)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 54ee5ed2f35d1..694cfdf9f8e82 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2900,7 +2900,81 @@ def to_parquet(
storage_options=storage_options,
**kwargs,
)
+
+ def to_orc(
+ self,
+ path: FilePathOrBuffer = None,
+ engine: str = 'pyarrow',
+ index: bool = None,
+ **kwargs
+ ) -> bytes:
+ """
+ Write a DataFrame to the orc/arrow format.
+ Parameters
+ ----------
+ df : DataFrame
+ path : str or file-like object, default None
+ If a string, it will be used as Root Directory path
+ when writing a partitioned dataset. By file-like object,
+ we refer to objects with a write() method, such as a file handle
+ (e.g. via builtin open function) or io.BytesIO. The engine
+ fastparquet does not accept file-like objects. If path is None,
+ a bytes object is returned.
+ engine : {{'pyarrow'}}, default 'pyarrow'
+ Parquet library to use, or library it self, checked with 'pyarrow' name
+ and version > 4.0.0
+ index : bool, default None
+ If ``True``, include the dataframe's index(es) in the file output. If
+ ``False``, they will not be written to the file.
+ If ``None``, similar to ``infer`` the dataframe's index(es)
+ will be saved. However, instead of being saved as values,
+ the RangeIndex will be stored as a range in the metadata so it
+ doesn't require much space and is faster. Other indexes will
+ be included as columns in the file output.
+ kwargs
+ Additional keyword arguments passed to the engine
+ Returns
+ -------
+ bytes if no path argument is provided else None
+
+ See Also
+ --------
+ read_orc : Read a ORC file.
+ DataFrame.to_parquet : Write a parquet file.
+ DataFrame.to_csv : Write a csv file.
+ DataFrame.to_sql : Write to a sql table.
+ DataFrame.to_hdf : Write to hdf.
+ Notes
+ -----
+ This function requires `pyarrow `_ library.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
+ >>> df.to_orc('df.orc', compression='gzip') # doctest: +SKIP
+ >>> pd.read_orc('df.orc') # doctest: +SKIP
+ col1 col2
+ 0 1 3
+ 1 2 4
+
+ If you want to get a buffer to the orc content you can write it to io.BytesIO
+ >>> import io
+ >>> b = io.BytesIO(df.to_orc())
+ >>> b.seek(0)
+ 0
+ >>> content = b.read()
+ """
+ from pandas.io.orc import to_orc
+
+ return to_orc(
+ self,
+ path,
+ engine,
+ index=index,
+ **kwargs
+ )
+
@Substitution(
header_type="bool",
header="Whether to print column labels, default True",
From 0146ac3aea9f87cb0e053af0784a17efd8230e3f Mon Sep 17 00:00:00 2001
From: NickFillot <40593450+NickFillot@users.noreply.github.com>
Date: Sun, 3 Oct 2021 16:47:11 +0200
Subject: [PATCH 03/49] Cleaning
---
pandas/io/orc.py | 24 +++++++-----------------
1 file changed, 7 insertions(+), 17 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 06d9563aa080f..15161ae202ad3 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -2,6 +2,8 @@
from __future__ import annotations
import os
+import pandas._testing as tm
+
from typing import TYPE_CHECKING
from tempfile import gettempdir
@@ -13,10 +15,8 @@
from pandas.io.common import get_handle
-from pandas.core import generic
-from pandas.util._decorators import doc
-
-from pandas import DataFrame
+if TYPE_CHECKING:
+ from pandas import DataFrame
def read_orc(
@@ -102,29 +102,19 @@ def to_orc(
else:
try:
assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module"
- assert hasattr(engine, 'orc'), "'pyarrow' module must have version > 4.0.0 with orc module"
+ assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module"
except Exception as e:
- raise ValueError("Wrong engine passed, %s" % (
- e,
- ))
+ raise ValueError("Wrong engine passed, %s" % e)
if path is None:
# to bytes: tmp path, pyarrow auto closes buffers
- path = os.path.join(gettempdir(), os.urandom(12).hex())
- try:
+ with tm.ensure_clean(os.path.join(gettempdir(), os.urandom(12).hex())) as path:
engine.orc.write_table(
engine.Table.from_pandas(df, preserve_index=index),
path, **kwargs
)
with open(path, 'rb') as path:
return path.read()
- except BaseException as e:
- raise e
- finally:
- try:
- os.remove(path)
- except Exception as e:
- pass
else:
engine.orc.write_table(
engine.Table.from_pandas(df, preserve_index=index),
From 057160250d4038e3cd35f883e7505c57b4c28fc5 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sun, 21 Nov 2021 04:29:12 -0500
Subject: [PATCH 04/49] Fix style & edit comments & change min dependency
version to 5.0.0
---
pandas/core/frame.py | 16 ++++++++--------
pandas/io/orc.py | 45 +++++++++++++++++++++++---------------------
2 files changed, 32 insertions(+), 29 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 694cfdf9f8e82..24991bd09e118 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2900,7 +2900,7 @@ def to_parquet(
storage_options=storage_options,
**kwargs,
)
-
+
def to_orc(
self,
path: FilePathOrBuffer = None,
@@ -2909,7 +2909,7 @@ def to_orc(
**kwargs
) -> bytes:
"""
- Write a DataFrame to the orc/arrow format.
+ Write a DataFrame to the ORC format.
Parameters
----------
df : DataFrame
@@ -2917,12 +2917,12 @@ def to_orc(
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
- (e.g. via builtin open function) or io.BytesIO. The engine
- fastparquet does not accept file-like objects. If path is None,
- a bytes object is returned.
+ (e.g. via builtin open function). If path is None,
+ a bytes object is returned. Note that currently the pyarrow
+ engine doesn't work with io.BytesIO.
engine : {{'pyarrow'}}, default 'pyarrow'
Parquet library to use, or library it self, checked with 'pyarrow' name
- and version > 4.0.0
+ and version >= 5.0.0
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
@@ -2952,7 +2952,7 @@ def to_orc(
Examples
--------
>>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
- >>> df.to_orc('df.orc', compression='gzip') # doctest: +SKIP
+ >>> df.to_orc('df.orc') # doctest: +SKIP
>>> pd.read_orc('df.orc') # doctest: +SKIP
col1 col2
0 1 3
@@ -2974,7 +2974,7 @@ def to_orc(
index=index,
**kwargs
)
-
+
@Substitution(
header_type="bool",
header="Whether to print column labels, default True",
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 15161ae202ad3..6664348656c84 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -1,9 +1,6 @@
""" orc compat """
from __future__ import annotations
-import os
-import pandas._testing as tm
-
from typing import TYPE_CHECKING
from tempfile import gettempdir
@@ -66,7 +63,7 @@ def to_orc(
**kwargs
) -> bytes:
"""
- Write a DataFrame to the orc/arrow format.
+ Write a DataFrame to the ORC format.
Parameters
----------
df : DataFrame
@@ -74,12 +71,12 @@ def to_orc(
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
- (e.g. via builtin open function) or io.BytesIO. The engine
- fastparquet does not accept file-like objects. If path is None,
- a bytes object is returned.
+ (e.g. via builtin open function). If path is None,
+ a bytes object is returned. Note that currently the pyarrow
+ engine doesn't work with io.BytesIO.
engine : {{'pyarrow'}}, default 'pyarrow'
Parquet library to use, or library it self, checked with 'pyarrow' name
- and version > 4.0.0
+ and version >= 5.0.0
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
@@ -96,25 +93,31 @@ def to_orc(
"""
if index is None:
index = df.index.names[0] is not None
-
+
if isinstance(engine, str):
- engine = import_optional_dependency(engine, min_version='4.0.0')
+ engine = import_optional_dependency(engine, min_version='5.0.0')
else:
try:
assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module"
assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module"
- except Exception as e:
- raise ValueError("Wrong engine passed, %s" % e)
-
+ except ImportError as e:
+ raise ValueError (
+ "Unable to find a usable engine; "
+ "tried using: 'pyarrow'.\n"
+ "A suitable version of "
+ "pyarrow is required for ORC support.\n"
+ "Trying to import the above resulted in these errors:"
+ f"\n - {e}"
+ )
+
if path is None:
- # to bytes: tmp path, pyarrow auto closes buffers
- with tm.ensure_clean(os.path.join(gettempdir(), os.urandom(12).hex())) as path:
- engine.orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index),
- path, **kwargs
- )
- with open(path, 'rb') as path:
- return path.read()
+ # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer
+ stream = engine.BufferOutputStream()
+ engine.orc.write_table(
+ engine.Table.from_pandas(df, preserve_index=index),
+ stream, **kwargs
+ )
+ return stream.getvalue().to_pybytes()
else:
engine.orc.write_table(
engine.Table.from_pandas(df, preserve_index=index),
From d970b5832d73f682dcddc63646cf55669d4d2a0e Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sun, 21 Nov 2021 04:32:15 -0500
Subject: [PATCH 05/49] Fix style & add to see also
---
pandas/core/frame.py | 4 +++-
pandas/io/orc.py | 16 ++++++++--------
2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 24991bd09e118..255cd2388dc1b 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2858,6 +2858,7 @@ def to_parquet(
See Also
--------
read_parquet : Read a parquet file.
+ DataFrame.to_orc : Write an orc file.
DataFrame.to_csv : Write a csv file.
DataFrame.to_sql : Write to a sql table.
DataFrame.to_hdf : Write to hdf.
@@ -2947,7 +2948,8 @@ def to_orc(
Notes
-----
- This function requires `pyarrow `_ library.
+ This function requires `pyarrow `
+ _ library.
Examples
--------
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 6664348656c84..06a41912a73fa 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -101,14 +101,14 @@ def to_orc(
assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module"
assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module"
except ImportError as e:
- raise ValueError (
- "Unable to find a usable engine; "
- "tried using: 'pyarrow'.\n"
- "A suitable version of "
- "pyarrow is required for ORC support.\n"
- "Trying to import the above resulted in these errors:"
- f"\n - {e}"
- )
+ raise ValueError(
+ "Unable to find a usable engine; "
+ "tried using: 'pyarrow'.\n"
+ "A suitable version of "
+ "pyarrow is required for ORC support.\n"
+ "Trying to import the above resulted in these errors:"
+ f"\n - {e}"
+ )
if path is None:
# to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer
From 8b12e9f82e70e805881c9e39bccfba06370982a7 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sun, 21 Nov 2021 04:53:23 -0500
Subject: [PATCH 06/49] Add ORC to documentation
---
doc/source/reference/frame.rst | 1 +
doc/source/reference/io.rst | 1 +
doc/source/user_guide/io.rst | 59 +++++++++++++++++++++++++++++++--
doc/source/user_guide/scale.rst | 17 ++++++++++
pandas/core/generic.py | 1 +
5 files changed, 76 insertions(+), 3 deletions(-)
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
index ea27d1efbb235..e71ee80767d29 100644
--- a/doc/source/reference/frame.rst
+++ b/doc/source/reference/frame.rst
@@ -373,6 +373,7 @@ Serialization / IO / conversion
DataFrame.from_dict
DataFrame.from_records
+ DataFrame.to_orc
DataFrame.to_parquet
DataFrame.to_pickle
DataFrame.to_csv
diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
index 70fd381bffd2c..425b5f81be966 100644
--- a/doc/source/reference/io.rst
+++ b/doc/source/reference/io.rst
@@ -159,6 +159,7 @@ ORC
:toctree: api/
read_orc
+ DataFrame.to_orc
SAS
~~~
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 4e19deb84487f..f3e712197f9c5 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -30,7 +30,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf`
binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather`
binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet`
- binary;`ORC Format `__;:ref:`read_orc`;
+ binary;`ORC Format `__;:ref:`read_orc`;:ref:`to_orc`
binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata`
binary;`SAS `__;:ref:`read_sas`;
binary;`SPSS `__;:ref:`read_spss`;
@@ -5562,14 +5562,67 @@ ORC
.. versionadded:: 1.0.0
Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization
-for data frames. It is designed to make reading data frames efficient. pandas provides *only* a reader for the
-ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library.
+for data frames. It is designed to make reading data frames efficient. pandas provides both the reader and the writer for the
+ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This requires the `pyarrow `__ library.
.. warning::
* It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow.
* :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `.
+.. ipython:: python
+
+ df = pd.DataFrame(
+ {
+ "a": list("abc"),
+ "b": list(range(1, 4)),
+ "c": np.arange(3, 6).astype("u1"),
+ "d": np.arange(4.0, 7.0, dtype="float64"),
+ "e": [True, False, True],
+ "f": pd.date_range("20130101", periods=3),
+ "g": pd.date_range("20130101", periods=3, tz="US/Eastern"),
+ "h": pd.Categorical(list("abc")),
+ "i": pd.Categorical(list("abc"), ordered=True),
+ }
+ )
+
+ df
+ df.dtypes
+
+Write to an orc file.
+
+.. ipython:: python
+ :okwarning:
+
+ df.to_orc("example_pa.orc", engine="pyarrow")
+
+Read from an orc file.
+
+.. ipython:: python
+ :okwarning:
+
+ result = pd.read_orc("example_pa.orc", engine="pyarrow")
+
+ result.dtypes
+
+Read only certain columns of an orc file.
+
+.. ipython:: python
+
+ result = pd.read_orc(
+ "example_pa.orc",
+ engine="pyarrow",
+ columns=["a", "b"],
+ )
+ result.dtypes
+
+
+.. ipython:: python
+ :suppress:
+
+ os.remove("example_pa.orc")
+
+
.. _io.sql:
SQL queries
diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index 129f43dd36930..cf8a0c9845e62 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -18,6 +18,23 @@ tool for all situations. If you're working with very large datasets and a tool
like PostgreSQL fits your needs, then you should probably be using that.
Assuming you want or need the expressiveness and power of pandas, let's carry on.
+.. ipython:: python
+
+ import pandas as pd
+ import numpy as np
+
+.. ipython:: python
+ :suppress:
+
+ from pandas._testing import _make_timeseries
+
+ # Make a random in-memory dataset
+ ts = _make_timeseries(freq="30S", seed=0)
+ ts.to_csv("timeseries.csv")
+ ts.to_orc("timeseries.orc")
+ ts.to_parquet("timeseries.parquet")
+
+
Load less data
--------------
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 89a590f291356..78edaf15fe7ce 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2629,6 +2629,7 @@ def to_hdf(
See Also
--------
read_hdf : Read from HDF file.
+ DataFrame.to_orc : Write a DataFrame to the binary orc format.
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
DataFrame.to_sql : Write to a SQL table.
DataFrame.to_feather : Write out feather-format for DataFrames.
From 65e6b7a0d1ff00ffe7dd9cdac3420f874eacea82 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sun, 21 Nov 2021 21:28:35 -0500
Subject: [PATCH 07/49] Changes according to review
---
pandas/core/frame.py | 4 ++--
pandas/io/orc.py | 29 +++++++++++++++++++----------
2 files changed, 21 insertions(+), 12 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 255cd2388dc1b..fc078cd29cf9d 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2904,8 +2904,8 @@ def to_parquet(
def to_orc(
self,
- path: FilePathOrBuffer = None,
- engine: str = 'pyarrow',
+ path: FilePath | WriteBuffer[bytes] | None = None,
+ engine: Literal['pyarrow'] = 'pyarrow',
index: bool = None,
**kwargs
) -> bytes:
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 06a41912a73fa..f352a54b1fc2a 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -1,12 +1,17 @@
""" orc compat """
from __future__ import annotations
-from typing import TYPE_CHECKING
+from types import ModuleType
+from typing import (
+ TYPE_CHECKING,
+ Literal,
+)
from tempfile import gettempdir
from pandas._typing import (
FilePath,
ReadBuffer,
+ WriteBuffer,
)
from pandas.compat._optional import import_optional_dependency
@@ -57,8 +62,8 @@ def read_orc(
def to_orc(
df: DataFrame,
- path: FilePathOrBuffer = None,
- engine: str = 'pyarrow',
+ path: FilePath | WriteBuffer[bytes] | None = None,
+ engine: Literal['pyarrow'] = 'pyarrow', # type: ignore[arg-type]
index: bool = None,
**kwargs
) -> bytes:
@@ -96,7 +101,7 @@ def to_orc(
if isinstance(engine, str):
engine = import_optional_dependency(engine, min_version='5.0.0')
- else:
+ elif isinstance(engine, ModuleType):
try:
assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module"
assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module"
@@ -109,18 +114,22 @@ def to_orc(
"Trying to import the above resulted in these errors:"
f"\n - {e}"
)
+ else:
+ raise TypeError(
+ f"unsuported type for engine: {type(engine)}"
+ )
- if path is None:
- # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer
- stream = engine.BufferOutputStream()
+ if hasattr(path, "write"):
engine.orc.write_table(
engine.Table.from_pandas(df, preserve_index=index),
- stream, **kwargs
+ path, **kwargs
)
- return stream.getvalue().to_pybytes()
else:
+ # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer
+ stream = engine.BufferOutputStream()
engine.orc.write_table(
engine.Table.from_pandas(df, preserve_index=index),
- path, **kwargs
+ stream, **kwargs
)
+ return stream.getvalue().to_pybytes()
return
From 2114616e4313a86c43761500253d4171d9282a64 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Tue, 23 Nov 2021 21:48:33 -0500
Subject: [PATCH 08/49] Fix problems mentioned in comment
---
pandas/core/frame.py | 2 +-
pandas/io/orc.py | 44 +++++++++++++++++---------------------------
2 files changed, 18 insertions(+), 28 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index fc078cd29cf9d..49ba0f4cbba5f 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2922,7 +2922,7 @@ def to_orc(
a bytes object is returned. Note that currently the pyarrow
engine doesn't work with io.BytesIO.
engine : {{'pyarrow'}}, default 'pyarrow'
- Parquet library to use, or library it self, checked with 'pyarrow' name
+ ORC library to use, or library itself, checked with 'pyarrow' name
and version >= 5.0.0
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index f352a54b1fc2a..c919867811752 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -1,7 +1,6 @@
""" orc compat """
from __future__ import annotations
-from types import ModuleType
from typing import (
TYPE_CHECKING,
Literal,
@@ -63,7 +62,7 @@ def read_orc(
def to_orc(
df: DataFrame,
path: FilePath | WriteBuffer[bytes] | None = None,
- engine: Literal['pyarrow'] = 'pyarrow', # type: ignore[arg-type]
+ engine: Literal['pyarrow'] = 'pyarrow',
index: bool = None,
**kwargs
) -> bytes:
@@ -99,37 +98,28 @@ def to_orc(
if index is None:
index = df.index.names[0] is not None
- if isinstance(engine, str):
- engine = import_optional_dependency(engine, min_version='5.0.0')
- elif isinstance(engine, ModuleType):
- try:
- assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module"
- assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module"
- except ImportError as e:
- raise ValueError(
- "Unable to find a usable engine; "
- "tried using: 'pyarrow'.\n"
- "A suitable version of "
- "pyarrow is required for ORC support.\n"
- "Trying to import the above resulted in these errors:"
- f"\n - {e}"
- )
+ if engine == "pyarrow":
+ engine = import_optional_dependency(engine, min_version='5.0.0') )
else:
- raise TypeError(
- f"unsuported type for engine: {type(engine)}"
+ raise ValueError(
+ f"engine must be 'pyarrow'"
)
- if hasattr(path, "write"):
+ if not hasattr(path, "write"):
engine.orc.write_table(
engine.Table.from_pandas(df, preserve_index=index),
path, **kwargs
)
else:
# to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer
- stream = engine.BufferOutputStream()
- engine.orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index),
- stream, **kwargs
- )
- return stream.getvalue().to_pybytes()
- return
+ with engine.BufferOutputStream() as stream: # if that is possible
+ engine.orc.write_table(
+ engine.Table.from_pandas(df, preserve_index=index),
+ stream, **kwargs
+ )
+ # allows writing to any (fsspec) URL
+ with get_handle(path, "wb", is_text=False) as handles:
+ orc_bytes = stream.getvalue().to_pybytes()
+ handles.handle.write(orc_bytes)
+ if path is None:
+ return orc_bytes
From e4b40ef861dbccbfa31eb2c9ba277f766b764ad9 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Tue, 23 Nov 2021 22:11:03 -0500
Subject: [PATCH 09/49] Linter compliance
---
pandas/io/orc.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index c919867811752..81721c8b02c80 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -99,7 +99,7 @@ def to_orc(
index = df.index.names[0] is not None
if engine == "pyarrow":
- engine = import_optional_dependency(engine, min_version='5.0.0') )
+ engine = import_optional_dependency(engine, min_version='5.0.0')
else:
raise ValueError(
f"engine must be 'pyarrow'"
From a7aa3e0d409cadce7f3c1f325e142ddc57e03e68 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Wed, 24 Nov 2021 05:54:12 -0500
Subject: [PATCH 10/49] Address comments
---
pandas/io/orc.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 81721c8b02c80..bedc7580d698a 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -105,7 +105,7 @@ def to_orc(
f"engine must be 'pyarrow'"
)
- if not hasattr(path, "write"):
+ if hasattr(path, "write"):
engine.orc.write_table(
engine.Table.from_pandas(df, preserve_index=index),
path, **kwargs
@@ -117,9 +117,9 @@ def to_orc(
engine.Table.from_pandas(df, preserve_index=index),
stream, **kwargs
)
+ orc_bytes = stream.getvalue().to_pybytes()
+ if path is None:
+ return orc_bytes
# allows writing to any (fsspec) URL
with get_handle(path, "wb", is_text=False) as handles:
- orc_bytes = stream.getvalue().to_pybytes()
handles.handle.write(orc_bytes)
- if path is None:
- return orc_bytes
From 1ab9b6c836a44e73c3ccf348d93f1a99652b134b Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Thu, 2 Dec 2021 06:36:29 -0500
Subject: [PATCH 11/49] Add orc test
---
pandas/io/orc.py | 6 +++---
pandas/tests/io/test_orc.py | 21 +++++++++++++++++++++
2 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index bedc7580d698a..02bf9f70406dc 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -98,12 +98,12 @@ def to_orc(
if index is None:
index = df.index.names[0] is not None
- if engine == "pyarrow":
- engine = import_optional_dependency(engine, min_version='5.0.0')
- else:
+ if engine != "pyarrow":
raise ValueError(
f"engine must be 'pyarrow'"
)
+ engine = import_optional_dependency(engine, min_version='5.0.0')
+
if hasattr(path, "write"):
engine.orc.write_table(
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index f34e9b940317d..211352cebcb73 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -224,3 +224,24 @@ def test_orc_reader_snappy_compressed(dirpath):
got = read_orc(inputfile).iloc[:10]
tm.assert_equal(expected, got)
+
+
+def test_orc_roundtrip(dirpath):
+ data = {
+ "boolean1": np.array([False, True], dtype="bool"),
+ "byte1": np.array([1, 100], dtype="int8"),
+ "short1": np.array([1024, 2048], dtype="int16"),
+ "int1": np.array([65536, 65536], dtype="int32"),
+ "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+ "float1": np.array([1.0, 2.0], dtype="float32"),
+ "double1": np.array([-15.0, -5.0], dtype="float64"),
+ "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+ "string1": np.array(["hi", "bye"], dtype="object"),
+ }
+ expected = pd.DataFrame.from_dict(data)
+
+ outputfile = os.path.join(dirpath, "TestOrcFile.testReadWrite.orc")
+ expected.to_orc(outputfile)
+ got = read_orc(outputfile)
+
+ tm.assert_equal(expected, got)
From 96969d50bf12f35f368065062e5719c88e05568a Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Fri, 3 Dec 2021 07:58:47 +0000
Subject: [PATCH 12/49] Fixes from pre-commit [automated commit]
---
pandas/core/frame.py | 12 +++---------
pandas/io/orc.py | 19 +++++++------------
2 files changed, 10 insertions(+), 21 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 49ba0f4cbba5f..9a3e2ddc6b463 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2905,9 +2905,9 @@ def to_parquet(
def to_orc(
self,
path: FilePath | WriteBuffer[bytes] | None = None,
- engine: Literal['pyarrow'] = 'pyarrow',
+ engine: Literal["pyarrow"] = "pyarrow",
index: bool = None,
- **kwargs
+ **kwargs,
) -> bytes:
"""
Write a DataFrame to the ORC format.
@@ -2969,13 +2969,7 @@ def to_orc(
"""
from pandas.io.orc import to_orc
- return to_orc(
- self,
- path,
- engine,
- index=index,
- **kwargs
- )
+ return to_orc(self, path, engine, index=index, **kwargs)
@Substitution(
header_type="bool",
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 02bf9f70406dc..526124e209fa7 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -1,11 +1,11 @@
""" orc compat """
from __future__ import annotations
+from tempfile import gettempdir
from typing import (
TYPE_CHECKING,
Literal,
)
-from tempfile import gettempdir
from pandas._typing import (
FilePath,
@@ -62,9 +62,9 @@ def read_orc(
def to_orc(
df: DataFrame,
path: FilePath | WriteBuffer[bytes] | None = None,
- engine: Literal['pyarrow'] = 'pyarrow',
+ engine: Literal["pyarrow"] = "pyarrow",
index: bool = None,
- **kwargs
+ **kwargs,
) -> bytes:
"""
Write a DataFrame to the ORC format.
@@ -99,23 +99,18 @@ def to_orc(
index = df.index.names[0] is not None
if engine != "pyarrow":
- raise ValueError(
- f"engine must be 'pyarrow'"
- )
- engine = import_optional_dependency(engine, min_version='5.0.0')
-
+ raise ValueError(f"engine must be 'pyarrow'")
+ engine = import_optional_dependency(engine, min_version="5.0.0")
if hasattr(path, "write"):
engine.orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index),
- path, **kwargs
+ engine.Table.from_pandas(df, preserve_index=index), path, **kwargs
)
else:
# to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer
with engine.BufferOutputStream() as stream: # if that is possible
engine.orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index),
- stream, **kwargs
+ engine.Table.from_pandas(df, preserve_index=index), stream, **kwargs
)
orc_bytes = stream.getvalue().to_pybytes()
if path is None:
From 2a54b8c11beb956c8e59095aecb7608cb002d095 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sun, 20 Mar 2022 00:44:02 -0400
Subject: [PATCH 13/49] Fix issues according to comments
---
pandas/core/frame.py | 3 +--
pandas/io/orc.py | 9 ++++-----
pandas/tests/io/test_orc.py | 3 +++
3 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 9a3e2ddc6b463..b300b8c714a1c 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2919,8 +2919,7 @@ def to_orc(
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
(e.g. via builtin open function). If path is None,
- a bytes object is returned. Note that currently the pyarrow
- engine doesn't work with io.BytesIO.
+ a bytes object is returned.
engine : {{'pyarrow'}}, default 'pyarrow'
ORC library to use, or library itself, checked with 'pyarrow' name
and version >= 5.0.0
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 526124e209fa7..2d89573982b39 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -76,8 +76,7 @@ def to_orc(
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
(e.g. via builtin open function). If path is None,
- a bytes object is returned. Note that currently the pyarrow
- engine doesn't work with io.BytesIO.
+ a bytes object is returned.
engine : {{'pyarrow'}}, default 'pyarrow'
Parquet library to use, or library it self, checked with 'pyarrow' name
and version >= 5.0.0
@@ -100,7 +99,7 @@ def to_orc(
if engine != "pyarrow":
raise ValueError(f"engine must be 'pyarrow'")
- engine = import_optional_dependency(engine, min_version="5.0.0")
+ engine = import_optional_dependency(engine, min_version="4.0.1")
if hasattr(path, "write"):
engine.orc.write_table(
@@ -112,9 +111,9 @@ def to_orc(
engine.orc.write_table(
engine.Table.from_pandas(df, preserve_index=index), stream, **kwargs
)
- orc_bytes = stream.getvalue().to_pybytes()
+ orc_bytes = stream.getvalue()
if path is None:
- return orc_bytes
+ return orc_bytes.to_pybytes()
# allows writing to any (fsspec) URL
with get_handle(path, "wb", is_text=False) as handles:
handles.handle.write(orc_bytes)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 211352cebcb73..986f02fb9a215 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -227,6 +227,9 @@ def test_orc_reader_snappy_compressed(dirpath):
def test_orc_roundtrip(dirpath):
+ # GH44554
+ # PyArrow gained ORC write support with the current argument order
+ pytest.importorskip("pyarrow", minversion="7.0.0")
data = {
"boolean1": np.array([False, True], dtype="bool"),
"byte1": np.array([1, 100], dtype="int8"),
From 1caec9ee5661d8f7d1afaea81c88dc6ef89ba493 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Mon, 21 Mar 2022 04:57:49 -0400
Subject: [PATCH 14/49] Simplify the code base after raising Arrow version to
7.0.0
---
pandas/io/orc.py | 28 +++++++---------------------
1 file changed, 7 insertions(+), 21 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 2d89573982b39..21af6fe9fb84b 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -1,7 +1,6 @@
""" orc compat """
from __future__ import annotations
-from tempfile import gettempdir
from typing import (
TYPE_CHECKING,
Literal,
@@ -79,7 +78,7 @@ def to_orc(
a bytes object is returned.
engine : {{'pyarrow'}}, default 'pyarrow'
Parquet library to use, or library it self, checked with 'pyarrow' name
- and version >= 5.0.0
+ and version >= 7.0.0
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
@@ -98,22 +97,9 @@ def to_orc(
index = df.index.names[0] is not None
if engine != "pyarrow":
- raise ValueError(f"engine must be 'pyarrow'")
- engine = import_optional_dependency(engine, min_version="4.0.1")
-
- if hasattr(path, "write"):
- engine.orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index), path, **kwargs
- )
- else:
- # to bytes: pyarrow auto closes buffers hence we read a pyarrow buffer
- with engine.BufferOutputStream() as stream: # if that is possible
- engine.orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index), stream, **kwargs
- )
- orc_bytes = stream.getvalue()
- if path is None:
- return orc_bytes.to_pybytes()
- # allows writing to any (fsspec) URL
- with get_handle(path, "wb", is_text=False) as handles:
- handles.handle.write(orc_bytes)
+ raise ValueError("engine must be 'pyarrow'")
+ engine = import_optional_dependency(engine, min_version="7.0.0")
+
+ engine.orc.write_table(
+ engine.Table.from_pandas(df, preserve_index=index), path, **kwargs
+ )
From 6f0a5380c08c6972bf6c7213bf22fcce3463f6bd Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Mon, 21 Mar 2022 05:36:01 -0400
Subject: [PATCH 15/49] Fix min arrow version in to_orc
---
pandas/core/frame.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b300b8c714a1c..e95ca119e6057 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2922,7 +2922,7 @@ def to_orc(
a bytes object is returned.
engine : {{'pyarrow'}}, default 'pyarrow'
ORC library to use, or library itself, checked with 'pyarrow' name
- and version >= 5.0.0
+ and version >= 7.0.0
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
@@ -2952,7 +2952,7 @@ def to_orc(
Examples
--------
- >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
+ >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
>>> df.to_orc('df.orc') # doctest: +SKIP
>>> pd.read_orc('df.orc') # doctest: +SKIP
col1 col2
From ae65214a58f8eef63166119dbc5c990e8f1e7119 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Mon, 21 Mar 2022 05:44:43 -0400
Subject: [PATCH 16/49] Add to_orc test in line with other formats
---
pandas/tests/io/test_common.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index fc605637dbc11..66905d7b7112f 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -222,6 +222,7 @@ def test_read_non_existent(self, reader, module, error_class, fn_ext):
(pd.DataFrame.to_html, "os", OSError, "html"),
(pd.DataFrame.to_excel, "xlrd", OSError, "xlsx"),
(pd.DataFrame.to_feather, "pyarrow", OSError, "feather"),
+ (pd.DataFrame.to_orc, "pyarrow", OSError, "orc"),
(pd.DataFrame.to_parquet, "pyarrow", OSError, "parquet"),
(pd.DataFrame.to_stata, "os", OSError, "dta"),
(pd.DataFrame.to_json, "os", OSError, "json"),
From 045c411d8640a002e2463c1df1b0ced498ca3bd9 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Tue, 22 Mar 2022 02:27:27 -0400
Subject: [PATCH 17/49] Add BytesIO support & test
---
doc/source/user_guide/scale.rst | 17 -----------------
pandas/io/orc.py | 11 ++++++++++-
pandas/tests/io/test_orc.py | 25 ++++++++++++++++++++++++-
3 files changed, 34 insertions(+), 19 deletions(-)
diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index cf8a0c9845e62..129f43dd36930 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -18,23 +18,6 @@ tool for all situations. If you're working with very large datasets and a tool
like PostgreSQL fits your needs, then you should probably be using that.
Assuming you want or need the expressiveness and power of pandas, let's carry on.
-.. ipython:: python
-
- import pandas as pd
- import numpy as np
-
-.. ipython:: python
- :suppress:
-
- from pandas._testing import _make_timeseries
-
- # Make a random in-memory dataset
- ts = _make_timeseries(freq="30S", seed=0)
- ts.to_csv("timeseries.csv")
- ts.to_orc("timeseries.orc")
- ts.to_parquet("timeseries.parquet")
-
-
Load less data
--------------
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 21af6fe9fb84b..08645a87f09dd 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -1,6 +1,7 @@
""" orc compat """
from __future__ import annotations
+import io
from typing import (
TYPE_CHECKING,
Literal,
@@ -100,6 +101,14 @@ def to_orc(
raise ValueError("engine must be 'pyarrow'")
engine = import_optional_dependency(engine, min_version="7.0.0")
+ path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
engine.orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index), path, **kwargs
+ engine.Table.from_pandas(df, preserve_index=index), path_or_buf, **kwargs
)
+
+ if path is None:
+ assert isinstance(path_or_buf, io.BytesIO)
+ return path_or_buf.getvalue()
+ else:
+ return None
+
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 986f02fb9a215..2eeed0adc379c 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -226,7 +226,7 @@ def test_orc_reader_snappy_compressed(dirpath):
tm.assert_equal(expected, got)
-def test_orc_roundtrip(dirpath):
+def test_orc_roundtrip_file(dirpath):
# GH44554
# PyArrow gained ORC write support with the current argument order
pytest.importorskip("pyarrow", minversion="7.0.0")
@@ -248,3 +248,26 @@ def test_orc_roundtrip(dirpath):
got = read_orc(outputfile)
tm.assert_equal(expected, got)
+
+
+def test_orc_roundtrip_bytesio():
+ # GH44554
+ # PyArrow gained ORC write support with the current argument order
+ pytest.importorskip("pyarrow", minversion="7.0.0")
+ data = {
+ "boolean1": np.array([False, True], dtype="bool"),
+ "byte1": np.array([1, 100], dtype="int8"),
+ "short1": np.array([1024, 2048], dtype="int16"),
+ "int1": np.array([65536, 65536], dtype="int32"),
+ "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+ "float1": np.array([1.0, 2.0], dtype="float32"),
+ "double1": np.array([-15.0, -5.0], dtype="float64"),
+ "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+ "string1": np.array(["hi", "bye"], dtype="object"),
+ }
+ expected = pd.DataFrame.from_dict(data)
+
+ bytesio = expected.to_orc()
+ got = read_orc(bytesio)
+
+ tm.assert_equal(expected, got)
From c00ed0f039594d48fe80243afed27882b9dbf33e Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Tue, 22 Mar 2022 03:16:12 -0400
Subject: [PATCH 18/49] Fix some docs issues
---
pandas/core/frame.py | 8 ++++----
pandas/io/orc.py | 8 ++++----
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index e95ca119e6057..14d0e052a0f8f 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2906,15 +2906,15 @@ def to_orc(
self,
path: FilePath | WriteBuffer[bytes] | None = None,
engine: Literal["pyarrow"] = "pyarrow",
- index: bool = None,
+ index: bool | None = None,
**kwargs,
- ) -> bytes:
+ ) -> bytes | None:
"""
Write a DataFrame to the ORC format.
Parameters
----------
df : DataFrame
- path : str or file-like object, default None
+ path : str, file-like object or None, default None
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
@@ -2923,7 +2923,7 @@ def to_orc(
engine : {{'pyarrow'}}, default 'pyarrow'
ORC library to use, or library itself, checked with 'pyarrow' name
and version >= 7.0.0
- index : bool, default None
+ index : bool, optional
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
If ``None``, similar to ``infer`` the dataframe's index(es)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 08645a87f09dd..61d7cdbccd53a 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -63,15 +63,15 @@ def to_orc(
df: DataFrame,
path: FilePath | WriteBuffer[bytes] | None = None,
engine: Literal["pyarrow"] = "pyarrow",
- index: bool = None,
+ index: bool | None = None,
**kwargs,
-) -> bytes:
+) -> bytes | None:
"""
Write a DataFrame to the ORC format.
Parameters
----------
df : DataFrame
- path : str or file-like object, default None
+ path : str, file-like object or None, default None
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
@@ -80,7 +80,7 @@ def to_orc(
engine : {{'pyarrow'}}, default 'pyarrow'
Parquet library to use, or library it self, checked with 'pyarrow' name
and version >= 7.0.0
- index : bool, default None
+ index : bool, optional
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
If ``None``, similar to ``infer`` the dataframe's index(es)
From fe275d7f21390127414905a1eb4c3791c6d98663 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Fri, 25 Mar 2022 16:29:56 -0400
Subject: [PATCH 19/49] Use keyword only arguments
---
pandas/core/frame.py | 1 +
pandas/io/orc.py | 4 ++--
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 14d0e052a0f8f..97661df2cef61 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2905,6 +2905,7 @@ def to_parquet(
def to_orc(
self,
path: FilePath | WriteBuffer[bytes] | None = None,
+ *,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
**kwargs,
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 61d7cdbccd53a..f49579425b387 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -62,6 +62,7 @@ def read_orc(
def to_orc(
df: DataFrame,
path: FilePath | WriteBuffer[bytes] | None = None,
+ *,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
**kwargs,
@@ -109,6 +110,5 @@ def to_orc(
if path is None:
assert isinstance(path_or_buf, io.BytesIO)
return path_or_buf.getvalue()
- else:
- return None
+ return None
From 9d3e0dfd464e41224f3a7a47d3b344b51d562f0d Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Thu, 12 May 2022 01:24:53 -0400
Subject: [PATCH 20/49] Fix bug
---
pandas/io/orc.py | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index f49579425b387..d3a683ae93aa2 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -102,13 +102,14 @@ def to_orc(
raise ValueError("engine must be 'pyarrow'")
engine = import_optional_dependency(engine, min_version="7.0.0")
- path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
- engine.orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index), path_or_buf, **kwargs
- )
-
- if path is None:
- assert isinstance(path_or_buf, io.BytesIO)
- return path_or_buf.getvalue()
+ was_none = path is None
+ if was_none:
+ path = io.BytesIO()
+ with get_handle(path, "wb") as handles:
+ engine.orc.write_table(
+ engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs
+ )
+
+ if was_none:
+ return path.getvalue()
return None
-
From 971f31c14abce5fdd03e813619fc07b2bbe2f4d8 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sat, 28 May 2022 22:24:18 -0400
Subject: [PATCH 21/49] Fix param issue
---
pandas/core/frame.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 97661df2cef61..aed78178ffbfd 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2969,7 +2969,7 @@ def to_orc(
"""
from pandas.io.orc import to_orc
- return to_orc(self, path, engine, index=index, **kwargs)
+ return to_orc(self, path, engine=engine, index=index, **kwargs)
@Substitution(
header_type="bool",
From 52b68a0f8eeaa1cbbc50d92cea8b6baf765e0171 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sat, 28 May 2022 23:09:31 -0400
Subject: [PATCH 22/49] Doctest skipping due to minimal versions
---
pandas/core/frame.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index aed78178ffbfd..14a0b52308e59 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2962,7 +2962,7 @@ def to_orc(
If you want to get a buffer to the orc content you can write it to io.BytesIO
>>> import io
- >>> b = io.BytesIO(df.to_orc())
+ >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
>>> b.seek(0)
0
>>> content = b.read()
From 76437ba361b014dd998d9ae1d33b40a72f19b538 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sat, 28 May 2022 23:28:12 -0400
Subject: [PATCH 23/49] Doctest skipping due to minimal versions
---
pandas/core/frame.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 14a0b52308e59..7f17df9b9580f 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2963,9 +2963,9 @@ def to_orc(
If you want to get a buffer to the orc content you can write it to io.BytesIO
>>> import io
>>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
- >>> b.seek(0)
+ >>> b.seek(0) # doctest: +SKIP
0
- >>> content = b.read()
+ >>> content = b.read() # doctest: +SKIP
"""
from pandas.io.orc import to_orc
From c5d585267f2bcd76e894e1134a34e494867cea76 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sun, 29 May 2022 00:40:41 -0400
Subject: [PATCH 24/49] Improve spacing in docstring & remove orc test in
test_common that has unusual pyarrow version requirement and is with a lot of
other tests
---
pandas/core/frame.py | 2 ++
pandas/tests/io/test_common.py | 1 -
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 7f17df9b9580f..2d00857a14895 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2912,6 +2912,7 @@ def to_orc(
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
+
Parameters
----------
df : DataFrame
@@ -2934,6 +2935,7 @@ def to_orc(
be included as columns in the file output.
kwargs
Additional keyword arguments passed to the engine
+
Returns
-------
bytes if no path argument is provided else None
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index 66905d7b7112f..fc605637dbc11 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -222,7 +222,6 @@ def test_read_non_existent(self, reader, module, error_class, fn_ext):
(pd.DataFrame.to_html, "os", OSError, "html"),
(pd.DataFrame.to_excel, "xlrd", OSError, "xlsx"),
(pd.DataFrame.to_feather, "pyarrow", OSError, "feather"),
- (pd.DataFrame.to_orc, "pyarrow", OSError, "orc"),
(pd.DataFrame.to_parquet, "pyarrow", OSError, "parquet"),
(pd.DataFrame.to_stata, "os", OSError, "dta"),
(pd.DataFrame.to_json, "os", OSError, "json"),
From b5cd02212be25297c0bcb9e8b114bd21c80ce99e Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Sun, 29 May 2022 01:28:37 -0400
Subject: [PATCH 25/49] Fix docstring syntax
---
pandas/core/frame.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 2d00857a14895..b7492d9a31bb1 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2924,7 +2924,7 @@ def to_orc(
a bytes object is returned.
engine : {{'pyarrow'}}, default 'pyarrow'
ORC library to use, or library itself, checked with 'pyarrow' name
- and version >= 7.0.0
+ and version >= 7.0.0.
index : bool, optional
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
@@ -2933,8 +2933,8 @@ def to_orc(
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
- kwargs
- Additional keyword arguments passed to the engine
+ **kwargs
+ Additional keyword arguments passed to the engine.
Returns
-------
From 7ad3df937c872849806d43428792884beca6aed5 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 02:21:56 -0400
Subject: [PATCH 26/49] ORC is not text
---
pandas/io/orc.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index d3a683ae93aa2..635d81a112dd4 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -105,7 +105,7 @@ def to_orc(
was_none = path is None
if was_none:
path = io.BytesIO()
- with get_handle(path, "wb") as handles:
+ with get_handle(path, "wb", is_text=False) as handles:
engine.orc.write_table(
engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs
)
From a73bb706e190eebfd9e6e4274064503cb2d6f8c0 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 04:48:10 -0400
Subject: [PATCH 27/49] Fix BytesIO bug && do not require orc to be explicitly
imported before usage && all pytest tests have passed
---
pandas/io/orc.py | 7 +++++--
pandas/tests/io/test_orc.py | 5 +++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 635d81a112dd4..e2b63eaaedadf 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -69,6 +69,7 @@ def to_orc(
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
+
Parameters
----------
df : DataFrame
@@ -89,8 +90,9 @@ def to_orc(
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
- kwargs
+ **kwargs
Additional keyword arguments passed to the engine
+
Returns
-------
bytes if no path argument is provided else None
@@ -101,12 +103,13 @@ def to_orc(
if engine != "pyarrow":
raise ValueError("engine must be 'pyarrow'")
engine = import_optional_dependency(engine, min_version="7.0.0")
+ orc = import_optional_dependency("pyarrow.orc")
was_none = path is None
if was_none:
path = io.BytesIO()
with get_handle(path, "wb", is_text=False) as handles:
- engine.orc.write_table(
+ orc.write_table(
engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs
)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 2eeed0adc379c..826514d2615a8 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -1,5 +1,6 @@
""" test orc compat """
import datetime
+from io import BytesIO
import os
import numpy as np
@@ -267,7 +268,7 @@ def test_orc_roundtrip_bytesio():
}
expected = pd.DataFrame.from_dict(data)
- bytesio = expected.to_orc()
- got = read_orc(bytesio)
+ bytes = expected.to_orc()
+ got = read_orc(BytesIO(bytes))
tm.assert_equal(expected, got)
From 20aefe79ed4bf2fb4a77f2858fbb5b678895ebc7 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 10:43:25 -0400
Subject: [PATCH 28/49] ORC writer does not work for categorical columns yet
---
doc/source/user_guide/io.rst | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index f3e712197f9c5..e0999d1ef85ce 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -5569,6 +5569,7 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This
* It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow.
* :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `.
+ * Categorical columns are not supported yet.
.. ipython:: python
@@ -5581,8 +5582,6 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This
"e": [True, False, True],
"f": pd.date_range("20130101", periods=3),
"g": pd.date_range("20130101", periods=3, tz="US/Eastern"),
- "h": pd.Categorical(list("abc")),
- "i": pd.Categorical(list("abc"), ordered=True),
}
)
From e7e81fee7a23f30946169613139880aa84b104ee Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 10:49:34 -0400
Subject: [PATCH 29/49] Appease mypy
---
pandas/io/orc.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index e2b63eaaedadf..356a82d2947ab 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -108,11 +108,13 @@ def to_orc(
was_none = path is None
if was_none:
path = io.BytesIO()
+ assert path is not None # For mypy
with get_handle(path, "wb", is_text=False) as handles:
orc.write_table(
engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs
)
if was_none:
+ assert isinstance(path, io.BytesIO) # For mypy
return path.getvalue()
return None
From 6b659f7007d10a2ecab925988fb6e5b6cf8a446e Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 15:18:55 -0400
Subject: [PATCH 30/49] Appease mypy
---
pandas/io/orc.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 356a82d2947ab..918f75de00c58 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -2,6 +2,7 @@
from __future__ import annotations
import io
+from types import ModuleType
from typing import (
TYPE_CHECKING,
Literal,
@@ -110,6 +111,7 @@ def to_orc(
path = io.BytesIO()
assert path is not None # For mypy
with get_handle(path, "wb", is_text=False) as handles:
+ assert isinstance(engine, ModuleType) # For mypy
orc.write_table(
engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs
)
From 18e5429968c7e7ad653cdf46c13ae863efaaa203 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 20:23:11 -0400
Subject: [PATCH 31/49] Edit according to reviews
---
doc/source/user_guide/io.rst | 11 ++---
pandas/core/frame.py | 30 +++++++++---
pandas/io/orc.py | 45 +++++++++++++++++-
.../io/data/orc/TestOrcFile.testReadWrite.orc | Bin 0 -> 1344 bytes
pandas/tests/io/test_orc.py | 41 ++++++++++++++++
5 files changed, 112 insertions(+), 15 deletions(-)
create mode 100644 pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index e0999d1ef85ce..a7f26e53620f8 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -5569,7 +5569,8 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This
* It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow.
* :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `.
- * Categorical columns are not supported yet.
+ * Unsigned integers, intervals, periods, sparse and categorical Dtypes are not supported yet.
+ * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files.
.. ipython:: python
@@ -5577,11 +5578,9 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This
{
"a": list("abc"),
"b": list(range(1, 4)),
- "c": np.arange(3, 6).astype("u1"),
- "d": np.arange(4.0, 7.0, dtype="float64"),
- "e": [True, False, True],
- "f": pd.date_range("20130101", periods=3),
- "g": pd.date_range("20130101", periods=3, tz="US/Eastern"),
+ "c": np.arange(4.0, 7.0, dtype="float64"),
+ "d": [True, False, True],
+ "e": pd.date_range("20130101", periods=3),
}
)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b7492d9a31bb1..20b130191e0b9 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2913,9 +2913,10 @@ def to_orc(
"""
Write a DataFrame to the ORC format.
+ .. versionadded:: 1.5.0
+
Parameters
----------
- df : DataFrame
path : str, file-like object or None, default None
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
@@ -2923,23 +2924,32 @@ def to_orc(
(e.g. via builtin open function). If path is None,
a bytes object is returned.
engine : {{'pyarrow'}}, default 'pyarrow'
- ORC library to use, or library itself, checked with 'pyarrow' name
- and version >= 7.0.0.
+ ORC library to use, or library it self, checked with 'pyarrow' name
+ and version >= 7.0.0. Raises ValueError if it is anything but
+ 'pyarrow'.
index : bool, optional
- If ``True``, include the dataframe's index(es) in the file output. If
- ``False``, they will not be written to the file.
+ If ``True``, include the dataframe's index(es) in the file output.
+ If ``False``, they will not be written to the file.
If ``None``, similar to ``infer`` the dataframe's index(es)
will be saved. However, instead of being saved as values,
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
**kwargs
- Additional keyword arguments passed to the engine.
+ Additional keyword arguments passed to the engine
Returns
-------
bytes if no path argument is provided else None
+ Raises
+ ------
+ NotImplementedError
+ * Dtype of one or more columns is unsigned integers, intervals,
+ periods, sparse or categorical.
+ ValueError
+ * engine is not pyarrow.
+
See Also
--------
read_orc : Read a ORC file.
@@ -2950,8 +2960,14 @@ def to_orc(
Notes
-----
- This function requires `pyarrow `
+ * Before using this function you should read the :ref:`user guide about
+ ORC ` and :ref:`install optional dependencies `.
+ * This function requires `pyarrow `
_ library.
+ * Unsigned integers, intervals, periods, sparse and categorical Dtypes
+ are not supported yet.
+ * Currently timezones in datetime columns are not preserved when a
+ dataframe is converted into ORC files.
Examples
--------
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 918f75de00c58..bc14a90d463cf 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -71,9 +71,14 @@ def to_orc(
"""
Write a DataFrame to the ORC format.
+ .. versionadded:: 1.5.0
+
Parameters
----------
df : DataFrame
+ The dataframe to be written to ORC. Raises NotImplementedError
+ if dtype of one or more columns is category, unsigned integers,
+ intervals, periods or sparse.
path : str, file-like object or None, default None
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
@@ -81,8 +86,9 @@ def to_orc(
(e.g. via builtin open function). If path is None,
a bytes object is returned.
engine : {{'pyarrow'}}, default 'pyarrow'
- Parquet library to use, or library it self, checked with 'pyarrow' name
- and version >= 7.0.0
+ ORC library to use, or library it self, checked with 'pyarrow' name
+ and version >= 7.0.0. Raises ValueError if it is anything but
+ 'pyarrow'.
index : bool, optional
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
@@ -97,10 +103,45 @@ def to_orc(
Returns
-------
bytes if no path argument is provided else None
+
+ Raises
+ ------
+ NotImplementedError
+ * Dtype of one or more columns is unsigned integers, intervals,
+ periods, sparse or categorical.
+ ValueError
+ * engine is not pyarrow.
+
+ Notes
+ -----
+ * Before using this function you should read the
+ :ref:`user guide about ORC ` and
+ :ref:`install optional dependencies `.
+ * This function requires `pyarrow `
+ _ library.
+ * Unsigned integers, intervals, periods, sparse and categorical Dtypes
+ are not supported yet.
+ * Currently timezones in datetime columns are not preserved when a
+ dataframe is converted into ORC files.
"""
if index is None:
index = df.index.names[0] is not None
+ # If unsupported dtypes are found raise NotImplementedError
+ for dtype in df.dtypes:
+ dtype_str = dtype.__str__().lower()
+ if (
+ "category" in dtype_str
+ or "interval" in dtype_str
+ or "sparse" in dtype_str
+ or "period" in dtype_str
+ or "uint" in dtype_str
+ ):
+ raise NotImplementedError(
+ """The dtype of one or more columns is unsigned integers,
+intervals, periods, sparse or categorical which is not supported yet."""
+ )
+
if engine != "pyarrow":
raise ValueError("engine must be 'pyarrow'")
engine = import_optional_dependency(engine, min_version="7.0.0")
diff --git a/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc b/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc
new file mode 100644
index 0000000000000000000000000000000000000000..852360ecad74cb3545fd160f895fca67d68ef276
GIT binary patch
literal 1344
zcma)6&5qJg6uv*b?P*25Q&BHP6C{}oYC<5Im~=B_CQ&y!X5!Ao1TrWjgeC(nW@8$k
zz&FrGaI35CH1jkrTvHFXmkRfFi(;;QU=Mge
zDVnzfQFzyYg#b+cqUwAes6}uc10PZ;)0#?Km{b=@h)D_36ePY?no10J7vUWYZ^0}#
z4=R@w%kBq`?&Kb@sOswu$nX7~f{;$7Akev>w3(UGa%M^&XIL8AE5s`~U=W~JwcZY}
zI^K}pSwCchp~5iF@Y%_;89m9fw-ks(YfKp-_{y`7)HJ_neU@j0r2E($eyC`%X|SB-
z*;Bl4lyN7|4(PYR5WgsJ!IeZ^)kxJ%jZ9TEausQG)Lo52?P+w?Lyew6P-U%sU57m?
zhMGvKks2hD8O;>eIX9XsS8K;;M}gMDXrb`F;rU#nU6Xr8drImYv}O|Y`6;5%zHfZr
z_`VXjaoFeDk!ff3xLRZVh(CMrI`~P0Vry4+b$vxQHgnrTUcQI_v45*27wp|~+ctL$
zxcnia62e!ey0!1D!Rl|I6#DZ`5v~V*1id5~^}FMCBzkAlemim}gXDDDj-25bk7AUJ
z-=q0%lyp&gC&`E2XkIlho}e+Ao({31MjE6=>Hx$tY8kVPTjp3MEORXb%e;cMSgc19
zFGmuujU-+lNxWqw@uee)SMT@szl48EwvQyXBZ)Uh()s-IIqivd^GWj&eBtiuK
SNU3p7TkfYnoF5$@R{jEaEZds^
literal 0
HcmV?d00001
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 826514d2615a8..5f075a118017c 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -272,3 +272,44 @@ def test_orc_roundtrip_bytesio():
got = read_orc(BytesIO(bytes))
tm.assert_equal(expected, got)
+
+
+testdata = [
+ (pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), dirpath),
+ (pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), dirpath),
+ (
+ pd.DataFrame(
+ {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]}
+ ),
+ dirpath,
+ ),
+ (
+ pd.DataFrame(
+ {
+ "unimpl": [
+ pd.Period("2022-01-03", freq="D"),
+ pd.Period("2022-01-04", freq="D"),
+ ]
+ }
+ ),
+ dirpath,
+ ),
+ (
+ pd.DataFrame({"unimpl": [np.nan] * 100}).astype(
+ pd.SparseDtype("float", np.nan)
+ ),
+ dirpath,
+ ),
+]
+
+
+@pytest.mark.parametrize("unimplemented, dirpath", testdata)
+def test_orc_writer_unimplemented_dtypes(unimplemented, dirpath):
+ # GH44554
+ # PyArrow gained ORC write support with the current argument order
+ pytest.importorskip("pyarrow", minversion="7.0.0")
+ outputfile = os.path.join(dirpath, "TestOrcFile.testReadWrite.orc")
+ msg = """The dtype of one or more columns is unsigned integers,
+intervals, periods, sparse or categorical which is not supported yet."""
+ with pytest.raises(NotImplementedError, match=msg):
+ unimplemented.to_orc(outputfile)
From 21cba6ed7e9196dd24cf2e6509ab6f44ad47e8eb Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 20:30:06 -0400
Subject: [PATCH 32/49] Fix path bug in test_orc
---
pandas/tests/io/test_orc.py | 20 ++++++--------------
1 file changed, 6 insertions(+), 14 deletions(-)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 5f075a118017c..d0fe5325a1c44 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -275,13 +275,12 @@ def test_orc_roundtrip_bytesio():
testdata = [
- (pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), dirpath),
- (pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), dirpath),
+ (pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")})),
+ (pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")})),
(
pd.DataFrame(
{"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]}
),
- dirpath,
),
(
pd.DataFrame(
@@ -292,24 +291,17 @@ def test_orc_roundtrip_bytesio():
]
}
),
- dirpath,
- ),
- (
- pd.DataFrame({"unimpl": [np.nan] * 100}).astype(
- pd.SparseDtype("float", np.nan)
- ),
- dirpath,
),
+ (pd.DataFrame({"unimpl": [np.nan] * 100}).astype(pd.SparseDtype("float", np.nan)),),
]
-@pytest.mark.parametrize("unimplemented, dirpath", testdata)
-def test_orc_writer_unimplemented_dtypes(unimplemented, dirpath):
+@pytest.mark.parametrize("unimplemented", testdata)
+def test_orc_writer_unimplemented_dtypes(unimplemented):
# GH44554
# PyArrow gained ORC write support with the current argument order
pytest.importorskip("pyarrow", minversion="7.0.0")
- outputfile = os.path.join(dirpath, "TestOrcFile.testReadWrite.orc")
msg = """The dtype of one or more columns is unsigned integers,
intervals, periods, sparse or categorical which is not supported yet."""
with pytest.raises(NotImplementedError, match=msg):
- unimplemented.to_orc(outputfile)
+ unimplemented.to_orc()
From c7bf39ff2c400deecfd41b216e52dfd1321d1c58 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 20:59:25 -0400
Subject: [PATCH 33/49] Fix testdata tuple bug in test_orc
---
pandas/tests/io/test_orc.py | 28 ++++++++++++----------------
1 file changed, 12 insertions(+), 16 deletions(-)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index d0fe5325a1c44..bac17c2f88584 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -275,24 +275,20 @@ def test_orc_roundtrip_bytesio():
testdata = [
- (pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")})),
- (pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")})),
- (
- pd.DataFrame(
- {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]}
- ),
+ pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}),
+ pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}),
+ pd.DataFrame(
+ {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]}
),
- (
- pd.DataFrame(
- {
- "unimpl": [
- pd.Period("2022-01-03", freq="D"),
- pd.Period("2022-01-04", freq="D"),
- ]
- }
- ),
+ pd.DataFrame(
+ {
+ "unimpl": [
+ pd.Period("2022-01-03", freq="D"),
+ pd.Period("2022-01-04", freq="D"),
+ ]
+ }
),
- (pd.DataFrame({"unimpl": [np.nan] * 100}).astype(pd.SparseDtype("float", np.nan)),),
+ pd.DataFrame({"unimpl": [np.nan] * 100}).astype(pd.SparseDtype("float", np.nan)),
]
From e43c6dd73bc2ac30aac771a4924dc0568ceccd28 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 21:28:39 -0400
Subject: [PATCH 34/49] Fix docstrings for check compliance
---
pandas/core/frame.py | 2 +-
pandas/io/orc.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 20b130191e0b9..4682ac4878bca 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2936,7 +2936,7 @@ def to_orc(
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
**kwargs
- Additional keyword arguments passed to the engine
+ Additional keyword arguments passed to the engine.
Returns
-------
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index bc14a90d463cf..51b29ce8144e3 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -98,7 +98,7 @@ def to_orc(
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
**kwargs
- Additional keyword arguments passed to the engine
+ Additional keyword arguments passed to the engine.
Returns
-------
From afa0a8a3735c082c8855e9e721caafc7e751922e Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 22:14:37 -0400
Subject: [PATCH 35/49] read_orc does not have engine as a param
---
doc/source/user_guide/io.rst | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index a7f26e53620f8..973e978a1453f 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -5599,7 +5599,7 @@ Read from an orc file.
.. ipython:: python
:okwarning:
- result = pd.read_orc("example_pa.orc", engine="pyarrow")
+ result = pd.read_orc("example_pa.orc")
result.dtypes
@@ -5609,7 +5609,6 @@ Read only certain columns of an orc file.
result = pd.read_orc(
"example_pa.orc",
- engine="pyarrow",
columns=["a", "b"],
)
result.dtypes
From cd585e678432c5359fab1bd07dd5a0277fdf0e6b Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 29 May 2022 23:54:54 -0400
Subject: [PATCH 36/49] Fix sphinx warnings
---
pandas/core/frame.py | 16 ++++++++--------
pandas/io/orc.py | 18 +++++++++---------
2 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 4682ac4878bca..51c5c4a7d802a 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2945,10 +2945,10 @@ def to_orc(
Raises
------
NotImplementedError
- * Dtype of one or more columns is unsigned integers, intervals,
- periods, sparse or categorical.
+ Dtype of one or more columns is category, unsigned integers, interval,
+ period or sparse.
ValueError
- * engine is not pyarrow.
+ engine is not pyarrow.
See Also
--------
@@ -2961,13 +2961,13 @@ def to_orc(
Notes
-----
* Before using this function you should read the :ref:`user guide about
- ORC ` and :ref:`install optional dependencies `.
- * This function requires `pyarrow `
- _ library.
+ ORC ` and :ref:`install optional dependencies `.
+ * This function requires `pyarrow `_
+ library.
* Unsigned integers, intervals, periods, sparse and categorical Dtypes
- are not supported yet.
+ are not supported yet.
* Currently timezones in datetime columns are not preserved when a
- dataframe is converted into ORC files.
+ dataframe is converted into ORC files.
Examples
--------
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 51b29ce8144e3..02f43855c4340 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -107,22 +107,22 @@ def to_orc(
Raises
------
NotImplementedError
- * Dtype of one or more columns is unsigned integers, intervals,
- periods, sparse or categorical.
+ Dtype of one or more columns is category, unsigned integers, interval,
+ period or sparse.
ValueError
- * engine is not pyarrow.
+ engine is not pyarrow.
Notes
-----
* Before using this function you should read the
- :ref:`user guide about ORC ` and
- :ref:`install optional dependencies `.
- * This function requires `pyarrow `
- _ library.
+ :ref:`user guide about ORC ` and
+ :ref:`install optional dependencies `.
+ * This function requires `pyarrow `_
+ library.
* Unsigned integers, intervals, periods, sparse and categorical Dtypes
- are not supported yet.
+ are not supported yet.
* Currently timezones in datetime columns are not preserved when a
- dataframe is converted into ORC files.
+ dataframe is converted into ORC files.
"""
if index is None:
index = df.index.names[0] is not None
From b509c3c22c5be1eaba6af400cc585b71c4939d26 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Mon, 30 May 2022 17:56:30 -0400
Subject: [PATCH 37/49] Improve docs & rerun tests
---
pandas/core/frame.py | 2 +-
pandas/io/orc.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 51c5c4a7d802a..38ee793f283cd 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2964,7 +2964,7 @@ def to_orc(
ORC ` and :ref:`install optional dependencies `.
* This function requires `pyarrow `_
library.
- * Unsigned integers, intervals, periods, sparse and categorical Dtypes
+ * Category, unsigned integers, interval, period and sparse Dtypes
are not supported yet.
* Currently timezones in datetime columns are not preserved when a
dataframe is converted into ORC files.
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 02f43855c4340..655753e22cd05 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -119,7 +119,7 @@ def to_orc(
:ref:`install optional dependencies `.
* This function requires `pyarrow `_
library.
- * Unsigned integers, intervals, periods, sparse and categorical Dtypes
+ * Category, unsigned integers, interval, period and sparse Dtypes
are not supported yet.
* Currently timezones in datetime columns are not preserved when a
dataframe is converted into ORC files.
From 1001002907fa8892d65f3ab7fe7df5a1cd8d7d00 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Mon, 30 May 2022 19:37:22 -0400
Subject: [PATCH 38/49] Force retrigger
---
pandas/tests/io/test_orc.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index bac17c2f88584..d09ecf6207926 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -288,7 +288,7 @@ def test_orc_roundtrip_bytesio():
]
}
),
- pd.DataFrame({"unimpl": [np.nan] * 100}).astype(pd.SparseDtype("float", np.nan)),
+ pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)),
]
From 55cab6ee3551eb3efd68783dab267299e889b993 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Tue, 7 Jun 2022 02:07:30 -0400
Subject: [PATCH 39/49] Fix test_orc according to review
---
pandas/tests/io/test_orc.py | 48 ++++++++++++++++++++-----------------
1 file changed, 26 insertions(+), 22 deletions(-)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index d09ecf6207926..5364b25b4a61f 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -6,6 +6,8 @@
import numpy as np
import pytest
+import pandas.util._test_decorators as td
+
import pandas as pd
from pandas import read_orc
import pandas._testing as tm
@@ -22,6 +24,26 @@ def dirpath(datapath):
return datapath("io", "data", "orc")
+# Examples of dataframes with dtypes for which conversion to ORC
+# hasn't been implemented yet.
+orc_writer_not_implemented = [
+ pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}),
+ pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}),
+ pd.DataFrame(
+ {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]}
+ ),
+ pd.DataFrame(
+ {
+ "unimpl": [
+ pd.Period("2022-01-03", freq="D"),
+ pd.Period("2022-01-04", freq="D"),
+ ]
+ }
+ ),
+ pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)),
+]
+
+
def test_orc_reader_empty(dirpath):
columns = [
"boolean1",
@@ -227,10 +249,10 @@ def test_orc_reader_snappy_compressed(dirpath):
tm.assert_equal(expected, got)
+@td.skip_if_no("pyarrow", min_version="7.0.0")
def test_orc_roundtrip_file(dirpath):
# GH44554
# PyArrow gained ORC write support with the current argument order
- pytest.importorskip("pyarrow", minversion="7.0.0")
data = {
"boolean1": np.array([False, True], dtype="bool"),
"byte1": np.array([1, 100], dtype="int8"),
@@ -251,10 +273,10 @@ def test_orc_roundtrip_file(dirpath):
tm.assert_equal(expected, got)
+@td.skip_if_no("pyarrow", min_version="7.0.0")
def test_orc_roundtrip_bytesio():
# GH44554
# PyArrow gained ORC write support with the current argument order
- pytest.importorskip("pyarrow", minversion="7.0.0")
data = {
"boolean1": np.array([False, True], dtype="bool"),
"byte1": np.array([1, 100], dtype="int8"),
@@ -274,29 +296,11 @@ def test_orc_roundtrip_bytesio():
tm.assert_equal(expected, got)
-testdata = [
- pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}),
- pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}),
- pd.DataFrame(
- {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]}
- ),
- pd.DataFrame(
- {
- "unimpl": [
- pd.Period("2022-01-03", freq="D"),
- pd.Period("2022-01-04", freq="D"),
- ]
- }
- ),
- pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)),
-]
-
-
-@pytest.mark.parametrize("unimplemented", testdata)
+@td.skip_if_no("pyarrow", min_version="7.0.0")
+@pytest.mark.parametrize("unimplemented", orc_writer_not_implemented)
def test_orc_writer_unimplemented_dtypes(unimplemented):
# GH44554
# PyArrow gained ORC write support with the current argument order
- pytest.importorskip("pyarrow", minversion="7.0.0")
msg = """The dtype of one or more columns is unsigned integers,
intervals, periods, sparse or categorical which is not supported yet."""
with pytest.raises(NotImplementedError, match=msg):
From 89283e0d2e7a9bf80894e0edd07b6d3aeeafe6c8 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Tue, 7 Jun 2022 08:46:02 -0400
Subject: [PATCH 40/49] Rename some variables and func
---
pandas/tests/io/test_orc.py | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 5364b25b4a61f..932918c75dec5 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -25,8 +25,9 @@ def dirpath(datapath):
# Examples of dataframes with dtypes for which conversion to ORC
-# hasn't been implemented yet.
-orc_writer_not_implemented = [
+# hasn't been implemented yet, that is, Category, unsigned integers,
+# interval, period and sparse.
+orc_writer_dtypes_not_supported = [
pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}),
pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}),
pd.DataFrame(
@@ -297,11 +298,11 @@ def test_orc_roundtrip_bytesio():
@td.skip_if_no("pyarrow", min_version="7.0.0")
-@pytest.mark.parametrize("unimplemented", orc_writer_not_implemented)
-def test_orc_writer_unimplemented_dtypes(unimplemented):
+@pytest.mark.parametrize("df_not_supported", orc_writer_dtypes_not_supported)
+def test_orc_writer_dtypes_not_supported(df_not_supported):
# GH44554
# PyArrow gained ORC write support with the current argument order
msg = """The dtype of one or more columns is unsigned integers,
intervals, periods, sparse or categorical which is not supported yet."""
with pytest.raises(NotImplementedError, match=msg):
- unimplemented.to_orc()
+ df_not_supported.to_orc()
From 989468a4b66637df12337ae0846d858fab06a0d0 Mon Sep 17 00:00:00 2001
From: Ian Alexander Joiner
Date: Tue, 7 Jun 2022 12:11:35 -0400
Subject: [PATCH 41/49] Update pandas/core/frame.py
Co-authored-by: Matthew Roeschke
---
pandas/core/frame.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 38ee793f283cd..364fda763c718 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2923,7 +2923,7 @@ def to_orc(
we refer to objects with a write() method, such as a file handle
(e.g. via builtin open function). If path is None,
a bytes object is returned.
- engine : {{'pyarrow'}}, default 'pyarrow'
+ engine : str, default 'pyarrow'
ORC library to use, or library it self, checked with 'pyarrow' name
and version >= 7.0.0. Raises ValueError if it is anything but
'pyarrow'.
From a7fca36785c8e7a747f0d4d6e6031706b5e43e58 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sat, 11 Jun 2022 23:58:24 -0400
Subject: [PATCH 42/49] Fix issues according to review
---
doc/source/user_guide/io.rst | 5 +++--
doc/source/whatsnew/v1.5.0.rst | 22 ++++++++++++++++++++++
pandas/core/frame.py | 7 +++----
pandas/io/orc.py | 4 ++--
pandas/tests/io/test_orc.py | 8 ++++----
5 files changed, 34 insertions(+), 12 deletions(-)
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 973e978a1453f..4c5d189e1bba3 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -5568,8 +5568,9 @@ ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This
.. warning::
* It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow.
- * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `.
- * Unsigned integers, intervals, periods, sparse and categorical Dtypes are not supported yet.
+ * :func:`~pandas.DataFrame.to_orc` requires pyarrow>=7.0.0.
+ * :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc` are not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `.
+ * For supported dtypes please refer to `supported ORC features in Arrow `__.
* Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files.
.. ipython:: python
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 8a7ad077c2a90..2719d415dedc0 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -100,6 +100,28 @@ as seen in the following example.
1 2021-01-02 08:00:00 4
2 2021-01-02 16:00:00 5
+.. _whatsnew_150.enhancements.orc:
+
+Writing to ORC files
+^^^^^^^^^^^^^^^^^^^^
+
+The new method :meth:`DataFrame.to_orc` allows writing to ORC files (:issue:`43864`).
+
+This functionality depends the `pyarrow `__ library. For more details, see :ref:`the IO docs on ORC `.
+
+.. warning::
+
+ * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow.
+ * :func:`~pandas.DataFrame.to_orc` requires pyarrow>=7.0.0.
+ * :func:`~pandas.DataFrame.to_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `.
+ * For supported dtypes please refer to `supported ORC features in Arrow `__.
+ * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files.
+
+.. code-block:: python
+
+ df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})
+ df.to_orc("./out.orc")
+
.. _whatsnew_150.enhancements.tar:
Reading directly from TAR archives
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 364fda763c718..6626b7dcad24d 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2925,8 +2925,7 @@ def to_orc(
a bytes object is returned.
engine : str, default 'pyarrow'
ORC library to use, or library it self, checked with 'pyarrow' name
- and version >= 7.0.0. Raises ValueError if it is anything but
- 'pyarrow'.
+ and version >= 7.0.0.
index : bool, optional
If ``True``, include the dataframe's index(es) in the file output.
If ``False``, they will not be written to the file.
@@ -2964,8 +2963,8 @@ def to_orc(
ORC ` and :ref:`install optional dependencies `.
* This function requires `pyarrow `_
library.
- * Category, unsigned integers, interval, period and sparse Dtypes
- are not supported yet.
+ * For supported dtypes please refer to
+ `this article `__.
* Currently timezones in datetime columns are not preserved when a
dataframe is converted into ORC files.
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 655753e22cd05..e679097ec3600 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -119,8 +119,8 @@ def to_orc(
:ref:`install optional dependencies `.
* This function requires `pyarrow `_
library.
- * Category, unsigned integers, interval, period and sparse Dtypes
- are not supported yet.
+ * For supported dtypes please refer to
+ `this article `__.
* Currently timezones in datetime columns are not preserved when a
dataframe is converted into ORC files.
"""
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 932918c75dec5..d5a0d4cc4fff0 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -267,11 +267,11 @@ def test_orc_roundtrip_file(dirpath):
}
expected = pd.DataFrame.from_dict(data)
- outputfile = os.path.join(dirpath, "TestOrcFile.testReadWrite.orc")
- expected.to_orc(outputfile)
- got = read_orc(outputfile)
+ with tm.ensure_clean() as path:
+ expected.to_orc(path)
+ got = read_orc(path)
- tm.assert_equal(expected, got)
+ tm.assert_equal(expected, got)
@td.skip_if_no("pyarrow", min_version="7.0.0")
From 7fc338c6ab8434651cf70b6e4821a0accd832e7c Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Sun, 12 Jun 2022 01:13:29 -0400
Subject: [PATCH 43/49] Forced reruns
---
pandas/core/frame.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 6626b7dcad24d..8d6357cc9ad57 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2970,12 +2970,12 @@ def to_orc(
Examples
--------
- >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
+ >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
>>> df.to_orc('df.orc') # doctest: +SKIP
>>> pd.read_orc('df.orc') # doctest: +SKIP
col1 col2
- 0 1 3
- 1 2 4
+ 0 1 4
+ 1 2 3
If you want to get a buffer to the orc content you can write it to io.BytesIO
>>> import io
From 91d15560330c00a0e6253d8ca76bb6f94f230fa4 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Mon, 13 Jun 2022 05:12:33 -0400
Subject: [PATCH 44/49] Fix issues according to review
---
pandas/core/frame.py | 15 +++---
pandas/io/orc.py | 44 +++++++++++-------
.../io/data/orc/TestOrcFile.testReadWrite.orc | Bin 1344 -> 0 bytes
3 files changed, 33 insertions(+), 26 deletions(-)
delete mode 100644 pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 8d6357cc9ad57..183a45e8dca05 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2908,7 +2908,7 @@ def to_orc(
*,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
- **kwargs,
+ engine_kwargs: dict[str, Any] | None = None,
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
@@ -2924,8 +2924,7 @@ def to_orc(
(e.g. via builtin open function). If path is None,
a bytes object is returned.
engine : str, default 'pyarrow'
- ORC library to use, or library it self, checked with 'pyarrow' name
- and version >= 7.0.0.
+ ORC library to use. Pyarrow must be >= 7.0.0.
index : bool, optional
If ``True``, include the dataframe's index(es) in the file output.
If ``False``, they will not be written to the file.
@@ -2934,8 +2933,8 @@ def to_orc(
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
- **kwargs
- Additional keyword arguments passed to the engine.
+ engine_kwargs: dict[str, Any], optional
+ Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
Returns
-------
@@ -2963,8 +2962,8 @@ def to_orc(
ORC ` and :ref:`install optional dependencies `.
* This function requires `pyarrow `_
library.
- * For supported dtypes please refer to
- `this article `__.
+ * For supported dtypes please refer to `supported ORC features in Arrow
+ `__.
* Currently timezones in datetime columns are not preserved when a
dataframe is converted into ORC files.
@@ -2986,7 +2985,7 @@ def to_orc(
"""
from pandas.io.orc import to_orc
- return to_orc(self, path, engine=engine, index=index, **kwargs)
+ return to_orc(self, path, engine=engine, index=index, **engine_kwargs)
@Substitution(
header_type="bool",
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index e679097ec3600..793c0356894fb 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -5,6 +5,7 @@
from types import ModuleType
from typing import (
TYPE_CHECKING,
+ Any,
Literal,
)
@@ -15,6 +16,14 @@
)
from pandas.compat._optional import import_optional_dependency
+from pandas.core.dtypes.common import (
+ is_categorical,
+ is_interval_dtype,
+ is_period_dtype,
+ is_sparse,
+ is_unsigned_integer_dtype,
+)
+
from pandas.io.common import get_handle
if TYPE_CHECKING:
@@ -66,7 +75,7 @@ def to_orc(
*,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
- **kwargs,
+ engine_kwargs: dict[str, Any] | None = None,
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
@@ -85,10 +94,8 @@ def to_orc(
we refer to objects with a write() method, such as a file handle
(e.g. via builtin open function). If path is None,
a bytes object is returned.
- engine : {{'pyarrow'}}, default 'pyarrow'
- ORC library to use, or library it self, checked with 'pyarrow' name
- and version >= 7.0.0. Raises ValueError if it is anything but
- 'pyarrow'.
+ engine : str, default 'pyarrow'
+ ORC library to use. Pyarrow must be >= 7.0.0.
index : bool, optional
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
@@ -97,8 +104,8 @@ def to_orc(
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
- **kwargs
- Additional keyword arguments passed to the engine.
+ engine_kwargs: dict[str, Any], optional
+ Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
Returns
-------
@@ -119,8 +126,8 @@ def to_orc(
:ref:`install optional dependencies `.
* This function requires `pyarrow `_
library.
- * For supported dtypes please refer to
- `this article `__.
+ * For supported dtypes please refer to `supported ORC features in Arrow
+ `__.
* Currently timezones in datetime columns are not preserved when a
dataframe is converted into ORC files.
"""
@@ -128,18 +135,17 @@ def to_orc(
index = df.index.names[0] is not None
# If unsupported dtypes are found raise NotImplementedError
+ # In Pyarrow 9.0.0 this check will no longer be needed
for dtype in df.dtypes:
- dtype_str = dtype.__str__().lower()
if (
- "category" in dtype_str
- or "interval" in dtype_str
- or "sparse" in dtype_str
- or "period" in dtype_str
- or "uint" in dtype_str
+ is_categorical(dtype)
+ or is_interval_dtype(dtype)
+ or is_period_dtype(dtype)
+ or is_sparse(dtype)
+ or is_unsigned_integer_dtype(dtype)
):
raise NotImplementedError(
- """The dtype of one or more columns is unsigned integers,
-intervals, periods, sparse or categorical which is not supported yet."""
+ """The dtype of one or more columns is not supported yet."""
)
if engine != "pyarrow":
@@ -154,7 +160,9 @@ def to_orc(
with get_handle(path, "wb", is_text=False) as handles:
assert isinstance(engine, ModuleType) # For mypy
orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index), handles.handle, **kwargs
+ engine.Table.from_pandas(df, preserve_index=index),
+ handles.handle,
+ **engine_kwargs,
)
if was_none:
diff --git a/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc b/pandas/tests/io/data/orc/TestOrcFile.testReadWrite.orc
deleted file mode 100644
index 852360ecad74cb3545fd160f895fca67d68ef276..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001
literal 1344
zcma)6&5qJg6uv*b?P*25Q&BHP6C{}oYC<5Im~=B_CQ&y!X5!Ao1TrWjgeC(nW@8$k
zz&FrGaI35CH1jkrTvHFXmkRfFi(;;QU=Mge
zDVnzfQFzyYg#b+cqUwAes6}uc10PZ;)0#?Km{b=@h)D_36ePY?no10J7vUWYZ^0}#
z4=R@w%kBq`?&Kb@sOswu$nX7~f{;$7Akev>w3(UGa%M^&XIL8AE5s`~U=W~JwcZY}
zI^K}pSwCchp~5iF@Y%_;89m9fw-ks(YfKp-_{y`7)HJ_neU@j0r2E($eyC`%X|SB-
z*;Bl4lyN7|4(PYR5WgsJ!IeZ^)kxJ%jZ9TEausQG)Lo52?P+w?Lyew6P-U%sU57m?
zhMGvKks2hD8O;>eIX9XsS8K;;M}gMDXrb`F;rU#nU6Xr8drImYv}O|Y`6;5%zHfZr
z_`VXjaoFeDk!ff3xLRZVh(CMrI`~P0Vry4+b$vxQHgnrTUcQI_v45*27wp|~+ctL$
zxcnia62e!ey0!1D!Rl|I6#DZ`5v~V*1id5~^}FMCBzkAlemim}gXDDDj-25bk7AUJ
z-=q0%lyp&gC&`E2XkIlho}e+Ao({31MjE6=>Hx$tY8kVPTjp3MEORXb%e;cMSgc19
zFGmuujU-+lNxWqw@uee)SMT@szl48EwvQyXBZ)Uh()s-IIqivd^GWj&eBtiuK
SNU3p7TkfYnoF5$@R{jEaEZds^
From a28c5a8786697f990612efaae8beb46e00871944 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Mon, 13 Jun 2022 05:26:51 -0400
Subject: [PATCH 45/49] Reraise Pyarrow TypeError as NotImplementedError
---
pandas/io/orc.py | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 793c0356894fb..078b9c7a9af84 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -20,7 +20,6 @@
is_categorical,
is_interval_dtype,
is_period_dtype,
- is_sparse,
is_unsigned_integer_dtype,
)
@@ -141,7 +140,6 @@ def to_orc(
is_categorical(dtype)
or is_interval_dtype(dtype)
or is_period_dtype(dtype)
- or is_sparse(dtype)
or is_unsigned_integer_dtype(dtype)
):
raise NotImplementedError(
@@ -159,11 +157,16 @@ def to_orc(
assert path is not None # For mypy
with get_handle(path, "wb", is_text=False) as handles:
assert isinstance(engine, ModuleType) # For mypy
- orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index),
- handles.handle,
- **engine_kwargs,
- )
+ try:
+ orc.write_table(
+ engine.Table.from_pandas(df, preserve_index=index),
+ handles.handle,
+ **engine_kwargs,
+ )
+ except TypeError as e:
+ raise NotImplementedError(
+ """The dtype of one or more columns is not supported yet."""
+ ) from e
if was_none:
assert isinstance(path, io.BytesIO) # For mypy
From 162e5bb36461b47248b4a8d47555e05994658b29 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Mon, 13 Jun 2022 06:02:05 -0400
Subject: [PATCH 46/49] Fix bugs
---
pandas/core/frame.py | 4 ++--
pandas/io/orc.py | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 183a45e8dca05..fd853d6603a2c 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2908,7 +2908,7 @@ def to_orc(
*,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
- engine_kwargs: dict[str, Any] | None = None,
+ engine_kwargs: dict[str, Any] = {},
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
@@ -2933,7 +2933,7 @@ def to_orc(
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
- engine_kwargs: dict[str, Any], optional
+ engine_kwargs : dict[str, Any], default {}
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
Returns
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 078b9c7a9af84..b80c6635776c9 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -74,7 +74,7 @@ def to_orc(
*,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
- engine_kwargs: dict[str, Any] | None = None,
+ engine_kwargs: dict[str, Any] = {},
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
@@ -103,7 +103,7 @@ def to_orc(
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
- engine_kwargs: dict[str, Any], optional
+ engine_kwargs : dict[str, Any], default {}
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
Returns
From b23058350fe42a239c5d58cd47fa92069be82c03 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Mon, 13 Jun 2022 06:12:57 -0400
Subject: [PATCH 47/49] Fix expected error msg in orc tests
---
pandas/io/orc.py | 4 ++--
pandas/tests/io/test_orc.py | 3 +--
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index b80c6635776c9..36bc72fa4c936 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -143,7 +143,7 @@ def to_orc(
or is_unsigned_integer_dtype(dtype)
):
raise NotImplementedError(
- """The dtype of one or more columns is not supported yet."""
+ "The dtype of one or more columns is not supported yet."
)
if engine != "pyarrow":
@@ -165,7 +165,7 @@ def to_orc(
)
except TypeError as e:
raise NotImplementedError(
- """The dtype of one or more columns is not supported yet."""
+ "The dtype of one or more columns is not supported yet."
) from e
if was_none:
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index d5a0d4cc4fff0..0bb320907b813 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -302,7 +302,6 @@ def test_orc_roundtrip_bytesio():
def test_orc_writer_dtypes_not_supported(df_not_supported):
# GH44554
# PyArrow gained ORC write support with the current argument order
- msg = """The dtype of one or more columns is unsigned integers,
-intervals, periods, sparse or categorical which is not supported yet."""
+ msg = "The dtype of one or more columns is not supported yet."
with pytest.raises(NotImplementedError, match=msg):
df_not_supported.to_orc()
From e16edabf733255f23991017be58e98db036f4204 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Mon, 13 Jun 2022 06:22:13 -0400
Subject: [PATCH 48/49] Avoid deprecated functions
---
pandas/io/orc.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 36bc72fa4c936..5e1d3f7c86b23 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -17,7 +17,7 @@
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.common import (
- is_categorical,
+ is_categorical_dtype,
is_interval_dtype,
is_period_dtype,
is_unsigned_integer_dtype,
@@ -137,7 +137,7 @@ def to_orc(
# In Pyarrow 9.0.0 this check will no longer be needed
for dtype in df.dtypes:
if (
- is_categorical(dtype)
+ is_categorical_dtype(dtype)
or is_interval_dtype(dtype)
or is_period_dtype(dtype)
or is_unsigned_integer_dtype(dtype)
From e4770b8cd6c49cf88c63931adbec40a747a55b84 Mon Sep 17 00:00:00 2001
From: Ian Joiner
Date: Mon, 13 Jun 2022 17:34:47 -0400
Subject: [PATCH 49/49] Replace {} with None in arg
---
pandas/core/frame.py | 8 +++++---
pandas/io/orc.py | 6 ++++--
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index fd853d6603a2c..00cfd0e0f8fd7 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2908,7 +2908,7 @@ def to_orc(
*,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
- engine_kwargs: dict[str, Any] = {},
+ engine_kwargs: dict[str, Any] | None = None,
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
@@ -2933,7 +2933,7 @@ def to_orc(
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
- engine_kwargs : dict[str, Any], default {}
+ engine_kwargs : dict[str, Any] or None, default None
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
Returns
@@ -2985,7 +2985,9 @@ def to_orc(
"""
from pandas.io.orc import to_orc
- return to_orc(self, path, engine=engine, index=index, **engine_kwargs)
+ return to_orc(
+ self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
+ )
@Substitution(
header_type="bool",
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 5e1d3f7c86b23..40754a56bbe8b 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -74,7 +74,7 @@ def to_orc(
*,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
- engine_kwargs: dict[str, Any] = {},
+ engine_kwargs: dict[str, Any] | None = None,
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
@@ -103,7 +103,7 @@ def to_orc(
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
- engine_kwargs : dict[str, Any], default {}
+ engine_kwargs : dict[str, Any] or None, default None
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
Returns
@@ -132,6 +132,8 @@ def to_orc(
"""
if index is None:
index = df.index.names[0] is not None
+ if engine_kwargs is None:
+ engine_kwargs = {}
# If unsupported dtypes are found raise NotImplementedError
# In Pyarrow 9.0.0 this check will no longer be needed