|
1 | 1 | """ orc compat """
|
2 | 2 | from __future__ import annotations
|
3 | 3 |
|
4 |
| -from typing import TYPE_CHECKING |
| 4 | +import io |
| 5 | +from types import ModuleType |
| 6 | +from typing import ( |
| 7 | + TYPE_CHECKING, |
| 8 | + Any, |
| 9 | + Literal, |
| 10 | +) |
5 | 11 |
|
6 | 12 | from pandas._typing import (
|
7 | 13 | FilePath,
|
8 | 14 | ReadBuffer,
|
| 15 | + WriteBuffer, |
9 | 16 | )
|
10 | 17 | from pandas.compat._optional import import_optional_dependency
|
11 | 18 |
|
| 19 | +from pandas.core.dtypes.common import ( |
| 20 | + is_categorical_dtype, |
| 21 | + is_interval_dtype, |
| 22 | + is_period_dtype, |
| 23 | + is_unsigned_integer_dtype, |
| 24 | +) |
| 25 | + |
12 | 26 | from pandas.io.common import get_handle
|
13 | 27 |
|
14 | 28 | if TYPE_CHECKING:
|
@@ -52,3 +66,111 @@ def read_orc(
|
52 | 66 | with get_handle(path, "rb", is_text=False) as handles:
|
53 | 67 | orc_file = orc.ORCFile(handles.handle)
|
54 | 68 | return orc_file.read(columns=columns, **kwargs).to_pandas()
|
| 69 | + |
| 70 | + |
| 71 | +def to_orc( |
| 72 | + df: DataFrame, |
| 73 | + path: FilePath | WriteBuffer[bytes] | None = None, |
| 74 | + *, |
| 75 | + engine: Literal["pyarrow"] = "pyarrow", |
| 76 | + index: bool | None = None, |
| 77 | + engine_kwargs: dict[str, Any] | None = None, |
| 78 | +) -> bytes | None: |
| 79 | + """ |
| 80 | + Write a DataFrame to the ORC format. |
| 81 | +
|
| 82 | + .. versionadded:: 1.5.0 |
| 83 | +
|
| 84 | + Parameters |
| 85 | + ---------- |
| 86 | + df : DataFrame |
| 87 | + The dataframe to be written to ORC. Raises NotImplementedError |
| 88 | + if dtype of one or more columns is category, unsigned integers, |
| 89 | + intervals, periods or sparse. |
| 90 | + path : str, file-like object or None, default None |
| 91 | + If a string, it will be used as Root Directory path |
| 92 | + when writing a partitioned dataset. By file-like object, |
| 93 | + we refer to objects with a write() method, such as a file handle |
| 94 | + (e.g. via builtin open function). If path is None, |
| 95 | + a bytes object is returned. |
| 96 | + engine : str, default 'pyarrow' |
| 97 | + ORC library to use. Pyarrow must be >= 7.0.0. |
| 98 | + index : bool, optional |
| 99 | + If ``True``, include the dataframe's index(es) in the file output. If |
| 100 | + ``False``, they will not be written to the file. |
| 101 | + If ``None``, similar to ``infer`` the dataframe's index(es) |
| 102 | + will be saved. However, instead of being saved as values, |
| 103 | + the RangeIndex will be stored as a range in the metadata so it |
| 104 | + doesn't require much space and is faster. Other indexes will |
| 105 | + be included as columns in the file output. |
| 106 | + engine_kwargs : dict[str, Any] or None, default None |
| 107 | + Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. |
| 108 | +
|
| 109 | + Returns |
| 110 | + ------- |
| 111 | + bytes if no path argument is provided else None |
| 112 | +
|
| 113 | + Raises |
| 114 | + ------ |
| 115 | + NotImplementedError |
| 116 | + Dtype of one or more columns is category, unsigned integers, interval, |
| 117 | + period or sparse. |
| 118 | + ValueError |
| 119 | + engine is not pyarrow. |
| 120 | +
|
| 121 | + Notes |
| 122 | + ----- |
| 123 | + * Before using this function you should read the |
| 124 | + :ref:`user guide about ORC <io.orc>` and |
| 125 | + :ref:`install optional dependencies <install.warn_orc>`. |
| 126 | + * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ |
| 127 | + library. |
| 128 | + * For supported dtypes please refer to `supported ORC features in Arrow |
| 129 | + <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. |
| 130 | + * Currently timezones in datetime columns are not preserved when a |
| 131 | + dataframe is converted into ORC files. |
| 132 | + """ |
| 133 | + if index is None: |
| 134 | + index = df.index.names[0] is not None |
| 135 | + if engine_kwargs is None: |
| 136 | + engine_kwargs = {} |
| 137 | + |
| 138 | + # If unsupported dtypes are found raise NotImplementedError |
| 139 | + # In Pyarrow 9.0.0 this check will no longer be needed |
| 140 | + for dtype in df.dtypes: |
| 141 | + if ( |
| 142 | + is_categorical_dtype(dtype) |
| 143 | + or is_interval_dtype(dtype) |
| 144 | + or is_period_dtype(dtype) |
| 145 | + or is_unsigned_integer_dtype(dtype) |
| 146 | + ): |
| 147 | + raise NotImplementedError( |
| 148 | + "The dtype of one or more columns is not supported yet." |
| 149 | + ) |
| 150 | + |
| 151 | + if engine != "pyarrow": |
| 152 | + raise ValueError("engine must be 'pyarrow'") |
| 153 | + engine = import_optional_dependency(engine, min_version="7.0.0") |
| 154 | + orc = import_optional_dependency("pyarrow.orc") |
| 155 | + |
| 156 | + was_none = path is None |
| 157 | + if was_none: |
| 158 | + path = io.BytesIO() |
| 159 | + assert path is not None # For mypy |
| 160 | + with get_handle(path, "wb", is_text=False) as handles: |
| 161 | + assert isinstance(engine, ModuleType) # For mypy |
| 162 | + try: |
| 163 | + orc.write_table( |
| 164 | + engine.Table.from_pandas(df, preserve_index=index), |
| 165 | + handles.handle, |
| 166 | + **engine_kwargs, |
| 167 | + ) |
| 168 | + except TypeError as e: |
| 169 | + raise NotImplementedError( |
| 170 | + "The dtype of one or more columns is not supported yet." |
| 171 | + ) from e |
| 172 | + |
| 173 | + if was_none: |
| 174 | + assert isinstance(path, io.BytesIO) # For mypy |
| 175 | + return path.getvalue() |
| 176 | + return None |
0 commit comments