|
| 1 | +""" orc compat """ |
| 2 | + |
| 3 | +from pandas.compat._optional import import_optional_dependency |
| 4 | +from pandas.errors import AbstractMethodError |
| 5 | + |
| 6 | +from pandas import DataFrame, get_option |
| 7 | + |
| 8 | +from pandas.io.common import get_filepath_or_buffer |
| 9 | + |
| 10 | + |
| 11 | +def get_engine(engine): |
| 12 | + """ return our implementation """ |
| 13 | + |
| 14 | + if engine == "auto": |
| 15 | + engine = get_option("io.orc.engine") |
| 16 | + |
| 17 | + if engine == "auto": |
| 18 | + # try engines in this order |
| 19 | + try: |
| 20 | + return PyArrowImpl() |
| 21 | + except ImportError: |
| 22 | + pass |
| 23 | + |
| 24 | + raise ImportError( |
| 25 | + "Unable to find a usable engine; " |
| 26 | + "tried using: 'pyarrow'.\n" |
| 27 | + "pyarrow is required for orc " |
| 28 | + "support" |
| 29 | + ) |
| 30 | + |
| 31 | + if engine not in ["pyarrow"]: |
| 32 | + raise ValueError("engine must be 'pyarrow'") |
| 33 | + |
| 34 | + if engine == "pyarrow": |
| 35 | + return PyArrowImpl() |
| 36 | + |
| 37 | + |
| 38 | +class BaseImpl: |
| 39 | + |
| 40 | + api = None # module |
| 41 | + |
| 42 | + @staticmethod |
| 43 | + def validate_dataframe(df): |
| 44 | + |
| 45 | + if not isinstance(df, DataFrame): |
| 46 | + raise ValueError("to_orc only supports IO with DataFrames") |
| 47 | + |
| 48 | + # must have value column names (strings only) |
| 49 | + if df.columns.inferred_type not in {"string", "unicode", "empty"}: |
| 50 | + raise ValueError("ORC must have string column names") |
| 51 | + |
| 52 | + # index level names must be strings |
| 53 | + valid_names = all( |
| 54 | + isinstance(name, str) for name in df.index.names if name is not None |
| 55 | + ) |
| 56 | + if not valid_names: |
| 57 | + raise ValueError("Index level names must be strings") |
| 58 | + |
| 59 | + def write(self, df, path, compression, **kwargs): |
| 60 | + raise AbstractMethodError(self) |
| 61 | + |
| 62 | + def read(self, path, columns=None, **kwargs): |
| 63 | + raise AbstractMethodError(self) |
| 64 | + |
| 65 | + |
| 66 | +class PyArrowImpl(BaseImpl): |
| 67 | + def __init__(self): |
| 68 | + pyarrow = import_optional_dependency( |
| 69 | + "pyarrow", extra="pyarrow is required for orc support." |
| 70 | + ) |
| 71 | + import pyarrow.orc |
| 72 | + |
| 73 | + self.api = pyarrow |
| 74 | + |
| 75 | + def read(self, path, columns=None, **kwargs): |
| 76 | + path, _, _, _ = get_filepath_or_buffer(path) |
| 77 | + |
| 78 | + py_file = self.api.input_stream(path) |
| 79 | + orc_file = self.api.orc.ORCFile(py_file) |
| 80 | + |
| 81 | + result = orc_file.read(columns=columns, **kwargs).to_pandas() |
| 82 | + |
| 83 | + return result |
| 84 | + |
| 85 | + |
| 86 | +def read_orc(path, engine="auto", columns=None, **kwargs): |
| 87 | + """ |
| 88 | + Load an ORC object from the file path, returning a DataFrame. |
| 89 | +
|
| 90 | + .. versionadded:: 0.21.0 |
| 91 | +
|
| 92 | + Parameters |
| 93 | + ---------- |
| 94 | + path : str, path object or file-like object |
| 95 | + Any valid string path is acceptable. The string could be a URL. Valid |
| 96 | + URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| 97 | + expected. A local file could be: |
| 98 | + ``file://localhost/path/to/table.orc``. |
| 99 | +
|
| 100 | + If you want to pass in a path object, pandas accepts any |
| 101 | + ``os.PathLike``. |
| 102 | +
|
| 103 | + By file-like object, we refer to objects with a ``read()`` method, |
| 104 | + such as a file handler (e.g. via builtin ``open`` function) |
| 105 | + or ``StringIO``. |
| 106 | + engine : {'auto', 'pyarrow'}, default 'auto' |
| 107 | + ORC library to use. If 'auto', then the option ``io.orc.engine`` is |
| 108 | + used. The default ``io.orc.engine`` behavior is to try 'pyarrow'. |
| 109 | + columns : list, default=None |
| 110 | + If not None, only these columns will be read from the file. |
| 111 | +
|
| 112 | + .. versionadded:: 0.21.1 |
| 113 | + **kwargs |
| 114 | + Any additional kwargs are passed to the engine. |
| 115 | +
|
| 116 | + Returns |
| 117 | + ------- |
| 118 | + DataFrame |
| 119 | + """ |
| 120 | + |
| 121 | + impl = get_engine(engine) |
| 122 | + return impl.read(path, columns=columns, **kwargs) |
0 commit comments