Skip to content

Commit 99bbf44

Browse files
committed
Merge branch 'orc-reader' of https://github.com/kkraus14/pandas into kkraus14-orc-reader
2 parents 797732a + 074c2a0 commit 99bbf44

11 files changed

+411
-0
lines changed

pandas/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@
165165
# misc
166166
read_clipboard,
167167
read_parquet,
168+
read_orc,
168169
read_feather,
169170
read_gbq,
170171
read_html,

pandas/core/config_init.py

+16
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,22 @@ def use_inf_as_na_cb(key):
568568
validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]),
569569
)
570570

571+
572+
# Set up the io.orc specific configuration.
573+
orc_engine_doc = """
574+
: string
575+
The default orc reader/writer engine. Available options:
576+
'auto', 'pyarrow', the default is 'auto'
577+
"""
578+
579+
with cf.config_prefix("io.orc"):
580+
cf.register_option(
581+
"engine",
582+
"auto",
583+
orc_engine_doc,
584+
validator=is_one_of_factory(["auto", "pyarrow"]),
585+
)
586+
571587
# --------
572588
# Plotting
573589
# ---------

pandas/io/api.py

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas.io.gbq import read_gbq
1111
from pandas.io.html import read_html
1212
from pandas.io.json import read_json
13+
from pandas.io.orc import read_orc
1314
from pandas.io.packers import read_msgpack, to_msgpack
1415
from pandas.io.parquet import read_parquet
1516
from pandas.io.parsers import read_csv, read_fwf, read_table

pandas/io/orc.py

+122
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
""" orc compat """
2+
3+
from pandas.compat._optional import import_optional_dependency
4+
from pandas.errors import AbstractMethodError
5+
6+
from pandas import DataFrame, get_option
7+
8+
from pandas.io.common import get_filepath_or_buffer
9+
10+
11+
def get_engine(engine):
12+
""" return our implementation """
13+
14+
if engine == "auto":
15+
engine = get_option("io.orc.engine")
16+
17+
if engine == "auto":
18+
# try engines in this order
19+
try:
20+
return PyArrowImpl()
21+
except ImportError:
22+
pass
23+
24+
raise ImportError(
25+
"Unable to find a usable engine; "
26+
"tried using: 'pyarrow'.\n"
27+
"pyarrow is required for orc "
28+
"support"
29+
)
30+
31+
if engine not in ["pyarrow"]:
32+
raise ValueError("engine must be 'pyarrow'")
33+
34+
if engine == "pyarrow":
35+
return PyArrowImpl()
36+
37+
38+
class BaseImpl:
39+
40+
api = None # module
41+
42+
@staticmethod
43+
def validate_dataframe(df):
44+
45+
if not isinstance(df, DataFrame):
46+
raise ValueError("to_orc only supports IO with DataFrames")
47+
48+
# must have value column names (strings only)
49+
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
50+
raise ValueError("ORC must have string column names")
51+
52+
# index level names must be strings
53+
valid_names = all(
54+
isinstance(name, str) for name in df.index.names if name is not None
55+
)
56+
if not valid_names:
57+
raise ValueError("Index level names must be strings")
58+
59+
def write(self, df, path, compression, **kwargs):
60+
raise AbstractMethodError(self)
61+
62+
def read(self, path, columns=None, **kwargs):
63+
raise AbstractMethodError(self)
64+
65+
66+
class PyArrowImpl(BaseImpl):
67+
def __init__(self):
68+
pyarrow = import_optional_dependency(
69+
"pyarrow", extra="pyarrow is required for orc support."
70+
)
71+
import pyarrow.orc
72+
73+
self.api = pyarrow
74+
75+
def read(self, path, columns=None, **kwargs):
76+
path, _, _, _ = get_filepath_or_buffer(path)
77+
78+
py_file = self.api.input_stream(path)
79+
orc_file = self.api.orc.ORCFile(py_file)
80+
81+
result = orc_file.read(columns=columns, **kwargs).to_pandas()
82+
83+
return result
84+
85+
86+
def read_orc(path, engine="auto", columns=None, **kwargs):
87+
"""
88+
Load an ORC object from the file path, returning a DataFrame.
89+
90+
.. versionadded:: 0.21.0
91+
92+
Parameters
93+
----------
94+
path : str, path object or file-like object
95+
Any valid string path is acceptable. The string could be a URL. Valid
96+
URL schemes include http, ftp, s3, and file. For file URLs, a host is
97+
expected. A local file could be:
98+
``file://localhost/path/to/table.orc``.
99+
100+
If you want to pass in a path object, pandas accepts any
101+
``os.PathLike``.
102+
103+
By file-like object, we refer to objects with a ``read()`` method,
104+
such as a file handler (e.g. via builtin ``open`` function)
105+
or ``StringIO``.
106+
engine : {'auto', 'pyarrow'}, default 'auto'
107+
ORC library to use. If 'auto', then the option ``io.orc.engine`` is
108+
used. The default ``io.orc.engine`` behavior is to try 'pyarrow'.
109+
columns : list, default=None
110+
If not None, only these columns will be read from the file.
111+
112+
.. versionadded:: 0.21.1
113+
**kwargs
114+
Any additional kwargs are passed to the engine.
115+
116+
Returns
117+
-------
118+
DataFrame
119+
"""
120+
121+
impl = get_engine(engine)
122+
return impl.read(path, columns=columns, **kwargs)
Binary file not shown.
523 Bytes
Binary file not shown.
1.67 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)