Skip to content

Commit 9b202d3

Browse files
kkraus14jorisvandenbossche
authored andcommitted
ENH: Add ORC reader (#29447)
1 parent b22a0f7 commit 9b202d3

14 files changed

+307
-4
lines changed

doc/source/getting_started/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ matplotlib 2.2.2 Visualization
258258
openpyxl 2.4.8 Reading / writing for xlsx files
259259
pandas-gbq 0.8.0 Google Big Query access
260260
psycopg2 PostgreSQL engine for sqlalchemy
261-
pyarrow 0.12.0 Parquet and feather reading / writing
261+
pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and feather reading / writing
262262
pymysql 0.7.11 MySQL engine for sqlalchemy
263263
pyreadstat SPSS files (.sav) reading
264264
pytables 3.4.2 HDF5 reading / writing

doc/source/reference/io.rst

+7
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,13 @@ Parquet
9898

9999
read_parquet
100100

101+
ORC
102+
~~~
103+
.. autosummary::
104+
:toctree: api/
105+
106+
read_orc
107+
101108
SAS
102109
~~~
103110
.. autosummary::

doc/source/user_guide/io.rst

+12-3
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
2828
binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
2929
binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
3030
binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
31+
binary;`ORC Format <//https://orc.apache.org/>`__;:ref:`read_orc<io.orc>`;
3132
binary;`Msgpack <https://msgpack.org/index.html>`__;:ref:`read_msgpack<io.msgpack>`;:ref:`to_msgpack<io.msgpack>`
3233
binary;`Stata <https://en.wikipedia.org/wiki/Stata>`__;:ref:`read_stata<io.stata_reader>`;:ref:`to_stata<io.stata_writer>`
3334
binary;`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__;:ref:`read_sas<io.sas_reader>`;
@@ -4858,6 +4859,17 @@ The above example creates a partitioned dataset that may look like:
48584859
except OSError:
48594860
pass
48604861
4862+
.. _io.orc:
4863+
4864+
ORC
4865+
---
4866+
4867+
.. versionadded:: 1.0.0
4868+
4869+
Similar to the :ref:`parquet <io.parquet>` format, the `ORC Format <//https://orc.apache.org/>`__ is a binary columnar serialization
4870+
for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the
4871+
ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow <https://arrow.apache.org/docs/python/>`__ library.
4872+
48614873
.. _io.sql:
48624874

48634875
SQL queries
@@ -5761,6 +5773,3 @@ Space on disk (in bytes)
57615773
24009288 Oct 10 06:43 test_fixed_compress.hdf
57625774
24458940 Oct 10 06:44 test_table.hdf
57635775
24458940 Oct 10 06:44 test_table_compress.hdf
5764-
5765-
5766-

pandas/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@
168168
# misc
169169
read_clipboard,
170170
read_parquet,
171+
read_orc,
171172
read_feather,
172173
read_gbq,
173174
read_html,

pandas/io/api.py

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas.io.gbq import read_gbq
1111
from pandas.io.html import read_html
1212
from pandas.io.json import read_json
13+
from pandas.io.orc import read_orc
1314
from pandas.io.packers import read_msgpack, to_msgpack
1415
from pandas.io.parquet import read_parquet
1516
from pandas.io.parsers import read_csv, read_fwf, read_table

pandas/io/orc.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
""" orc compat """
2+
3+
import distutils
4+
from typing import TYPE_CHECKING, List, Optional
5+
6+
from pandas._typing import FilePathOrBuffer
7+
8+
from pandas.io.common import get_filepath_or_buffer
9+
10+
if TYPE_CHECKING:
11+
from pandas import DataFrame
12+
13+
14+
def read_orc(
15+
path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs,
16+
) -> "DataFrame":
17+
"""
18+
Load an ORC object from the file path, returning a DataFrame.
19+
20+
.. versionadded:: 1.0.0
21+
22+
Parameters
23+
----------
24+
path : str, path object or file-like object
25+
Any valid string path is acceptable. The string could be a URL. Valid
26+
URL schemes include http, ftp, s3, and file. For file URLs, a host is
27+
expected. A local file could be:
28+
``file://localhost/path/to/table.orc``.
29+
30+
If you want to pass in a path object, pandas accepts any
31+
``os.PathLike``.
32+
33+
By file-like object, we refer to objects with a ``read()`` method,
34+
such as a file handler (e.g. via builtin ``open`` function)
35+
or ``StringIO``.
36+
columns : list, default None
37+
If not None, only these columns will be read from the file.
38+
**kwargs
39+
Any additional kwargs are passed to pyarrow.
40+
41+
Returns
42+
-------
43+
DataFrame
44+
"""
45+
46+
# we require a newer version of pyarrow than we support for parquet
47+
import pyarrow
48+
49+
if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0":
50+
raise ImportError("pyarrow must be >= 0.13.0 for read_orc")
51+
52+
import pyarrow.orc
53+
54+
path, _, _, _ = get_filepath_or_buffer(path)
55+
orc_file = pyarrow.orc.ORCFile(path)
56+
result = orc_file.read(columns=columns, **kwargs).to_pandas()
57+
return result

pandas/tests/api/test_api.py

+1
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ class TestPDApi(Base):
167167
"read_table",
168168
"read_feather",
169169
"read_parquet",
170+
"read_orc",
170171
"read_spss",
171172
]
172173

Binary file not shown.
523 Bytes
Binary file not shown.
1.67 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

pandas/tests/io/test_orc.py

+227
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
""" test orc compat """
2+
import datetime
3+
import os
4+
5+
import numpy as np
6+
import pytest
7+
8+
import pandas as pd
9+
from pandas import read_orc
10+
import pandas.util.testing as tm
11+
12+
pytest.importorskip("pyarrow", minversion="0.13.0")
13+
pytest.importorskip("pyarrow.orc")
14+
15+
pytestmark = pytest.mark.filterwarnings(
16+
"ignore:RangeIndex.* is deprecated:DeprecationWarning"
17+
)
18+
19+
20+
@pytest.fixture
21+
def dirpath(datapath):
22+
return datapath("io", "data", "orc")
23+
24+
25+
def test_orc_reader_empty(dirpath):
26+
columns = [
27+
"boolean1",
28+
"byte1",
29+
"short1",
30+
"int1",
31+
"long1",
32+
"float1",
33+
"double1",
34+
"bytes1",
35+
"string1",
36+
]
37+
dtypes = [
38+
"bool",
39+
"int8",
40+
"int16",
41+
"int32",
42+
"int64",
43+
"float32",
44+
"float64",
45+
"object",
46+
"object",
47+
]
48+
expected = pd.DataFrame(index=pd.RangeIndex(0))
49+
for colname, dtype in zip(columns, dtypes):
50+
expected[colname] = pd.Series(dtype=dtype)
51+
52+
inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
53+
got = read_orc(inputfile, columns=columns)
54+
55+
tm.assert_equal(expected, got)
56+
57+
58+
def test_orc_reader_basic(dirpath):
59+
data = {
60+
"boolean1": np.array([False, True], dtype="bool"),
61+
"byte1": np.array([1, 100], dtype="int8"),
62+
"short1": np.array([1024, 2048], dtype="int16"),
63+
"int1": np.array([65536, 65536], dtype="int32"),
64+
"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
65+
"float1": np.array([1.0, 2.0], dtype="float32"),
66+
"double1": np.array([-15.0, -5.0], dtype="float64"),
67+
"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
68+
"string1": np.array(["hi", "bye"], dtype="object"),
69+
}
70+
expected = pd.DataFrame.from_dict(data)
71+
72+
inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
73+
got = read_orc(inputfile, columns=data.keys())
74+
75+
tm.assert_equal(expected, got)
76+
77+
78+
def test_orc_reader_decimal(dirpath):
79+
from decimal import Decimal
80+
81+
# Only testing the first 10 rows of data
82+
data = {
83+
"_col0": np.array(
84+
[
85+
Decimal("-1000.50000"),
86+
Decimal("-999.60000"),
87+
Decimal("-998.70000"),
88+
Decimal("-997.80000"),
89+
Decimal("-996.90000"),
90+
Decimal("-995.10000"),
91+
Decimal("-994.11000"),
92+
Decimal("-993.12000"),
93+
Decimal("-992.13000"),
94+
Decimal("-991.14000"),
95+
],
96+
dtype="object",
97+
)
98+
}
99+
expected = pd.DataFrame.from_dict(data)
100+
101+
inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
102+
got = read_orc(inputfile).iloc[:10]
103+
104+
tm.assert_equal(expected, got)
105+
106+
107+
def test_orc_reader_date_low(dirpath):
108+
data = {
109+
"time": np.array(
110+
[
111+
"1900-05-05 12:34:56.100000",
112+
"1900-05-05 12:34:56.100100",
113+
"1900-05-05 12:34:56.100200",
114+
"1900-05-05 12:34:56.100300",
115+
"1900-05-05 12:34:56.100400",
116+
"1900-05-05 12:34:56.100500",
117+
"1900-05-05 12:34:56.100600",
118+
"1900-05-05 12:34:56.100700",
119+
"1900-05-05 12:34:56.100800",
120+
"1900-05-05 12:34:56.100900",
121+
],
122+
dtype="datetime64[ns]",
123+
),
124+
"date": np.array(
125+
[
126+
datetime.date(1900, 12, 25),
127+
datetime.date(1900, 12, 25),
128+
datetime.date(1900, 12, 25),
129+
datetime.date(1900, 12, 25),
130+
datetime.date(1900, 12, 25),
131+
datetime.date(1900, 12, 25),
132+
datetime.date(1900, 12, 25),
133+
datetime.date(1900, 12, 25),
134+
datetime.date(1900, 12, 25),
135+
datetime.date(1900, 12, 25),
136+
],
137+
dtype="object",
138+
),
139+
}
140+
expected = pd.DataFrame.from_dict(data)
141+
142+
inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
143+
got = read_orc(inputfile).iloc[:10]
144+
145+
tm.assert_equal(expected, got)
146+
147+
148+
def test_orc_reader_date_high(dirpath):
149+
data = {
150+
"time": np.array(
151+
[
152+
"2038-05-05 12:34:56.100000",
153+
"2038-05-05 12:34:56.100100",
154+
"2038-05-05 12:34:56.100200",
155+
"2038-05-05 12:34:56.100300",
156+
"2038-05-05 12:34:56.100400",
157+
"2038-05-05 12:34:56.100500",
158+
"2038-05-05 12:34:56.100600",
159+
"2038-05-05 12:34:56.100700",
160+
"2038-05-05 12:34:56.100800",
161+
"2038-05-05 12:34:56.100900",
162+
],
163+
dtype="datetime64[ns]",
164+
),
165+
"date": np.array(
166+
[
167+
datetime.date(2038, 12, 25),
168+
datetime.date(2038, 12, 25),
169+
datetime.date(2038, 12, 25),
170+
datetime.date(2038, 12, 25),
171+
datetime.date(2038, 12, 25),
172+
datetime.date(2038, 12, 25),
173+
datetime.date(2038, 12, 25),
174+
datetime.date(2038, 12, 25),
175+
datetime.date(2038, 12, 25),
176+
datetime.date(2038, 12, 25),
177+
],
178+
dtype="object",
179+
),
180+
}
181+
expected = pd.DataFrame.from_dict(data)
182+
183+
inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
184+
got = read_orc(inputfile).iloc[:10]
185+
186+
tm.assert_equal(expected, got)
187+
188+
189+
def test_orc_reader_snappy_compressed(dirpath):
190+
data = {
191+
"int1": np.array(
192+
[
193+
-1160101563,
194+
1181413113,
195+
2065821249,
196+
-267157795,
197+
172111193,
198+
1752363137,
199+
1406072123,
200+
1911809390,
201+
-1308542224,
202+
-467100286,
203+
],
204+
dtype="int32",
205+
),
206+
"string1": np.array(
207+
[
208+
"f50dcb8",
209+
"382fdaaa",
210+
"90758c6",
211+
"9e8caf3f",
212+
"ee97332b",
213+
"d634da1",
214+
"2bea4396",
215+
"d67d89e8",
216+
"ad71007e",
217+
"e8c82066",
218+
],
219+
dtype="object",
220+
),
221+
}
222+
expected = pd.DataFrame.from_dict(data)
223+
224+
inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
225+
got = read_orc(inputfile).iloc[:10]
226+
227+
tm.assert_equal(expected, got)

0 commit comments

Comments
 (0)