Skip to content

ENH: Add ORC reader #29447

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Dec 11, 2019
2 changes: 1 addition & 1 deletion doc/source/getting_started/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ matplotlib 2.2.2 Visualization
openpyxl 2.4.8 Reading / writing for xlsx files
pandas-gbq 0.8.0 Google Big Query access
psycopg2 PostgreSQL engine for sqlalchemy
pyarrow 0.12.0 Parquet and feather reading / writing
pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and feather reading / writing
pymysql 0.7.11 MySQL engine for sqlalchemy
pyreadstat SPSS files (.sav) reading
pytables 3.4.2 HDF5 reading / writing
Expand Down
7 changes: 7 additions & 0 deletions doc/source/reference/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ Parquet

read_parquet

ORC
~~~
.. autosummary::
:toctree: api/

read_orc

SAS
~~~
.. autosummary::
Expand Down
15 changes: 12 additions & 3 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
binary;`ORC Format <//https://orc.apache.org/>`__;:ref:`read_orc<io.orc>`;
binary;`Msgpack <https://msgpack.org/index.html>`__;:ref:`read_msgpack<io.msgpack>`;:ref:`to_msgpack<io.msgpack>`
binary;`Stata <https://en.wikipedia.org/wiki/Stata>`__;:ref:`read_stata<io.stata_reader>`;:ref:`to_stata<io.stata_writer>`
binary;`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__;:ref:`read_sas<io.sas_reader>`;
Expand Down Expand Up @@ -4858,6 +4859,17 @@ The above example creates a partitioned dataset that may look like:
except OSError:
pass

.. _io.orc:

ORC
---

.. versionadded:: 1.0.0

Similar to the :ref:`parquet <io.parquet>` format, the `ORC Format <//https://orc.apache.org/>`__ is a binary columnar serialization
for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the
ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow <https://arrow.apache.org/docs/python/>`__ library.

.. _io.sql:

SQL queries
Expand Down Expand Up @@ -5761,6 +5773,3 @@ Space on disk (in bytes)
24009288 Oct 10 06:43 test_fixed_compress.hdf
24458940 Oct 10 06:44 test_table.hdf
24458940 Oct 10 06:44 test_table_compress.hdf



1 change: 1 addition & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@
# misc
read_clipboard,
read_parquet,
read_orc,
read_feather,
read_gbq,
read_html,
Expand Down
1 change: 1 addition & 0 deletions pandas/io/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pandas.io.gbq import read_gbq
from pandas.io.html import read_html
from pandas.io.json import read_json
from pandas.io.orc import read_orc
from pandas.io.packers import read_msgpack, to_msgpack
from pandas.io.parquet import read_parquet
from pandas.io.parsers import read_csv, read_fwf, read_table
Expand Down
57 changes: 57 additions & 0 deletions pandas/io/orc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
""" orc compat """

import distutils
from typing import TYPE_CHECKING, List, Optional

from pandas._typing import FilePathOrBuffer

from pandas.io.common import get_filepath_or_buffer

if TYPE_CHECKING:
from pandas import DataFrame


def read_orc(
path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs,
) -> "DataFrame":
"""
Load an ORC object from the file path, returning a DataFrame.

.. versionadded:: 1.0.0

Parameters
----------
path : str, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.orc``.

If you want to pass in a path object, pandas accepts any
``os.PathLike``.

By file-like object, we refer to objects with a ``read()`` method,
such as a file handler (e.g. via builtin ``open`` function)
or ``StringIO``.
columns : list, default None
If not None, only these columns will be read from the file.
**kwargs
Any additional kwargs are passed to pyarrow.

Returns
-------
DataFrame
"""

# we require a newer version of pyarrow than we support for parquet
import pyarrow

if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0":
raise ImportError("pyarrow must be >= 0.13.0 for read_orc")

import pyarrow.orc

path, _, _, _ = get_filepath_or_buffer(path)
orc_file = pyarrow.orc.ORCFile(path)
result = orc_file.read(columns=columns, **kwargs).to_pandas()
return result
1 change: 1 addition & 0 deletions pandas/tests/api/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ class TestPDApi(Base):
"read_table",
"read_feather",
"read_parquet",
"read_orc",
"read_spss",
]

Expand Down
Binary file not shown.
Binary file added pandas/tests/io/data/orc/TestOrcFile.emptyFile.orc
Binary file not shown.
Binary file added pandas/tests/io/data/orc/TestOrcFile.test1.orc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
227 changes: 227 additions & 0 deletions pandas/tests/io/test_orc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
""" test orc compat """
import datetime
import os

import numpy as np
import pytest

import pandas as pd
from pandas import read_orc
import pandas.util.testing as tm

pytest.importorskip("pyarrow", minversion="0.13.0")
pytest.importorskip("pyarrow.orc")

pytestmark = pytest.mark.filterwarnings(
"ignore:RangeIndex.* is deprecated:DeprecationWarning"
)


@pytest.fixture
def dirpath(datapath):
return datapath("io", "data", "orc")


def test_orc_reader_empty(dirpath):
columns = [
"boolean1",
"byte1",
"short1",
"int1",
"long1",
"float1",
"double1",
"bytes1",
"string1",
]
dtypes = [
"bool",
"int8",
"int16",
"int32",
"int64",
"float32",
"float64",
"object",
"object",
]
expected = pd.DataFrame(index=pd.RangeIndex(0))
for colname, dtype in zip(columns, dtypes):
expected[colname] = pd.Series(dtype=dtype)

inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
got = read_orc(inputfile, columns=columns)

tm.assert_equal(expected, got)


def test_orc_reader_basic(dirpath):
data = {
"boolean1": np.array([False, True], dtype="bool"),
"byte1": np.array([1, 100], dtype="int8"),
"short1": np.array([1024, 2048], dtype="int16"),
"int1": np.array([65536, 65536], dtype="int32"),
"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
"float1": np.array([1.0, 2.0], dtype="float32"),
"double1": np.array([-15.0, -5.0], dtype="float64"),
"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
"string1": np.array(["hi", "bye"], dtype="object"),
}
expected = pd.DataFrame.from_dict(data)

inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
got = read_orc(inputfile, columns=data.keys())

tm.assert_equal(expected, got)


def test_orc_reader_decimal(dirpath):
from decimal import Decimal

# Only testing the first 10 rows of data
data = {
"_col0": np.array(
[
Decimal("-1000.50000"),
Decimal("-999.60000"),
Decimal("-998.70000"),
Decimal("-997.80000"),
Decimal("-996.90000"),
Decimal("-995.10000"),
Decimal("-994.11000"),
Decimal("-993.12000"),
Decimal("-992.13000"),
Decimal("-991.14000"),
],
dtype="object",
)
}
expected = pd.DataFrame.from_dict(data)

inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
got = read_orc(inputfile).iloc[:10]

tm.assert_equal(expected, got)


def test_orc_reader_date_low(dirpath):
data = {
"time": np.array(
[
"1900-05-05 12:34:56.100000",
"1900-05-05 12:34:56.100100",
"1900-05-05 12:34:56.100200",
"1900-05-05 12:34:56.100300",
"1900-05-05 12:34:56.100400",
"1900-05-05 12:34:56.100500",
"1900-05-05 12:34:56.100600",
"1900-05-05 12:34:56.100700",
"1900-05-05 12:34:56.100800",
"1900-05-05 12:34:56.100900",
],
dtype="datetime64[ns]",
),
"date": np.array(
[
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
],
dtype="object",
),
}
expected = pd.DataFrame.from_dict(data)

inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
got = read_orc(inputfile).iloc[:10]

tm.assert_equal(expected, got)


def test_orc_reader_date_high(dirpath):
data = {
"time": np.array(
[
"2038-05-05 12:34:56.100000",
"2038-05-05 12:34:56.100100",
"2038-05-05 12:34:56.100200",
"2038-05-05 12:34:56.100300",
"2038-05-05 12:34:56.100400",
"2038-05-05 12:34:56.100500",
"2038-05-05 12:34:56.100600",
"2038-05-05 12:34:56.100700",
"2038-05-05 12:34:56.100800",
"2038-05-05 12:34:56.100900",
],
dtype="datetime64[ns]",
),
"date": np.array(
[
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
],
dtype="object",
),
}
expected = pd.DataFrame.from_dict(data)

inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
got = read_orc(inputfile).iloc[:10]

tm.assert_equal(expected, got)


def test_orc_reader_snappy_compressed(dirpath):
data = {
"int1": np.array(
[
-1160101563,
1181413113,
2065821249,
-267157795,
172111193,
1752363137,
1406072123,
1911809390,
-1308542224,
-467100286,
],
dtype="int32",
),
"string1": np.array(
[
"f50dcb8",
"382fdaaa",
"90758c6",
"9e8caf3f",
"ee97332b",
"d634da1",
"2bea4396",
"d67d89e8",
"ad71007e",
"e8c82066",
],
dtype="object",
),
}
expected = pd.DataFrame.from_dict(data)

inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
got = read_orc(inputfile).iloc[:10]

tm.assert_equal(expected, got)