pandas-dev · jorisvandenbossche · Dec 11, 2019 · Nov 6, 2019 · Nov 6, 2019 · Nov 6, 2019
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -258,7 +258,7 @@ matplotlib                2.2.2              Visualization
 openpyxl                  2.4.8              Reading / writing for xlsx files
 pandas-gbq                0.8.0              Google Big Query access
 psycopg2                                     PostgreSQL engine for sqlalchemy
-pyarrow                   0.12.0             Parquet and feather reading / writing
+pyarrow                   0.12.0             Parquet, ORC (requires 0.13.0), and feather reading / writing
 pymysql                   0.7.11             MySQL engine for sqlalchemy
 pyreadstat                                   SPSS files (.sav) reading
 pytables                  3.4.2              HDF5 reading / writing

diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
@@ -98,6 +98,13 @@ Parquet
 
    read_parquet
 
+ORC
+~~~
+.. autosummary::
+   :toctree: api/
+
+   read_orc
+
 SAS
 ~~~
 .. autosummary::

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -28,6 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
     binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
     binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
     binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
+    binary;`ORC Format <//https://orc.apache.org/>`__;:ref:`read_orc<io.orc>`;
     binary;`Msgpack <https://msgpack.org/index.html>`__;:ref:`read_msgpack<io.msgpack>`;:ref:`to_msgpack<io.msgpack>`
     binary;`Stata <https://en.wikipedia.org/wiki/Stata>`__;:ref:`read_stata<io.stata_reader>`;:ref:`to_stata<io.stata_writer>`
     binary;`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__;:ref:`read_sas<io.sas_reader>`;
@@ -4858,6 +4859,17 @@ The above example creates a partitioned dataset that may look like:
    except OSError:
        pass
 
+.. _io.orc:
+
+ORC
+---
+
+.. versionadded:: 1.0.0
+
+Similar to the :ref:`parquet <io.parquet>` format, the `ORC Format <//https://orc.apache.org/>`__ is a binary columnar serialization
+for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the
+ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow <https://arrow.apache.org/docs/python/>`__ library.
+
 .. _io.sql:
 
 SQL queries
@@ -5761,6 +5773,3 @@ Space on disk (in bytes)
     24009288 Oct 10 06:43 test_fixed_compress.hdf
     24458940 Oct 10 06:44 test_table.hdf
     24458940 Oct 10 06:44 test_table_compress.hdf
-
-
-
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -168,6 +168,7 @@
     # misc
     read_clipboard,
     read_parquet,
+    read_orc,
     read_feather,
     read_gbq,
     read_html,

diff --git a/pandas/io/api.py b/pandas/io/api.py
@@ -10,6 +10,7 @@
 from pandas.io.gbq import read_gbq
 from pandas.io.html import read_html
 from pandas.io.json import read_json
+from pandas.io.orc import read_orc
 from pandas.io.packers import read_msgpack, to_msgpack
 from pandas.io.parquet import read_parquet
 from pandas.io.parsers import read_csv, read_fwf, read_table

diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -0,0 +1,57 @@
+""" orc compat """
+
+import distutils
+from typing import TYPE_CHECKING, List, Optional
+
+from pandas._typing import FilePathOrBuffer
+
+from pandas.io.common import get_filepath_or_buffer
+
+if TYPE_CHECKING:
+    from pandas import DataFrame
+
+
+def read_orc(
+    path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs,
+) -> "DataFrame":
+    """
+    Load an ORC object from the file path, returning a DataFrame.
+
+    .. versionadded:: 1.0.0
+
+    Parameters
+    ----------
+    path : str, path object or file-like object
+        Any valid string path is acceptable. The string could be a URL. Valid
+        URL schemes include http, ftp, s3, and file. For file URLs, a host is
+        expected. A local file could be:
+        ``file://localhost/path/to/table.orc``.
+
+        If you want to pass in a path object, pandas accepts any
+        ``os.PathLike``.
+
+        By file-like object, we refer to objects with a ``read()`` method,
+        such as a file handler (e.g. via builtin ``open`` function)
+        or ``StringIO``.
+    columns : list, default=None
-    columns : list, default=None
+    columns : list, default None
-    columns : list, default=None
+    columns : list, default None
+        If not None, only these columns will be read from the file.
+    **kwargs
+        Any additional kwargs are passed to pyarrow.
+
+    Returns
+    -------
+    DataFrame
+    """
+
+    # we require a newer version of pyarrow thaN we support for parquet
-    # we require a newer version of pyarrow thaN we support for parquet
+    # we require a newer version of pyarrow than we support for parquet
-    # we require a newer version of pyarrow thaN we support for parquet
+    # we require a newer version of pyarrow than we support for parquet
+    import pyarrow
+
+    if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0":
+        raise ImportError("pyarrow must be >= 0.13.0 for read_orc")
+
+    import pyarrow.orc
+
+    path, _, _, _ = get_filepath_or_buffer(path)
+    orc_file = pyarrow.orc.ORCFile(path)
+    result = orc_file.read(columns=columns, **kwargs).to_pandas()
+    return result
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
@@ -167,6 +167,7 @@ class TestPDApi(Base):
         "read_table",
         "read_feather",
         "read_parquet",
+        "read_orc",
         "read_spss",
     ]
 

diff --git a/pandas/tests/io/data/orc/TestOrcFile.decimal.orc b/pandas/tests/io/data/orc/TestOrcFile.decimal.orc
diff --git a/pandas/tests/io/data/orc/TestOrcFile.emptyFile.orc b/pandas/tests/io/data/orc/TestOrcFile.emptyFile.orc
diff --git a/pandas/tests/io/data/orc/TestOrcFile.test1.orc b/pandas/tests/io/data/orc/TestOrcFile.test1.orc
diff --git a/pandas/tests/io/data/orc/TestOrcFile.testDate1900.orc b/pandas/tests/io/data/orc/TestOrcFile.testDate1900.orc
diff --git a/pandas/tests/io/data/orc/TestOrcFile.testDate2038.orc b/pandas/tests/io/data/orc/TestOrcFile.testDate2038.orc
diff --git a/pandas/tests/io/data/orc/TestOrcFile.testSnappy.orc b/pandas/tests/io/data/orc/TestOrcFile.testSnappy.orc
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -0,0 +1,227 @@
+""" test orc compat """
+import datetime
+import os
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import read_orc
+import pandas.util.testing as tm
+
+pytest.importorskip("pyarrow", minversion="0.13.0")
+pytest.importorskip("pyarrow.orc")
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:RangeIndex.* is deprecated:DeprecationWarning"
+)
+
+
+@pytest.fixture
+def dirpath(datapath):
+    return datapath("io", "data", "orc")
+
+
+def test_orc_reader_empty(dirpath):
+    columns = [
+        "boolean1",
+        "byte1",
+        "short1",
+        "int1",
+        "long1",
+        "float1",
+        "double1",
+        "bytes1",
+        "string1",
+    ]
+    dtypes = [
+        "bool",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float64",
+        "object",
+        "object",
+    ]
+    expected = pd.DataFrame(index=pd.RangeIndex(0))
+    for colname, dtype in zip(columns, dtypes):
+        expected[colname] = pd.Series(dtype=dtype)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
+    got = read_orc(inputfile, columns=columns)
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_basic(dirpath):
+    data = {
+        "boolean1": np.array([False, True], dtype="bool"),
+        "byte1": np.array([1, 100], dtype="int8"),
+        "short1": np.array([1024, 2048], dtype="int16"),
+        "int1": np.array([65536, 65536], dtype="int32"),
+        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+        "float1": np.array([1.0, 2.0], dtype="float32"),
+        "double1": np.array([-15.0, -5.0], dtype="float64"),
+        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+        "string1": np.array(["hi", "bye"], dtype="object"),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
+    got = read_orc(inputfile, columns=data.keys())
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_decimal(dirpath):
+    from decimal import Decimal
+
+    # Only testing the first 10 rows of data
+    data = {
+        "_col0": np.array(
+            [
+                Decimal("-1000.50000"),
+                Decimal("-999.60000"),
+                Decimal("-998.70000"),
+                Decimal("-997.80000"),
+                Decimal("-996.90000"),
+                Decimal("-995.10000"),
+                Decimal("-994.11000"),
+                Decimal("-993.12000"),
+                Decimal("-992.13000"),
+                Decimal("-991.14000"),
+            ],
+            dtype="object",
+        )
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_date_low(dirpath):
+    data = {
+        "time": np.array(
+            [
+                "1900-05-05 12:34:56.100000",
+                "1900-05-05 12:34:56.100100",
+                "1900-05-05 12:34:56.100200",
+                "1900-05-05 12:34:56.100300",
+                "1900-05-05 12:34:56.100400",
+                "1900-05-05 12:34:56.100500",
+                "1900-05-05 12:34:56.100600",
+                "1900-05-05 12:34:56.100700",
+                "1900-05-05 12:34:56.100800",
+                "1900-05-05 12:34:56.100900",
+            ],
+            dtype="datetime64[ns]",
+        ),
+        "date": np.array(
+            [
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+            ],
+            dtype="object",
+        ),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_date_high(dirpath):
+    data = {
+        "time": np.array(
+            [
+                "2038-05-05 12:34:56.100000",
+                "2038-05-05 12:34:56.100100",
+                "2038-05-05 12:34:56.100200",
+                "2038-05-05 12:34:56.100300",
+                "2038-05-05 12:34:56.100400",
+                "2038-05-05 12:34:56.100500",
+                "2038-05-05 12:34:56.100600",
+                "2038-05-05 12:34:56.100700",
+                "2038-05-05 12:34:56.100800",
+                "2038-05-05 12:34:56.100900",
+            ],
+            dtype="datetime64[ns]",
+        ),
+        "date": np.array(
+            [
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+            ],
+            dtype="object",
+        ),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_snappy_compressed(dirpath):
+    data = {
+        "int1": np.array(
+            [
+                -1160101563,
+                1181413113,
+                2065821249,
+                -267157795,
+                172111193,
+                1752363137,
+                1406072123,
+                1911809390,
+                -1308542224,
+                -467100286,
+            ],
+            dtype="int32",
+        ),
+        "string1": np.array(
+            [
+                "f50dcb8",
+                "382fdaaa",
+                "90758c6",
+                "9e8caf3f",
+                "ee97332b",
+                "d634da1",
+                "2bea4396",
+                "d67d89e8",
+                "ad71007e",
+                "e8c82066",
+            ],
+            dtype="object",
+        ),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)