diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 9f3ab22496ae7..14530a9010a1c 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -258,7 +258,7 @@ matplotlib 2.2.2 Visualization openpyxl 2.4.8 Reading / writing for xlsx files pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy -pyarrow 0.12.0 Parquet and feather reading / writing +pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and feather reading / writing pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 91f4942d03b0d..6d2d405a15850 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -98,6 +98,13 @@ Parquet read_parquet +ORC +~~~ +.. autosummary:: + :toctree: api/ + + read_orc + SAS ~~~ .. autosummary:: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index fa47a5944f7bf..972f36aecad24 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -28,6 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` + binary;`ORC Format `__;:ref:`read_orc`; binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; @@ -4858,6 +4859,17 @@ The above example creates a partitioned dataset that may look like: except OSError: pass +.. _io.orc: + +ORC +--- + +.. versionadded:: 1.0.0 + +Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization +for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the +ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library. + .. _io.sql: SQL queries @@ -5761,6 +5773,3 @@ Space on disk (in bytes) 24009288 Oct 10 06:43 test_fixed_compress.hdf 24458940 Oct 10 06:44 test_table.hdf 24458940 Oct 10 06:44 test_table_compress.hdf - - - diff --git a/pandas/__init__.py b/pandas/__init__.py index a60aa08b89f84..f72a12b58edcb 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -168,6 +168,7 @@ # misc read_clipboard, read_parquet, + read_orc, read_feather, read_gbq, read_html, diff --git a/pandas/io/api.py b/pandas/io/api.py index 725e82604ca7f..e20aa18324a34 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -10,6 +10,7 @@ from pandas.io.gbq import read_gbq from pandas.io.html import read_html from pandas.io.json import read_json +from pandas.io.orc import read_orc from pandas.io.packers import read_msgpack, to_msgpack from pandas.io.parquet import read_parquet from pandas.io.parsers import read_csv, read_fwf, read_table diff --git a/pandas/io/orc.py b/pandas/io/orc.py new file mode 100644 index 0000000000000..bbefe447cb7fe --- /dev/null +++ b/pandas/io/orc.py @@ -0,0 +1,57 @@ +""" orc compat """ + +import distutils +from typing import TYPE_CHECKING, List, Optional + +from pandas._typing import FilePathOrBuffer + +from pandas.io.common import get_filepath_or_buffer + +if TYPE_CHECKING: + from pandas import DataFrame + + +def read_orc( + path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs, +) -> "DataFrame": + """ + Load an ORC object from the file path, returning a DataFrame. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + path : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.orc``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + columns : list, default None + If not None, only these columns will be read from the file. + **kwargs + Any additional kwargs are passed to pyarrow. + + Returns + ------- + DataFrame + """ + + # we require a newer version of pyarrow than we support for parquet + import pyarrow + + if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0": + raise ImportError("pyarrow must be >= 0.13.0 for read_orc") + + import pyarrow.orc + + path, _, _, _ = get_filepath_or_buffer(path) + orc_file = pyarrow.orc.ORCFile(path) + result = orc_file.read(columns=columns, **kwargs).to_pandas() + return result diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 76141dceae930..870d7fd6e44c1 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -167,6 +167,7 @@ class TestPDApi(Base): "read_table", "read_feather", "read_parquet", + "read_orc", "read_spss", ] diff --git a/pandas/tests/io/data/orc/TestOrcFile.decimal.orc b/pandas/tests/io/data/orc/TestOrcFile.decimal.orc new file mode 100644 index 0000000000000..cb0f7b9d767a3 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.decimal.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.emptyFile.orc b/pandas/tests/io/data/orc/TestOrcFile.emptyFile.orc new file mode 100644 index 0000000000000..ecdadcbff1346 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.emptyFile.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.test1.orc b/pandas/tests/io/data/orc/TestOrcFile.test1.orc new file mode 100644 index 0000000000000..4fb0beff86897 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.test1.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.testDate1900.orc b/pandas/tests/io/data/orc/TestOrcFile.testDate1900.orc new file mode 100644 index 0000000000000..f51ffdbd03a43 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.testDate1900.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.testDate2038.orc b/pandas/tests/io/data/orc/TestOrcFile.testDate2038.orc new file mode 100644 index 0000000000000..cd11fa8a4e91d Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.testDate2038.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.testSnappy.orc b/pandas/tests/io/data/orc/TestOrcFile.testSnappy.orc new file mode 100644 index 0000000000000..aa6cc9c9ba1a7 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.testSnappy.orc differ diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py new file mode 100644 index 0000000000000..9f3ec274007d0 --- /dev/null +++ b/pandas/tests/io/test_orc.py @@ -0,0 +1,227 @@ +""" test orc compat """ +import datetime +import os + +import numpy as np +import pytest + +import pandas as pd +from pandas import read_orc +import pandas.util.testing as tm + +pytest.importorskip("pyarrow", minversion="0.13.0") +pytest.importorskip("pyarrow.orc") + +pytestmark = pytest.mark.filterwarnings( + "ignore:RangeIndex.* is deprecated:DeprecationWarning" +) + + +@pytest.fixture +def dirpath(datapath): + return datapath("io", "data", "orc") + + +def test_orc_reader_empty(dirpath): + columns = [ + "boolean1", + "byte1", + "short1", + "int1", + "long1", + "float1", + "double1", + "bytes1", + "string1", + ] + dtypes = [ + "bool", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "object", + "object", + ] + expected = pd.DataFrame(index=pd.RangeIndex(0)) + for colname, dtype in zip(columns, dtypes): + expected[colname] = pd.Series(dtype=dtype) + + inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") + got = read_orc(inputfile, columns=columns) + + tm.assert_equal(expected, got) + + +def test_orc_reader_basic(dirpath): + data = { + "boolean1": np.array([False, True], dtype="bool"), + "byte1": np.array([1, 100], dtype="int8"), + "short1": np.array([1024, 2048], dtype="int16"), + "int1": np.array([65536, 65536], dtype="int32"), + "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), + "float1": np.array([1.0, 2.0], dtype="float32"), + "double1": np.array([-15.0, -5.0], dtype="float64"), + "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + "string1": np.array(["hi", "bye"], dtype="object"), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc") + got = read_orc(inputfile, columns=data.keys()) + + tm.assert_equal(expected, got) + + +def test_orc_reader_decimal(dirpath): + from decimal import Decimal + + # Only testing the first 10 rows of data + data = { + "_col0": np.array( + [ + Decimal("-1000.50000"), + Decimal("-999.60000"), + Decimal("-998.70000"), + Decimal("-997.80000"), + Decimal("-996.90000"), + Decimal("-995.10000"), + Decimal("-994.11000"), + Decimal("-993.12000"), + Decimal("-992.13000"), + Decimal("-991.14000"), + ], + dtype="object", + ) + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) + + +def test_orc_reader_date_low(dirpath): + data = { + "time": np.array( + [ + "1900-05-05 12:34:56.100000", + "1900-05-05 12:34:56.100100", + "1900-05-05 12:34:56.100200", + "1900-05-05 12:34:56.100300", + "1900-05-05 12:34:56.100400", + "1900-05-05 12:34:56.100500", + "1900-05-05 12:34:56.100600", + "1900-05-05 12:34:56.100700", + "1900-05-05 12:34:56.100800", + "1900-05-05 12:34:56.100900", + ], + dtype="datetime64[ns]", + ), + "date": np.array( + [ + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + ], + dtype="object", + ), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) + + +def test_orc_reader_date_high(dirpath): + data = { + "time": np.array( + [ + "2038-05-05 12:34:56.100000", + "2038-05-05 12:34:56.100100", + "2038-05-05 12:34:56.100200", + "2038-05-05 12:34:56.100300", + "2038-05-05 12:34:56.100400", + "2038-05-05 12:34:56.100500", + "2038-05-05 12:34:56.100600", + "2038-05-05 12:34:56.100700", + "2038-05-05 12:34:56.100800", + "2038-05-05 12:34:56.100900", + ], + dtype="datetime64[ns]", + ), + "date": np.array( + [ + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + ], + dtype="object", + ), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) + + +def test_orc_reader_snappy_compressed(dirpath): + data = { + "int1": np.array( + [ + -1160101563, + 1181413113, + 2065821249, + -267157795, + 172111193, + 1752363137, + 1406072123, + 1911809390, + -1308542224, + -467100286, + ], + dtype="int32", + ), + "string1": np.array( + [ + "f50dcb8", + "382fdaaa", + "90758c6", + "9e8caf3f", + "ee97332b", + "d634da1", + "2bea4396", + "d67d89e8", + "ad71007e", + "e8c82066", + ], + dtype="object", + ), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got)