diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 53bcf6ffd7a8a..6189396ab3ea9 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -36,6 +36,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`SPSS `__;:ref:`read_spss`; binary;`Python Pickle Format `__;:ref:`read_pickle`;:ref:`to_pickle` SQL;`SQL `__;:ref:`read_sql`;:ref:`to_sql` + SQL;`Delta Lake `__;:ref:`read_deltalake`; SQL;`Google BigQuery `__;:ref:`read_gbq`;:ref:`to_gbq` :ref:`Here ` is an informal performance comparison for some of these IO methods. @@ -5901,6 +5902,17 @@ And then issue the following queries: data.to_sql("data", con) pd.read_sql_query("SELECT * FROM data", con) +.. _io.deltalake: + +Delta Lake +---------- +The ``deltalake`` package provides functionality to read/write from Delta Lake. + +pandas integrates with this external package. if ``deltalake`` is installed, you can +use the pandas methods ``pd.read_deltalake``, which will call the +respective functions from ``deltalake``. + +Full documentation can be found `here `__. .. _io.bigquery: diff --git a/pandas/__init__.py b/pandas/__init__.py index 585a1ae341217..ccaf43d2de602 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -162,6 +162,7 @@ read_parquet, read_orc, read_feather, + read_deltalake, read_gbq, read_html, read_xml, @@ -313,6 +314,7 @@ "read_excel", "read_feather", "read_fwf", + "read_deltalake", "read_gbq", "read_hdf", "read_html", diff --git a/pandas/io/api.py b/pandas/io/api.py index 4e8b34a61dfc6..bec1105bf8a47 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -3,6 +3,7 @@ """ from pandas.io.clipboards import read_clipboard +from pandas.io.deltalake import read_deltalake from pandas.io.excel import ( ExcelFile, ExcelWriter, @@ -46,6 +47,7 @@ "read_excel", "read_feather", "read_fwf", + "read_deltalake", "read_gbq", "read_hdf", "read_html", diff --git a/pandas/io/deltalake.py b/pandas/io/deltalake.py new file mode 100644 index 0000000000000..9c486069a4c98 --- /dev/null +++ b/pandas/io/deltalake.py @@ -0,0 +1,89 @@ +""" Delta Lake support """ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +from pandas.compat._optional import import_optional_dependency + +if TYPE_CHECKING: + import pyarrow.fs as pa_fs + + from pandas import DataFrame + + +def _try_import(): + # since pandas is a dependency of deltalake + # we need to import on first use + msg = ( + "deltalake is required to load data from Delta Lake. " + "See the docs: https://delta-io.github.io/delta-rs/python." + ) + deltalake = import_optional_dependency("deltalake", extra=msg) + return deltalake + + +def read_deltalake( + table_uri: str, + version: int | None = None, + storage_options: dict[str, str] | None = None, + without_files: bool = False, + partitions: list[tuple[str, str, Any]] | None = None, + columns: list[str] | None = None, + filesystem: str | pa_fs.FileSystem | None = None, +) -> DataFrame: + """ + Load data from Deltalake. + + This function requires the `deltalake package + `__. + + See the `How to load a Delta table + `__ + guide for loading instructions. + + Parameters + ---------- + table_uri: str + The path of the DeltaTable. + version: int, optional + The version of the DeltaTable. + storage_options: Dict[str, str], optional + A dictionary of the options to use for the storage backend. + without_files: bool, default False + If True, will load table without tracking files. + Some append-only applications might have no need of tracking any files. + So, the DeltaTable will be loaded with a significant memory reduction. + partitions: List[Tuple[str, str, Any], optional + A list of partition filters, see help(DeltaTable.files_by_partitions) + for filter syntax. + columns: List[str], optional + The columns to project. This can be a list of column names to include + (order and duplicates will be preserved). + filesystem: Union[str, pa_fs.FileSystem], optional + A concrete implementation of the Pyarrow FileSystem or + a fsspec-compatible interface. If None, the first file path will be used + to determine the right FileSystem. + + Returns + ------- + df: DataFrame + DataFrame including the results. + + See Also + -------- + deltalake.DeltaTable : Create a DeltaTable instance with the deltalake library. + """ + deltalake = _try_import() + + table = deltalake.DeltaTable( + table_uri=table_uri, + version=version, + storage_options=storage_options, + without_files=without_files, + ) + return table.to_pandas( + partitions=partitions, columns=columns, filesystem=filesystem + ) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 995b1668046d2..8c1932be9e9e0 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -146,6 +146,7 @@ class TestPDApi(Base): "read_csv", "read_excel", "read_fwf", + "read_deltalake", "read_gbq", "read_hdf", "read_html",