From b3cadf3b0bd7f5d3f962206f14ba8cf1fc8c9676 Mon Sep 17 00:00:00 2001 From: Vijay Vaidyanathan Date: Sat, 15 Jul 2023 14:35:55 -0700 Subject: [PATCH 1/6] DOC: Provide examples of using read_parquet #49739 --- pandas/io/parquet.py | 57 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9bb000c363684..60c255fe99737 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -556,7 +556,64 @@ def read_parquet( Returns ------- DataFrame + + See Also + -------- + DataFrame.to_parquet : Create a parquet object that serializes a DataFrame. + + Examples + -------- + >>> import pandas as pd + >>> original_df = pd.DataFrame( + ... {{"foo": range(5), "bar": range(5, 10)}} + ... ) + >>> original_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + >>> df_parquet_bytes = original_df.to_parquet() + >>> from io import BytesIO + >>> restored_df = pd.read_parquet(BytesIO(df_parquet_bytes)) + >>> restored_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + >>> restored_df.equals(original_df) + True + >>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"]) + bar + 0 5 + 1 6 + 2 7 + 3 8 + 4 9 + >>> restored_bar.equals(original_df[['bar']]) + True + + The function uses `kwargs` that are passed directly to the engine. + In the following example, we use the `filters` argument of the pyarrow + engine to filter the rows of the DataFrame. + + Since `pyarrow` is the default engine, we can omit the `engine` argument. + Note that the `filters` argument is implemented by the `pyarrow` engine, + which can benefit from multithreading and also potentially be more + economical in terms of memory. + + >>> sel = [("bar", ">", 2)] + >>> restored_part = pd.read_parquet(BytesIO(df_parquet_bytes), filters=sel) + >>> restored_part + foo bar + 0 3 8 + 1 4 9 + """ + impl = get_engine(engine) if use_nullable_dtypes is not lib.no_default: From 694700ee3a8bfb5ad771bf81145a1007e2ce5746 Mon Sep 17 00:00:00 2001 From: Vijay Vaidyanathan Date: Sat, 15 Jul 2023 15:46:21 -0700 Subject: [PATCH 2/6] DOC: Provide examples of using read_parquet #49739 --- pandas/io/parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 60c255fe99737..c07f5700417db 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -587,6 +587,7 @@ def read_parquet( >>> restored_df.equals(original_df) True >>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"]) + >>> restored_bar bar 0 5 1 6 From 0ec5d45c4023ca4456837bcad2ca109f23a8610b Mon Sep 17 00:00:00 2001 From: Vijay Vaidyanathan Date: Sat, 15 Jul 2023 15:52:05 -0700 Subject: [PATCH 3/6] DOC: Provide examples of using read_parquet #49739 (with minor fixes) --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index c07f5700417db..92f09c45e3c90 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -588,7 +588,7 @@ def read_parquet( True >>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"]) >>> restored_bar - bar + bar 0 5 1 6 2 7 From bc3c5d45c08de691cb43b141bd3350adbca6a195 Mon Sep 17 00:00:00 2001 From: Vijay Vaidyanathan Date: Sat, 15 Jul 2023 20:50:01 -0700 Subject: [PATCH 4/6] DOC: Provide examples of using read_parquet #49739 Fixed typos that were causing tests to fail. Oops. --- pandas/io/parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 92f09c45e3c90..19eaaac700fb9 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -576,7 +576,7 @@ def read_parquet( 4 4 9 >>> df_parquet_bytes = original_df.to_parquet() >>> from io import BytesIO - >>> restored_df = pd.read_parquet(BytesIO(df_parquet_bytes)) + >>> restored_df = pd.read_parquet(BytesIO(df_parquet_bytes)) >>> restored_df foo bar 0 0 5 @@ -606,7 +606,7 @@ def read_parquet( which can benefit from multithreading and also potentially be more economical in terms of memory. - >>> sel = [("bar", ">", 2)] + >>> sel = [("foo", ">", 2)] >>> restored_part = pd.read_parquet(BytesIO(df_parquet_bytes), filters=sel) >>> restored_part foo bar From 4bb74344d193f85bcf02af483149a026e6516b5c Mon Sep 17 00:00:00 2001 From: Vijay Vaidyanathan Date: Mon, 17 Jul 2023 18:55:55 -0700 Subject: [PATCH 5/6] DOC: Provide examples of using read_parquet #49739 - fix formatting failed checks --- pandas/io/parquet.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 19eaaac700fb9..61112542fb9d8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -563,7 +563,6 @@ def read_parquet( Examples -------- - >>> import pandas as pd >>> original_df = pd.DataFrame( ... {{"foo": range(5), "bar": range(5, 10)}} ... ) @@ -612,7 +611,6 @@ def read_parquet( foo bar 0 3 8 1 4 9 - """ impl = get_engine(engine) From b2d3a5d70872b900d7dcb166f18dedb646dd5237 Mon Sep 17 00:00:00 2001 From: Vijay Vaidyanathan Date: Tue, 18 Jul 2023 11:54:55 -0700 Subject: [PATCH 6/6] DOC: Provide examples of using read_parquet #49739 - removed read_parquet from code_checks.sh as requested by @mroeschke --- ci/code_checks.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7dd347327f3cc..add06b34f9359 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -90,7 +90,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.NaT \ pandas.read_feather \ pandas.DataFrame.to_feather \ - pandas.read_parquet \ pandas.read_orc \ pandas.read_sas \ pandas.read_spss \