From 83288d6c41c9d0dd6bdbbeab619525f7c18fc49f Mon Sep 17 00:00:00 2001 From: akiyuki ishikawa Date: Wed, 16 Jun 2021 12:47:26 +0900 Subject: [PATCH 1/3] Fix hash_pandas_object does not recognize hash_key --- pandas/core/util/hashing.py | 5 ++++- pandas/tests/util/test_hashing.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 962728b2f38c4..09213b9e37aa2 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -139,7 +139,10 @@ def hash_pandas_object( ser = Series(h, index=obj.index, dtype="uint64", copy=False) elif isinstance(obj, ABCDataFrame): - hashes = (hash_array(series._values) for _, series in obj.items()) + hashes = ( + hash_array(series._values, encoding, hash_key, categorize) + for _, series in obj.items() + ) num_items = len(obj.columns) if index: index_hash_generator = ( diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 8ce24dc963dc5..1567deca93eca 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -255,6 +255,16 @@ def test_hash_keys(): assert (a != b).all() +def test_df_hash_keys(): + # DataFrame version of the test_hash_keys + obj = DataFrame({"x": np.arange(3), "y": list("abc")}) + + a = hash_pandas_object(obj, hash_key="9876543210123456") + b = hash_pandas_object(obj, hash_key="9876543210123465") + + assert (a != b).all() + + def test_invalid_key(): # This only matters for object dtypes. msg = "key should be a 16-byte string encoded" From 6b797deb95d2edd2a4d02efb4f4c3b8930ce3654 Mon Sep 17 00:00:00 2001 From: akiyuki ishikawa Date: Thu, 17 Jun 2021 12:04:30 +0900 Subject: [PATCH 2/3] Add encoding test case and links to issues --- pandas/tests/util/test_hashing.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 1567deca93eca..e4a46de11ceb7 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -256,7 +256,8 @@ def test_hash_keys(): def test_df_hash_keys(): - # DataFrame version of the test_hash_keys + # DataFrame version of the test_hash_keys. + # https://github.com/pandas-dev/pandas/issues/41404 obj = DataFrame({"x": np.arange(3), "y": list("abc")}) a = hash_pandas_object(obj, hash_key="9876543210123456") @@ -265,6 +266,21 @@ def test_df_hash_keys(): assert (a != b).all() +def test_df_encoding(): + # Check that DataFrame recognizes optional encoding. + # https://github.com/pandas-dev/pandas/issues/41404 + # https://github.com/pandas-dev/pandas/pull/42049 + obj = DataFrame({"x": np.arange(3), "y": list("a+c")}) + + a = hash_pandas_object(obj, encoding="utf8") + b = hash_pandas_object(obj, encoding="utf7") + + # Note that the "+" is encoded as "+-" in utf-7. + assert a[0] == b[0] + assert a[1] != b[1] + assert a[2] == b[2] + + def test_invalid_key(): # This only matters for object dtypes. msg = "key should be a 16-byte string encoded" From ceb689dc30d8df751cc8e769c085534f3badede8 Mon Sep 17 00:00:00 2001 From: akiyuki ishikawa Date: Fri, 18 Jun 2021 09:27:19 +0900 Subject: [PATCH 3/3] Added whatsnew entry. --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 438313f3e58e2..7269a184358b7 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1205,6 +1205,7 @@ Other - Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`) - Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`) - Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`) +- Bug in :func:`pandas.util.hash_pandas_object` not recognizing ``hash_key``, ``encoding`` and ``categorize`` when the input object type is a :class:`DataFrame` (:issue:`41404`) .. ---------------------------------------------------------------------------