Skip to content

CLN: .values->._values in hashing #33529

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 6 additions & 28 deletions pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ def hash_pandas_object(
return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)

elif isinstance(obj, ABCIndexClass):
h = hash_array(obj.values, encoding, hash_key, categorize).astype(
h = hash_array(obj._values, encoding, hash_key, categorize).astype(
"uint64", copy=False
)
h = Series(h, index=obj, dtype="uint64", copy=False)

elif isinstance(obj, ABCSeries):
h = hash_array(obj.values, encoding, hash_key, categorize).astype(
h = hash_array(obj._values, encoding, hash_key, categorize).astype(
"uint64", copy=False
)
if index:
Expand All @@ -107,7 +107,7 @@ def hash_pandas_object(
encoding=encoding,
hash_key=hash_key,
categorize=categorize,
).values
)._values
for _ in [None]
)
arrays = itertools.chain([h], index_iter)
Expand All @@ -116,7 +116,7 @@ def hash_pandas_object(
h = Series(h, index=obj.index, dtype="uint64", copy=False)

elif isinstance(obj, ABCDataFrame):
hashes = (hash_array(series.values) for _, series in obj.items())
hashes = (hash_array(series._values) for _, series in obj.items())
num_items = len(obj.columns)
if index:
index_hash_generator = (
Expand All @@ -126,7 +126,7 @@ def hash_pandas_object(
encoding=encoding,
hash_key=hash_key,
categorize=categorize,
).values # noqa
)._values
for _ in [None]
)
num_items += 1
Expand Down Expand Up @@ -185,28 +185,6 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
return h


def hash_tuple(val, encoding: str = "utf8", hash_key: str = _default_hash_key):
"""
Hash a single tuple efficiently

Parameters
----------
val : single tuple
encoding : str, default 'utf8'
hash_key : str, default _default_hash_key

Returns
-------
hash

"""
hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) for v in val)

h = _combine_hash_arrays(hashes, len(val))[0]

return h


def _hash_categorical(c, encoding: str, hash_key: str):
"""
Hash a Categorical by hashing its categories, and then mapping the codes
Expand All @@ -223,7 +201,7 @@ def _hash_categorical(c, encoding: str, hash_key: str):
ndarray of hashed values array, same size as len(c)
"""
# Convert ExtensionArrays to ndarrays
values = np.asarray(c.categories.values)
values = np.asarray(c.categories._values)
hashed = hash_array(values, encoding, hash_key, categorize=False)

# we have uint64, as we don't directly support missing values
Expand Down
44 changes: 1 addition & 43 deletions pandas/tests/util/test_hashing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import datetime

import numpy as np
import pytest

import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series
import pandas._testing as tm
from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples
from pandas.core.util.hashing import hash_tuples
from pandas.util import hash_array, hash_pandas_object


Expand Down Expand Up @@ -111,46 +109,6 @@ def test_hash_tuples():
assert result == expected[0]


@pytest.mark.parametrize(
"tup",
[(1, "one"), (1, np.nan), (1.0, pd.NaT, "A"), ("A", pd.Timestamp("2012-01-01"))],
)
def test_hash_tuple(tup):
# Test equivalence between
# hash_tuples and hash_tuple.
result = hash_tuple(tup)
expected = hash_tuples([tup])[0]

assert result == expected


@pytest.mark.parametrize(
"val",
[
1,
1.4,
"A",
b"A",
pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-01", tz="Europe/Brussels"),
datetime.datetime(2012, 1, 1),
pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(),
pd.Timedelta("1 days"),
datetime.timedelta(1),
pd.Period("2012-01-01", freq="D"),
pd.Interval(0, 1),
np.nan,
pd.NaT,
None,
],
)
def test_hash_scalar(val):
result = _hash_scalar(val)
expected = hash_array(np.array([val], dtype=object), categorize=True)

assert result[0] == expected[0]


@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
def test_hash_tuples_err(val):
msg = "must be convertible to a list-of-tuples"
Expand Down