Skip to content

[WIP] Quick fix to provide complex data type support for hashmap based algorithms #27599

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
9 changes: 5 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,11 @@ def _ensure_data(values, dtype=None):

# ignore the fact that we are casting to float
# which discards complex parts
with catch_warnings():
simplefilter("ignore", np.ComplexWarning)
values = ensure_float64(values)
return values, "float64", "float64"
# with catch_warnings():
# simplefilter("ignore", np.ComplexWarning)
# values = ensure_float64(values)
# return values, "float64", "float64"
return ensure_object(np.asarray(values)), "object", "object"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we cant just return non-object values? regardless, definitely dont leave the commented-out stuff above, just delete


except (TypeError, ValueError, OverflowError):
# if we are trying to coerce to a dtype
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,11 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=T
dtype, _ = infer_dtype_from_array(values)
values = np.asarray(values, dtype=dtype)

elif isinstance(values, np.ndarray) and values.dtype == object:
dtype = lib.infer_dtype(values)
if dtype == 'complex':
values = np.asarray(values, dtype=dtype)

def sort_mixed(values):
# order ints before strings, safe in py3
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
Expand Down
85 changes: 85 additions & 0 deletions pandas/tests/test_complex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import numpy as np
import pytest

import pandas as pd
from pandas import (
Index,
Series,
DataFrame,
)
import pandas.core.algorithms as algos
import pandas.util.testing as tm


class TestComplexSupportBasic:
@pytest.mark.parametrize("array,expected", [
(
[1 + 1j, 0, 1, 1j, 1 + 2j],
Series([1, 1, 1, 1, 1], index=[1 + 2j, 1 + 1j, 1j, 1, 0])
),
(
[1 + 2j, 0, 1j, 1, 1j, 1 + 1j],
# index is sorted by value counts in descending order by default
Series([2, 1, 1, 1, 1], index=[1j, 1 + 2j, 1 + 1j, 1, 0])
)
])
def test_value_counts(self, array, expected):
result = algos.value_counts(array)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("array,expected", [
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For any that are just one set of parameters here you can just define in the function body. No need to parametrize

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just an initial set of test cases. I plan on adding more test cases for each of the algos.

(
[1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j],
np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=object)
),
])
def test_unique(self, array, expected):
result = algos.unique(array)
assert np.array_equal(result, expected)

@pytest.mark.parametrize("array,expected", [
(
[0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j],
Series([False, False, True, False, False, False, True], dtype=bool)
),
])
def test_duplicated(self, array, expected):
result = Series(array).duplicated()
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("array,expected", [
(
[0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j],
Series([False, True, True, False, True, True, True], dtype=bool)
),
])
def test_isin(self, array, expected):
result = Series(array).isin([1j, 1 + 1j, 1 + 2j])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("array,expected", [
(
[1, 2, 2 + 1j],
(np.array([0, 1, 2]), np.array([(1 + 0j), (2 + 0j), (2 + 1j)],
dtype=object))
),
])
def test_factorize(self, array, expected):
result = pd.factorize(array)
assert len(result) == 2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think easier to read at a quick glance if you just compare the two elements rather than the enumerating loop


for i, r in enumerate(result):
assert np.array_equal(r, expected[i])

@pytest.mark.parametrize("frame,expected", [
(
DataFrame([dict(a=1, b=1 + 1j), dict(a=1, b=1 + 2j)]),
DataFrame(
np.array([1, 1]),
index=Index([(1 + 1j), (1 + 2j)], dtype='object', name='b'),
columns=Index(['a'], dtype='object'))
),
])
def test_groupby(self, frame, expected):
result = frame.groupby("b").count()
tm.assert_frame_equal(result, expected)