Skip to content

[WIP] Quick fix to provide complex data type support for hashmap based algorithms #27599

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
10 changes: 2 additions & 8 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
from textwrap import dedent
from typing import Dict
from warnings import catch_warnings, simplefilter, warn
from warnings import warn

import numpy as np

Expand Down Expand Up @@ -101,13 +101,7 @@ def _ensure_data(values, dtype=None):
elif is_object_dtype(values) and dtype is None:
return ensure_object(np.asarray(values)), "object", "object"
elif is_complex_dtype(values) or is_complex_dtype(dtype):

# ignore the fact that we are casting to float
# which discards complex parts
with catch_warnings():
simplefilter("ignore", np.ComplexWarning)
values = ensure_float64(values)
return values, "float64", "float64"
raise TypeError("Complex data types not supported...Coercing to object")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better to do the object-coersion here than fallback to the except branch. clearer


except (TypeError, ValueError, OverflowError):
# if we are trying to coerce to a dtype
Expand Down
122 changes: 122 additions & 0 deletions pandas/tests/test_complex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import numpy as np
import pytest

import pandas as pd
from pandas import (
Index,
Series,
DataFrame,
)
import pandas.core.algorithms as algos
import pandas.util.testing as tm


class TestComplexSupportBasic:
@pytest.mark.parametrize("array,expected", [
(
[1 + 1j, 0, 1, 1j, 1 + 2j],
Series([1, 1, 1, 1, 1], index=[1 + 2j, 1 + 1j, 1j, 1, 0])
),
(
[1 + 2j, 0, 1j, 1, 1j, 1 + 1j],
# index is sorted by value counts in descending order by default
Series([2, 1, 1, 1, 1], index=[1j, 1 + 2j, 1 + 1j, 1, 0])
)
])
def test_value_counts(self, array, expected):
result = algos.value_counts(array)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("array,expected", [
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For any that are just one set of parameters here you can just define in the function body. No need to parametrize

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just an initial set of test cases. I plan on adding more test cases for each of the algos.

(
[1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j],
np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=object)
),
])
def test_unique(self, array, expected):
result = algos.unique(array)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("array,expected", [
(
[0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j],
Series([False, False, True, False, False, False, True], dtype=bool)
),
])
def test_duplicated(self, array, expected):
result = Series(array).duplicated()
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("array,expected", [
(
[0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j],
Series([False, True, True, False, True, True, True], dtype=bool)
),
])
def test_isin(self, array, expected):
result = Series(array).isin([1j, 1 + 1j, 1 + 2j])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("array,expected", [
(
[1, 2, 2 + 1j],
(np.array([0, 1, 2], dtype=np.int64),
np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=object))
),
])
def test_factorize(self, array, expected):
result = pd.factorize(array)
assert len(result) == 2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think easier to read at a quick glance if you just compare the two elements rather than the enumerating loop


for i, r in enumerate(result):
tm.assert_numpy_array_equal(r, expected[i])

@pytest.mark.parametrize("frame,expected", [
(
DataFrame([dict(a=1, b=1 + 1j), dict(a=1, b=1 + 2j)]),
DataFrame(
np.array([1, 1], dtype=np.int64),
index=Index([(1 + 1j), (1 + 2j)], dtype='object', name='b'),
columns=Index(['a'], dtype='object'))
),
])
def test_groupby(self, frame, expected):
result = frame.groupby("b", sort=False).count()
tm.assert_frame_equal(result, expected)

# sorting of the index should fail since complex numbers are unordered
with pytest.raises(TypeError):
frame.groupby("b", sort=True).count()

@pytest.mark.parametrize("array,expected", [
(
[0, 1j, 1, 1, 1 + 1j, 1 + 2j],
Series([1], dtype=np.complex128)
),
(
[1 + 1j, 2j, 1 + 1j],
Series([1 + 1j], dtype=np.complex128)
),
])
def test_unimode(self, array, expected):
result = Series(array).mode()
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("array,expected", [
(
# no modes
[0, 1j, 1, 1 + 1j, 1 + 2j],
Series([0, 1, 1j, 1 + 1j, 1 + 2j], dtype=np.complex128)
),
(
[1 + 1j, 2j, 1 + 1j, 2j, 3],
Series([1 + 1j, 2j], dtype=np.complex128)
),
])
def test_multimode(self, array, expected):
# mode tries to sort multimodal series.
# A warning will be raise since complex numbers
# are not ordered.
with pytest.warns(UserWarning):
result = Series(array).mode()
tm.assert_series_equal(result, expected)