Skip to content

Commit b81d444

Browse files
pijuchajreback
authored andcommitted
BUG/TST: Empty input arrays in cartesian_product and MultiIndex (pandas-dev#12258)
closes pandas-dev#12258 1. fixes logic (and division by 0) in `cartesian_product` when some input arrays are empty 2. adds tests for MultiIndex empty level construction with `.from_arrays` and `.from_product` Author: Piotr Jucha <[email protected]> Closes pandas-dev#14151 from pijucha/cartesian and squashes the following commits: b831516 [Piotr Jucha] BUG/TST: Empty input arrays in cartesian_product and MultiIndex (pandas-dev#12258)
1 parent 7dedbed commit b81d444

File tree

5 files changed

+135
-3
lines changed

5 files changed

+135
-3
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1471,6 +1471,7 @@ Bug Fixes
14711471
- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`)
14721472
- Bug in ``.str.replace`` does not raise ``TypeError`` for invalid replacement (:issue:`13438`)
14731473
- Bug in ``MultiIndex.from_arrays`` which didn't check for input array lengths matching (:issue:`13599`)
1474+
- Bug in ``cartesian_product`` and ``MultiIndex.from_product`` which may raise with empty input arrays (:issue:`12258`)
14741475

14751476

14761477
- Bug in ``pd.read_csv()`` which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)

pandas/core/categorical.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -1979,13 +1979,16 @@ def _factorize_from_iterable(values):
19791979
19801980
Returns
19811981
-------
1982-
codes : np.array
1982+
codes : ndarray
19831983
categories : Index
19841984
If `values` has a categorical dtype, then `categories` is
19851985
a CategoricalIndex keeping the categories and order of `values`.
19861986
"""
19871987
from pandas.indexes.category import CategoricalIndex
19881988

1989+
if not is_list_like(values):
1990+
raise TypeError("Input must be list-like")
1991+
19891992
if is_categorical(values):
19901993
if isinstance(values, (ABCCategoricalIndex, ABCSeries)):
19911994
values = values._values
@@ -2003,8 +2006,23 @@ def _factorize_from_iterable(values):
20032006
def _factorize_from_iterables(iterables):
20042007
"""
20052008
A higher-level wrapper over `_factorize_from_iterable`.
2006-
See `_factorize_from_iterable` for more info.
20072009
20082010
*This is an internal function*
2011+
2012+
Parameters
2013+
----------
2014+
iterables : list-like of list-likes
2015+
2016+
Returns
2017+
-------
2018+
codes_tuple : tuple of ndarrays
2019+
categories_tuple : tuple of Indexes
2020+
2021+
Notes
2022+
-----
2023+
See `_factorize_from_iterable` for more info.
20092024
"""
2025+
if len(iterables) == 0:
2026+
# For consistency, it should return a list of 2 tuples.
2027+
return [(), ()]
20102028
return lzip(*[_factorize_from_iterable(it) for it in iterables])

pandas/tests/indexes/test_multi.py

+63
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,32 @@ def test_from_arrays_index_series_categorical(self):
691691
tm.assert_index_equal(result3.get_level_values(0), idx1)
692692
tm.assert_index_equal(result3.get_level_values(1), idx2)
693693

694+
def test_from_arrays_empty(self):
695+
# 0 levels
696+
with tm.assertRaisesRegexp(
697+
ValueError, "Must pass non-zero number of levels/labels"):
698+
MultiIndex.from_arrays(arrays=[])
699+
700+
# 1 level
701+
result = MultiIndex.from_arrays(arrays=[[]], names=['A'])
702+
expected = Index([], name='A')
703+
tm.assert_index_equal(result, expected)
704+
705+
# N levels
706+
for N in [2, 3]:
707+
arrays = [[]] * N
708+
names = list('ABC')[:N]
709+
result = MultiIndex.from_arrays(arrays=arrays, names=names)
710+
expected = MultiIndex(levels=[np.array([])] * N, labels=[[]] * N,
711+
names=names)
712+
tm.assert_index_equal(result, expected)
713+
714+
def test_from_arrays_invalid_input(self):
715+
invalid_inputs = [1, [1], [1, 2], [[1], 2],
716+
'a', ['a'], ['a', 'b'], [['a'], 'b']]
717+
for i in invalid_inputs:
718+
tm.assertRaises(TypeError, MultiIndex.from_arrays, arrays=i)
719+
694720
def test_from_arrays_different_lengths(self):
695721
# GH13599
696722
idx1 = [1, 2, 3]
@@ -723,6 +749,43 @@ def test_from_product(self):
723749
tm.assert_index_equal(result, expected)
724750
self.assertEqual(result.names, names)
725751

752+
def test_from_product_empty(self):
753+
# 0 levels
754+
with tm.assertRaisesRegexp(
755+
ValueError, "Must pass non-zero number of levels/labels"):
756+
MultiIndex.from_product([])
757+
758+
# 1 level
759+
result = MultiIndex.from_product([[]], names=['A'])
760+
expected = pd.Float64Index([], name='A')
761+
tm.assert_index_equal(result, expected)
762+
763+
# 2 levels
764+
l1 = [[], ['foo', 'bar', 'baz'], []]
765+
l2 = [[], [], ['a', 'b', 'c']]
766+
names = ['A', 'B']
767+
for first, second in zip(l1, l2):
768+
result = MultiIndex.from_product([first, second], names=names)
769+
expected = MultiIndex(levels=[np.array(first), np.array(second)],
770+
labels=[[], []], names=names)
771+
tm.assert_index_equal(result, expected)
772+
773+
# GH12258
774+
names = ['A', 'B', 'C']
775+
for N in range(4):
776+
lvl2 = lrange(N)
777+
result = MultiIndex.from_product([[], lvl2, []], names=names)
778+
expected = MultiIndex(levels=[np.array(A)
779+
for A in [[], lvl2, []]],
780+
labels=[[], [], []], names=names)
781+
tm.assert_index_equal(result, expected)
782+
783+
def test_from_product_invalid_input(self):
784+
invalid_inputs = [1, [1], [1, 2], [[1], 2],
785+
'a', ['a'], ['a', 'b'], [['a'], 'b']]
786+
for i in invalid_inputs:
787+
tm.assertRaises(TypeError, MultiIndex.from_product, iterables=i)
788+
726789
def test_from_product_datetimeindex(self):
727790
dt_index = date_range('2000-01-01', periods=2)
728791
mi = pd.MultiIndex.from_product([[1, 2], dt_index])

pandas/tools/tests/test_util.py

+23
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,29 @@ def test_datetimeindex(self):
3434
tm.assert_numpy_array_equal(result1, expected1)
3535
tm.assert_numpy_array_equal(result2, expected2)
3636

37+
def test_empty(self):
38+
# product of empty factors
39+
X = [[], [0, 1], []]
40+
Y = [[], [], ['a', 'b', 'c']]
41+
for x, y in zip(X, Y):
42+
expected1 = np.array([], dtype=np.asarray(x).dtype)
43+
expected2 = np.array([], dtype=np.asarray(y).dtype)
44+
result1, result2 = cartesian_product([x, y])
45+
tm.assert_numpy_array_equal(result1, expected1)
46+
tm.assert_numpy_array_equal(result2, expected2)
47+
48+
# empty product (empty input):
49+
result = cartesian_product([])
50+
expected = []
51+
tm.assert_equal(result, expected)
52+
53+
def test_invalid_input(self):
54+
invalid_inputs = [1, [1], [1, 2], [[1], 2],
55+
'a', ['a'], ['a', 'b'], [['a'], 'b']]
56+
msg = "Input must be a list-like of list-likes"
57+
for X in invalid_inputs:
58+
tm.assertRaisesRegexp(TypeError, msg, cartesian_product, X=X)
59+
3760

3861
class TestLocaleUtils(tm.TestCase):
3962

pandas/tools/util.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pandas.types.common import (is_number,
55
is_numeric_dtype,
66
is_datetime_or_timedelta_dtype,
7+
is_list_like,
78
_ensure_object)
89
from pandas.types.cast import _possibly_downcast_to_dtype
910

@@ -24,21 +25,47 @@ def cartesian_product(X):
2425
Numpy version of itertools.product or pandas.compat.product.
2526
Sometimes faster (for large inputs)...
2627
28+
Parameters
29+
----------
30+
X : list-like of list-likes
31+
32+
Returns
33+
-------
34+
product : list of ndarrays
35+
2736
Examples
2837
--------
2938
>>> cartesian_product([list('ABC'), [1, 2]])
3039
[array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
3140
array([1, 2, 1, 2, 1, 2])]
3241
42+
See also
43+
--------
44+
itertools.product : Cartesian product of input iterables. Equivalent to
45+
nested for-loops.
46+
pandas.compat.product : An alias for itertools.product.
3347
"""
48+
msg = "Input must be a list-like of list-likes"
49+
if not is_list_like(X):
50+
raise TypeError(msg)
51+
for x in X:
52+
if not is_list_like(x):
53+
raise TypeError(msg)
54+
55+
if len(X) == 0:
56+
return []
3457

3558
lenX = np.fromiter((len(x) for x in X), dtype=int)
3659
cumprodX = np.cumproduct(lenX)
3760

3861
a = np.roll(cumprodX, 1)
3962
a[0] = 1
4063

41-
b = cumprodX[-1] / cumprodX
64+
if cumprodX[-1] != 0:
65+
b = cumprodX[-1] / cumprodX
66+
else:
67+
# if any factor is empty, the cartesian product is empty
68+
b = np.zeros_like(cumprodX)
4269

4370
return [np.tile(np.repeat(np.asarray(com._values_from_object(x)), b[i]),
4471
np.product(a[i]))

0 commit comments

Comments
 (0)