pandas/tests/strings/test_cat.py

import numpy as np
import pytest

import pandas as pd
from pandas import (
    DataFrame,
    Index,
    MultiIndex,
    Series,
    _testing as tm,
    concat,
)
from pandas.tests.strings.test_strings import assert_series_or_index_equal


@pytest.mark.parametrize("other", [None, Series, Index])
def test_str_cat_name(index_or_series, other):
    # GH 21053
    box = index_or_series
    values = ["a", "b"]
    if other:
        other = other(values)
    else:
        other = values
    result = box(values, name="name").str.cat(other, sep=",")
    assert result.name == "name"


def test_str_cat(index_or_series):
    box = index_or_series
    # test_cat above tests "str_cat" from ndarray;
    # here testing "str.cat" from Series/Index to ndarray/list
    s = box(["a", "a", "b", "b", "c", np.nan])

    # single array
    result = s.str.cat()
    expected = "aabbc"
    assert result == expected

    result = s.str.cat(na_rep="-")
    expected = "aabbc-"
    assert result == expected

    result = s.str.cat(sep="_", na_rep="NA")
    expected = "a_a_b_b_c_NA"
    assert result == expected

    t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object)
    expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"])

    # Series/Index with array
    result = s.str.cat(t, na_rep="-")
    assert_series_or_index_equal(result, expected)

    # Series/Index with list
    result = s.str.cat(list(t), na_rep="-")
    assert_series_or_index_equal(result, expected)

    # errors for incorrect lengths
    rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
    z = Series(["1", "2", "3"])

    with pytest.raises(ValueError, match=rgx):
        s.str.cat(z.values)

    with pytest.raises(ValueError, match=rgx):
        s.str.cat(list(z))


def test_str_cat_raises_intuitive_error(index_or_series):
    # GH 11334
    box = index_or_series
    s = box(["a", "b", "c", "d"])
    message = "Did you mean to supply a `sep` keyword?"
    with pytest.raises(ValueError, match=message):
        s.str.cat("|")
    with pytest.raises(ValueError, match=message):
        s.str.cat("    ")


@pytest.mark.parametrize("sep", ["", None])
@pytest.mark.parametrize("dtype_target", ["object", "category"])
@pytest.mark.parametrize("dtype_caller", ["object", "category"])
def test_str_cat_categorical(index_or_series, dtype_caller, dtype_target, sep):
    box = index_or_series

    s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
    s = s if box == Index else Series(s, index=s)
    t = Index(["b", "a", "b", "c"], dtype=dtype_target)

    expected = Index(["ab", "aa", "bb", "ac"])
    expected = expected if box == Index else Series(expected, index=s)

    # Series/Index with unaligned Index -> t.values
    result = s.str.cat(t.values, sep=sep)
    assert_series_or_index_equal(result, expected)

    # Series/Index with Series having matching Index
    t = Series(t.values, index=s)
    result = s.str.cat(t, sep=sep)
    assert_series_or_index_equal(result, expected)

    # Series/Index with Series.values
    result = s.str.cat(t.values, sep=sep)
    assert_series_or_index_equal(result, expected)

    # Series/Index with Series having different Index
    t = Series(t.values, index=t.values)
    expected = Index(["aa", "aa", "aa", "bb", "bb"])
    expected = expected if box == Index else Series(expected, index=expected.str[:1])

    result = s.str.cat(t, sep=sep)
    assert_series_or_index_equal(result, expected)


@pytest.mark.parametrize(
    "data",
    [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]],
    ids=["integers", "floats", "mixed"],
)
# without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
@pytest.mark.parametrize(
    "box",
    [Series, Index, list, lambda x: np.array(x, dtype=object)],
    ids=["Series", "Index", "list", "np.array"],
)
def test_str_cat_wrong_dtype_raises(box, data):
    # GH 22722
    s = Series(["a", "b", "c"])
    t = box(data)

    msg = "Concatenation requires list-likes containing only strings.*"
    with pytest.raises(TypeError, match=msg):
        # need to use outer and na_rep, as otherwise Index would not raise
        s.str.cat(t, join="outer", na_rep="-")


def test_str_cat_mixed_inputs(index_or_series):
    box = index_or_series
    s = Index(["a", "b", "c", "d"])
    s = s if box == Index else Series(s, index=s)

    t = Series(["A", "B", "C", "D"], index=s.values)
    d = concat([t, Series(s, index=s)], axis=1)

    expected = Index(["aAa", "bBb", "cCc", "dDd"])
    expected = expected if box == Index else Series(expected.values, index=s.values)

    # Series/Index with DataFrame
    result = s.str.cat(d)
    assert_series_or_index_equal(result, expected)

    # Series/Index with two-dimensional ndarray
    result = s.str.cat(d.values)
    assert_series_or_index_equal(result, expected)

    # Series/Index with list of Series
    result = s.str.cat([t, s])
    assert_series_or_index_equal(result, expected)

    # Series/Index with mixed list of Series/array
    result = s.str.cat([t, s.values])
    assert_series_or_index_equal(result, expected)

    # Series/Index with list of Series; different indexes
    t.index = ["b", "c", "d", "a"]
    expected = box(["aDa", "bAb", "cBc", "dCd"])
    expected = expected if box == Index else Series(expected.values, index=s.values)
    result = s.str.cat([t, s])
    assert_series_or_index_equal(result, expected)

    # Series/Index with mixed list; different index
    result = s.str.cat([t, s.values])
    assert_series_or_index_equal(result, expected)

    # Series/Index with DataFrame; different indexes
    d.index = ["b", "c", "d", "a"]
    expected = box(["aDd", "bAa", "cBb", "dCc"])
    expected = expected if box == Index else Series(expected.values, index=s.values)
    result = s.str.cat(d)
    assert_series_or_index_equal(result, expected)

    # errors for incorrect lengths
    rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
    z = Series(["1", "2", "3"])
    e = concat([z, z], axis=1)

    # two-dimensional ndarray
    with pytest.raises(ValueError, match=rgx):
        s.str.cat(e.values)

    # list of list-likes
    with pytest.raises(ValueError, match=rgx):
        s.str.cat([z.values, s.values])

    # mixed list of Series/list-like
    with pytest.raises(ValueError, match=rgx):
        s.str.cat([z.values, s])

    # errors for incorrect arguments in list-like
    rgx = "others must be Series, Index, DataFrame,.*"
    # make sure None/NaN do not crash checks in _get_series_list
    u = Series(["a", np.nan, "c", None])

    # mix of string and Series
    with pytest.raises(TypeError, match=rgx):
        s.str.cat([u, "u"])

    # DataFrame in list
    with pytest.raises(TypeError, match=rgx):
        s.str.cat([u, d])

    # 2-dim ndarray in list
    with pytest.raises(TypeError, match=rgx):
        s.str.cat([u, d.values])

    # nested lists
    with pytest.raises(TypeError, match=rgx):
        s.str.cat([u, [u, d]])

    # forbidden input type: set
    # GH 23009
    with pytest.raises(TypeError, match=rgx):
        s.str.cat(set(u))

    # forbidden input type: set in list
    # GH 23009
    with pytest.raises(TypeError, match=rgx):
        s.str.cat([u, set(u)])

    # other forbidden input type, e.g. int
    with pytest.raises(TypeError, match=rgx):
        s.str.cat(1)

    # nested list-likes
    with pytest.raises(TypeError, match=rgx):
        s.str.cat(iter([t.values, list(s)]))


@pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
def test_str_cat_align_indexed(index_or_series, join):
    # https://github.com/pandas-dev/pandas/issues/18657
    box = index_or_series

    s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"])
    t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"])
    sa, ta = s.align(t, join=join)
    # result after manual alignment of inputs
    expected = sa.str.cat(ta, na_rep="-")

    if box == Index:
        s = Index(s)
        sa = Index(sa)
        expected = Index(expected)

    result = s.str.cat(t, join=join, na_rep="-")
    assert_series_or_index_equal(result, expected)


@pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
def test_str_cat_align_mixed_inputs(join):
    s = Series(["a", "b", "c", "d"])
    t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
    d = concat([t, t], axis=1)

    expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"])
    expected = expected_outer.loc[s.index.join(t.index, how=join)]

    # list of Series
    result = s.str.cat([t, t], join=join, na_rep="-")
    tm.assert_series_equal(result, expected)

    # DataFrame
    result = s.str.cat(d, join=join, na_rep="-")
    tm.assert_series_equal(result, expected)

    # mixed list of indexed/unindexed
    u = np.array(["A", "B", "C", "D"])
    expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"])
    # joint index of rhs [t, u]; u will be forced have index of s
    rhs_idx = (
        t.index.intersection(s.index) if join == "inner" else t.index.union(s.index)
    )

    expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
    result = s.str.cat([t, u], join=join, na_rep="-")
    tm.assert_series_equal(result, expected)

    with pytest.raises(TypeError, match="others must be Series,.*"):
        # nested lists are forbidden
        s.str.cat([t, list(u)], join=join)

    # errors for incorrect lengths
    rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
    z = Series(["1", "2", "3"]).values

    # unindexed object of wrong length
    with pytest.raises(ValueError, match=rgx):
        s.str.cat(z, join=join)

    # unindexed object of wrong length in list
    with pytest.raises(ValueError, match=rgx):
        s.str.cat([t, z], join=join)


def test_str_cat_all_na(index_or_series, index_or_series2):
    # GH 24044
    box = index_or_series
    other = index_or_series2

    # check that all NaNs in caller / target work
    s = Index(["a", "b", "c", "d"])
    s = s if box == Index else Series(s, index=s)
    t = other([np.nan] * 4, dtype=object)
    # add index of s for alignment
    t = t if other == Index else Series(t, index=s)

    # all-NA target
    if box == Series:
        expected = Series([np.nan] * 4, index=s.index, dtype=object)
    else:  # box == Index
        expected = Index([np.nan] * 4, dtype=object)
    result = s.str.cat(t, join="left")
    assert_series_or_index_equal(result, expected)

    # all-NA caller (only for Series)
    if other == Series:
        expected = Series([np.nan] * 4, dtype=object, index=t.index)
        result = t.str.cat(s, join="left")
        tm.assert_series_equal(result, expected)


def test_str_cat_special_cases():
    s = Series(["a", "b", "c", "d"])
    t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])

    # iterator of elements with different types
    expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"])
    result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-")
    tm.assert_series_equal(result, expected)

    # right-align with different indexes in others
    expected = Series(["aa-", "d-d"], index=[0, 3])
    result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-")
    tm.assert_series_equal(result, expected)


def test_cat_on_filtered_index():
    df = DataFrame(
        index=MultiIndex.from_product(
            [[2011, 2012], [1, 2, 3]], names=["year", "month"]
        )
    )

    df = df.reset_index()
    df = df[df.month > 1]

    str_year = df.year.astype("str")
    str_month = df.month.astype("str")
    str_both = str_year.str.cat(str_month, sep=" ")

    assert str_both.loc[1] == "2011 2"

    str_multiple = str_year.str.cat([str_month, str_month], sep=" ")

    assert str_multiple.loc[1] == "2011 2 2"


@pytest.mark.parametrize("klass", [tuple, list, np.array, pd.Series, pd.Index])
def test_cat_different_classes(klass):
    # https://github.com/pandas-dev/pandas/issues/33425
    s = Series(["a", "b", "c"])
    result = s.str.cat(klass(["x", "y", "z"]))
    expected = Series(["ax", "by", "cz"])
    tm.assert_series_equal(result, expected)