diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py new file mode 100644 index 0000000000000..2f9228bc84394 --- /dev/null +++ b/pandas/tests/reshape/concat/test_append.py @@ -0,0 +1,383 @@ +import datetime as dt +from datetime import datetime +from itertools import combinations + +import dateutil +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, Timestamp, concat, isna +import pandas._testing as tm + + +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param + + +class TestAppend: + def test_append(self, sort, float_frame): + mixed_frame = float_frame.copy() + mixed_frame["foo"] = "bar" + + begin_index = float_frame.index[:5] + end_index = float_frame.index[5:] + + begin_frame = float_frame.reindex(begin_index) + end_frame = float_frame.reindex(end_index) + + appended = begin_frame.append(end_frame) + tm.assert_almost_equal(appended["A"], float_frame["A"]) + + del end_frame["A"] + partial_appended = begin_frame.append(end_frame, sort=sort) + assert "A" in partial_appended + + partial_appended = end_frame.append(begin_frame, sort=sort) + assert "A" in partial_appended + + # mixed type handling + appended = mixed_frame[:5].append(mixed_frame[5:]) + tm.assert_frame_equal(appended, mixed_frame) + + # what to test here + mixed_appended = mixed_frame[:5].append(float_frame[5:], sort=sort) + mixed_appended2 = float_frame[:5].append(mixed_frame[5:], sort=sort) + + # all equal except 'foo' column + tm.assert_frame_equal( + mixed_appended.reindex(columns=["A", "B", "C", "D"]), + mixed_appended2.reindex(columns=["A", "B", "C", "D"]), + ) + + def test_append_empty(self, float_frame): + empty = DataFrame() + + appended = float_frame.append(empty) + tm.assert_frame_equal(float_frame, appended) + assert appended is not float_frame + + appended = empty.append(float_frame) + tm.assert_frame_equal(float_frame, appended) + assert appended is not float_frame + + def test_append_overlap_raises(self, float_frame): + msg = "Indexes have overlapping values" + with pytest.raises(ValueError, match=msg): + float_frame.append(float_frame, verify_integrity=True) + + def test_append_new_columns(self): + # see gh-6129: new columns + df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}}) + row = Series([5, 6, 7], index=["a", "b", "c"], name="z") + expected = DataFrame( + { + "a": {"x": 1, "y": 2, "z": 5}, + "b": {"x": 3, "y": 4, "z": 6}, + "c": {"z": 7}, + } + ) + result = df.append(row) + tm.assert_frame_equal(result, expected) + + def test_append_length0_frame(self, sort): + df = DataFrame(columns=["A", "B", "C"]) + df3 = DataFrame(index=[0, 1], columns=["A", "B"]) + df5 = df.append(df3, sort=sort) + + expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) + tm.assert_frame_equal(df5, expected) + + def test_append_records(self): + arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] + + arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] + + df1 = DataFrame(arr1) + df2 = DataFrame(arr2) + + result = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate((arr1, arr2))) + tm.assert_frame_equal(result, expected) + + # rewrite sort fixture, since we also want to test default of None + def test_append_sorts(self, sort): + df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) + df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) + + with tm.assert_produces_warning(None): + result = df1.append(df2, sort=sort) + + # for None / True + expected = DataFrame( + {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, + columns=["a", "b", "c"], + ) + if sort is False: + expected = expected[["b", "a", "c"]] + tm.assert_frame_equal(result, expected) + + def test_append_different_columns(self, sort): + df = DataFrame( + { + "bools": np.random.randn(10) > 0, + "ints": np.random.randint(0, 10, 10), + "floats": np.random.randn(10), + "strings": ["foo", "bar"] * 5, + } + ) + + a = df[:5].loc[:, ["bools", "ints", "floats"]] + b = df[5:].loc[:, ["strings", "ints", "floats"]] + + appended = a.append(b, sort=sort) + assert isna(appended["strings"][0:4]).all() + assert isna(appended["bools"][5:]).all() + + def test_append_many(self, sort, float_frame): + chunks = [ + float_frame[:5], + float_frame[5:10], + float_frame[10:15], + float_frame[15:], + ] + + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result, float_frame) + + chunks[-1] = chunks[-1].copy() + chunks[-1]["foo"] = "bar" + result = chunks[0].append(chunks[1:], sort=sort) + tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) + assert (result["foo"][15:] == "bar").all() + assert result["foo"][:15].isna().all() + + def test_append_preserve_index_name(self): + # #980 + df1 = DataFrame(columns=["A", "B", "C"]) + df1 = df1.set_index(["A"]) + df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) + df2 = df2.set_index(["A"]) + + result = df1.append(df2) + assert result.index.name == "A" + + indexes_can_append = [ + pd.RangeIndex(3), + Index([4, 5, 6]), + Index([4.5, 5.5, 6.5]), + Index(list("abc")), + pd.CategoricalIndex("A B C".split()), + pd.CategoricalIndex("D E F".split(), ordered=True), + pd.IntervalIndex.from_breaks([7, 8, 9, 10]), + pd.DatetimeIndex( + [ + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 3, 7, 12), + ] + ), + ] + + indexes_cannot_append_with_other = [ + pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]) + ] + + all_indexes = indexes_can_append + indexes_cannot_append_with_other + + @pytest.mark.parametrize("index", all_indexes, ids=lambda x: type(x).__name__) + def test_append_same_columns_type(self, index): + # GH18359 + + # df wider than ser + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) + ser_index = index[:2] + ser = Series([7, 8], index=ser_index, name=2) + result = df.append(ser) + expected = DataFrame( + [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index + ) + tm.assert_frame_equal(result, expected) + + # ser wider than df + ser_index = index + index = index[:2] + df = DataFrame([[1, 2], [4, 5]], columns=index) + ser = Series([7, 8, 9], index=ser_index, name=2) + result = df.append(ser) + expected = DataFrame( + [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], + index=[0, 1, 2], + columns=ser_index, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "df_columns, series_index", + combinations(indexes_can_append, r=2), + ids=lambda x: type(x).__name__, + ) + def test_append_different_columns_types(self, df_columns, series_index): + # GH18359 + # See also test 'test_append_different_columns_types_raises' below + # for errors raised when appending + + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) + ser = Series([7, 8, 9], index=series_index, name=2) + + result = df.append(ser) + idx_diff = ser.index.difference(df_columns) + combined_columns = Index(df_columns.tolist()).append(idx_diff) + expected = DataFrame( + [ + [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], + [4, 5, 6, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 7, 8, 9], + ], + index=[0, 1, 2], + columns=combined_columns, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "index_can_append", indexes_can_append, ids=lambda x: type(x).__name__ + ) + @pytest.mark.parametrize( + "index_cannot_append_with_other", + indexes_cannot_append_with_other, + ids=lambda x: type(x).__name__, + ) + def test_append_different_columns_types_raises( + self, index_can_append, index_cannot_append_with_other + ): + # GH18359 + # Dataframe.append will raise if MultiIndex appends + # or is appended to a different index type + # + # See also test 'test_append_different_columns_types' above for + # appending without raising. + + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) + ser = Series([7, 8, 9], index=index_cannot_append_with_other, name=2) + msg = ( + r"Expected tuple, got (int|long|float|str|" + r"pandas._libs.interval.Interval)|" + r"object of type '(int|float|Timestamp|" + r"pandas._libs.interval.Interval)' has no len\(\)|" + ) + with pytest.raises(TypeError, match=msg): + df.append(ser) + + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other) + ser = Series([7, 8, 9], index=index_can_append, name=2) + + with pytest.raises(TypeError, match=msg): + df.append(ser) + + def test_append_dtype_coerce(self, sort): + + # GH 4993 + # appending with datetime will incorrectly convert datetime64 + + df1 = DataFrame( + index=[1, 2], + data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], + columns=["start_time"], + ) + df2 = DataFrame( + index=[4, 5], + data=[ + [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)], + [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)], + ], + columns=["start_time", "end_time"], + ) + + expected = concat( + [ + Series( + [ + pd.NaT, + pd.NaT, + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 4, 7, 10), + ], + name="end_time", + ), + Series( + [ + dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0), + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 4, 0, 0), + ], + name="start_time", + ), + ], + axis=1, + sort=sort, + ) + result = df1.append(df2, ignore_index=True, sort=sort) + if sort: + expected = expected[["end_time", "start_time"]] + else: + expected = expected[["start_time", "end_time"]] + + tm.assert_frame_equal(result, expected) + + def test_append_missing_column_proper_upcast(self, sort): + df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) + df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)}) + + appended = df1.append(df2, ignore_index=True, sort=sort) + assert appended["A"].dtype == "f8" + assert appended["B"].dtype == "O" + + def test_append_empty_frame_to_series_with_dateutil_tz(self): + # GH 23682 + date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) + s = Series({"date": date, "a": 1.0, "b": 2.0}) + df = DataFrame(columns=["c", "d"]) + result_a = df.append(s, ignore_index=True) + expected = DataFrame( + [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"] + ) + # These columns get cast to object after append + expected["c"] = expected["c"].astype(object) + expected["d"] = expected["d"].astype(object) + tm.assert_frame_equal(result_a, expected) + + expected = DataFrame( + [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"] + ) + expected["c"] = expected["c"].astype(object) + expected["d"] = expected["d"].astype(object) + + result_b = result_a.append(s, ignore_index=True) + tm.assert_frame_equal(result_b, expected) + + # column order is different + expected = expected[["c", "d", "date", "a", "b"]] + result = df.append([s, s], ignore_index=True) + tm.assert_frame_equal(result, expected) + + def test_append_empty_tz_frame_with_datetime64ns(self): + # https://github.com/pandas-dev/pandas/issues/35460 + df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + + # pd.NaT gets inferred as tz-naive, so append result is tz-naive + result = df.append({"a": pd.NaT}, ignore_index=True) + expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) + + # also test with typed value to append + df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + result = df.append( + Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True + ) + expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py new file mode 100644 index 0000000000000..7ca3ae65706fe --- /dev/null +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -0,0 +1,727 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, DataFrame, Index, Series +import pandas._testing as tm + + +class TestConcatAppendCommon: + """ + Test common dtype coercion rules between concat and append. + """ + + def setup_method(self, method): + + dt_data = [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ] + tz_data = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ] + + td_data = [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ] + + period_data = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + ] + + self.data = { + "bool": [True, False, True], + "int64": [1, 2, 3], + "float64": [1.1, np.nan, 3.3], + "category": pd.Categorical(["X", "Y", "Z"]), + "object": ["a", "b", "c"], + "datetime64[ns]": dt_data, + "datetime64[ns, US/Eastern]": tz_data, + "timedelta64[ns]": td_data, + "period[M]": period_data, + } + + def _check_expected_dtype(self, obj, label): + """ + Check whether obj has expected dtype depending on label + considering not-supported dtypes + """ + if isinstance(obj, Index): + if label == "bool": + assert obj.dtype == "object" + else: + assert obj.dtype == label + elif isinstance(obj, Series): + if label.startswith("period"): + assert obj.dtype == "Period[M]" + else: + assert obj.dtype == label + else: + raise ValueError + + def test_dtypes(self): + # to confirm test case covers intended dtypes + for typ, vals in self.data.items(): + self._check_expected_dtype(Index(vals), typ) + self._check_expected_dtype(Series(vals), typ) + + def test_concatlike_same_dtypes(self): + # GH 13660 + for typ1, vals1 in self.data.items(): + + vals2 = vals1 + vals3 = vals1 + + if typ1 == "category": + exp_data = pd.Categorical(list(vals1) + list(vals2)) + exp_data3 = pd.Categorical(list(vals1) + list(vals2) + list(vals3)) + else: + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = Index(vals1).append(Index(vals2)) + exp = Index(exp_data) + tm.assert_index_equal(res, exp) + + # 3 elements + res = Index(vals1).append([Index(vals2), Index(vals3)]) + exp = Index(exp_data3) + tm.assert_index_equal(res, exp) + + # index.append name mismatch + i1 = Index(vals1, name="x") + i2 = Index(vals2, name="y") + res = i1.append(i2) + exp = Index(exp_data) + tm.assert_index_equal(res, exp) + + # index.append name match + i1 = Index(vals1, name="x") + i2 = Index(vals2, name="x") + res = i1.append(i2) + exp = Index(exp_data, name="x") + tm.assert_index_equal(res, exp) + + # cannot append non-index + with pytest.raises(TypeError, match="all inputs must be Index"): + Index(vals1).append(vals2) + + with pytest.raises(TypeError, match="all inputs must be Index"): + Index(vals1).append([Index(vals2), vals3]) + + # ----- Series ----- # + + # series.append + res = Series(vals1).append(Series(vals2), ignore_index=True) + exp = Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = Series(vals1).append( + [Series(vals2), Series(vals3)], ignore_index=True + ) + exp = Series(exp_data3) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [Series(vals1), Series(vals2), Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) + + # name mismatch + s1 = Series(vals1, name="x") + s2 = Series(vals2, name="y") + res = s1.append(s2, ignore_index=True) + exp = Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # name match + s1 = Series(vals1, name="x") + s2 = Series(vals2, name="x") + res = s1.append(s2, ignore_index=True) + exp = Series(exp_data, name="x") + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # cannot append non-index + msg = ( + r"cannot concatenate object of type '.+'; " + "only Series and DataFrame objs are valid" + ) + with pytest.raises(TypeError, match=msg): + Series(vals1).append(vals2) + + with pytest.raises(TypeError, match=msg): + Series(vals1).append([Series(vals2), vals3]) + + with pytest.raises(TypeError, match=msg): + pd.concat([Series(vals1), vals2]) + + with pytest.raises(TypeError, match=msg): + pd.concat([Series(vals1), Series(vals2), vals3]) + + def test_concatlike_dtypes_coercion(self): + # GH 13660 + for typ1, vals1 in self.data.items(): + for typ2, vals2 in self.data.items(): + + vals3 = vals2 + + # basically infer + exp_index_dtype = None + exp_series_dtype = None + + if typ1 == typ2: + # same dtype is tested in test_concatlike_same_dtypes + continue + elif typ1 == "category" or typ2 == "category": + # TODO: suspicious + continue + + # specify expected dtype + if typ1 == "bool" and typ2 in ("int64", "float64"): + # series coerces to numeric based on numpy rule + # index doesn't because bool is object dtype + exp_series_dtype = typ2 + elif typ2 == "bool" and typ1 in ("int64", "float64"): + exp_series_dtype = typ1 + elif ( + typ1 == "datetime64[ns, US/Eastern]" + or typ2 == "datetime64[ns, US/Eastern]" + or typ1 == "timedelta64[ns]" + or typ2 == "timedelta64[ns]" + ): + exp_index_dtype = object + exp_series_dtype = object + + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = Index(vals1).append(Index(vals2)) + exp = Index(exp_data, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # 3 elements + res = Index(vals1).append([Index(vals2), Index(vals3)]) + exp = Index(exp_data3, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # ----- Series ----- # + + # series.append + res = Series(vals1).append(Series(vals2), ignore_index=True) + exp = Series(exp_data, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = Series(vals1).append( + [Series(vals2), Series(vals3)], ignore_index=True + ) + exp = Series(exp_data3, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [Series(vals1), Series(vals2), Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) + + def test_concatlike_common_coerce_to_pandas_object(self): + # GH 13626 + # result must be Timestamp/Timedelta, not datetime.datetime/timedelta + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"]) + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + + exp = Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ] + ) + + res = dti.append(tdi) + tm.assert_index_equal(res, exp) + assert isinstance(res[0], pd.Timestamp) + assert isinstance(res[-1], pd.Timedelta) + + dts = Series(dti) + tds = Series(tdi) + res = dts.append(tds) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + assert isinstance(res.iloc[0], pd.Timestamp) + assert isinstance(res.iloc[-1], pd.Timedelta) + + res = pd.concat([dts, tds]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + assert isinstance(res.iloc[0], pd.Timestamp) + assert isinstance(res.iloc[-1], pd.Timedelta) + + def test_concatlike_datetimetz(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH 7795 + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz) + + exp = pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz + ) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = Series(dti1) + dts2 = Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + @pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"]) + def test_concatlike_datetimetz_short(self, tz): + # GH#7795 + ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz) + ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz) + df1 = DataFrame(0, index=ix1, columns=["A", "B"]) + df2 = DataFrame(0, index=ix2, columns=["A", "B"]) + + exp_idx = pd.DatetimeIndex( + ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], + tz=tz, + ) + exp = DataFrame(0, index=exp_idx, columns=["A", "B"]) + + tm.assert_frame_equal(df1.append(df2), exp) + tm.assert_frame_equal(pd.concat([df1, df2]), exp) + + def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH 13660 + + # different tz coerces to object + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"]) + + exp = Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-02"), + ], + dtype=object, + ) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = Series(dti1) + dts2 = Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + # different tz + dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") + + exp = Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01", tz="US/Pacific"), + pd.Timestamp("2012-01-02", tz="US/Pacific"), + ], + dtype=object, + ) + + res = dti1.append(dti3) + # tm.assert_index_equal(res, exp) + + dts1 = Series(dti1) + dts3 = Series(dti3) + res = dts1.append(dts3) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts3]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period(self): + # GH 13660 + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M") + + exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M") + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = Series(pi1) + ps2 = Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_diff_freq_to_object(self): + # GH 13221 + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D") + + exp = Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.Period("2012-02-01", freq="D"), + ], + dtype=object, + ) + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = Series(pi1) + ps2 = Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_mixed_dt_to_object(self): + # GH 13221 + # different datetimelike + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + exp = Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ], + dtype=object, + ) + + res = pi1.append(tdi) + tm.assert_index_equal(res, exp) + + ps1 = Series(pi1) + tds = Series(tdi) + res = ps1.append(tds) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, tds]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + # inverse + exp = Index( + [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + ], + dtype=object, + ) + + res = tdi.append(pi1) + tm.assert_index_equal(res, exp) + + ps1 = Series(pi1) + tds = Series(tdi) + res = tds.append(ps1) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([tds, ps1]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + def test_concat_categorical(self): + # GH 13524 + + # same categories -> category + s1 = Series([1, 2, np.nan], dtype="category") + s2 = Series([2, 1, 2], dtype="category") + + exp = Series([1, 2, np.nan, 2, 1, 2], dtype="category") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # partially different categories => not-category + s1 = Series([3, 2], dtype="category") + s2 = Series([2, 1], dtype="category") + + exp = Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # completely different categories (same dtype) => not-category + s1 = Series([10, 11, np.nan], dtype="category") + s2 = Series([np.nan, 1, 3, 2], dtype="category") + + exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_union_categorical_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19096 + a = Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"])) + b = Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"])) + result = pd.concat([a, b], ignore_index=True) + expected = Series( + Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]) + ) + tm.assert_series_equal(result, expected) + + def test_concat_categorical_coercion(self): + # GH 13524 + + # category + not-category => not-category + s1 = Series([1, 2, np.nan], dtype="category") + s2 = Series([2, 1, 2]) + + exp = Series([1, 2, np.nan, 2, 1, 2], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # result shouldn't be affected by 1st elem dtype + exp = Series([2, 1, 2, 1, 2, np.nan], dtype="object") + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all values are not in category => not-category + s1 = Series([3, 2], dtype="category") + s2 = Series([2, 1]) + + exp = Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series([2, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # completely different categories => not-category + s1 = Series([10, 11, np.nan], dtype="category") + s2 = Series([1, 3, 2]) + + exp = Series([10, 11, np.nan, 1, 3, 2], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series([1, 3, 2, 10, 11, np.nan], dtype="object") + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # different dtype => not-category + s1 = Series([10, 11, np.nan], dtype="category") + s2 = Series(["a", "b", "c"]) + + exp = Series([10, 11, np.nan, "a", "b", "c"]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series(["a", "b", "c", 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # if normal series only contains NaN-likes => not-category + s1 = Series([10, 11], dtype="category") + s2 = Series([np.nan, np.nan, np.nan]) + + exp = Series([10, 11, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series([np.nan, np.nan, np.nan, 10, 11]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + def test_concat_categorical_3elem_coercion(self): + # GH 13524 + + # mixed dtypes => not-category + s1 = Series([1, 2, np.nan], dtype="category") + s2 = Series([2, 1, 2], dtype="category") + s3 = Series([1, 2, 1, 2, np.nan]) + + exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = Series([4, 5, 6], dtype="category") + s2 = Series([1, 2, 3], dtype="category") + s3 = Series([1, 3, 4]) + + exp = Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = Series([4, 5, 6], dtype="category") + s2 = Series([1, 2, 3], dtype="category") + s3 = Series([10, 11, 12]) + + exp = Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + def test_concat_categorical_multi_coercion(self): + # GH 13524 + + s1 = Series([1, 3], dtype="category") + s2 = Series([3, 4], dtype="category") + s3 = Series([2, 3]) + s4 = Series([2, 2], dtype="category") + s5 = Series([1, np.nan]) + s6 = Series([1, 3, 2], dtype="category") + + # mixed dtype, values are all in categories => not-category + exp = Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) + res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) + res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + + def test_concat_categorical_ordered(self): + # GH 13524 + + s1 = Series(pd.Categorical([1, 2, np.nan], ordered=True)) + s2 = Series(pd.Categorical([2, 1, 2], ordered=True)) + + exp = Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series( + pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True) + ) + tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) + + def test_concat_categorical_coercion_nan(self): + # GH 13524 + + # some edge cases + # category + not-category => not category + s1 = Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category") + s2 = Series([np.nan, 1]) + + exp = Series([np.nan, np.nan, np.nan, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + s1 = Series([1, np.nan], dtype="category") + s2 = Series([np.nan, np.nan]) + + exp = Series([1, np.nan, np.nan, np.nan], dtype="float") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # mixed dtype, all nan-likes => not-category + s1 = Series([np.nan, np.nan], dtype="category") + s2 = Series([np.nan, np.nan]) + + exp = Series([np.nan, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all category nan-likes => category + s1 = Series([np.nan, np.nan], dtype="category") + s2 = Series([np.nan, np.nan], dtype="category") + + exp = Series([np.nan, np.nan, np.nan, np.nan], dtype="category") + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_empty(self): + # GH 13524 + + s1 = Series([], dtype="category") + s2 = Series([1, 2], dtype="category") + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = Series([], dtype="category") + s2 = Series([], dtype="category") + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + s1 = Series([], dtype="category") + s2 = Series([], dtype="object") + + # different dtype => not-category + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = Series([], dtype="category") + s2 = Series([np.nan, np.nan]) + + # empty Series is ignored + exp = Series([np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/concat/test_concat.py similarity index 55% rename from pandas/tests/reshape/test_concat.py rename to pandas/tests/reshape/concat/test_concat.py index 33048c0e0e2df..6fa4419f90138 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -3,7 +3,6 @@ from datetime import datetime from decimal import Decimal from io import StringIO -from itertools import combinations from warnings import catch_warnings import dateutil @@ -24,7 +23,6 @@ Timestamp, concat, date_range, - isna, read_csv, ) import pandas._testing as tm @@ -39,1093 +37,6 @@ def sort(request): return request.param -class TestConcatAppendCommon: - """ - Test common dtype coercion rules between concat and append. - """ - - def setup_method(self, method): - - dt_data = [ - pd.Timestamp("2011-01-01"), - pd.Timestamp("2011-01-02"), - pd.Timestamp("2011-01-03"), - ] - tz_data = [ - pd.Timestamp("2011-01-01", tz="US/Eastern"), - pd.Timestamp("2011-01-02", tz="US/Eastern"), - pd.Timestamp("2011-01-03", tz="US/Eastern"), - ] - - td_data = [ - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - ] - - period_data = [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Period("2011-03", freq="M"), - ] - - self.data = { - "bool": [True, False, True], - "int64": [1, 2, 3], - "float64": [1.1, np.nan, 3.3], - "category": pd.Categorical(["X", "Y", "Z"]), - "object": ["a", "b", "c"], - "datetime64[ns]": dt_data, - "datetime64[ns, US/Eastern]": tz_data, - "timedelta64[ns]": td_data, - "period[M]": period_data, - } - - def _check_expected_dtype(self, obj, label): - """ - Check whether obj has expected dtype depending on label - considering not-supported dtypes - """ - if isinstance(obj, Index): - if label == "bool": - assert obj.dtype == "object" - else: - assert obj.dtype == label - elif isinstance(obj, Series): - if label.startswith("period"): - assert obj.dtype == "Period[M]" - else: - assert obj.dtype == label - else: - raise ValueError - - def test_dtypes(self): - # to confirm test case covers intended dtypes - for typ, vals in self.data.items(): - self._check_expected_dtype(Index(vals), typ) - self._check_expected_dtype(Series(vals), typ) - - def test_concatlike_same_dtypes(self): - # GH 13660 - for typ1, vals1 in self.data.items(): - - vals2 = vals1 - vals3 = vals1 - - if typ1 == "category": - exp_data = pd.Categorical(list(vals1) + list(vals2)) - exp_data3 = pd.Categorical(list(vals1) + list(vals2) + list(vals3)) - else: - exp_data = vals1 + vals2 - exp_data3 = vals1 + vals2 + vals3 - - # ----- Index ----- # - - # index.append - res = Index(vals1).append(Index(vals2)) - exp = Index(exp_data) - tm.assert_index_equal(res, exp) - - # 3 elements - res = Index(vals1).append([Index(vals2), Index(vals3)]) - exp = Index(exp_data3) - tm.assert_index_equal(res, exp) - - # index.append name mismatch - i1 = Index(vals1, name="x") - i2 = Index(vals2, name="y") - res = i1.append(i2) - exp = Index(exp_data) - tm.assert_index_equal(res, exp) - - # index.append name match - i1 = Index(vals1, name="x") - i2 = Index(vals2, name="x") - res = i1.append(i2) - exp = Index(exp_data, name="x") - tm.assert_index_equal(res, exp) - - # cannot append non-index - with pytest.raises(TypeError, match="all inputs must be Index"): - Index(vals1).append(vals2) - - with pytest.raises(TypeError, match="all inputs must be Index"): - Index(vals1).append([Index(vals2), vals3]) - - # ----- Series ----- # - - # series.append - res = Series(vals1).append(Series(vals2), ignore_index=True) - exp = Series(exp_data) - tm.assert_series_equal(res, exp, check_index_type=True) - - # concat - res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # 3 elements - res = Series(vals1).append( - [Series(vals2), Series(vals3)], ignore_index=True - ) - exp = Series(exp_data3) - tm.assert_series_equal(res, exp) - - res = pd.concat( - [Series(vals1), Series(vals2), Series(vals3)], - ignore_index=True, - ) - tm.assert_series_equal(res, exp) - - # name mismatch - s1 = Series(vals1, name="x") - s2 = Series(vals2, name="y") - res = s1.append(s2, ignore_index=True) - exp = Series(exp_data) - tm.assert_series_equal(res, exp, check_index_type=True) - - res = pd.concat([s1, s2], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # name match - s1 = Series(vals1, name="x") - s2 = Series(vals2, name="x") - res = s1.append(s2, ignore_index=True) - exp = Series(exp_data, name="x") - tm.assert_series_equal(res, exp, check_index_type=True) - - res = pd.concat([s1, s2], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # cannot append non-index - msg = ( - r"cannot concatenate object of type '.+'; " - "only Series and DataFrame objs are valid" - ) - with pytest.raises(TypeError, match=msg): - Series(vals1).append(vals2) - - with pytest.raises(TypeError, match=msg): - Series(vals1).append([Series(vals2), vals3]) - - with pytest.raises(TypeError, match=msg): - pd.concat([Series(vals1), vals2]) - - with pytest.raises(TypeError, match=msg): - pd.concat([Series(vals1), Series(vals2), vals3]) - - def test_concatlike_dtypes_coercion(self): - # GH 13660 - for typ1, vals1 in self.data.items(): - for typ2, vals2 in self.data.items(): - - vals3 = vals2 - - # basically infer - exp_index_dtype = None - exp_series_dtype = None - - if typ1 == typ2: - # same dtype is tested in test_concatlike_same_dtypes - continue - elif typ1 == "category" or typ2 == "category": - # TODO: suspicious - continue - - # specify expected dtype - if typ1 == "bool" and typ2 in ("int64", "float64"): - # series coerces to numeric based on numpy rule - # index doesn't because bool is object dtype - exp_series_dtype = typ2 - elif typ2 == "bool" and typ1 in ("int64", "float64"): - exp_series_dtype = typ1 - elif ( - typ1 == "datetime64[ns, US/Eastern]" - or typ2 == "datetime64[ns, US/Eastern]" - or typ1 == "timedelta64[ns]" - or typ2 == "timedelta64[ns]" - ): - exp_index_dtype = object - exp_series_dtype = object - - exp_data = vals1 + vals2 - exp_data3 = vals1 + vals2 + vals3 - - # ----- Index ----- # - - # index.append - res = Index(vals1).append(Index(vals2)) - exp = Index(exp_data, dtype=exp_index_dtype) - tm.assert_index_equal(res, exp) - - # 3 elements - res = Index(vals1).append([Index(vals2), Index(vals3)]) - exp = Index(exp_data3, dtype=exp_index_dtype) - tm.assert_index_equal(res, exp) - - # ----- Series ----- # - - # series.append - res = Series(vals1).append(Series(vals2), ignore_index=True) - exp = Series(exp_data, dtype=exp_series_dtype) - tm.assert_series_equal(res, exp, check_index_type=True) - - # concat - res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # 3 elements - res = Series(vals1).append( - [Series(vals2), Series(vals3)], ignore_index=True - ) - exp = Series(exp_data3, dtype=exp_series_dtype) - tm.assert_series_equal(res, exp) - - res = pd.concat( - [Series(vals1), Series(vals2), Series(vals3)], - ignore_index=True, - ) - tm.assert_series_equal(res, exp) - - def test_concatlike_common_coerce_to_pandas_object(self): - # GH 13626 - # result must be Timestamp/Timedelta, not datetime.datetime/timedelta - dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"]) - tdi = pd.TimedeltaIndex(["1 days", "2 days"]) - - exp = Index( - [ - pd.Timestamp("2011-01-01"), - pd.Timestamp("2011-01-02"), - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - ] - ) - - res = dti.append(tdi) - tm.assert_index_equal(res, exp) - assert isinstance(res[0], pd.Timestamp) - assert isinstance(res[-1], pd.Timedelta) - - dts = Series(dti) - tds = Series(tdi) - res = dts.append(tds) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - assert isinstance(res.iloc[0], pd.Timestamp) - assert isinstance(res.iloc[-1], pd.Timedelta) - - res = pd.concat([dts, tds]) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - assert isinstance(res.iloc[0], pd.Timestamp) - assert isinstance(res.iloc[-1], pd.Timedelta) - - def test_concatlike_datetimetz(self, tz_aware_fixture): - tz = tz_aware_fixture - # GH 7795 - dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) - dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz) - - exp = pd.DatetimeIndex( - ["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz - ) - - res = dti1.append(dti2) - tm.assert_index_equal(res, exp) - - dts1 = Series(dti1) - dts2 = Series(dti2) - res = dts1.append(dts2) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([dts1, dts2]) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - @pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"]) - def test_concatlike_datetimetz_short(self, tz): - # GH#7795 - ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz) - ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz) - df1 = DataFrame(0, index=ix1, columns=["A", "B"]) - df2 = DataFrame(0, index=ix2, columns=["A", "B"]) - - exp_idx = pd.DatetimeIndex( - ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], - tz=tz, - ) - exp = DataFrame(0, index=exp_idx, columns=["A", "B"]) - - tm.assert_frame_equal(df1.append(df2), exp) - tm.assert_frame_equal(pd.concat([df1, df2]), exp) - - def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): - tz = tz_aware_fixture - # GH 13660 - - # different tz coerces to object - dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) - dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"]) - - exp = Index( - [ - pd.Timestamp("2011-01-01", tz=tz), - pd.Timestamp("2011-01-02", tz=tz), - pd.Timestamp("2012-01-01"), - pd.Timestamp("2012-01-02"), - ], - dtype=object, - ) - - res = dti1.append(dti2) - tm.assert_index_equal(res, exp) - - dts1 = Series(dti1) - dts2 = Series(dti2) - res = dts1.append(dts2) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([dts1, dts2]) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - # different tz - dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") - - exp = Index( - [ - pd.Timestamp("2011-01-01", tz=tz), - pd.Timestamp("2011-01-02", tz=tz), - pd.Timestamp("2012-01-01", tz="US/Pacific"), - pd.Timestamp("2012-01-02", tz="US/Pacific"), - ], - dtype=object, - ) - - res = dti1.append(dti3) - # tm.assert_index_equal(res, exp) - - dts1 = Series(dti1) - dts3 = Series(dti3) - res = dts1.append(dts3) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([dts1, dts3]) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - def test_concatlike_common_period(self): - # GH 13660 - pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") - pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M") - - exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M") - - res = pi1.append(pi2) - tm.assert_index_equal(res, exp) - - ps1 = Series(pi1) - ps2 = Series(pi2) - res = ps1.append(ps2) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([ps1, ps2]) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - def test_concatlike_common_period_diff_freq_to_object(self): - # GH 13221 - pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") - pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D") - - exp = Index( - [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Period("2012-01-01", freq="D"), - pd.Period("2012-02-01", freq="D"), - ], - dtype=object, - ) - - res = pi1.append(pi2) - tm.assert_index_equal(res, exp) - - ps1 = Series(pi1) - ps2 = Series(pi2) - res = ps1.append(ps2) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([ps1, ps2]) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - def test_concatlike_common_period_mixed_dt_to_object(self): - # GH 13221 - # different datetimelike - pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") - tdi = pd.TimedeltaIndex(["1 days", "2 days"]) - exp = Index( - [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - ], - dtype=object, - ) - - res = pi1.append(tdi) - tm.assert_index_equal(res, exp) - - ps1 = Series(pi1) - tds = Series(tdi) - res = ps1.append(tds) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([ps1, tds]) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - # inverse - exp = Index( - [ - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - ], - dtype=object, - ) - - res = tdi.append(pi1) - tm.assert_index_equal(res, exp) - - ps1 = Series(pi1) - tds = Series(tdi) - res = tds.append(ps1) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([tds, ps1]) - tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) - - def test_concat_categorical(self): - # GH 13524 - - # same categories -> category - s1 = Series([1, 2, np.nan], dtype="category") - s2 = Series([2, 1, 2], dtype="category") - - exp = Series([1, 2, np.nan, 2, 1, 2], dtype="category") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - # partially different categories => not-category - s1 = Series([3, 2], dtype="category") - s2 = Series([2, 1], dtype="category") - - exp = Series([3, 2, 2, 1]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - # completely different categories (same dtype) => not-category - s1 = Series([10, 11, np.nan], dtype="category") - s2 = Series([np.nan, 1, 3, 2], dtype="category") - - exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype="object") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - def test_union_categorical_same_categories_different_order(self): - # https://github.com/pandas-dev/pandas/issues/19096 - a = Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"])) - b = Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"])) - result = pd.concat([a, b], ignore_index=True) - expected = Series( - Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]) - ) - tm.assert_series_equal(result, expected) - - def test_concat_categorical_coercion(self): - # GH 13524 - - # category + not-category => not-category - s1 = Series([1, 2, np.nan], dtype="category") - s2 = Series([2, 1, 2]) - - exp = Series([1, 2, np.nan, 2, 1, 2], dtype="object") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - # result shouldn't be affected by 1st elem dtype - exp = Series([2, 1, 2, 1, 2, np.nan], dtype="object") - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # all values are not in category => not-category - s1 = Series([3, 2], dtype="category") - s2 = Series([2, 1]) - - exp = Series([3, 2, 2, 1]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = Series([2, 1, 3, 2]) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # completely different categories => not-category - s1 = Series([10, 11, np.nan], dtype="category") - s2 = Series([1, 3, 2]) - - exp = Series([10, 11, np.nan, 1, 3, 2], dtype="object") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = Series([1, 3, 2, 10, 11, np.nan], dtype="object") - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # different dtype => not-category - s1 = Series([10, 11, np.nan], dtype="category") - s2 = Series(["a", "b", "c"]) - - exp = Series([10, 11, np.nan, "a", "b", "c"]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = Series(["a", "b", "c", 10, 11, np.nan]) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # if normal series only contains NaN-likes => not-category - s1 = Series([10, 11], dtype="category") - s2 = Series([np.nan, np.nan, np.nan]) - - exp = Series([10, 11, np.nan, np.nan, np.nan]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = Series([np.nan, np.nan, np.nan, 10, 11]) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - def test_concat_categorical_3elem_coercion(self): - # GH 13524 - - # mixed dtypes => not-category - s1 = Series([1, 2, np.nan], dtype="category") - s2 = Series([2, 1, 2], dtype="category") - s3 = Series([1, 2, 1, 2, np.nan]) - - exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") - tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) - tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - - exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") - tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) - - # values are all in either category => not-category - s1 = Series([4, 5, 6], dtype="category") - s2 = Series([1, 2, 3], dtype="category") - s3 = Series([1, 3, 4]) - - exp = Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) - tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) - tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - - exp = Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) - tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) - - # values are all in either category => not-category - s1 = Series([4, 5, 6], dtype="category") - s2 = Series([1, 2, 3], dtype="category") - s3 = Series([10, 11, 12]) - - exp = Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) - tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) - tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - - exp = Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) - tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) - - def test_concat_categorical_multi_coercion(self): - # GH 13524 - - s1 = Series([1, 3], dtype="category") - s2 = Series([3, 4], dtype="category") - s3 = Series([2, 3]) - s4 = Series([2, 2], dtype="category") - s5 = Series([1, np.nan]) - s6 = Series([1, 3, 2], dtype="category") - - # mixed dtype, values are all in categories => not-category - exp = Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) - res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) - tm.assert_series_equal(res, exp) - res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) - tm.assert_series_equal(res, exp) - - exp = Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) - res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) - tm.assert_series_equal(res, exp) - res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) - tm.assert_series_equal(res, exp) - - def test_concat_categorical_ordered(self): - # GH 13524 - - s1 = Series(pd.Categorical([1, 2, np.nan], ordered=True)) - s2 = Series(pd.Categorical([2, 1, 2], ordered=True)) - - exp = Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = Series( - pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True) - ) - tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) - - def test_concat_categorical_coercion_nan(self): - # GH 13524 - - # some edge cases - # category + not-category => not category - s1 = Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category") - s2 = Series([np.nan, 1]) - - exp = Series([np.nan, np.nan, np.nan, 1]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - s1 = Series([1, np.nan], dtype="category") - s2 = Series([np.nan, np.nan]) - - exp = Series([1, np.nan, np.nan, np.nan], dtype="float") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - # mixed dtype, all nan-likes => not-category - s1 = Series([np.nan, np.nan], dtype="category") - s2 = Series([np.nan, np.nan]) - - exp = Series([np.nan, np.nan, np.nan, np.nan]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # all category nan-likes => category - s1 = Series([np.nan, np.nan], dtype="category") - s2 = Series([np.nan, np.nan], dtype="category") - - exp = Series([np.nan, np.nan, np.nan, np.nan], dtype="category") - - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - def test_concat_categorical_empty(self): - # GH 13524 - - s1 = Series([], dtype="category") - s2 = Series([1, 2], dtype="category") - - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) - - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) - tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) - - s1 = Series([], dtype="category") - s2 = Series([], dtype="category") - - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) - - s1 = Series([], dtype="category") - s2 = Series([], dtype="object") - - # different dtype => not-category - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) - tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) - - s1 = Series([], dtype="category") - s2 = Series([np.nan, np.nan]) - - # empty Series is ignored - exp = Series([np.nan, np.nan]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - -class TestAppend: - def test_append(self, sort, float_frame): - mixed_frame = float_frame.copy() - mixed_frame["foo"] = "bar" - - begin_index = float_frame.index[:5] - end_index = float_frame.index[5:] - - begin_frame = float_frame.reindex(begin_index) - end_frame = float_frame.reindex(end_index) - - appended = begin_frame.append(end_frame) - tm.assert_almost_equal(appended["A"], float_frame["A"]) - - del end_frame["A"] - partial_appended = begin_frame.append(end_frame, sort=sort) - assert "A" in partial_appended - - partial_appended = end_frame.append(begin_frame, sort=sort) - assert "A" in partial_appended - - # mixed type handling - appended = mixed_frame[:5].append(mixed_frame[5:]) - tm.assert_frame_equal(appended, mixed_frame) - - # what to test here - mixed_appended = mixed_frame[:5].append(float_frame[5:], sort=sort) - mixed_appended2 = float_frame[:5].append(mixed_frame[5:], sort=sort) - - # all equal except 'foo' column - tm.assert_frame_equal( - mixed_appended.reindex(columns=["A", "B", "C", "D"]), - mixed_appended2.reindex(columns=["A", "B", "C", "D"]), - ) - - def test_append_empty(self, float_frame): - empty = DataFrame() - - appended = float_frame.append(empty) - tm.assert_frame_equal(float_frame, appended) - assert appended is not float_frame - - appended = empty.append(float_frame) - tm.assert_frame_equal(float_frame, appended) - assert appended is not float_frame - - def test_append_overlap_raises(self, float_frame): - msg = "Indexes have overlapping values" - with pytest.raises(ValueError, match=msg): - float_frame.append(float_frame, verify_integrity=True) - - def test_append_new_columns(self): - # see gh-6129: new columns - df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}}) - row = Series([5, 6, 7], index=["a", "b", "c"], name="z") - expected = DataFrame( - { - "a": {"x": 1, "y": 2, "z": 5}, - "b": {"x": 3, "y": 4, "z": 6}, - "c": {"z": 7}, - } - ) - result = df.append(row) - tm.assert_frame_equal(result, expected) - - def test_append_length0_frame(self, sort): - df = DataFrame(columns=["A", "B", "C"]) - df3 = DataFrame(index=[0, 1], columns=["A", "B"]) - df5 = df.append(df3, sort=sort) - - expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) - tm.assert_frame_equal(df5, expected) - - def test_append_records(self): - arr1 = np.zeros((2,), dtype=("i4,f4,a10")) - arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] - - arr2 = np.zeros((3,), dtype=("i4,f4,a10")) - arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] - - df1 = DataFrame(arr1) - df2 = DataFrame(arr2) - - result = df1.append(df2, ignore_index=True) - expected = DataFrame(np.concatenate((arr1, arr2))) - tm.assert_frame_equal(result, expected) - - # rewrite sort fixture, since we also want to test default of None - def test_append_sorts(self, sort): - df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) - df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) - - with tm.assert_produces_warning(None): - result = df1.append(df2, sort=sort) - - # for None / True - expected = DataFrame( - {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, - columns=["a", "b", "c"], - ) - if sort is False: - expected = expected[["b", "a", "c"]] - tm.assert_frame_equal(result, expected) - - def test_append_different_columns(self, sort): - df = DataFrame( - { - "bools": np.random.randn(10) > 0, - "ints": np.random.randint(0, 10, 10), - "floats": np.random.randn(10), - "strings": ["foo", "bar"] * 5, - } - ) - - a = df[:5].loc[:, ["bools", "ints", "floats"]] - b = df[5:].loc[:, ["strings", "ints", "floats"]] - - appended = a.append(b, sort=sort) - assert isna(appended["strings"][0:4]).all() - assert isna(appended["bools"][5:]).all() - - def test_append_many(self, sort, float_frame): - chunks = [ - float_frame[:5], - float_frame[5:10], - float_frame[10:15], - float_frame[15:], - ] - - result = chunks[0].append(chunks[1:]) - tm.assert_frame_equal(result, float_frame) - - chunks[-1] = chunks[-1].copy() - chunks[-1]["foo"] = "bar" - result = chunks[0].append(chunks[1:], sort=sort) - tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) - assert (result["foo"][15:] == "bar").all() - assert result["foo"][:15].isna().all() - - def test_append_preserve_index_name(self): - # #980 - df1 = DataFrame(columns=["A", "B", "C"]) - df1 = df1.set_index(["A"]) - df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) - df2 = df2.set_index(["A"]) - - result = df1.append(df2) - assert result.index.name == "A" - - indexes_can_append = [ - pd.RangeIndex(3), - Index([4, 5, 6]), - Index([4.5, 5.5, 6.5]), - Index(list("abc")), - pd.CategoricalIndex("A B C".split()), - pd.CategoricalIndex("D E F".split(), ordered=True), - pd.IntervalIndex.from_breaks([7, 8, 9, 10]), - pd.DatetimeIndex( - [ - dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 3, 7, 12), - ] - ), - ] - - indexes_cannot_append_with_other = [ - pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]) - ] - - all_indexes = indexes_can_append + indexes_cannot_append_with_other - - @pytest.mark.parametrize("index", all_indexes, ids=lambda x: type(x).__name__) - def test_append_same_columns_type(self, index): - # GH18359 - - # df wider than ser - df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) - ser_index = index[:2] - ser = Series([7, 8], index=ser_index, name=2) - result = df.append(ser) - expected = DataFrame( - [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index - ) - tm.assert_frame_equal(result, expected) - - # ser wider than df - ser_index = index - index = index[:2] - df = DataFrame([[1, 2], [4, 5]], columns=index) - ser = Series([7, 8, 9], index=ser_index, name=2) - result = df.append(ser) - expected = DataFrame( - [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], - index=[0, 1, 2], - columns=ser_index, - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "df_columns, series_index", - combinations(indexes_can_append, r=2), - ids=lambda x: type(x).__name__, - ) - def test_append_different_columns_types(self, df_columns, series_index): - # GH18359 - # See also test 'test_append_different_columns_types_raises' below - # for errors raised when appending - - df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) - ser = Series([7, 8, 9], index=series_index, name=2) - - result = df.append(ser) - idx_diff = ser.index.difference(df_columns) - combined_columns = Index(df_columns.tolist()).append(idx_diff) - expected = DataFrame( - [ - [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], - [4, 5, 6, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, 7, 8, 9], - ], - index=[0, 1, 2], - columns=combined_columns, - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "index_can_append", indexes_can_append, ids=lambda x: type(x).__name__ - ) - @pytest.mark.parametrize( - "index_cannot_append_with_other", - indexes_cannot_append_with_other, - ids=lambda x: type(x).__name__, - ) - def test_append_different_columns_types_raises( - self, index_can_append, index_cannot_append_with_other - ): - # GH18359 - # Dataframe.append will raise if MultiIndex appends - # or is appended to a different index type - # - # See also test 'test_append_different_columns_types' above for - # appending without raising. - - df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) - ser = Series([7, 8, 9], index=index_cannot_append_with_other, name=2) - msg = ( - r"Expected tuple, got (int|long|float|str|" - r"pandas._libs.interval.Interval)|" - r"object of type '(int|float|Timestamp|" - r"pandas._libs.interval.Interval)' has no len\(\)|" - ) - with pytest.raises(TypeError, match=msg): - df.append(ser) - - df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other) - ser = Series([7, 8, 9], index=index_can_append, name=2) - - with pytest.raises(TypeError, match=msg): - df.append(ser) - - def test_append_dtype_coerce(self, sort): - - # GH 4993 - # appending with datetime will incorrectly convert datetime64 - - df1 = DataFrame( - index=[1, 2], - data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], - columns=["start_time"], - ) - df2 = DataFrame( - index=[4, 5], - data=[ - [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)], - [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)], - ], - columns=["start_time", "end_time"], - ) - - expected = concat( - [ - Series( - [ - pd.NaT, - pd.NaT, - dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 4, 7, 10), - ], - name="end_time", - ), - Series( - [ - dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0), - dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 4, 0, 0), - ], - name="start_time", - ), - ], - axis=1, - sort=sort, - ) - result = df1.append(df2, ignore_index=True, sort=sort) - if sort: - expected = expected[["end_time", "start_time"]] - else: - expected = expected[["start_time", "end_time"]] - - tm.assert_frame_equal(result, expected) - - def test_append_missing_column_proper_upcast(self, sort): - df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) - df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)}) - - appended = df1.append(df2, ignore_index=True, sort=sort) - assert appended["A"].dtype == "f8" - assert appended["B"].dtype == "O" - - def test_append_empty_frame_to_series_with_dateutil_tz(self): - # GH 23682 - date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) - s = Series({"date": date, "a": 1.0, "b": 2.0}) - df = DataFrame(columns=["c", "d"]) - result_a = df.append(s, ignore_index=True) - expected = DataFrame( - [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"] - ) - # These columns get cast to object after append - expected["c"] = expected["c"].astype(object) - expected["d"] = expected["d"].astype(object) - tm.assert_frame_equal(result_a, expected) - - expected = DataFrame( - [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"] - ) - expected["c"] = expected["c"].astype(object) - expected["d"] = expected["d"].astype(object) - - result_b = result_a.append(s, ignore_index=True) - tm.assert_frame_equal(result_b, expected) - - # column order is different - expected = expected[["c", "d", "date", "a", "b"]] - result = df.append([s, s], ignore_index=True) - tm.assert_frame_equal(result, expected) - - def test_append_empty_tz_frame_with_datetime64ns(self): - # https://github.com/pandas-dev/pandas/issues/35460 - df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") - - # pd.NaT gets inferred as tz-naive, so append result is tz-naive - result = df.append({"a": pd.NaT}, ignore_index=True) - expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) - - # also test with typed value to append - df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") - result = df.append( - Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True - ) - expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) - - class TestConcatenate: def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) @@ -2932,349 +1843,3 @@ def test_concat_preserves_extension_int64_dtype(): result = pd.concat([df_a, df_b], ignore_index=True) expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64") tm.assert_frame_equal(result, expected) - - -class TestSeriesConcat: - @pytest.mark.parametrize( - "dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"] - ) - def test_concat_empty_series_dtypes_match_roundtrips(self, dtype): - dtype = np.dtype(dtype) - - result = pd.concat([Series(dtype=dtype)]) - assert result.dtype == dtype - - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype)]) - assert result.dtype == dtype - - def test_concat_empty_series_dtypes_roundtrips(self): - - # round-tripping with self & like self - dtypes = map(np.dtype, ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]) - - def int_result_type(dtype, dtype2): - typs = {dtype.kind, dtype2.kind} - if not len(typs - {"i", "u", "b"}) and ( - dtype.kind == "i" or dtype2.kind == "i" - ): - return "i" - elif not len(typs - {"u", "b"}) and ( - dtype.kind == "u" or dtype2.kind == "u" - ): - return "u" - return None - - def float_result_type(dtype, dtype2): - typs = {dtype.kind, dtype2.kind} - if not len(typs - {"f", "i", "u"}) and ( - dtype.kind == "f" or dtype2.kind == "f" - ): - return "f" - return None - - def get_result_type(dtype, dtype2): - result = float_result_type(dtype, dtype2) - if result is not None: - return result - result = int_result_type(dtype, dtype2) - if result is not None: - return result - return "O" - - for dtype in dtypes: - for dtype2 in dtypes: - if dtype == dtype2: - continue - - expected = get_result_type(dtype, dtype2) - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype - assert result.kind == expected - - @pytest.mark.parametrize( - "left,right,expected", - [ - # booleans - (np.bool_, np.int32, np.int32), - (np.bool_, np.float32, np.object_), - # datetime-like - ("m8[ns]", np.bool_, np.object_), - ("m8[ns]", np.int64, np.object_), - ("M8[ns]", np.bool_, np.object_), - ("M8[ns]", np.int64, np.object_), - # categorical - ("category", "category", "category"), - ("category", "object", "object"), - ], - ) - def test_concat_empty_series_dtypes(self, left, right, expected): - result = pd.concat([Series(dtype=left), Series(dtype=right)]) - assert result.dtype == expected - - def test_concat_empty_series_dtypes_triple(self): - - assert ( - pd.concat( - [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] - ).dtype - == np.object_ - ) - - def test_concat_empty_series_dtype_category_with_array(self): - # GH#18515 - assert ( - pd.concat( - [Series(np.array([]), dtype="category"), Series(dtype="float64")] - ).dtype - == "float64" - ) - - def test_concat_empty_series_dtypes_sparse(self): - result = pd.concat( - [ - Series(dtype="float64").astype("Sparse"), - Series(dtype="float64").astype("Sparse"), - ] - ) - assert result.dtype == "Sparse[float64]" - - result = pd.concat( - [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] - ) - # TODO: release-note: concat sparse dtype - expected = pd.SparseDtype(np.float64) - assert result.dtype == expected - - result = pd.concat( - [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] - ) - # TODO: release-note: concat sparse dtype - expected = pd.SparseDtype("object") - assert result.dtype == expected - - -class TestDataFrameConcat: - def test_concat_multiple_frames_dtypes(self): - - # GH#2759 - A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64) - B = DataFrame(data=np.ones((10, 2)), dtype=np.float32) - results = pd.concat((A, B), axis=1).dtypes - expected = Series( - [np.dtype("float64")] * 2 + [np.dtype("float32")] * 2, - index=["foo", "bar", 0, 1], - ) - tm.assert_series_equal(results, expected) - - def test_concat_multiple_tzs(self): - # GH#12467 - # combining datetime tz-aware and naive DataFrames - ts1 = Timestamp("2015-01-01", tz=None) - ts2 = Timestamp("2015-01-01", tz="UTC") - ts3 = Timestamp("2015-01-01", tz="EST") - - df1 = DataFrame(dict(time=[ts1])) - df2 = DataFrame(dict(time=[ts2])) - df3 = DataFrame(dict(time=[ts3])) - - results = pd.concat([df1, df2]).reset_index(drop=True) - expected = DataFrame(dict(time=[ts1, ts2]), dtype=object) - tm.assert_frame_equal(results, expected) - - results = pd.concat([df1, df3]).reset_index(drop=True) - expected = DataFrame(dict(time=[ts1, ts3]), dtype=object) - tm.assert_frame_equal(results, expected) - - results = pd.concat([df2, df3]).reset_index(drop=True) - expected = DataFrame(dict(time=[ts2, ts3])) - tm.assert_frame_equal(results, expected) - - @pytest.mark.parametrize( - "t1", - [ - "2015-01-01", - pytest.param( - pd.NaT, - marks=pytest.mark.xfail( - reason="GH23037 incorrect dtype when concatenating" - ), - ), - ], - ) - def test_concat_tz_NaT(self, t1): - # GH#22796 - # Concating tz-aware multicolumn DataFrames - ts1 = Timestamp(t1, tz="UTC") - ts2 = Timestamp("2015-01-01", tz="UTC") - ts3 = Timestamp("2015-01-01", tz="UTC") - - df1 = DataFrame([[ts1, ts2]]) - df2 = DataFrame([[ts3]]) - - result = pd.concat([df1, df2]) - expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0]) - - tm.assert_frame_equal(result, expected) - - def test_concat_tz_not_aligned(self): - # GH#22796 - ts = pd.to_datetime([1, 2]).tz_localize("UTC") - a = DataFrame({"A": ts}) - b = DataFrame({"A": ts, "B": ts}) - result = pd.concat([a, b], sort=True, ignore_index=True) - expected = DataFrame( - {"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)} - ) - tm.assert_frame_equal(result, expected) - - def test_concat_tuple_keys(self): - # GH#14438 - df1 = DataFrame(np.ones((2, 2)), columns=list("AB")) - df2 = DataFrame(np.ones((3, 2)) * 2, columns=list("AB")) - results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")]) - expected = DataFrame( - { - "A": { - ("bee", "bah", 0): 1.0, - ("bee", "bah", 1): 1.0, - ("bee", "boo", 0): 2.0, - ("bee", "boo", 1): 2.0, - ("bee", "boo", 2): 2.0, - }, - "B": { - ("bee", "bah", 0): 1.0, - ("bee", "bah", 1): 1.0, - ("bee", "boo", 0): 2.0, - ("bee", "boo", 1): 2.0, - ("bee", "boo", 2): 2.0, - }, - } - ) - tm.assert_frame_equal(results, expected) - - def test_concat_named_keys(self): - # GH#14252 - df = DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]}) - index = Index(["a", "b"], name="baz") - concatted_named_from_keys = pd.concat([df, df], keys=index) - expected_named = DataFrame( - {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, - index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]), - ) - tm.assert_frame_equal(concatted_named_from_keys, expected_named) - - index_no_name = Index(["a", "b"], name=None) - concatted_named_from_names = pd.concat( - [df, df], keys=index_no_name, names=["baz"] - ) - tm.assert_frame_equal(concatted_named_from_names, expected_named) - - concatted_unnamed = pd.concat([df, df], keys=index_no_name) - expected_unnamed = DataFrame( - {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, - index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]), - ) - tm.assert_frame_equal(concatted_unnamed, expected_unnamed) - - def test_concat_axis_parameter(self): - # GH#14369 - df1 = DataFrame({"A": [0.1, 0.2]}, index=range(2)) - df2 = DataFrame({"A": [0.3, 0.4]}, index=range(2)) - - # Index/row/0 DataFrame - expected_index = DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) - - concatted_index = pd.concat([df1, df2], axis="index") - tm.assert_frame_equal(concatted_index, expected_index) - - concatted_row = pd.concat([df1, df2], axis="rows") - tm.assert_frame_equal(concatted_row, expected_index) - - concatted_0 = pd.concat([df1, df2], axis=0) - tm.assert_frame_equal(concatted_0, expected_index) - - # Columns/1 DataFrame - expected_columns = DataFrame( - [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"] - ) - - concatted_columns = pd.concat([df1, df2], axis="columns") - tm.assert_frame_equal(concatted_columns, expected_columns) - - concatted_1 = pd.concat([df1, df2], axis=1) - tm.assert_frame_equal(concatted_1, expected_columns) - - series1 = Series([0.1, 0.2]) - series2 = Series([0.3, 0.4]) - - # Index/row/0 Series - expected_index_series = Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) - - concatted_index_series = pd.concat([series1, series2], axis="index") - tm.assert_series_equal(concatted_index_series, expected_index_series) - - concatted_row_series = pd.concat([series1, series2], axis="rows") - tm.assert_series_equal(concatted_row_series, expected_index_series) - - concatted_0_series = pd.concat([series1, series2], axis=0) - tm.assert_series_equal(concatted_0_series, expected_index_series) - - # Columns/1 Series - expected_columns_series = DataFrame( - [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1] - ) - - concatted_columns_series = pd.concat([series1, series2], axis="columns") - tm.assert_frame_equal(concatted_columns_series, expected_columns_series) - - concatted_1_series = pd.concat([series1, series2], axis=1) - tm.assert_frame_equal(concatted_1_series, expected_columns_series) - - # Testing ValueError - with pytest.raises(ValueError, match="No axis named"): - pd.concat([series1, series2], axis="something") - - def test_concat_numerical_names(self): - # GH#15262, GH#12223 - df = DataFrame( - {"col": range(9)}, - dtype="int32", - index=( - pd.MultiIndex.from_product( - [["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2] - ) - ), - ) - result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :])) - expected = DataFrame( - {"col": [0, 1, 7, 8]}, - dtype="int32", - index=pd.MultiIndex.from_tuples( - [("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2] - ), - ) - tm.assert_frame_equal(result, expected) - - def test_concat_astype_dup_col(self): - # GH#23049 - df = DataFrame([{"a": "b"}]) - df = pd.concat([df, df], axis=1) - - result = df.astype("category") - expected = DataFrame( - np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"] - ).astype("category") - tm.assert_frame_equal(result, expected) - - def test_concat_datetime_datetime64_frame(self): - # GH#2624 - rows = [] - rows.append([datetime(2010, 1, 1), 1]) - rows.append([datetime(2010, 1, 2), "hi"]) - - df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) - - ind = date_range(start="2000/1/1", freq="D", periods=10) - df1 = DataFrame({"date": ind, "test": range(10)}) - - # it works! - pd.concat([df1, df2_obj]) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py new file mode 100644 index 0000000000000..21abd1bed7cbc --- /dev/null +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -0,0 +1,236 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, Timestamp, date_range +import pandas._testing as tm + + +class TestDataFrameConcat: + def test_concat_multiple_frames_dtypes(self): + + # GH#2759 + A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64) + B = DataFrame(data=np.ones((10, 2)), dtype=np.float32) + results = pd.concat((A, B), axis=1).dtypes + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("float32")] * 2, + index=["foo", "bar", 0, 1], + ) + tm.assert_series_equal(results, expected) + + def test_concat_multiple_tzs(self): + # GH#12467 + # combining datetime tz-aware and naive DataFrames + ts1 = Timestamp("2015-01-01", tz=None) + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="EST") + + df1 = DataFrame(dict(time=[ts1])) + df2 = DataFrame(dict(time=[ts2])) + df3 = DataFrame(dict(time=[ts3])) + + results = pd.concat([df1, df2]).reset_index(drop=True) + expected = DataFrame(dict(time=[ts1, ts2]), dtype=object) + tm.assert_frame_equal(results, expected) + + results = pd.concat([df1, df3]).reset_index(drop=True) + expected = DataFrame(dict(time=[ts1, ts3]), dtype=object) + tm.assert_frame_equal(results, expected) + + results = pd.concat([df2, df3]).reset_index(drop=True) + expected = DataFrame(dict(time=[ts2, ts3])) + tm.assert_frame_equal(results, expected) + + @pytest.mark.parametrize( + "t1", + [ + "2015-01-01", + pytest.param( + pd.NaT, + marks=pytest.mark.xfail( + reason="GH23037 incorrect dtype when concatenating" + ), + ), + ], + ) + def test_concat_tz_NaT(self, t1): + # GH#22796 + # Concating tz-aware multicolumn DataFrames + ts1 = Timestamp(t1, tz="UTC") + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="UTC") + + df1 = DataFrame([[ts1, ts2]]) + df2 = DataFrame([[ts3]]) + + result = pd.concat([df1, df2]) + expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0]) + + tm.assert_frame_equal(result, expected) + + def test_concat_tz_not_aligned(self): + # GH#22796 + ts = pd.to_datetime([1, 2]).tz_localize("UTC") + a = DataFrame({"A": ts}) + b = DataFrame({"A": ts, "B": ts}) + result = pd.concat([a, b], sort=True, ignore_index=True) + expected = DataFrame( + {"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)} + ) + tm.assert_frame_equal(result, expected) + + def test_concat_tuple_keys(self): + # GH#14438 + df1 = DataFrame(np.ones((2, 2)), columns=list("AB")) + df2 = DataFrame(np.ones((3, 2)) * 2, columns=list("AB")) + results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")]) + expected = DataFrame( + { + "A": { + ("bee", "bah", 0): 1.0, + ("bee", "bah", 1): 1.0, + ("bee", "boo", 0): 2.0, + ("bee", "boo", 1): 2.0, + ("bee", "boo", 2): 2.0, + }, + "B": { + ("bee", "bah", 0): 1.0, + ("bee", "bah", 1): 1.0, + ("bee", "boo", 0): 2.0, + ("bee", "boo", 1): 2.0, + ("bee", "boo", 2): 2.0, + }, + } + ) + tm.assert_frame_equal(results, expected) + + def test_concat_named_keys(self): + # GH#14252 + df = DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]}) + index = Index(["a", "b"], name="baz") + concatted_named_from_keys = pd.concat([df, df], keys=index) + expected_named = DataFrame( + {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, + index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]), + ) + tm.assert_frame_equal(concatted_named_from_keys, expected_named) + + index_no_name = Index(["a", "b"], name=None) + concatted_named_from_names = pd.concat( + [df, df], keys=index_no_name, names=["baz"] + ) + tm.assert_frame_equal(concatted_named_from_names, expected_named) + + concatted_unnamed = pd.concat([df, df], keys=index_no_name) + expected_unnamed = DataFrame( + {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, + index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]), + ) + tm.assert_frame_equal(concatted_unnamed, expected_unnamed) + + def test_concat_axis_parameter(self): + # GH#14369 + df1 = DataFrame({"A": [0.1, 0.2]}, index=range(2)) + df2 = DataFrame({"A": [0.3, 0.4]}, index=range(2)) + + # Index/row/0 DataFrame + expected_index = DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) + + concatted_index = pd.concat([df1, df2], axis="index") + tm.assert_frame_equal(concatted_index, expected_index) + + concatted_row = pd.concat([df1, df2], axis="rows") + tm.assert_frame_equal(concatted_row, expected_index) + + concatted_0 = pd.concat([df1, df2], axis=0) + tm.assert_frame_equal(concatted_0, expected_index) + + # Columns/1 DataFrame + expected_columns = DataFrame( + [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"] + ) + + concatted_columns = pd.concat([df1, df2], axis="columns") + tm.assert_frame_equal(concatted_columns, expected_columns) + + concatted_1 = pd.concat([df1, df2], axis=1) + tm.assert_frame_equal(concatted_1, expected_columns) + + series1 = Series([0.1, 0.2]) + series2 = Series([0.3, 0.4]) + + # Index/row/0 Series + expected_index_series = Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) + + concatted_index_series = pd.concat([series1, series2], axis="index") + tm.assert_series_equal(concatted_index_series, expected_index_series) + + concatted_row_series = pd.concat([series1, series2], axis="rows") + tm.assert_series_equal(concatted_row_series, expected_index_series) + + concatted_0_series = pd.concat([series1, series2], axis=0) + tm.assert_series_equal(concatted_0_series, expected_index_series) + + # Columns/1 Series + expected_columns_series = DataFrame( + [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1] + ) + + concatted_columns_series = pd.concat([series1, series2], axis="columns") + tm.assert_frame_equal(concatted_columns_series, expected_columns_series) + + concatted_1_series = pd.concat([series1, series2], axis=1) + tm.assert_frame_equal(concatted_1_series, expected_columns_series) + + # Testing ValueError + with pytest.raises(ValueError, match="No axis named"): + pd.concat([series1, series2], axis="something") + + def test_concat_numerical_names(self): + # GH#15262, GH#12223 + df = DataFrame( + {"col": range(9)}, + dtype="int32", + index=( + pd.MultiIndex.from_product( + [["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2] + ) + ), + ) + result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :])) + expected = DataFrame( + {"col": [0, 1, 7, 8]}, + dtype="int32", + index=pd.MultiIndex.from_tuples( + [("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_concat_astype_dup_col(self): + # GH#23049 + df = DataFrame([{"a": "b"}]) + df = pd.concat([df, df], axis=1) + + result = df.astype("category") + expected = DataFrame( + np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"] + ).astype("category") + tm.assert_frame_equal(result, expected) + + def test_concat_datetime_datetime64_frame(self): + # GH#2624 + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), "hi"]) + + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) + + ind = date_range(start="2000/1/1", freq="D", periods=10) + df1 = DataFrame({"date": ind, "test": range(10)}) + + # it works! + pd.concat([df1, df2_obj]) diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py new file mode 100644 index 0000000000000..7f84e937736ac --- /dev/null +++ b/pandas/tests/reshape/concat/test_series.py @@ -0,0 +1,123 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series + + +class TestSeriesConcat: + @pytest.mark.parametrize( + "dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"] + ) + def test_concat_empty_series_dtypes_match_roundtrips(self, dtype): + dtype = np.dtype(dtype) + + result = pd.concat([Series(dtype=dtype)]) + assert result.dtype == dtype + + result = pd.concat([Series(dtype=dtype), Series(dtype=dtype)]) + assert result.dtype == dtype + + def test_concat_empty_series_dtypes_roundtrips(self): + + # round-tripping with self & like self + dtypes = map(np.dtype, ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]) + + def int_result_type(dtype, dtype2): + typs = {dtype.kind, dtype2.kind} + if not len(typs - {"i", "u", "b"}) and ( + dtype.kind == "i" or dtype2.kind == "i" + ): + return "i" + elif not len(typs - {"u", "b"}) and ( + dtype.kind == "u" or dtype2.kind == "u" + ): + return "u" + return None + + def float_result_type(dtype, dtype2): + typs = {dtype.kind, dtype2.kind} + if not len(typs - {"f", "i", "u"}) and ( + dtype.kind == "f" or dtype2.kind == "f" + ): + return "f" + return None + + def get_result_type(dtype, dtype2): + result = float_result_type(dtype, dtype2) + if result is not None: + return result + result = int_result_type(dtype, dtype2) + if result is not None: + return result + return "O" + + for dtype in dtypes: + for dtype2 in dtypes: + if dtype == dtype2: + continue + + expected = get_result_type(dtype, dtype2) + result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype + assert result.kind == expected + + @pytest.mark.parametrize( + "left,right,expected", + [ + # booleans + (np.bool_, np.int32, np.int32), + (np.bool_, np.float32, np.object_), + # datetime-like + ("m8[ns]", np.bool_, np.object_), + ("m8[ns]", np.int64, np.object_), + ("M8[ns]", np.bool_, np.object_), + ("M8[ns]", np.int64, np.object_), + # categorical + ("category", "category", "category"), + ("category", "object", "object"), + ], + ) + def test_concat_empty_series_dtypes(self, left, right, expected): + result = pd.concat([Series(dtype=left), Series(dtype=right)]) + assert result.dtype == expected + + def test_concat_empty_series_dtypes_triple(self): + + assert ( + pd.concat( + [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] + ).dtype + == np.object_ + ) + + def test_concat_empty_series_dtype_category_with_array(self): + # GH#18515 + assert ( + pd.concat( + [Series(np.array([]), dtype="category"), Series(dtype="float64")] + ).dtype + == "float64" + ) + + def test_concat_empty_series_dtypes_sparse(self): + result = pd.concat( + [ + Series(dtype="float64").astype("Sparse"), + Series(dtype="float64").astype("Sparse"), + ] + ) + assert result.dtype == "Sparse[float64]" + + result = pd.concat( + [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] + ) + # TODO: release-note: concat sparse dtype + expected = pd.SparseDtype(np.float64) + assert result.dtype == expected + + result = pd.concat( + [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] + ) + # TODO: release-note: concat sparse dtype + expected = pd.SparseDtype("object") + assert result.dtype == expected