Skip to content

Commit a5c02d5

Browse files
committed
rework tests & implementation in concat of tz-aware dataframes
1 parent 1dcddba commit a5c02d5

File tree

4 files changed

+105
-174
lines changed

4 files changed

+105
-174
lines changed

pandas/core/dtypes/concat.py

+51-48
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,13 @@ def _maybe_unwrap(x):
416416
fastpath=True)
417417

418418

419+
def _concatenate_2d(to_concat, axis):
420+
# coerce to 2d if needed & concatenate
421+
if axis == 1:
422+
to_concat = [np.atleast_2d(x) for x in to_concat]
423+
return np.concatenate(to_concat, axis=axis)
424+
425+
419426
def _concat_datetime(to_concat, axis=0, typs=None):
420427
"""
421428
provide concatenation of an datetimelike array of arrays each of which is a
@@ -432,61 +439,57 @@ def _concat_datetime(to_concat, axis=0, typs=None):
432439
a single array, preserving the combined dtypes
433440
"""
434441

435-
def convert_to_pydatetime(x, axis):
436-
# coerce to an object dtype
442+
if typs is None:
443+
typs = get_dtype_kinds(to_concat)
437444

438-
# if dtype is of datetimetz or timezone
439-
if x.dtype.kind == _NS_DTYPE.kind:
440-
if getattr(x, 'tz', None) is not None:
441-
x = x.astype(object).values
442-
else:
443-
shape = x.shape
444-
x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(),
445-
box="timestamp")
446-
x = x.reshape(shape)
445+
# multiple types, need to coerce to object
446+
if len(typs) != 1:
447+
return _concatenate_2d([_convert_datetimelike_to_object(x)
448+
for x in to_concat],
449+
axis=axis)
447450

448-
elif x.dtype == _TD_DTYPE:
449-
shape = x.shape
450-
x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True)
451-
x = x.reshape(shape)
451+
# must be single dtype
452+
if any(typ.startswith('datetime') for typ in typs):
452453

453-
if axis == 1:
454-
x = np.atleast_2d(x)
455-
return x
454+
if 'datetime' in typs:
455+
to_concat = [np.array(x, copy=False).view(np.int64)
456+
for x in to_concat]
457+
return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE)
458+
else:
459+
# when to_concat has different tz, len(typs) > 1.
460+
# thus no need to care
461+
return _concat_datetimetz(to_concat)
456462

457-
if typs is None:
458-
typs = get_dtype_kinds(to_concat)
463+
elif 'timedelta' in typs:
464+
return _concatenate_2d([x.view(np.int64) for x in to_concat],
465+
axis=axis).view(_TD_DTYPE)
459466

460-
# must be single dtype
461-
if len(typs) == 1:
462-
_contains_datetime = any(typ.startswith('datetime') for typ in typs)
463-
_contains_period = any(typ.startswith('period') for typ in typs)
467+
elif any(typ.startswith('period') for typ in typs):
468+
# PeriodIndex must be handled by PeriodIndex,
469+
# Thus can't meet this condition ATM
470+
# Must be changed when we adding PeriodDtype
471+
raise NotImplementedError("unable to concat PeriodDtype")
464472

465-
if _contains_datetime:
466473

467-
if 'datetime' in typs:
468-
new_values = np.concatenate([x.view(np.int64) for x in
469-
to_concat], axis=axis)
470-
return new_values.view(_NS_DTYPE)
471-
else:
472-
# when to_concat has different tz, len(typs) > 1.
473-
# thus no need to care
474-
return _concat_datetimetz(to_concat)
475-
476-
elif 'timedelta' in typs:
477-
new_values = np.concatenate([x.view(np.int64) for x in to_concat],
478-
axis=axis)
479-
return new_values.view(_TD_DTYPE)
480-
481-
elif _contains_period:
482-
# PeriodIndex must be handled by PeriodIndex,
483-
# Thus can't meet this condition ATM
484-
# Must be changed when we adding PeriodDtype
485-
raise NotImplementedError
486-
487-
# need to coerce to object
488-
to_concat = [convert_to_pydatetime(x, axis) for x in to_concat]
489-
return np.concatenate(to_concat, axis=axis)
474+
def _convert_datetimelike_to_object(x):
475+
# coerce datetimelike array to object dtype
476+
477+
# if dtype is of datetimetz or timezone
478+
if x.dtype.kind == _NS_DTYPE.kind:
479+
if getattr(x, 'tz', None) is not None:
480+
x = x.astype(object).values
481+
else:
482+
shape = x.shape
483+
x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(),
484+
box="timestamp")
485+
x = x.reshape(shape)
486+
487+
elif x.dtype == _TD_DTYPE:
488+
shape = x.shape
489+
x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True)
490+
x = x.reshape(shape)
491+
492+
return x
490493

491494

492495
def _concat_datetimetz(to_concat, name=None):

pandas/core/indexes/base.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -2183,19 +2183,17 @@ def _assert_take_fillable(self, values, indices, allow_fill=True,
21832183
fill_value=None, na_value=np.nan):
21842184
""" Internal method to handle NA filling of take """
21852185
indices = _ensure_platform_int(indices)
2186+
21862187
# only fill if we are passing a non-None fill_value
21872188
if allow_fill and fill_value is not None:
21882189
if (indices < -1).any():
21892190
msg = ('When allow_fill=True and fill_value is not None, '
21902191
'all indices must be >= -1')
21912192
raise ValueError(msg)
2192-
mask = indices == -1
2193-
if mask.all():
2194-
taken = np.full(indices.shape, fill_value=na_value)
2195-
else:
2196-
taken = values.take(indices)
2197-
if mask.any():
2198-
taken[mask] = na_value
2193+
taken = algos.take(values,
2194+
indices,
2195+
allow_fill=allow_fill,
2196+
fill_value=na_value)
21992197
else:
22002198
taken = values.take(indices)
22012199
return taken

pandas/core/internals.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -5837,8 +5837,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
58375837

58385838
if getattr(self.block, 'is_datetimetz', False) or \
58395839
is_datetimetz(empty_dtype):
5840-
missing_arr = np.full(np.prod(self.shape), fill_value)
5841-
return DatetimeIndex(missing_arr, dtype=empty_dtype)
5840+
pass
58425841
elif getattr(self.block, 'is_categorical', False):
58435842
pass
58445843
elif getattr(self.block, 'is_sparse', False):

pandas/tests/reshape/test_concat.py

+48-117
Original file line numberDiff line numberDiff line change
@@ -1917,131 +1917,73 @@ def test_concat_tz_series_tzlocal(self):
19171917
tm.assert_series_equal(result, pd.Series(x + y))
19181918
assert result.dtype == 'datetime64[ns, tzlocal()]'
19191919

1920-
def test_concat_NaT_dataframes_all_NaT_axis_0(self):
1920+
@pytest.mark.parametrize('tz1', [None, 'UTC'])
1921+
@pytest.mark.parametrize('tz2', [None, 'UTC'])
1922+
@pytest.mark.parametrize('s', [pd.NaT, pd.Timestamp('20150101')])
1923+
def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s):
19211924
# GH 12396
19221925

19231926
# tz-naive
1924-
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
1925-
second = pd.DataFrame([[pd.NaT]])
1927+
first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply(
1928+
lambda x: x.dt.tz_localize(tz1))
1929+
second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2))
19261930

19271931
result = pd.concat([first, second], axis=0)
1928-
expected = pd.DataFrame([pd.NaT, pd.NaT, pd.NaT], index=[0, 1, 0])
1929-
assert_frame_equal(result, expected)
1932+
expected = pd.DataFrame(pd.Series(
1933+
[pd.NaT, pd.NaT, s], index=[0, 1, 0]))
1934+
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
1935+
if tz1 != tz2:
1936+
expected = expected.astype(object)
19301937

1931-
# one side timezone-aware
1932-
# upcasts for mixed case
1933-
first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'))
1934-
result = pd.concat([first, second], axis=0)
1935-
expected = pd.DataFrame(
1936-
pd.Series([pd.NaT, pd.NaT, pd.NaT]).dt.tz_localize('UTC'),
1937-
index=[0, 1, 0]
1938-
)
19391938
assert_frame_equal(result, expected)
19401939

1941-
# both sides timezone-aware
1942-
# upcasts to tz-aware
1943-
second = pd.DataFrame(pd.Series([pd.NaT]).dt.tz_localize('UTC'))
1944-
result = pd.concat([first, second], axis=0)
1945-
assert_frame_equal(result, expected)
1946-
1947-
def test_concat_NaT_dataframes_all_NaT_axis_1(self):
1940+
@pytest.mark.parametrize('tz1', [None, 'UTC'])
1941+
@pytest.mark.parametrize('tz2', [None, 'UTC'])
1942+
def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
19481943
# GH 12396
19491944

1950-
# tz-naive
1951-
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
1952-
second = pd.DataFrame([[pd.NaT]], columns=[1])
1953-
expected = pd.DataFrame([[pd.NaT, pd.NaT], [pd.NaT, pd.NaT]],
1954-
columns=[0, 1])
1955-
result = pd.concat([first, second], axis=1)
1956-
assert_frame_equal(result, expected)
1957-
1958-
# one side timezone-aware
1959-
# upcasts result to tz-aware
1960-
first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'))
1945+
first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
1946+
second = pd.DataFrame(pd.Series(
1947+
[pd.NaT]).dt.tz_localize(tz2), columns=[1])
19611948
expected = pd.DataFrame(
1962-
{0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'),
1963-
1: pd.Series([pd.NaT, pd.NaT])}
1949+
{0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
1950+
1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2)}
19641951
)
19651952
result = pd.concat([first, second], axis=1)
19661953
assert_frame_equal(result, expected)
19671954

1968-
# both sides timezone-aware
1969-
# upcasts result to tz-aware
1970-
second[1] = second[1].dt.tz_localize('UTC')
1971-
expected = pd.DataFrame(
1972-
{0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'),
1973-
1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC')}
1974-
)
1975-
result = pd.concat([first, second], axis=1)
1976-
assert_frame_equal(result, expected)
1977-
1978-
def test_concat_NaT_dataframes_mixed_timestamps_and_NaT(self):
1955+
@pytest.mark.parametrize('tz1', [None, 'UTC'])
1956+
@pytest.mark.parametrize('tz2', [None, 'UTC'])
1957+
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
19791958
# GH 12396
19801959

19811960
# tz-naive
1982-
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
1983-
second = pd.DataFrame([[pd.Timestamp('2015/01/01')],
1984-
[pd.Timestamp('2016/01/01')]],
1985-
index=[2, 3])
1986-
expected = pd.DataFrame([pd.NaT, pd.NaT,
1987-
pd.Timestamp('2015/01/01'),
1988-
pd.Timestamp('2016/01/01')])
1989-
1990-
result = pd.concat([first, second], axis=0)
1991-
assert_frame_equal(result, expected)
1992-
1993-
# one side timezone-aware
1994-
second = second[0].dt.tz_localize('UTC')
1995-
expected = pd.DataFrame(
1996-
pd.Series([pd.NaT, pd.NaT,
1997-
pd.Timestamp('2015/01/01'),
1998-
pd.Timestamp('2016/01/01')]).dt.tz_localize('UTC')
1999-
)
2000-
result = pd.concat([first, second], axis=0)
2001-
assert_frame_equal(result, expected)
2002-
2003-
def test_concat_NaT_series_dataframe_all_NaT(self):
2004-
# GH 12396
2005-
2006-
# tz-naive
2007-
first = pd.Series([pd.NaT, pd.NaT])
2008-
second = pd.DataFrame([[pd.Timestamp('2015/01/01')],
2009-
[pd.Timestamp('2016/01/01')]],
1961+
first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
1962+
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz2)],
1963+
[pd.Timestamp('2016/01/01', tz=tz2)]],
20101964
index=[2, 3])
20111965

20121966
expected = pd.DataFrame([pd.NaT, pd.NaT,
2013-
pd.Timestamp('2015/01/01'),
2014-
pd.Timestamp('2016/01/01')])
1967+
pd.Timestamp('2015/01/01', tz=tz2),
1968+
pd.Timestamp('2016/01/01', tz=tz2)])
1969+
if tz1 != tz2:
1970+
expected = expected.astype(object)
20151971

20161972
result = pd.concat([first, second])
20171973
assert_frame_equal(result, expected)
20181974

2019-
# one side timezone-aware
2020-
second[0] = second[0].dt.tz_localize('UTC')
2021-
result = pd.concat([first, second])
2022-
2023-
expected = pd.DataFrame(
2024-
pd.Series([pd.NaT, pd.NaT,
2025-
pd.Timestamp('2015/01/01'),
2026-
pd.Timestamp('2016/01/01')]).dt.tz_localize('UTC')
2027-
)
2028-
assert_frame_equal(result, expected)
2029-
2030-
# both sides timezone-aware
2031-
first = first.dt.tz_localize('UTC')
2032-
result = pd.concat([first, second])
2033-
assert_frame_equal(result, expected)
1975+
@pytest.mark.parametrize('tz', [None, 'UTC'])
1976+
def test_concat_NaT_dataframes(self, tz):
1977+
# GH 12396
20341978

2035-
# mixed tz
20361979
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
2037-
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz='UTC')],
2038-
[pd.Timestamp('2016/01/01', tz='US/Eastern')]],
1980+
first = first.apply(lambda x: x.dt.tz_localize(tz))
1981+
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz)],
1982+
[pd.Timestamp('2016/01/01', tz=tz)]],
20391983
index=[2, 3])
2040-
2041-
expected = pd.DataFrame([pd.NaT,
2042-
pd.NaT,
2043-
pd.Timestamp('2015/01/01', tz='UTC'),
2044-
pd.Timestamp('2016/01/01', tz='US/Eastern')])
1984+
expected = pd.DataFrame([pd.NaT, pd.NaT,
1985+
pd.Timestamp('2015/01/01', tz=tz),
1986+
pd.Timestamp('2016/01/01', tz=tz)])
20451987

20461988
result = pd.concat([first, second], axis=0)
20471989
assert_frame_equal(result, expected)
@@ -2107,32 +2049,21 @@ def test_concat_empty_series(self):
21072049
columns=['x', 0])
21082050
tm.assert_frame_equal(res, exp)
21092051

2052+
@pytest.mark.parametrize('tz', [None, 'UTC'])
2053+
@pytest.mark.parametrize('values', [[], [1, 2, 3]])
2054+
def test_concat_empty_series_timelike(self, tz, values):
21102055
# GH 18447
2111-
# tz-naive
2112-
first = Series(pd.to_datetime([], utc=False))
2113-
second = Series([1, 2, 3])
2114-
expected = DataFrame([[pd.NaT, 1], [pd.NaT, 2], [pd.NaT, 3]])
2115-
result = concat([first, second], axis=1)
2116-
assert_frame_equal(result, expected)
21172056

2118-
# timezone-aware
2119-
first = Series(pd.to_datetime([], utc=True))
2120-
second = Series([1, 2, 3])
2057+
first = Series([], dtype='M8[ns]').dt.tz_localize(tz)
2058+
second = Series(values)
21212059
expected = DataFrame(
2122-
{0: pd.Series([pd.NaT, pd.NaT, pd.NaT]).dt.tz_localize('UTC'),
2123-
1: pd.Series([1, 2, 3])}
2124-
)
2060+
{0: pd.Series([pd.NaT] * len(values),
2061+
dtype='M8[ns]'
2062+
).dt.tz_localize(tz),
2063+
1: values})
21252064
result = concat([first, second], axis=1)
21262065
assert_frame_equal(result, expected)
21272066

2128-
# both empty
2129-
first = Series(pd.to_datetime([], utc=True))
2130-
second = Series([])
2131-
result = concat([first, second], axis=1)
2132-
assert result.size == 0
2133-
assert result.dtypes[0] == 'datetime64[ns, UTC]'
2134-
assert result.dtypes[1] == 'float64'
2135-
21362067
def test_default_index(self):
21372068
# is_series and ignore_index
21382069
s1 = pd.Series([1, 2, 3], name='x')

0 commit comments

Comments
 (0)