Skip to content

Commit 1dcddba

Browse files
paul-manninojreback
authored andcommitted
BUG: Concatentation of TZ-aware dataframes
closes pandas-dev#12396 closes pandas-dev#18447
1 parent 3283ae8 commit 1dcddba

File tree

4 files changed

+167
-6
lines changed

4 files changed

+167
-6
lines changed

doc/source/whatsnew/v0.23.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1373,6 +1373,8 @@ Reshaping
13731373
- Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
13741374
- Bug in :func:`get_dummies`, and :func:`select_dtypes`, where duplicate column names caused incorrect behavior (:issue:`20848`)
13751375
- Bug in :func:`isna`, which cannot handle ambiguous typed lists (:issue:`20675`)
1376+
- Bug in :func:`concat` which raises an error when concatenating TZ-aware dataframes and all-NaT dataframes (:issue:`12396`)
1377+
- Bug in :func:`concat` which raises an error when concatenating empty TZ-aware series (:issue:`18447`)
13761378

13771379
Other
13781380
^^^^^

pandas/core/indexes/base.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -2183,17 +2183,19 @@ def _assert_take_fillable(self, values, indices, allow_fill=True,
21832183
fill_value=None, na_value=np.nan):
21842184
""" Internal method to handle NA filling of take """
21852185
indices = _ensure_platform_int(indices)
2186-
21872186
# only fill if we are passing a non-None fill_value
21882187
if allow_fill and fill_value is not None:
21892188
if (indices < -1).any():
21902189
msg = ('When allow_fill=True and fill_value is not None, '
21912190
'all indices must be >= -1')
21922191
raise ValueError(msg)
2193-
taken = values.take(indices)
21942192
mask = indices == -1
2195-
if mask.any():
2196-
taken[mask] = na_value
2193+
if mask.all():
2194+
taken = np.full(indices.shape, fill_value=na_value)
2195+
else:
2196+
taken = values.take(indices)
2197+
if mask.any():
2198+
taken[mask] = na_value
21972199
else:
21982200
taken = values.take(indices)
21992201
return taken

pandas/core/internals.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -5835,8 +5835,10 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
58355835
if len(values) and values[0] is None:
58365836
fill_value = None
58375837

5838-
if getattr(self.block, 'is_datetimetz', False):
5839-
pass
5838+
if getattr(self.block, 'is_datetimetz', False) or \
5839+
is_datetimetz(empty_dtype):
5840+
missing_arr = np.full(np.prod(self.shape), fill_value)
5841+
return DatetimeIndex(missing_arr, dtype=empty_dtype)
58405842
elif getattr(self.block, 'is_categorical', False):
58415843
pass
58425844
elif getattr(self.block, 'is_sparse', False):

pandas/tests/reshape/test_concat.py

+155
Original file line numberDiff line numberDiff line change
@@ -1917,6 +1917,135 @@ def test_concat_tz_series_tzlocal(self):
19171917
tm.assert_series_equal(result, pd.Series(x + y))
19181918
assert result.dtype == 'datetime64[ns, tzlocal()]'
19191919

1920+
def test_concat_NaT_dataframes_all_NaT_axis_0(self):
1921+
# GH 12396
1922+
1923+
# tz-naive
1924+
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
1925+
second = pd.DataFrame([[pd.NaT]])
1926+
1927+
result = pd.concat([first, second], axis=0)
1928+
expected = pd.DataFrame([pd.NaT, pd.NaT, pd.NaT], index=[0, 1, 0])
1929+
assert_frame_equal(result, expected)
1930+
1931+
# one side timezone-aware
1932+
# upcasts for mixed case
1933+
first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'))
1934+
result = pd.concat([first, second], axis=0)
1935+
expected = pd.DataFrame(
1936+
pd.Series([pd.NaT, pd.NaT, pd.NaT]).dt.tz_localize('UTC'),
1937+
index=[0, 1, 0]
1938+
)
1939+
assert_frame_equal(result, expected)
1940+
1941+
# both sides timezone-aware
1942+
# upcasts to tz-aware
1943+
second = pd.DataFrame(pd.Series([pd.NaT]).dt.tz_localize('UTC'))
1944+
result = pd.concat([first, second], axis=0)
1945+
assert_frame_equal(result, expected)
1946+
1947+
def test_concat_NaT_dataframes_all_NaT_axis_1(self):
1948+
# GH 12396
1949+
1950+
# tz-naive
1951+
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
1952+
second = pd.DataFrame([[pd.NaT]], columns=[1])
1953+
expected = pd.DataFrame([[pd.NaT, pd.NaT], [pd.NaT, pd.NaT]],
1954+
columns=[0, 1])
1955+
result = pd.concat([first, second], axis=1)
1956+
assert_frame_equal(result, expected)
1957+
1958+
# one side timezone-aware
1959+
# upcasts result to tz-aware
1960+
first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'))
1961+
expected = pd.DataFrame(
1962+
{0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'),
1963+
1: pd.Series([pd.NaT, pd.NaT])}
1964+
)
1965+
result = pd.concat([first, second], axis=1)
1966+
assert_frame_equal(result, expected)
1967+
1968+
# both sides timezone-aware
1969+
# upcasts result to tz-aware
1970+
second[1] = second[1].dt.tz_localize('UTC')
1971+
expected = pd.DataFrame(
1972+
{0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'),
1973+
1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC')}
1974+
)
1975+
result = pd.concat([first, second], axis=1)
1976+
assert_frame_equal(result, expected)
1977+
1978+
def test_concat_NaT_dataframes_mixed_timestamps_and_NaT(self):
1979+
# GH 12396
1980+
1981+
# tz-naive
1982+
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
1983+
second = pd.DataFrame([[pd.Timestamp('2015/01/01')],
1984+
[pd.Timestamp('2016/01/01')]],
1985+
index=[2, 3])
1986+
expected = pd.DataFrame([pd.NaT, pd.NaT,
1987+
pd.Timestamp('2015/01/01'),
1988+
pd.Timestamp('2016/01/01')])
1989+
1990+
result = pd.concat([first, second], axis=0)
1991+
assert_frame_equal(result, expected)
1992+
1993+
# one side timezone-aware
1994+
second = second[0].dt.tz_localize('UTC')
1995+
expected = pd.DataFrame(
1996+
pd.Series([pd.NaT, pd.NaT,
1997+
pd.Timestamp('2015/01/01'),
1998+
pd.Timestamp('2016/01/01')]).dt.tz_localize('UTC')
1999+
)
2000+
result = pd.concat([first, second], axis=0)
2001+
assert_frame_equal(result, expected)
2002+
2003+
def test_concat_NaT_series_dataframe_all_NaT(self):
2004+
# GH 12396
2005+
2006+
# tz-naive
2007+
first = pd.Series([pd.NaT, pd.NaT])
2008+
second = pd.DataFrame([[pd.Timestamp('2015/01/01')],
2009+
[pd.Timestamp('2016/01/01')]],
2010+
index=[2, 3])
2011+
2012+
expected = pd.DataFrame([pd.NaT, pd.NaT,
2013+
pd.Timestamp('2015/01/01'),
2014+
pd.Timestamp('2016/01/01')])
2015+
2016+
result = pd.concat([first, second])
2017+
assert_frame_equal(result, expected)
2018+
2019+
# one side timezone-aware
2020+
second[0] = second[0].dt.tz_localize('UTC')
2021+
result = pd.concat([first, second])
2022+
2023+
expected = pd.DataFrame(
2024+
pd.Series([pd.NaT, pd.NaT,
2025+
pd.Timestamp('2015/01/01'),
2026+
pd.Timestamp('2016/01/01')]).dt.tz_localize('UTC')
2027+
)
2028+
assert_frame_equal(result, expected)
2029+
2030+
# both sides timezone-aware
2031+
first = first.dt.tz_localize('UTC')
2032+
result = pd.concat([first, second])
2033+
assert_frame_equal(result, expected)
2034+
2035+
# mixed tz
2036+
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
2037+
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz='UTC')],
2038+
[pd.Timestamp('2016/01/01', tz='US/Eastern')]],
2039+
index=[2, 3])
2040+
2041+
expected = pd.DataFrame([pd.NaT,
2042+
pd.NaT,
2043+
pd.Timestamp('2015/01/01', tz='UTC'),
2044+
pd.Timestamp('2016/01/01', tz='US/Eastern')])
2045+
2046+
result = pd.concat([first, second], axis=0)
2047+
assert_frame_equal(result, expected)
2048+
19202049
def test_concat_period_series(self):
19212050
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
19222051
y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D'))
@@ -1978,6 +2107,32 @@ def test_concat_empty_series(self):
19782107
columns=['x', 0])
19792108
tm.assert_frame_equal(res, exp)
19802109

2110+
# GH 18447
2111+
# tz-naive
2112+
first = Series(pd.to_datetime([], utc=False))
2113+
second = Series([1, 2, 3])
2114+
expected = DataFrame([[pd.NaT, 1], [pd.NaT, 2], [pd.NaT, 3]])
2115+
result = concat([first, second], axis=1)
2116+
assert_frame_equal(result, expected)
2117+
2118+
# timezone-aware
2119+
first = Series(pd.to_datetime([], utc=True))
2120+
second = Series([1, 2, 3])
2121+
expected = DataFrame(
2122+
{0: pd.Series([pd.NaT, pd.NaT, pd.NaT]).dt.tz_localize('UTC'),
2123+
1: pd.Series([1, 2, 3])}
2124+
)
2125+
result = concat([first, second], axis=1)
2126+
assert_frame_equal(result, expected)
2127+
2128+
# both empty
2129+
first = Series(pd.to_datetime([], utc=True))
2130+
second = Series([])
2131+
result = concat([first, second], axis=1)
2132+
assert result.size == 0
2133+
assert result.dtypes[0] == 'datetime64[ns, UTC]'
2134+
assert result.dtypes[1] == 'float64'
2135+
19812136
def test_default_index(self):
19822137
# is_series and ignore_index
19832138
s1 = pd.Series([1, 2, 3], name='x')

0 commit comments

Comments
 (0)