Skip to content

Commit 4d51aff

Browse files
author
Artemy Kolchinsky
committed
BUG: Make .iloc and .loc indexing consistent on empty dataframes
Tests Fix Test reorder Doc update Tests fix Tests fix SQL tests fix Testing update Fixes Testing fix Test fix
1 parent 7eb5668 commit 4d51aff

File tree

10 files changed

+64
-22
lines changed

10 files changed

+64
-22
lines changed

doc/source/whatsnew/v0.16.1.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -217,9 +217,8 @@ Bug Fixes
217217
- Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`)
218218
- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`)
219219

220-
221220
- Bug causing elements with a null group to spill into the final group when grouping by a ``Categorical`` (:issue:`9603`)
222-
221+
- Bug where .iloc and .loc behavior is not consistent on empty dataframes (:issue:`9964`)
223222

224223
- Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`)
225224

pandas/core/frame.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1740,17 +1740,19 @@ def _ixs(self, i, axis=0):
17401740
lab_slice = slice(label[0], label[-1])
17411741
return self.ix[:, lab_slice]
17421742
else:
1743-
label = self.columns[i]
17441743
if isinstance(label, Index):
17451744
return self.take(i, axis=1, convert=True)
17461745

1746+
index_len = len(self.index)
1747+
17471748
# if the values returned are not the same length
17481749
# as the index (iow a not found value), iget returns
17491750
# a 0-len ndarray. This is effectively catching
17501751
# a numpy error (as numpy should really raise)
17511752
values = self._data.iget(i)
1752-
if not len(values):
1753-
values = np.array([np.nan] * len(self.index), dtype=object)
1753+
1754+
if index_len and not len(values):
1755+
values = np.array([np.nan] * index_len, dtype=object)
17541756
result = self._constructor_sliced.from_array(
17551757
values, index=self.index,
17561758
name=label, fastpath=True)

pandas/io/tests/test_json/test_pandas.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -324,12 +324,14 @@ def test_frame_to_json_except(self):
324324
def test_frame_empty(self):
325325
df = DataFrame(columns=['jim', 'joe'])
326326
self.assertFalse(df._is_mixed_type)
327-
assert_frame_equal(read_json(df.to_json()), df)
327+
assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df)
328328

329+
def test_frame_empty_mixedtype(self):
329330
# mixed type
331+
df = DataFrame(columns=['jim', 'joe'])
330332
df['joe'] = df['joe'].astype('i8')
331333
self.assertTrue(df._is_mixed_type)
332-
assert_frame_equal(read_json(df.to_json()), df)
334+
assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df)
333335

334336
def test_v12_compat(self):
335337
df = DataFrame(

pandas/io/tests/test_sql.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1256,10 +1256,14 @@ def test_transactions(self):
12561256
self._transaction_test()
12571257

12581258
def test_get_schema_create_table(self):
1259-
self._load_test2_data()
1259+
# Use a dataframe without a bool column, since MySQL converts bool to
1260+
# TINYINT (which read_sql_table returns as an int and causes a dtype
1261+
# mismatch)
1262+
1263+
self._load_test3_data()
12601264
tbl = 'test_get_schema_create_table'
1261-
create_sql = sql.get_schema(self.test_frame2, tbl, con=self.conn)
1262-
blank_test_df = self.test_frame2.iloc[:0]
1265+
create_sql = sql.get_schema(self.test_frame3, tbl, con=self.conn)
1266+
blank_test_df = self.test_frame3.iloc[:0]
12631267

12641268
self.drop_table(tbl)
12651269
self.conn.execute(create_sql)

pandas/stats/tests/test_moments.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -862,7 +862,7 @@ def _non_null_values(x):
862862
if mock_mean:
863863
# check that mean equals mock_mean
864864
expected = mock_mean(x)
865-
assert_equal(mean_x, expected)
865+
assert_equal(mean_x, expected.astype('float64'))
866866

867867
# check that correlation of a series with itself is either 1 or NaN
868868
corr_x_x = corr(x, x)
@@ -1550,6 +1550,7 @@ def test_moment_functions_zero_length(self):
15501550
df1_expected = df1
15511551
df1_expected_panel = Panel(items=df1.index, major_axis=df1.columns, minor_axis=df1.columns)
15521552
df2 = DataFrame(columns=['a'])
1553+
df2['a'] = df2['a'].astype('float64')
15531554
df2_expected = df2
15541555
df2_expected_panel = Panel(items=df2.index, major_axis=df2.columns, minor_axis=df2.columns)
15551556

pandas/tests/test_frame.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -11864,12 +11864,10 @@ def test_mode(self):
1186411864
"E": [8, 8, 1, 1, 3, 3]})
1186511865
assert_frame_equal(df[["A"]].mode(),
1186611866
pd.DataFrame({"A": [12]}))
11867-
assert_frame_equal(df[["D"]].mode(),
11868-
pd.DataFrame(pd.Series([], dtype="int64"),
11869-
columns=["D"]))
11870-
assert_frame_equal(df[["E"]].mode(),
11871-
pd.DataFrame(pd.Series([1, 3, 8], dtype="int64"),
11872-
columns=["E"]))
11867+
expected = pd.Series([], dtype='int64', name='D').to_frame()
11868+
assert_frame_equal(df[["D"]].mode(), expected)
11869+
expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame()
11870+
assert_frame_equal(df[["E"]].mode(), expected)
1187311871
assert_frame_equal(df[["A", "B"]].mode(),
1187411872
pd.DataFrame({"A": [12], "B": [10.]}))
1187511873
assert_frame_equal(df.mode(),

pandas/tests/test_groupby.py

+4
Original file line numberDiff line numberDiff line change
@@ -1728,6 +1728,8 @@ def test_groupby_head_tail(self):
17281728
assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
17291729

17301730
empty_not_as = DataFrame(columns=df.columns)
1731+
empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype)
1732+
empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype)
17311733
assert_frame_equal(empty_not_as, g_not_as.head(0))
17321734
assert_frame_equal(empty_not_as, g_not_as.tail(0))
17331735
assert_frame_equal(empty_not_as, g_not_as.head(-1))
@@ -1743,6 +1745,8 @@ def test_groupby_head_tail(self):
17431745
assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
17441746

17451747
empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
1748+
empty_as['A'] = empty_not_as['A'].astype(df.A.dtype)
1749+
empty_as['B'] = empty_not_as['B'].astype(df.B.dtype)
17461750
assert_frame_equal(empty_as, g_as.head(0))
17471751
assert_frame_equal(empty_as, g_as.tail(0))
17481752
assert_frame_equal(empty_as, g_as.head(-1))

pandas/tests/test_indexing.py

+28-5
Original file line numberDiff line numberDiff line change
@@ -1063,6 +1063,7 @@ def test_loc_setitem_consistency(self):
10631063

10641064
# empty (essentially noops)
10651065
expected = DataFrame(columns=['x', 'y'])
1066+
expected['x'] = expected['x'].astype(np.int64)
10661067
df = DataFrame(columns=['x', 'y'])
10671068
df.loc[:, 'x'] = 1
10681069
assert_frame_equal(df,expected)
@@ -3376,7 +3377,7 @@ def f():
33763377
expected = DataFrame(columns=['foo'])
33773378
def f():
33783379
df = DataFrame()
3379-
df['foo'] = Series([])
3380+
df['foo'] = Series([], dtype='object')
33803381
return df
33813382
assert_frame_equal(f(), expected)
33823383
def f():
@@ -3386,17 +3387,20 @@ def f():
33863387
assert_frame_equal(f(), expected)
33873388
def f():
33883389
df = DataFrame()
3389-
df['foo'] = Series(range(len(df)))
3390+
df['foo'] = df.index
33903391
return df
33913392
assert_frame_equal(f(), expected)
3393+
3394+
expected = DataFrame(columns=['foo'])
3395+
expected['foo'] = expected['foo'].astype('float64')
33923396
def f():
33933397
df = DataFrame()
33943398
df['foo'] = []
33953399
return df
33963400
assert_frame_equal(f(), expected)
33973401
def f():
33983402
df = DataFrame()
3399-
df['foo'] = df.index
3403+
df['foo'] = Series(range(len(df)))
34003404
return df
34013405
assert_frame_equal(f(), expected)
34023406
def f():
@@ -3429,21 +3433,31 @@ def f():
34293433

34303434
# GH5720, GH5744
34313435
# don't create rows when empty
3436+
expected = DataFrame(columns=['A','B','New'])
3437+
expected['A'] = expected['A'].astype('int64')
3438+
expected['B'] = expected['B'].astype('float64')
3439+
expected['New'] = expected['New'].astype('float64')
34323440
df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]})
34333441
y = df[df.A > 5]
34343442
y['New'] = np.nan
3435-
assert_frame_equal(y,DataFrame(columns=['A','B','New']))
3443+
assert_frame_equal(y,expected)
3444+
#assert_frame_equal(y,expected)
34363445

3446+
expected = DataFrame(columns=['a','b','c c','d'])
3447+
expected['d'] = expected['d'].astype('int64')
34373448
df = DataFrame(columns=['a', 'b', 'c c'])
34383449
df['d'] = 3
3439-
assert_frame_equal(df,DataFrame(columns=['a','b','c c','d']))
3450+
assert_frame_equal(df,expected)
34403451
assert_series_equal(df['c c'],Series(name='c c',dtype=object))
34413452

34423453
# reindex columns is ok
34433454
df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]})
34443455
y = df[df.A > 5]
34453456
result = y.reindex(columns=['A','B','C'])
34463457
expected = DataFrame(columns=['A','B','C'])
3458+
expected['A'] = expected['A'].astype('int64')
3459+
expected['B'] = expected['B'].astype('float64')
3460+
expected['C'] = expected['C'].astype('float64')
34473461
assert_frame_equal(result,expected)
34483462

34493463
# GH 5756
@@ -4429,6 +4443,15 @@ def test_indexing_assignment_dict_already_exists(self):
44294443
expected.loc[5] = [9, 99]
44304444
tm.assert_frame_equal(df, expected)
44314445

4446+
def test_indexing_dtypes_on_empty(self):
4447+
# Check that .iloc and .ix return correct dtypes GH9983
4448+
df = DataFrame({'a':[1,2,3],'b':['b','b2','b3']})
4449+
df2 = df.ix[[],:]
4450+
4451+
self.assertEqual(df2.loc[:,'a'].dtype, int)
4452+
assert_series_equal(df2.loc[:,'a'], df2.iloc[:,0])
4453+
assert_series_equal(df2.loc[:,'a'], df2.ix[:,0])
4454+
44324455

44334456

44344457
class TestCategoricalIndex(tm.TestCase):

pandas/tests/test_testing.py

+8
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,14 @@ def test_multiindex_dtype(self):
215215
{'a':[1.0,2.0],'b':[2.1,1.5],'c':['l1','l2']}, index=['a','b'])
216216
self._assert_not_equal(df1, df2, check_index_type=True)
217217

218+
def test_empty_dtypes(self):
219+
df1=pd.DataFrame(columns=["col1","col2"])
220+
df1["col1"] = df1["col1"].astype('int64')
221+
df2=pd.DataFrame(columns=["col1","col2"])
222+
self._assert_equal(df1, df2, check_dtype=False)
223+
self._assert_not_equal(df1, df2, check_dtype=True)
224+
225+
218226
class TestRNGContext(unittest.TestCase):
219227

220228
def test_RNGContext(self):

pandas/tseries/tests/test_period.py

+1
Original file line numberDiff line numberDiff line change
@@ -2118,6 +2118,7 @@ def test_range_slice_outofbounds(self):
21182118
for idx in [didx, pidx]:
21192119
df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx)
21202120
empty = DataFrame(index=idx.__class__([], freq='D'), columns=['units'])
2121+
empty['units'] = empty['units'].astype('int64')
21212122

21222123
tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty)
21232124
tm.assert_frame_equal(df['2013/09/30':'2013/10/02'], df.iloc[:2])

0 commit comments

Comments
 (0)