|
13 | 13 | from pandas.util.testing import (assert_frame_equal,
|
14 | 14 | assert_series_equal,
|
15 | 15 | slow)
|
| 16 | +from pandas.types.dtypes import CategoricalDtype |
| 17 | +from pandas.types.common import is_categorical_dtype, is_object_dtype |
16 | 18 | from pandas import DataFrame, Index, MultiIndex, Series, Categorical
|
17 | 19 | import pandas.util.testing as tm
|
18 | 20 |
|
@@ -1018,38 +1020,6 @@ def test_left_join_index_multi_match(self):
|
1018 | 1020 | expected.index = np.arange(len(expected))
|
1019 | 1021 | tm.assert_frame_equal(result, expected)
|
1020 | 1022 |
|
1021 |
| - def test_join_multi_dtypes(self): |
1022 |
| - |
1023 |
| - # test with multi dtypes in the join index |
1024 |
| - def _test(dtype1, dtype2): |
1025 |
| - left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), |
1026 |
| - 'k2': ['foo', 'bar'] * 12, |
1027 |
| - 'v': np.array(np.arange(24), dtype=np.int64)}) |
1028 |
| - |
1029 |
| - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) |
1030 |
| - right = DataFrame( |
1031 |
| - {'v2': np.array([5, 7], dtype=dtype2)}, index=index) |
1032 |
| - |
1033 |
| - result = left.join(right, on=['k1', 'k2']) |
1034 |
| - |
1035 |
| - expected = left.copy() |
1036 |
| - |
1037 |
| - if dtype2.kind == 'i': |
1038 |
| - dtype2 = np.dtype('float64') |
1039 |
| - expected['v2'] = np.array(np.nan, dtype=dtype2) |
1040 |
| - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 |
1041 |
| - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 |
1042 |
| - |
1043 |
| - tm.assert_frame_equal(result, expected) |
1044 |
| - |
1045 |
| - result = left.join(right, on=['k1', 'k2'], sort=True) |
1046 |
| - expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) |
1047 |
| - tm.assert_frame_equal(result, expected) |
1048 |
| - |
1049 |
| - for d1 in [np.int64, np.int32, np.int16, np.int8, np.uint8]: |
1050 |
| - for d2 in [np.int64, np.float64, np.float32, np.float16]: |
1051 |
| - _test(np.dtype(d1), np.dtype(d2)) |
1052 |
| - |
1053 | 1023 | def test_left_merge_na_buglet(self):
|
1054 | 1024 | left = DataFrame({'id': list('abcde'), 'v1': randn(5),
|
1055 | 1025 | 'v2': randn(5), 'dummy': list('abcde'),
|
@@ -1367,3 +1337,140 @@ def f():
|
1367 | 1337 | def f():
|
1368 | 1338 | household.join(log_return, how='outer')
|
1369 | 1339 | self.assertRaises(NotImplementedError, f)
|
| 1340 | + |
| 1341 | + |
| 1342 | +class TestMergeDtypes(tm.TestCase): |
| 1343 | + |
| 1344 | + def setUp(self): |
| 1345 | + |
| 1346 | + self.df = DataFrame( |
| 1347 | + {'A': ['foo', 'bar'], |
| 1348 | + 'B': Series(['foo', 'bar']).astype('category'), |
| 1349 | + 'C': [1, 2], |
| 1350 | + 'D': [1.0, 2.0], |
| 1351 | + 'E': Series([1, 2], dtype='uint64'), |
| 1352 | + 'F': Series([1, 2], dtype='int32')}) |
| 1353 | + |
| 1354 | + def test_different(self): |
| 1355 | + |
| 1356 | + # we expect differences by kind |
| 1357 | + # to be ok, while other differences should return object |
| 1358 | + |
| 1359 | + left = self.df |
| 1360 | + for col in self.df.columns: |
| 1361 | + right = DataFrame({'A': self.df[col]}) |
| 1362 | + result = pd.merge(left, right, on='A') |
| 1363 | + self.assertTrue(is_object_dtype(result.A.dtype)) |
| 1364 | + |
| 1365 | + def test_join_multi_dtypes(self): |
| 1366 | + |
| 1367 | + # test with multi dtypes in the join index |
| 1368 | + def _test(dtype1, dtype2): |
| 1369 | + left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), |
| 1370 | + 'k2': ['foo', 'bar'] * 12, |
| 1371 | + 'v': np.array(np.arange(24), dtype=np.int64)}) |
| 1372 | + |
| 1373 | + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) |
| 1374 | + right = DataFrame( |
| 1375 | + {'v2': np.array([5, 7], dtype=dtype2)}, index=index) |
| 1376 | + |
| 1377 | + result = left.join(right, on=['k1', 'k2']) |
| 1378 | + |
| 1379 | + expected = left.copy() |
| 1380 | + |
| 1381 | + if dtype2.kind == 'i': |
| 1382 | + dtype2 = np.dtype('float64') |
| 1383 | + expected['v2'] = np.array(np.nan, dtype=dtype2) |
| 1384 | + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 |
| 1385 | + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 |
| 1386 | + |
| 1387 | + tm.assert_frame_equal(result, expected) |
| 1388 | + |
| 1389 | + result = left.join(right, on=['k1', 'k2'], sort=True) |
| 1390 | + expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) |
| 1391 | + tm.assert_frame_equal(result, expected) |
| 1392 | + |
| 1393 | + for d1 in [np.int64, np.int32, np.int16, np.int8, np.uint8]: |
| 1394 | + for d2 in [np.int64, np.float64, np.float32, np.float16]: |
| 1395 | + _test(np.dtype(d1), np.dtype(d2)) |
| 1396 | + |
| 1397 | + |
| 1398 | +class TestMergeCategorical(tm.TestCase): |
| 1399 | + _multiprocess_can_split_ = True |
| 1400 | + |
| 1401 | + def setUp(self): |
| 1402 | + np.random.seed(1234) |
| 1403 | + self.left = DataFrame( |
| 1404 | + {'X': Series(np.random.choice( |
| 1405 | + ['foo', 'bar'], |
| 1406 | + size=(10,))).astype('category', categories=['foo', 'bar']), |
| 1407 | + 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) |
| 1408 | + self.right = pd.DataFrame( |
| 1409 | + {'X': Series(['foo', 'bar']).astype('category', |
| 1410 | + categories=['foo', 'bar']), |
| 1411 | + 'Z': [1, 2]}) |
| 1412 | + |
| 1413 | + def test_identical(self): |
| 1414 | + # merging on the same, should preserve dtypes |
| 1415 | + merged = pd.merge(self.left, self.left, on='X') |
| 1416 | + result = merged.dtypes.sort_index() |
| 1417 | + expected = Series([CategoricalDtype(), |
| 1418 | + np.dtype('O'), |
| 1419 | + np.dtype('O')], |
| 1420 | + index=['X', 'Y_x', 'Y_y']) |
| 1421 | + assert_series_equal(result, expected) |
| 1422 | + |
| 1423 | + def test_basic(self): |
| 1424 | + # we have matching Categorical dtypes in X |
| 1425 | + # so should preserve the merged column |
| 1426 | + merged = pd.merge(self.left, self.right, on='X') |
| 1427 | + result = merged.dtypes.sort_index() |
| 1428 | + expected = Series([CategoricalDtype(), |
| 1429 | + np.dtype('O'), |
| 1430 | + np.dtype('int64')], |
| 1431 | + index=['X', 'Y', 'Z']) |
| 1432 | + assert_series_equal(result, expected) |
| 1433 | + |
| 1434 | + def test_other_columns(self): |
| 1435 | + # non-merge columns should preserve if possible |
| 1436 | + left = self.left |
| 1437 | + right = self.right.assign(Z=self.right.Z.astype('category')) |
| 1438 | + |
| 1439 | + merged = pd.merge(left, right, on='X') |
| 1440 | + result = merged.dtypes.sort_index() |
| 1441 | + expected = Series([CategoricalDtype(), |
| 1442 | + np.dtype('O'), |
| 1443 | + CategoricalDtype()], |
| 1444 | + index=['X', 'Y', 'Z']) |
| 1445 | + assert_series_equal(result, expected) |
| 1446 | + |
| 1447 | + # categories are preserved |
| 1448 | + self.assertTrue(left.X.values.is_dtype_equal(merged.X.values)) |
| 1449 | + self.assertTrue(right.Z.values.is_dtype_equal(merged.Z.values)) |
| 1450 | + |
| 1451 | + def test_dtype_on_merged_different(self): |
| 1452 | + # our merging columns, X now has 2 different dtypes |
| 1453 | + # so we must be object as a result |
| 1454 | + left = self.left |
| 1455 | + |
| 1456 | + for change in [lambda x: x, |
| 1457 | + lambda x: x.astype('category', |
| 1458 | + categories=['bar', 'foo']), |
| 1459 | + lambda x: x.astype('category', |
| 1460 | + categories=['foo', 'bar', 'bah']), |
| 1461 | + lambda x: x.astype('category', ordered=True)]: |
| 1462 | + for how in ['inner', 'outer', 'left', 'right']: |
| 1463 | + |
| 1464 | + X = change(self.right.X.astype('object')) |
| 1465 | + right = self.right.assign(X=X) |
| 1466 | + self.assertTrue(is_categorical_dtype(left.X.values)) |
| 1467 | + self.assertFalse(left.X.values.is_dtype_equal(right.X.values)) |
| 1468 | + |
| 1469 | + merged = pd.merge(left, right, on='X', how=how) |
| 1470 | + |
| 1471 | + result = merged.dtypes.sort_index() |
| 1472 | + expected = Series([np.dtype('O'), |
| 1473 | + np.dtype('O'), |
| 1474 | + np.dtype('int64')], |
| 1475 | + index=['X', 'Y', 'Z']) |
| 1476 | + assert_series_equal(result, expected) |
0 commit comments