|
12 | 12 | from pandas.util.testing import (assert_frame_equal,
|
13 | 13 | assert_series_equal,
|
14 | 14 | slow)
|
| 15 | +from pandas.types.dtypes import CategoricalDtype |
15 | 16 | from pandas import DataFrame, Index, MultiIndex, Series, Categorical
|
16 | 17 | import pandas.util.testing as tm
|
17 | 18 |
|
@@ -1368,3 +1369,118 @@ def f():
|
1368 | 1369 | def f():
|
1369 | 1370 | household.join(log_return, how='outer')
|
1370 | 1371 | self.assertRaises(NotImplementedError, f)
|
| 1372 | + |
| 1373 | + |
| 1374 | +class TestMergeCategorical(tm.TestCase): |
| 1375 | + _multiprocess_can_split_ = True |
| 1376 | + |
| 1377 | + def setUp(self): |
| 1378 | + np.random.seed(1234) |
| 1379 | + self.left = DataFrame( |
| 1380 | + {'X': np.random.choice(['foo', 'bar'], size=(10,)), |
| 1381 | + 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) |
| 1382 | + |
| 1383 | + self.right = pd.DataFrame( |
| 1384 | + {'X': np.random.choice(['foo', 'bar'], size=(10,)), |
| 1385 | + 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10,))}) |
| 1386 | + |
| 1387 | + def test_identical(self): |
| 1388 | + # GH 10409 |
| 1389 | + left = self.left.assign(X=self.left.X.astype('category')) |
| 1390 | + |
| 1391 | + merged = pd.merge(left, left, on='X') |
| 1392 | + result = merged.dtypes.sort_index() |
| 1393 | + expected = Series([CategoricalDtype(), |
| 1394 | + np.dtype('O'), |
| 1395 | + np.dtype('O')], |
| 1396 | + index=['X', 'Y_x', 'Y_y']) |
| 1397 | + assert_series_equal(result, expected) |
| 1398 | + |
| 1399 | + def test_other_columns(self): |
| 1400 | + # non-merge columns should preserver if possible |
| 1401 | + x = self.left.X.astype('category') |
| 1402 | + left = DataFrame({'X': x, 'Y': x}) |
| 1403 | + |
| 1404 | + merged = pd.merge(left, left, on='X') |
| 1405 | + result = merged.dtypes.sort_index() |
| 1406 | + expected = Series([CategoricalDtype(), |
| 1407 | + CategoricalDtype(), |
| 1408 | + CategoricalDtype()], |
| 1409 | + index=['X', 'Y_x', 'Y_y']) |
| 1410 | + assert_series_equal(result, expected) |
| 1411 | + |
| 1412 | + # different categories |
| 1413 | + x = self.left.X.astype('category') |
| 1414 | + left = DataFrame( |
| 1415 | + {'X': x, |
| 1416 | + 'Y': x.cat.set_categories(['bar', 'foo', 'bah'])}) |
| 1417 | + right = self.right.drop_duplicates(['X']) |
| 1418 | + right = right.assign( |
| 1419 | + Y=pd.Series(['foo', 'foo']).astype( |
| 1420 | + 'category', categories=['foo', 'bar', 'baz'])) |
| 1421 | + |
| 1422 | + merged = pd.merge(left, right, on='X') |
| 1423 | + result = merged.dtypes.sort_index() |
| 1424 | + expected = Series([CategoricalDtype(), |
| 1425 | + CategoricalDtype(), |
| 1426 | + CategoricalDtype(), |
| 1427 | + np.dtype('O')], |
| 1428 | + index=['X', 'Y_x', 'Y_y', 'Z']) |
| 1429 | + assert_series_equal(result, expected) |
| 1430 | + |
| 1431 | + def test_categories_same(self): |
| 1432 | + # GH 10409 |
| 1433 | + left = self.left.assign(X=self.left.X.astype('category')) |
| 1434 | + right = self.right.assign(X=self.right.X.astype('category')) |
| 1435 | + |
| 1436 | + merged = pd.merge(left, right, on='X') |
| 1437 | + result = merged.dtypes.sort_index() |
| 1438 | + expected = Series([CategoricalDtype(), |
| 1439 | + np.dtype('O'), |
| 1440 | + np.dtype('O')], |
| 1441 | + index=['X', 'Y', 'Z']) |
| 1442 | + assert_series_equal(result, expected) |
| 1443 | + |
| 1444 | + def test_categories_different(self): |
| 1445 | + |
| 1446 | + r = self.right.drop_duplicates(['X']) |
| 1447 | + |
| 1448 | + # from above with original categories |
| 1449 | + left = self.left.assign(X=self.left.X.astype('category')) |
| 1450 | + |
| 1451 | + right = r.assign(X=r.X.astype('category')) |
| 1452 | + merged = pd.merge(left, right, on='X') |
| 1453 | + |
| 1454 | + # swap the categories |
| 1455 | + # but should still work (end return categorical) |
| 1456 | + left = self.left.assign(X=self.left.X.astype('category')) |
| 1457 | + right = r.assign(X=r.X.astype('category', categories=['foo', 'bar'])) |
| 1458 | + result = pd.merge(left, right, on='X') |
| 1459 | + tm.assert_index_equal(result.X.cat.categories, |
| 1460 | + pd.Index(['bar', 'foo'])) |
| 1461 | + |
| 1462 | + assert_frame_equal(result, merged) |
| 1463 | + |
| 1464 | + result = result.dtypes.sort_index() |
| 1465 | + expected = Series([CategoricalDtype(), |
| 1466 | + np.dtype('O'), |
| 1467 | + np.dtype('O')], |
| 1468 | + index=['X', 'Y', 'Z']) |
| 1469 | + assert_series_equal(result, expected) |
| 1470 | + |
| 1471 | + # swap the categories and ordered on one |
| 1472 | + # but should still work (end return categorical) |
| 1473 | + right = r.assign(X=r.X.astype('category', categories=['foo', 'bar'], |
| 1474 | + ordered=True)) |
| 1475 | + result = pd.merge(left, right, on='X') |
| 1476 | + tm.assert_index_equal(result.X.cat.categories, |
| 1477 | + pd.Index(['bar', 'foo'])) |
| 1478 | + |
| 1479 | + assert_frame_equal(result, merged) |
| 1480 | + |
| 1481 | + result = result.dtypes.sort_index() |
| 1482 | + expected = Series([CategoricalDtype(), |
| 1483 | + np.dtype('O'), |
| 1484 | + np.dtype('O')], |
| 1485 | + index=['X', 'Y', 'Z']) |
| 1486 | + assert_series_equal(result, expected) |
0 commit comments