|
14 | 14 | from pandas.util.testing import (assert_frame_equal,
|
15 | 15 | assert_series_equal,
|
16 | 16 | slow)
|
| 17 | +from pandas.types.dtypes import CategoricalDtype |
17 | 18 | from pandas import DataFrame, Index, MultiIndex, Series, Categorical
|
18 | 19 | import pandas.util.testing as tm
|
19 | 20 |
|
@@ -1372,6 +1373,121 @@ def f():
|
1372 | 1373 | self.assertRaises(NotImplementedError, f)
|
1373 | 1374 |
|
1374 | 1375 |
|
| 1376 | +class TestMergeCategorical(tm.TestCase): |
| 1377 | + _multiprocess_can_split_ = True |
| 1378 | + |
| 1379 | + def setUp(self): |
| 1380 | + np.random.seed(1234) |
| 1381 | + self.left = DataFrame( |
| 1382 | + {'X': np.random.choice(['foo', 'bar'], size=(10,)), |
| 1383 | + 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) |
| 1384 | + |
| 1385 | + self.right = pd.DataFrame( |
| 1386 | + {'X': np.random.choice(['foo', 'bar'], size=(10,)), |
| 1387 | + 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10,))}) |
| 1388 | + |
| 1389 | + def test_identical(self): |
| 1390 | + # GH 10409 |
| 1391 | + left = self.left.assign(X=self.left.X.astype('category')) |
| 1392 | + |
| 1393 | + merged = pd.merge(left, left, on='X') |
| 1394 | + result = merged.dtypes.sort_index() |
| 1395 | + expected = Series([CategoricalDtype(), |
| 1396 | + np.dtype('O'), |
| 1397 | + np.dtype('O')], |
| 1398 | + index=['X', 'Y_x', 'Y_y']) |
| 1399 | + assert_series_equal(result, expected) |
| 1400 | + |
| 1401 | + def test_other_columns(self): |
| 1402 | + # non-merge columns should preserver if possible |
| 1403 | + x = self.left.X.astype('category') |
| 1404 | + left = DataFrame({'X': x, 'Y': x}) |
| 1405 | + |
| 1406 | + merged = pd.merge(left, left, on='X') |
| 1407 | + result = merged.dtypes.sort_index() |
| 1408 | + expected = Series([CategoricalDtype(), |
| 1409 | + CategoricalDtype(), |
| 1410 | + CategoricalDtype()], |
| 1411 | + index=['X', 'Y_x', 'Y_y']) |
| 1412 | + assert_series_equal(result, expected) |
| 1413 | + |
| 1414 | + # different categories |
| 1415 | + x = self.left.X.astype('category') |
| 1416 | + left = DataFrame( |
| 1417 | + {'X': x, |
| 1418 | + 'Y': x.cat.set_categories(['bar', 'foo', 'bah'])}) |
| 1419 | + right = self.right.drop_duplicates(['X']) |
| 1420 | + right = right.assign( |
| 1421 | + Y=pd.Series(['foo', 'foo']).astype( |
| 1422 | + 'category', categories=['foo', 'bar', 'baz'])) |
| 1423 | + |
| 1424 | + merged = pd.merge(left, right, on='X') |
| 1425 | + result = merged.dtypes.sort_index() |
| 1426 | + expected = Series([CategoricalDtype(), |
| 1427 | + CategoricalDtype(), |
| 1428 | + CategoricalDtype(), |
| 1429 | + np.dtype('O')], |
| 1430 | + index=['X', 'Y_x', 'Y_y', 'Z']) |
| 1431 | + assert_series_equal(result, expected) |
| 1432 | + |
| 1433 | + def test_categories_same(self): |
| 1434 | + # GH 10409 |
| 1435 | + left = self.left.assign(X=self.left.X.astype('category')) |
| 1436 | + right = self.right.assign(X=self.right.X.astype('category')) |
| 1437 | + |
| 1438 | + merged = pd.merge(left, right, on='X') |
| 1439 | + result = merged.dtypes.sort_index() |
| 1440 | + expected = Series([CategoricalDtype(), |
| 1441 | + np.dtype('O'), |
| 1442 | + np.dtype('O')], |
| 1443 | + index=['X', 'Y', 'Z']) |
| 1444 | + assert_series_equal(result, expected) |
| 1445 | + |
| 1446 | + def test_categories_different(self): |
| 1447 | + |
| 1448 | + r = self.right.drop_duplicates(['X']) |
| 1449 | + |
| 1450 | + # from above with original categories |
| 1451 | + left = self.left.assign(X=self.left.X.astype('category')) |
| 1452 | + |
| 1453 | + right = r.assign(X=r.X.astype('category')) |
| 1454 | + merged = pd.merge(left, right, on='X') |
| 1455 | + |
| 1456 | + # swap the categories |
| 1457 | + # but should still work (end return categorical) |
| 1458 | + left = self.left.assign(X=self.left.X.astype('category')) |
| 1459 | + right = r.assign(X=r.X.astype('category', categories=['foo', 'bar'])) |
| 1460 | + result = pd.merge(left, right, on='X') |
| 1461 | + tm.assert_index_equal(result.X.cat.categories, |
| 1462 | + pd.Index(['bar', 'foo'])) |
| 1463 | + |
| 1464 | + assert_frame_equal(result, merged) |
| 1465 | + |
| 1466 | + result = result.dtypes.sort_index() |
| 1467 | + expected = Series([CategoricalDtype(), |
| 1468 | + np.dtype('O'), |
| 1469 | + np.dtype('O')], |
| 1470 | + index=['X', 'Y', 'Z']) |
| 1471 | + assert_series_equal(result, expected) |
| 1472 | + |
| 1473 | + # swap the categories and ordered on one |
| 1474 | + # but should still work (end return categorical) |
| 1475 | + right = r.assign(X=r.X.astype('category', categories=['foo', 'bar'], |
| 1476 | + ordered=True)) |
| 1477 | + result = pd.merge(left, right, on='X') |
| 1478 | + tm.assert_index_equal(result.X.cat.categories, |
| 1479 | + pd.Index(['bar', 'foo'])) |
| 1480 | + |
| 1481 | + assert_frame_equal(result, merged) |
| 1482 | + |
| 1483 | + result = result.dtypes.sort_index() |
| 1484 | + expected = Series([CategoricalDtype(), |
| 1485 | + np.dtype('O'), |
| 1486 | + np.dtype('O')], |
| 1487 | + index=['X', 'Y', 'Z']) |
| 1488 | + assert_series_equal(result, expected) |
| 1489 | + |
| 1490 | + |
1375 | 1491 | if __name__ == '__main__':
|
1376 | 1492 | nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
|
1377 | 1493 | exit=False)
|
0 commit comments