|
1 | 1 | # pylint: disable=E1103
|
2 | 2 |
|
| 3 | +import pytest |
3 | 4 | from datetime import datetime
|
4 | 5 | from numpy.random import randn
|
5 | 6 | from numpy import nan
|
|
11 | 12 | from pandas.tools.concat import concat
|
12 | 13 | from pandas.tools.merge import merge, MergeError
|
13 | 14 | from pandas.util.testing import assert_frame_equal, assert_series_equal
|
| 15 | +from pandas.types.dtypes import CategoricalDtype |
| 16 | +from pandas.types.common import is_categorical_dtype, is_object_dtype |
14 | 17 | from pandas import DataFrame, Index, MultiIndex, Series, Categorical
|
15 | 18 | import pandas.util.testing as tm
|
16 | 19 |
|
@@ -1024,38 +1027,6 @@ def test_left_join_index_multi_match(self):
|
1024 | 1027 | expected.index = np.arange(len(expected))
|
1025 | 1028 | tm.assert_frame_equal(result, expected)
|
1026 | 1029 |
|
1027 |
| - def test_join_multi_dtypes(self): |
1028 |
| - |
1029 |
| - # test with multi dtypes in the join index |
1030 |
| - def _test(dtype1, dtype2): |
1031 |
| - left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), |
1032 |
| - 'k2': ['foo', 'bar'] * 12, |
1033 |
| - 'v': np.array(np.arange(24), dtype=np.int64)}) |
1034 |
| - |
1035 |
| - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) |
1036 |
| - right = DataFrame( |
1037 |
| - {'v2': np.array([5, 7], dtype=dtype2)}, index=index) |
1038 |
| - |
1039 |
| - result = left.join(right, on=['k1', 'k2']) |
1040 |
| - |
1041 |
| - expected = left.copy() |
1042 |
| - |
1043 |
| - if dtype2.kind == 'i': |
1044 |
| - dtype2 = np.dtype('float64') |
1045 |
| - expected['v2'] = np.array(np.nan, dtype=dtype2) |
1046 |
| - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 |
1047 |
| - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 |
1048 |
| - |
1049 |
| - tm.assert_frame_equal(result, expected) |
1050 |
| - |
1051 |
| - result = left.join(right, on=['k1', 'k2'], sort=True) |
1052 |
| - expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) |
1053 |
| - tm.assert_frame_equal(result, expected) |
1054 |
| - |
1055 |
| - for d1 in [np.int64, np.int32, np.int16, np.int8, np.uint8]: |
1056 |
| - for d2 in [np.int64, np.float64, np.float32, np.float16]: |
1057 |
| - _test(np.dtype(d1), np.dtype(d2)) |
1058 |
| - |
1059 | 1030 | def test_left_merge_na_buglet(self):
|
1060 | 1031 | left = DataFrame({'id': list('abcde'), 'v1': randn(5),
|
1061 | 1032 | 'v2': randn(5), 'dummy': list('abcde'),
|
@@ -1242,3 +1213,145 @@ def f():
|
1242 | 1213 | def f():
|
1243 | 1214 | household.join(log_return, how='outer')
|
1244 | 1215 | self.assertRaises(NotImplementedError, f)
|
| 1216 | + |
| 1217 | + |
| 1218 | +@pytest.fixture |
| 1219 | +def df(): |
| 1220 | + return DataFrame( |
| 1221 | + {'A': ['foo', 'bar'], |
| 1222 | + 'B': Series(['foo', 'bar']).astype('category'), |
| 1223 | + 'C': [1, 2], |
| 1224 | + 'D': [1.0, 2.0], |
| 1225 | + 'E': Series([1, 2], dtype='uint64'), |
| 1226 | + 'F': Series([1, 2], dtype='int32')}) |
| 1227 | + |
| 1228 | + |
| 1229 | +class TestMergeDtypes(object): |
| 1230 | + |
| 1231 | + def test_different(self, df): |
| 1232 | + |
| 1233 | + # we expect differences by kind |
| 1234 | + # to be ok, while other differences should return object |
| 1235 | + |
| 1236 | + left = df |
| 1237 | + for col in df.columns: |
| 1238 | + right = DataFrame({'A': df[col]}) |
| 1239 | + result = pd.merge(left, right, on='A') |
| 1240 | + assert is_object_dtype(result.A.dtype) |
| 1241 | + |
| 1242 | + @pytest.mark.parametrize('d1', [np.int64, np.int32, |
| 1243 | + np.int16, np.int8, np.uint8]) |
| 1244 | + @pytest.mark.parametrize('d2', [np.int64, np.float64, |
| 1245 | + np.float32, np.float16]) |
| 1246 | + def test_join_multi_dtypes(self, d1, d2): |
| 1247 | + |
| 1248 | + dtype1 = np.dtype(d1) |
| 1249 | + dtype2 = np.dtype(d2) |
| 1250 | + |
| 1251 | + left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), |
| 1252 | + 'k2': ['foo', 'bar'] * 12, |
| 1253 | + 'v': np.array(np.arange(24), dtype=np.int64)}) |
| 1254 | + |
| 1255 | + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) |
| 1256 | + right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index) |
| 1257 | + |
| 1258 | + result = left.join(right, on=['k1', 'k2']) |
| 1259 | + |
| 1260 | + expected = left.copy() |
| 1261 | + |
| 1262 | + if dtype2.kind == 'i': |
| 1263 | + dtype2 = np.dtype('float64') |
| 1264 | + expected['v2'] = np.array(np.nan, dtype=dtype2) |
| 1265 | + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 |
| 1266 | + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 |
| 1267 | + |
| 1268 | + tm.assert_frame_equal(result, expected) |
| 1269 | + |
| 1270 | + result = left.join(right, on=['k1', 'k2'], sort=True) |
| 1271 | + expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) |
| 1272 | + tm.assert_frame_equal(result, expected) |
| 1273 | + |
| 1274 | + |
| 1275 | +@pytest.fixture |
| 1276 | +def left(): |
| 1277 | + np.random.seed(1234) |
| 1278 | + return DataFrame( |
| 1279 | + {'X': Series(np.random.choice( |
| 1280 | + ['foo', 'bar'], |
| 1281 | + size=(10,))).astype('category', categories=['foo', 'bar']), |
| 1282 | + 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) |
| 1283 | + |
| 1284 | + |
| 1285 | +@pytest.fixture |
| 1286 | +def right(): |
| 1287 | + np.random.seed(1234) |
| 1288 | + return DataFrame( |
| 1289 | + {'X': Series(['foo', 'bar']).astype('category', |
| 1290 | + categories=['foo', 'bar']), |
| 1291 | + 'Z': [1, 2]}) |
| 1292 | + |
| 1293 | + |
| 1294 | +class TestMergeCategorical(object): |
| 1295 | + |
| 1296 | + def test_identical(self, left): |
| 1297 | + # merging on the same, should preserve dtypes |
| 1298 | + merged = pd.merge(left, left, on='X') |
| 1299 | + result = merged.dtypes.sort_index() |
| 1300 | + expected = Series([CategoricalDtype(), |
| 1301 | + np.dtype('O'), |
| 1302 | + np.dtype('O')], |
| 1303 | + index=['X', 'Y_x', 'Y_y']) |
| 1304 | + assert_series_equal(result, expected) |
| 1305 | + |
| 1306 | + def test_basic(self, left, right): |
| 1307 | + # we have matching Categorical dtypes in X |
| 1308 | + # so should preserve the merged column |
| 1309 | + merged = pd.merge(left, right, on='X') |
| 1310 | + result = merged.dtypes.sort_index() |
| 1311 | + expected = Series([CategoricalDtype(), |
| 1312 | + np.dtype('O'), |
| 1313 | + np.dtype('int64')], |
| 1314 | + index=['X', 'Y', 'Z']) |
| 1315 | + assert_series_equal(result, expected) |
| 1316 | + |
| 1317 | + def test_other_columns(self, left, right): |
| 1318 | + # non-merge columns should preserve if possible |
| 1319 | + right = right.assign(Z=right.Z.astype('category')) |
| 1320 | + |
| 1321 | + merged = pd.merge(left, right, on='X') |
| 1322 | + result = merged.dtypes.sort_index() |
| 1323 | + expected = Series([CategoricalDtype(), |
| 1324 | + np.dtype('O'), |
| 1325 | + CategoricalDtype()], |
| 1326 | + index=['X', 'Y', 'Z']) |
| 1327 | + assert_series_equal(result, expected) |
| 1328 | + |
| 1329 | + # categories are preserved |
| 1330 | + assert left.X.values.is_dtype_equal(merged.X.values) |
| 1331 | + assert right.Z.values.is_dtype_equal(merged.Z.values) |
| 1332 | + |
| 1333 | + @pytest.mark.parametrize( |
| 1334 | + 'change', [lambda x: x, |
| 1335 | + lambda x: x.astype('category', |
| 1336 | + categories=['bar', 'foo']), |
| 1337 | + lambda x: x.astype('category', |
| 1338 | + categories=['foo', 'bar', 'bah']), |
| 1339 | + lambda x: x.astype('category', ordered=True)]) |
| 1340 | + @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right']) |
| 1341 | + def test_dtype_on_merged_different(self, change, how, left, right): |
| 1342 | + # our merging columns, X now has 2 different dtypes |
| 1343 | + # so we must be object as a result |
| 1344 | + |
| 1345 | + X = change(right.X.astype('object')) |
| 1346 | + right = right.assign(X=X) |
| 1347 | + assert is_categorical_dtype(left.X.values) |
| 1348 | + assert not left.X.values.is_dtype_equal(right.X.values) |
| 1349 | + |
| 1350 | + merged = pd.merge(left, right, on='X', how=how) |
| 1351 | + |
| 1352 | + result = merged.dtypes.sort_index() |
| 1353 | + expected = Series([np.dtype('O'), |
| 1354 | + np.dtype('O'), |
| 1355 | + np.dtype('int64')], |
| 1356 | + index=['X', 'Y', 'Z']) |
| 1357 | + assert_series_equal(result, expected) |
0 commit comments