8
8
import numpy as np
9
9
import pytest
10
10
11
- from pandas .core .dtypes .common import is_object_dtype
11
+ from pandas .core .dtypes .common import (
12
+ is_object_dtype ,
13
+ is_string_dtype ,
14
+ )
12
15
from pandas .core .dtypes .dtypes import CategoricalDtype
13
16
14
17
import pandas as pd
@@ -316,14 +319,15 @@ def test_merge_copy(self):
316
319
merged ["d" ] = "peekaboo"
317
320
assert (right ["d" ] == "bar" ).all ()
318
321
319
- def test_merge_nocopy (self , using_array_manager ):
322
+ def test_merge_nocopy (self , using_array_manager , using_infer_string ):
320
323
left = DataFrame ({"a" : 0 , "b" : 1 }, index = range (10 ))
321
324
right = DataFrame ({"c" : "foo" , "d" : "bar" }, index = range (10 ))
322
325
323
326
merged = merge (left , right , left_index = True , right_index = True , copy = False )
324
327
325
328
assert np .shares_memory (merged ["a" ]._values , left ["a" ]._values )
326
- assert np .shares_memory (merged ["d" ]._values , right ["d" ]._values )
329
+ if not using_infer_string :
330
+ assert np .shares_memory (merged ["d" ]._values , right ["d" ]._values )
327
331
328
332
def test_intelligently_handle_join_key (self ):
329
333
# #733, be a bit more 1337 about not returning unconsolidated DataFrame
@@ -667,11 +671,13 @@ def test_merge_nan_right(self):
667
671
"i1_" : {0 : 0 , 1 : np .nan },
668
672
"i3" : {0 : 0.0 , 1 : np .nan },
669
673
None : {0 : 0 , 1 : 0 },
670
- }
674
+ },
675
+ columns = Index (["i1" , "i2" , "i1_" , "i3" , None ], dtype = object ),
671
676
)
672
677
.set_index (None )
673
678
.reset_index ()[["i1" , "i2" , "i1_" , "i3" ]]
674
679
)
680
+ result .columns = result .columns .astype ("object" )
675
681
tm .assert_frame_equal (result , expected , check_dtype = False )
676
682
677
683
def test_merge_nan_right2 (self ):
@@ -820,7 +826,7 @@ def test_overlapping_columns_error_message(self):
820
826
821
827
# #2649, #10639
822
828
df2 .columns = ["key1" , "foo" , "foo" ]
823
- msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)"
829
+ msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string '\)"
824
830
with pytest .raises (MergeError , match = msg ):
825
831
merge (df , df2 )
826
832
@@ -1498,7 +1504,7 @@ def test_different(self, right_vals):
1498
1504
# We allow merging on object and categorical cols and cast
1499
1505
# categorical cols to object
1500
1506
result = merge (left , right , on = "A" )
1501
- assert is_object_dtype (result .A .dtype )
1507
+ assert is_object_dtype (result .A .dtype ) or is_string_dtype ( result . A . dtype )
1502
1508
1503
1509
@pytest .mark .parametrize (
1504
1510
"d1" , [np .int64 , np .int32 , np .intc , np .int16 , np .int8 , np .uint8 ]
@@ -1637,7 +1643,7 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
1637
1643
result = merge (df1 , df2 , on = ["A" ])
1638
1644
assert is_object_dtype (result .A .dtype )
1639
1645
result = merge (df2 , df1 , on = ["A" ])
1640
- assert is_object_dtype (result .A .dtype )
1646
+ assert is_object_dtype (result .A .dtype ) or is_string_dtype ( result . A . dtype )
1641
1647
1642
1648
@pytest .mark .parametrize (
1643
1649
"df1_vals, df2_vals" ,
@@ -1867,25 +1873,27 @@ def right():
1867
1873
1868
1874
1869
1875
class TestMergeCategorical :
1870
- def test_identical (self , left ):
1876
+ def test_identical (self , left , using_infer_string ):
1871
1877
# merging on the same, should preserve dtypes
1872
1878
merged = merge (left , left , on = "X" )
1873
1879
result = merged .dtypes .sort_index ()
1880
+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
1874
1881
expected = Series (
1875
- [CategoricalDtype (categories = ["foo" , "bar" ]), np . dtype ( "O" ), np . dtype ( "O" ) ],
1882
+ [CategoricalDtype (categories = ["foo" , "bar" ]), dtype , dtype ],
1876
1883
index = ["X" , "Y_x" , "Y_y" ],
1877
1884
)
1878
1885
tm .assert_series_equal (result , expected )
1879
1886
1880
- def test_basic (self , left , right ):
1887
+ def test_basic (self , left , right , using_infer_string ):
1881
1888
# we have matching Categorical dtypes in X
1882
1889
# so should preserve the merged column
1883
1890
merged = merge (left , right , on = "X" )
1884
1891
result = merged .dtypes .sort_index ()
1892
+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
1885
1893
expected = Series (
1886
1894
[
1887
1895
CategoricalDtype (categories = ["foo" , "bar" ]),
1888
- np . dtype ( "O" ) ,
1896
+ dtype ,
1889
1897
np .dtype ("int64" ),
1890
1898
],
1891
1899
index = ["X" , "Y" , "Z" ],
@@ -1989,16 +1997,17 @@ def test_multiindex_merge_with_unordered_categoricalindex(self, ordered):
1989
1997
).set_index (["id" , "p" ])
1990
1998
tm .assert_frame_equal (result , expected )
1991
1999
1992
- def test_other_columns (self , left , right ):
2000
+ def test_other_columns (self , left , right , using_infer_string ):
1993
2001
# non-merge columns should preserve if possible
1994
2002
right = right .assign (Z = right .Z .astype ("category" ))
1995
2003
1996
2004
merged = merge (left , right , on = "X" )
1997
2005
result = merged .dtypes .sort_index ()
2006
+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
1998
2007
expected = Series (
1999
2008
[
2000
2009
CategoricalDtype (categories = ["foo" , "bar" ]),
2001
- np . dtype ( "O" ) ,
2010
+ dtype ,
2002
2011
CategoricalDtype (categories = [1 , 2 ]),
2003
2012
],
2004
2013
index = ["X" , "Y" , "Z" ],
@@ -2017,7 +2026,9 @@ def test_other_columns(self, left, right):
2017
2026
lambda x : x .astype (CategoricalDtype (ordered = True )),
2018
2027
],
2019
2028
)
2020
- def test_dtype_on_merged_different (self , change , join_type , left , right ):
2029
+ def test_dtype_on_merged_different (
2030
+ self , change , join_type , left , right , using_infer_string
2031
+ ):
2021
2032
# our merging columns, X now has 2 different dtypes
2022
2033
# so we must be object as a result
2023
2034
@@ -2029,9 +2040,8 @@ def test_dtype_on_merged_different(self, change, join_type, left, right):
2029
2040
merged = merge (left , right , on = "X" , how = join_type )
2030
2041
2031
2042
result = merged .dtypes .sort_index ()
2032
- expected = Series (
2033
- [np .dtype ("O" ), np .dtype ("O" ), np .dtype ("int64" )], index = ["X" , "Y" , "Z" ]
2034
- )
2043
+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
2044
+ expected = Series ([dtype , dtype , np .dtype ("int64" )], index = ["X" , "Y" , "Z" ])
2035
2045
tm .assert_series_equal (result , expected )
2036
2046
2037
2047
def test_self_join_multiple_categories (self ):
@@ -2499,7 +2509,7 @@ def test_merge_multiindex_columns():
2499
2509
expected_index = MultiIndex .from_tuples (tuples , names = ["outer" , "inner" ])
2500
2510
expected = DataFrame (columns = expected_index )
2501
2511
2502
- tm .assert_frame_equal (result , expected )
2512
+ tm .assert_frame_equal (result , expected , check_dtype = False )
2503
2513
2504
2514
2505
2515
def test_merge_datetime_upcast_dtype ():
0 commit comments