8
8
import numpy as np
9
9
import pytest
10
10
11
- from pandas .core .dtypes .common import is_object_dtype
11
+ from pandas .core .dtypes .common import (
12
+ is_object_dtype ,
13
+ is_string_dtype ,
14
+ )
12
15
from pandas .core .dtypes .dtypes import CategoricalDtype
13
16
14
17
import pandas as pd
@@ -265,14 +268,15 @@ def test_merge_copy(self):
265
268
merged ["d" ] = "peekaboo"
266
269
assert (right ["d" ] == "bar" ).all ()
267
270
268
- def test_merge_nocopy (self ):
271
+ def test_merge_nocopy (self , using_infer_string ):
269
272
left = DataFrame ({"a" : 0 , "b" : 1 }, index = range (10 ))
270
273
right = DataFrame ({"c" : "foo" , "d" : "bar" }, index = range (10 ))
271
274
272
275
merged = merge (left , right , left_index = True , right_index = True , copy = False )
273
276
274
277
assert np .shares_memory (merged ["a" ]._values , left ["a" ]._values )
275
- assert np .shares_memory (merged ["d" ]._values , right ["d" ]._values )
278
+ if not using_infer_string :
279
+ assert np .shares_memory (merged ["d" ]._values , right ["d" ]._values )
276
280
277
281
def test_intelligently_handle_join_key (self ):
278
282
# #733, be a bit more 1337 about not returning unconsolidated DataFrame
@@ -660,11 +664,13 @@ def test_merge_nan_right(self):
660
664
"i1_" : {0 : 0 , 1 : np .nan },
661
665
"i3" : {0 : 0.0 , 1 : np .nan },
662
666
None : {0 : 0 , 1 : 0 },
663
- }
667
+ },
668
+ columns = Index (["i1" , "i2" , "i1_" , "i3" , None ], dtype = object ),
664
669
)
665
670
.set_index (None )
666
671
.reset_index ()[["i1" , "i2" , "i1_" , "i3" ]]
667
672
)
673
+ result .columns = result .columns .astype ("object" )
668
674
tm .assert_frame_equal (result , expected , check_dtype = False )
669
675
670
676
def test_merge_nan_right2 (self ):
@@ -808,7 +814,7 @@ def test_overlapping_columns_error_message(self):
808
814
809
815
# #2649, #10639
810
816
df2 .columns = ["key1" , "foo" , "foo" ]
811
- msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)"
817
+ msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string '\)"
812
818
with pytest .raises (MergeError , match = msg ):
813
819
merge (df , df2 )
814
820
@@ -1485,7 +1491,7 @@ def test_different(self, dtype):
1485
1491
# We allow merging on object and categorical cols and cast
1486
1492
# categorical cols to object
1487
1493
result = merge (left , right , on = "A" )
1488
- assert is_object_dtype (result .A .dtype )
1494
+ assert is_object_dtype (result .A .dtype ) or is_string_dtype ( result . A . dtype )
1489
1495
1490
1496
@pytest .mark .parametrize ("d2" , [np .int64 , np .float64 , np .float32 , np .float16 ])
1491
1497
def test_join_multi_dtypes (self , any_int_numpy_dtype , d2 ):
@@ -1621,7 +1627,7 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
1621
1627
result = merge (df1 , df2 , on = ["A" ])
1622
1628
assert is_object_dtype (result .A .dtype )
1623
1629
result = merge (df2 , df1 , on = ["A" ])
1624
- assert is_object_dtype (result .A .dtype )
1630
+ assert is_object_dtype (result .A .dtype ) or is_string_dtype ( result . A . dtype )
1625
1631
1626
1632
@pytest .mark .parametrize (
1627
1633
"df1_vals, df2_vals" ,
@@ -1850,25 +1856,27 @@ def right():
1850
1856
1851
1857
1852
1858
class TestMergeCategorical :
1853
- def test_identical (self , left ):
1859
+ def test_identical (self , left , using_infer_string ):
1854
1860
# merging on the same, should preserve dtypes
1855
1861
merged = merge (left , left , on = "X" )
1856
1862
result = merged .dtypes .sort_index ()
1863
+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
1857
1864
expected = Series (
1858
- [CategoricalDtype (categories = ["foo" , "bar" ]), np . dtype ( "O" ), np . dtype ( "O" ) ],
1865
+ [CategoricalDtype (categories = ["foo" , "bar" ]), dtype , dtype ],
1859
1866
index = ["X" , "Y_x" , "Y_y" ],
1860
1867
)
1861
1868
tm .assert_series_equal (result , expected )
1862
1869
1863
- def test_basic (self , left , right ):
1870
+ def test_basic (self , left , right , using_infer_string ):
1864
1871
# we have matching Categorical dtypes in X
1865
1872
# so should preserve the merged column
1866
1873
merged = merge (left , right , on = "X" )
1867
1874
result = merged .dtypes .sort_index ()
1875
+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
1868
1876
expected = Series (
1869
1877
[
1870
1878
CategoricalDtype (categories = ["foo" , "bar" ]),
1871
- np . dtype ( "O" ) ,
1879
+ dtype ,
1872
1880
np .dtype ("int64" ),
1873
1881
],
1874
1882
index = ["X" , "Y" , "Z" ],
@@ -1972,16 +1980,17 @@ def test_multiindex_merge_with_unordered_categoricalindex(self, ordered):
1972
1980
).set_index (["id" , "p" ])
1973
1981
tm .assert_frame_equal (result , expected )
1974
1982
1975
- def test_other_columns (self , left , right ):
1983
+ def test_other_columns (self , left , right , using_infer_string ):
1976
1984
# non-merge columns should preserve if possible
1977
1985
right = right .assign (Z = right .Z .astype ("category" ))
1978
1986
1979
1987
merged = merge (left , right , on = "X" )
1980
1988
result = merged .dtypes .sort_index ()
1989
+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
1981
1990
expected = Series (
1982
1991
[
1983
1992
CategoricalDtype (categories = ["foo" , "bar" ]),
1984
- np . dtype ( "O" ) ,
1993
+ dtype ,
1985
1994
CategoricalDtype (categories = [1 , 2 ]),
1986
1995
],
1987
1996
index = ["X" , "Y" , "Z" ],
@@ -2000,7 +2009,9 @@ def test_other_columns(self, left, right):
2000
2009
lambda x : x .astype (CategoricalDtype (ordered = True )),
2001
2010
],
2002
2011
)
2003
- def test_dtype_on_merged_different (self , change , join_type , left , right ):
2012
+ def test_dtype_on_merged_different (
2013
+ self , change , join_type , left , right , using_infer_string
2014
+ ):
2004
2015
# our merging columns, X now has 2 different dtypes
2005
2016
# so we must be object as a result
2006
2017
@@ -2012,9 +2023,8 @@ def test_dtype_on_merged_different(self, change, join_type, left, right):
2012
2023
merged = merge (left , right , on = "X" , how = join_type )
2013
2024
2014
2025
result = merged .dtypes .sort_index ()
2015
- expected = Series (
2016
- [np .dtype ("O" ), np .dtype ("O" ), np .dtype ("int64" )], index = ["X" , "Y" , "Z" ]
2017
- )
2026
+ dtype = np .dtype ("O" ) if not using_infer_string else "string"
2027
+ expected = Series ([dtype , dtype , np .dtype ("int64" )], index = ["X" , "Y" , "Z" ])
2018
2028
tm .assert_series_equal (result , expected )
2019
2029
2020
2030
def test_self_join_multiple_categories (self ):
@@ -2471,7 +2481,7 @@ def test_merge_multiindex_columns():
2471
2481
expected_index = MultiIndex .from_tuples (tuples , names = ["outer" , "inner" ])
2472
2482
expected = DataFrame (columns = expected_index )
2473
2483
2474
- tm .assert_frame_equal (result , expected )
2484
+ tm .assert_frame_equal (result , expected , check_dtype = False )
2475
2485
2476
2486
2477
2487
def test_merge_datetime_upcast_dtype ():
0 commit comments