11
11
import numpy as np
12
12
import pytest
13
13
14
- from pandas ._config import using_string_dtype
15
-
16
14
import pandas .util ._test_decorators as td
17
15
18
16
import pandas as pd
@@ -347,9 +345,8 @@ def test_write_dta6(self, datapath):
347
345
check_index_type = False ,
348
346
)
349
347
350
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
351
348
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
352
- def test_read_write_dta10 (self , version ):
349
+ def test_read_write_dta10 (self , version , using_infer_string ):
353
350
original = DataFrame (
354
351
data = [["string" , "object" , 1 , 1.1 , np .datetime64 ("2003-12-25" )]],
355
352
columns = ["string" , "object" , "integer" , "floating" , "datetime" ],
@@ -362,12 +359,17 @@ def test_read_write_dta10(self, version):
362
359
with tm .ensure_clean () as path :
363
360
original .to_stata (path , convert_dates = {"datetime" : "tc" }, version = version )
364
361
written_and_read_again = self .read_dta (path )
365
- # original.index is np.int32, read index is np.int64
366
- tm .assert_frame_equal (
367
- written_and_read_again .set_index ("index" ),
368
- original ,
369
- check_index_type = False ,
370
- )
362
+
363
+ expected = original .copy ()
364
+ if using_infer_string :
365
+ expected ["object" ] = expected ["object" ].astype ("str" )
366
+
367
+ # original.index is np.int32, read index is np.int64
368
+ tm .assert_frame_equal (
369
+ written_and_read_again .set_index ("index" ),
370
+ expected ,
371
+ check_index_type = False ,
372
+ )
371
373
372
374
def test_stata_doc_examples (self ):
373
375
with tm .ensure_clean () as path :
@@ -1153,7 +1155,6 @@ def test_categorical_ordering(self, file, datapath):
1153
1155
assert parsed [col ].cat .ordered
1154
1156
assert not parsed_unordered [col ].cat .ordered
1155
1157
1156
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" , strict = False )
1157
1158
@pytest .mark .filterwarnings ("ignore::UserWarning" )
1158
1159
@pytest .mark .parametrize (
1159
1160
"file" ,
@@ -1215,6 +1216,10 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame:
1215
1216
if cat .categories .dtype == object :
1216
1217
categories = pd .Index ._with_infer (cat .categories ._values )
1217
1218
cat = cat .set_categories (categories )
1219
+ elif cat .categories .dtype == "string" and len (cat .categories ) == 0 :
1220
+ # if the read categories are empty, it comes back as object dtype
1221
+ categories = cat .categories .astype (object )
1222
+ cat = cat .set_categories (categories )
1218
1223
from_frame [col ] = cat
1219
1224
return from_frame
1220
1225
@@ -1244,7 +1249,6 @@ def test_iterator(self, datapath):
1244
1249
from_chunks = pd .concat (itr )
1245
1250
tm .assert_frame_equal (parsed , from_chunks )
1246
1251
1247
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" , strict = False )
1248
1252
@pytest .mark .filterwarnings ("ignore::UserWarning" )
1249
1253
@pytest .mark .parametrize (
1250
1254
"file" ,
@@ -1548,12 +1552,11 @@ def test_inf(self, infval):
1548
1552
with tm .ensure_clean () as path :
1549
1553
df .to_stata (path )
1550
1554
1551
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1552
1555
def test_path_pathlib (self ):
1553
1556
df = DataFrame (
1554
1557
1.1 * np .arange (120 ).reshape ((30 , 4 )),
1555
- columns = pd .Index (list ("ABCD" ), dtype = object ),
1556
- index = pd .Index ([f"i-{ i } " for i in range (30 )], dtype = object ),
1558
+ columns = pd .Index (list ("ABCD" )),
1559
+ index = pd .Index ([f"i-{ i } " for i in range (30 )]),
1557
1560
)
1558
1561
df .index .name = "index"
1559
1562
reader = lambda x : read_stata (x ).set_index ("index" )
@@ -1584,13 +1587,12 @@ def test_value_labels_iterator(self, write_index):
1584
1587
value_labels = dta_iter .value_labels ()
1585
1588
assert value_labels == {"A" : {0 : "A" , 1 : "B" , 2 : "C" , 3 : "E" }}
1586
1589
1587
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1588
1590
def test_set_index (self ):
1589
1591
# GH 17328
1590
1592
df = DataFrame (
1591
1593
1.1 * np .arange (120 ).reshape ((30 , 4 )),
1592
- columns = pd .Index (list ("ABCD" ), dtype = object ),
1593
- index = pd .Index ([f"i-{ i } " for i in range (30 )], dtype = object ),
1594
+ columns = pd .Index (list ("ABCD" )),
1595
+ index = pd .Index ([f"i-{ i } " for i in range (30 )]),
1594
1596
)
1595
1597
df .index .name = "index"
1596
1598
with tm .ensure_clean () as path :
@@ -1618,8 +1620,7 @@ def test_date_parsing_ignores_format_details(self, column, datapath):
1618
1620
formatted = df .loc [0 , column + "_fmt" ]
1619
1621
assert unformatted == formatted
1620
1622
1621
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1622
- def test_writer_117 (self ):
1623
+ def test_writer_117 (self , using_infer_string ):
1623
1624
original = DataFrame (
1624
1625
data = [
1625
1626
[
@@ -1682,13 +1683,17 @@ def test_writer_117(self):
1682
1683
version = 117 ,
1683
1684
)
1684
1685
written_and_read_again = self .read_dta (path )
1685
- # original.index is np.int32, read index is np.int64
1686
- tm .assert_frame_equal (
1687
- written_and_read_again .set_index ("index" ),
1688
- original ,
1689
- check_index_type = False ,
1690
- )
1691
- tm .assert_frame_equal (original , copy )
1686
+
1687
+ expected = original [:]
1688
+ if using_infer_string :
1689
+ # object dtype (with only strings/None) comes back as string dtype
1690
+ expected ["object" ] = expected ["object" ].astype ("str" )
1691
+
1692
+ tm .assert_frame_equal (
1693
+ written_and_read_again .set_index ("index" ),
1694
+ expected ,
1695
+ )
1696
+ tm .assert_frame_equal (original , copy )
1692
1697
1693
1698
def test_convert_strl_name_swap (self ):
1694
1699
original = DataFrame (
@@ -1725,15 +1730,14 @@ def test_invalid_date_conversion(self):
1725
1730
with pytest .raises (ValueError , match = msg ):
1726
1731
original .to_stata (path , convert_dates = {"wrong_name" : "tc" })
1727
1732
1728
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1729
1733
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
1730
1734
def test_nonfile_writing (self , version ):
1731
1735
# GH 21041
1732
1736
bio = io .BytesIO ()
1733
1737
df = DataFrame (
1734
1738
1.1 * np .arange (120 ).reshape ((30 , 4 )),
1735
- columns = pd .Index (list ("ABCD" ), dtype = object ),
1736
- index = pd .Index ([f"i-{ i } " for i in range (30 )], dtype = object ),
1739
+ columns = pd .Index (list ("ABCD" )),
1740
+ index = pd .Index ([f"i-{ i } " for i in range (30 )]),
1737
1741
)
1738
1742
df .index .name = "index"
1739
1743
with tm .ensure_clean () as path :
@@ -1744,13 +1748,12 @@ def test_nonfile_writing(self, version):
1744
1748
reread = read_stata (path , index_col = "index" )
1745
1749
tm .assert_frame_equal (df , reread )
1746
1750
1747
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1748
1751
def test_gzip_writing (self ):
1749
1752
# writing version 117 requires seek and cannot be used with gzip
1750
1753
df = DataFrame (
1751
1754
1.1 * np .arange (120 ).reshape ((30 , 4 )),
1752
- columns = pd .Index (list ("ABCD" ), dtype = object ),
1753
- index = pd .Index ([f"i-{ i } " for i in range (30 )], dtype = object ),
1755
+ columns = pd .Index (list ("ABCD" )),
1756
+ index = pd .Index ([f"i-{ i } " for i in range (30 )]),
1754
1757
)
1755
1758
df .index .name = "index"
1756
1759
with tm .ensure_clean () as path :
@@ -1777,8 +1780,7 @@ def test_unicode_dta_118(self, datapath):
1777
1780
1778
1781
tm .assert_frame_equal (unicode_df , expected )
1779
1782
1780
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1781
- def test_mixed_string_strl (self ):
1783
+ def test_mixed_string_strl (self , using_infer_string ):
1782
1784
# GH 23633
1783
1785
output = [{"mixed" : "string" * 500 , "number" : 0 }, {"mixed" : None , "number" : 1 }]
1784
1786
output = DataFrame (output )
@@ -1796,7 +1798,10 @@ def test_mixed_string_strl(self):
1796
1798
path , write_index = False , convert_strl = ["mixed" ], version = 117
1797
1799
)
1798
1800
reread = read_stata (path )
1799
- expected = output .fillna ("" )
1801
+ expected = output .copy ()
1802
+ if using_infer_string :
1803
+ expected ["mixed" ] = expected ["mixed" ].astype ("str" )
1804
+ expected = expected .fillna ("" )
1800
1805
tm .assert_frame_equal (reread , expected )
1801
1806
1802
1807
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
@@ -1875,7 +1880,7 @@ def test_stata_119(self, datapath):
1875
1880
reader ._ensure_open ()
1876
1881
assert reader ._nvar == 32999
1877
1882
1878
- @pytest .mark .xfail ( using_string_dtype (), reason = "TODO(infer_string) " )
1883
+ @pytest .mark .filterwarnings ( "ignore:Downcasting behavior:FutureWarning " )
1879
1884
@pytest .mark .parametrize ("version" , [118 , 119 , None ])
1880
1885
def test_utf8_writer (self , version ):
1881
1886
cat = pd .Categorical (["a" , "β" , "ĉ" ], ordered = True )
@@ -2143,14 +2148,13 @@ def test_iterator_errors(datapath, chunksize):
2143
2148
pass
2144
2149
2145
2150
2146
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
2147
2151
def test_iterator_value_labels ():
2148
2152
# GH 31544
2149
2153
values = ["c_label" , "b_label" ] + ["a_label" ] * 500
2150
2154
df = DataFrame ({f"col{ k } " : pd .Categorical (values , ordered = True ) for k in range (2 )})
2151
2155
with tm .ensure_clean () as path :
2152
2156
df .to_stata (path , write_index = False )
2153
- expected = pd .Index (["a_label" , "b_label" , "c_label" ], dtype = "object" )
2157
+ expected = pd .Index (["a_label" , "b_label" , "c_label" ])
2154
2158
with read_stata (path , chunksize = 100 ) as reader :
2155
2159
for j , chunk in enumerate (reader ):
2156
2160
for i in range (2 ):
0 commit comments