39
39
)
40
40
from pandas ._libs .lib import is_string_array
41
41
from pandas ._libs .tslibs import timezones
42
+ from pandas .compat import HAS_PYARROW
42
43
from pandas .compat ._optional import import_optional_dependency
43
44
from pandas .compat .pickle_compat import patch_pickle
44
45
from pandas .errors import (
@@ -381,6 +382,13 @@ def read_hdf(
381
382
DataFrame.to_hdf : Write a HDF file from a DataFrame.
382
383
HDFStore : Low-level access to HDF files.
383
384
385
+ Notes
386
+ -----
387
+ When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
388
+ and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
389
+ to UTF-8, the resulting dtype will be
390
+ ``pd.StringDtype(storage="python", na_value=np.nan)``.
391
+
384
392
Examples
385
393
--------
386
394
>>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"]) # doctest: +SKIP
@@ -2257,6 +2265,20 @@ def convert(
2257
2265
# making an Index instance could throw a number of different errors
2258
2266
try :
2259
2267
new_pd_index = factory (values , ** kwargs )
2268
+ except UnicodeEncodeError as err :
2269
+ if (
2270
+ errors == "surrogatepass"
2271
+ and get_option ("future.infer_string" )
2272
+ and str (err ).endswith ("surrogates not allowed" )
2273
+ and HAS_PYARROW
2274
+ ):
2275
+ new_pd_index = factory (
2276
+ values ,
2277
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
2278
+ ** kwargs ,
2279
+ )
2280
+ else :
2281
+ raise
2260
2282
except ValueError :
2261
2283
# if the output freq is different that what we recorded,
2262
2284
# it should be None (see also 'doc example part 2')
@@ -3170,12 +3192,29 @@ def read_index_node(
3170
3192
** kwargs ,
3171
3193
)
3172
3194
else :
3173
- index = factory (
3174
- _unconvert_index (
3175
- data , kind , encoding = self .encoding , errors = self .errors
3176
- ),
3177
- ** kwargs ,
3178
- )
3195
+ try :
3196
+ index = factory (
3197
+ _unconvert_index (
3198
+ data , kind , encoding = self .encoding , errors = self .errors
3199
+ ),
3200
+ ** kwargs ,
3201
+ )
3202
+ except UnicodeEncodeError as err :
3203
+ if (
3204
+ self .errors == "surrogatepass"
3205
+ and get_option ("future.infer_string" )
3206
+ and str (err ).endswith ("surrogates not allowed" )
3207
+ and HAS_PYARROW
3208
+ ):
3209
+ index = factory (
3210
+ _unconvert_index (
3211
+ data , kind , encoding = self .encoding , errors = self .errors
3212
+ ),
3213
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
3214
+ ** kwargs ,
3215
+ )
3216
+ else :
3217
+ raise
3179
3218
3180
3219
index .name = name
3181
3220
@@ -3311,13 +3350,24 @@ def read(
3311
3350
self .validate_read (columns , where )
3312
3351
index = self .read_index ("index" , start = start , stop = stop )
3313
3352
values = self .read_array ("values" , start = start , stop = stop )
3314
- result = Series (values , index = index , name = self .name , copy = False )
3315
- if (
3316
- using_string_dtype ()
3317
- and isinstance (values , np .ndarray )
3318
- and is_string_array (values , skipna = True )
3319
- ):
3320
- result = result .astype (StringDtype (na_value = np .nan ))
3353
+ try :
3354
+ result = Series (values , index = index , name = self .name , copy = False )
3355
+ except UnicodeEncodeError as err :
3356
+ if (
3357
+ self .errors == "surrogatepass"
3358
+ and get_option ("future.infer_string" )
3359
+ and str (err ).endswith ("surrogates not allowed" )
3360
+ and HAS_PYARROW
3361
+ ):
3362
+ result = Series (
3363
+ values ,
3364
+ index = index ,
3365
+ name = self .name ,
3366
+ copy = False ,
3367
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
3368
+ )
3369
+ else :
3370
+ raise
3321
3371
return result
3322
3372
3323
3373
def write (self , obj , ** kwargs ) -> None :
@@ -4764,7 +4814,24 @@ def read(
4764
4814
values = values .reshape ((1 , values .shape [0 ]))
4765
4815
4766
4816
if isinstance (values , (np .ndarray , DatetimeArray )):
4767
- df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4817
+ try :
4818
+ df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4819
+ except UnicodeEncodeError as err :
4820
+ if (
4821
+ self .errors == "surrogatepass"
4822
+ and get_option ("future.infer_string" )
4823
+ and str (err ).endswith ("surrogates not allowed" )
4824
+ and HAS_PYARROW
4825
+ ):
4826
+ df = DataFrame (
4827
+ values .T ,
4828
+ columns = cols_ ,
4829
+ index = index_ ,
4830
+ copy = False ,
4831
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
4832
+ )
4833
+ else :
4834
+ raise
4768
4835
elif isinstance (values , Index ):
4769
4836
df = DataFrame (values , columns = cols_ , index = index_ )
4770
4837
else :
@@ -4774,23 +4841,10 @@ def read(
4774
4841
assert (df .dtypes == values .dtype ).all (), (df .dtypes , values .dtype )
4775
4842
4776
4843
# If str / string dtype is stored in meta, use that.
4777
- converted = False
4778
4844
for column in cols_ :
4779
4845
dtype = getattr (self .table .attrs , f"{ column } _meta" , None )
4780
4846
if dtype in ["str" , "string" ]:
4781
4847
df [column ] = df [column ].astype (dtype )
4782
- converted = True
4783
- # Otherwise try inference.
4784
- if (
4785
- not converted
4786
- and using_string_dtype ()
4787
- and isinstance (values , np .ndarray )
4788
- and is_string_array (
4789
- values ,
4790
- skipna = True ,
4791
- )
4792
- ):
4793
- df = df .astype (StringDtype (na_value = np .nan ))
4794
4848
frames .append (df )
4795
4849
4796
4850
if len (frames ) == 1 :
@@ -5224,7 +5278,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
5224
5278
# encode if needed
5225
5279
if len (data ):
5226
5280
data = (
5227
- Series (data .ravel (), copy = False )
5281
+ Series (data .ravel (), copy = False , dtype = "object" )
5228
5282
.str .encode (encoding , errors )
5229
5283
._values .reshape (data .shape )
5230
5284
)
@@ -5264,7 +5318,9 @@ def _unconvert_string_array(
5264
5318
dtype = f"U{ itemsize } "
5265
5319
5266
5320
if isinstance (data [0 ], bytes ):
5267
- ser = Series (data , copy = False ).str .decode (encoding , errors = errors )
5321
+ ser = Series (data , copy = False ).str .decode (
5322
+ encoding , errors = errors , dtype = "object"
5323
+ )
5268
5324
data = ser .to_numpy ()
5269
5325
data .flags .writeable = True
5270
5326
else :
0 commit comments