@@ -2313,111 +2313,34 @@ def set_kind(self):
2313
2313
if self .typ is None :
2314
2314
self .typ = getattr (self .description , self .cname , None )
2315
2315
2316
- def set_atom (
2317
- self ,
2318
- block ,
2319
- existing_col ,
2320
- min_itemsize ,
2321
- nan_rep ,
2322
- info ,
2323
- encoding = None ,
2324
- errors = "strict" ,
2325
- ):
2316
+ def set_atom (self , block , itemsize : int , data_converted , use_str : bool ):
2326
2317
""" create and setup my atom from the block b """
2327
2318
2328
2319
# short-cut certain block types
2329
2320
if block .is_categorical :
2330
2321
self .set_atom_categorical (block )
2331
- self .update_info (info )
2332
- return
2333
2322
elif block .is_datetimetz :
2334
2323
self .set_atom_datetime64tz (block )
2335
- self .update_info (info )
2336
- return
2337
2324
elif block .is_datetime :
2338
- return self .set_atom_datetime64 (block )
2325
+ self .set_atom_datetime64 (block )
2339
2326
elif block .is_timedelta :
2340
- return self .set_atom_timedelta64 (block )
2327
+ self .set_atom_timedelta64 (block )
2341
2328
elif block .is_complex :
2342
- return self .set_atom_complex (block )
2343
-
2344
- dtype = block .dtype .name
2345
- inferred_type = lib .infer_dtype (block .values , skipna = False )
2329
+ self .set_atom_complex (block )
2346
2330
2347
- if inferred_type == "date" :
2348
- raise TypeError ("[date] is not implemented as a table column" )
2349
- elif inferred_type == "datetime" :
2350
- # after GH#8260
2351
- # this only would be hit for a multi-timezone dtype
2352
- # which is an error
2353
-
2354
- raise TypeError (
2355
- "too many timezones in this block, create separate data columns"
2356
- )
2357
- elif inferred_type == "unicode" :
2358
- raise TypeError ("[unicode] is not implemented as a table column" )
2359
-
2360
- # this is basically a catchall; if say a datetime64 has nans then will
2361
- # end up here ###
2362
- elif inferred_type == "string" or dtype == "object" :
2363
- self .set_atom_string (
2364
- block , existing_col , min_itemsize , nan_rep , encoding , errors ,
2365
- )
2366
-
2367
- # set as a data block
2331
+ elif use_str :
2332
+ self .set_atom_string (itemsize , data_converted )
2368
2333
else :
2334
+ # set as a data block
2369
2335
self .set_atom_data (block )
2370
2336
2371
- def get_atom_string (self , block , itemsize ):
2372
- return _tables ().StringCol (itemsize = itemsize , shape = block .shape [0 ])
2373
-
2374
- def set_atom_string (
2375
- self , block , existing_col , min_itemsize , nan_rep , encoding , errors
2376
- ):
2377
- # fill nan items with myself, don't disturb the blocks by
2378
- # trying to downcast
2379
- block = block .fillna (nan_rep , downcast = False )
2380
- if isinstance (block , list ):
2381
- block = block [0 ]
2382
- data = block .values
2383
-
2384
- # see if we have a valid string type
2385
- inferred_type = lib .infer_dtype (data .ravel (), skipna = False )
2386
- if inferred_type != "string" :
2387
-
2388
- # we cannot serialize this data, so report an exception on a column
2389
- # by column basis
2390
- for i in range (len (block .shape [0 ])):
2391
-
2392
- col = block .iget (i )
2393
- inferred_type = lib .infer_dtype (col .ravel (), skipna = False )
2394
- if inferred_type != "string" :
2395
- iloc = block .mgr_locs .indexer [i ]
2396
- raise TypeError (
2397
- f"Cannot serialize the column [{ iloc } ] because\n "
2398
- f"its data contents are [{ inferred_type } ] object dtype"
2399
- )
2400
-
2401
- # itemsize is the maximum length of a string (along any dimension)
2402
- data_converted = _convert_string_array (data , encoding , errors )
2403
- itemsize = data_converted .itemsize
2404
-
2405
- # specified min_itemsize?
2406
- if isinstance (min_itemsize , dict ):
2407
- min_itemsize = int (
2408
- min_itemsize .get (self .name ) or min_itemsize .get ("values" ) or 0
2409
- )
2410
- itemsize = max (min_itemsize or 0 , itemsize )
2411
-
2412
- # check for column in the values conflicts
2413
- if existing_col is not None :
2414
- eci = existing_col .validate_col (itemsize )
2415
- if eci > itemsize :
2416
- itemsize = eci
2337
+ def get_atom_string (self , shape , itemsize ):
2338
+ return _tables ().StringCol (itemsize = itemsize , shape = shape [0 ])
2417
2339
2340
+ def set_atom_string (self , itemsize : int , data_converted : np .ndarray ):
2418
2341
self .itemsize = itemsize
2419
2342
self .kind = "string"
2420
- self .typ = self .get_atom_string (block , itemsize )
2343
+ self .typ = self .get_atom_string (data_converted . shape , itemsize )
2421
2344
self .set_data (data_converted .astype (f"|S{ itemsize } " , copy = False ))
2422
2345
2423
2346
def get_atom_coltype (self , kind = None ):
@@ -2621,7 +2544,7 @@ def validate_names(self):
2621
2544
# TODO: should the message here be more specifically non-str?
2622
2545
raise ValueError ("cannot have non-object label DataIndexableCol" )
2623
2546
2624
- def get_atom_string (self , block , itemsize ):
2547
+ def get_atom_string (self , shape , itemsize ):
2625
2548
return _tables ().StringCol (itemsize = itemsize )
2626
2549
2627
2550
def get_atom_data (self , block , kind = None ):
@@ -3972,17 +3895,26 @@ def get_blk_items(mgr, blocks):
3972
3895
else :
3973
3896
existing_col = None
3974
3897
3975
- col = klass . create_for_block ( i = i , name = name , version = self . version )
3976
- col . values = list ( b_items )
3977
- col . set_atom (
3978
- block = b ,
3898
+ new_name = name or f"values_block_ { i } "
3899
+ itemsize , data_converted , use_str = _maybe_convert_for_string_atom (
3900
+ new_name ,
3901
+ b ,
3979
3902
existing_col = existing_col ,
3980
3903
min_itemsize = min_itemsize ,
3981
3904
nan_rep = nan_rep ,
3982
3905
encoding = self .encoding ,
3983
3906
errors = self .errors ,
3984
- info = self .info ,
3985
3907
)
3908
+
3909
+ col = klass .create_for_block (i = i , name = new_name , version = self .version )
3910
+ col .values = list (b_items )
3911
+ col .set_atom (
3912
+ block = b ,
3913
+ itemsize = itemsize ,
3914
+ data_converted = data_converted ,
3915
+ use_str = use_str ,
3916
+ )
3917
+ col .update_info (self .info )
3986
3918
col .set_pos (j )
3987
3919
3988
3920
vaxes .append (col )
@@ -4847,6 +4779,74 @@ def _unconvert_index(data, kind: str, encoding=None, errors="strict"):
4847
4779
return index
4848
4780
4849
4781
4782
+ def _maybe_convert_for_string_atom (
4783
+ name : str , block , existing_col , min_itemsize , nan_rep , encoding , errors
4784
+ ):
4785
+ use_str = False
4786
+
4787
+ if not block .is_object :
4788
+ return block .dtype .itemsize , block .values , use_str
4789
+
4790
+ dtype_name = block .dtype .name
4791
+ inferred_type = lib .infer_dtype (block .values , skipna = False )
4792
+
4793
+ if inferred_type == "date" :
4794
+ raise TypeError ("[date] is not implemented as a table column" )
4795
+ elif inferred_type == "datetime" :
4796
+ # after GH#8260
4797
+ # this only would be hit for a multi-timezone dtype which is an error
4798
+ raise TypeError (
4799
+ "too many timezones in this block, create separate data columns"
4800
+ )
4801
+
4802
+ elif not (inferred_type == "string" or dtype_name == "object" ):
4803
+ return block .dtype .itemsize , block .values , use_str
4804
+
4805
+ use_str = True
4806
+
4807
+ block = block .fillna (nan_rep , downcast = False )
4808
+ if isinstance (block , list ):
4809
+ # Note: because block is always object dtype, fillna goes
4810
+ # through a path such that the result is always a 1-element list
4811
+ block = block [0 ]
4812
+ data = block .values
4813
+
4814
+ # see if we have a valid string type
4815
+ inferred_type = lib .infer_dtype (data .ravel (), skipna = False )
4816
+ if inferred_type != "string" :
4817
+
4818
+ # we cannot serialize this data, so report an exception on a column
4819
+ # by column basis
4820
+ for i in range (len (block .shape [0 ])):
4821
+
4822
+ col = block .iget (i )
4823
+ inferred_type = lib .infer_dtype (col .ravel (), skipna = False )
4824
+ if inferred_type != "string" :
4825
+ iloc = block .mgr_locs .indexer [i ]
4826
+ raise TypeError (
4827
+ f"Cannot serialize the column [{ iloc } ] because\n "
4828
+ f"its data contents are [{ inferred_type } ] object dtype"
4829
+ )
4830
+
4831
+ # itemsize is the maximum length of a string (along any dimension)
4832
+ data_converted = _convert_string_array (data , encoding , errors ).reshape (data .shape )
4833
+ assert data_converted .shape == block .shape , (data_converted .shape , block .shape )
4834
+ itemsize = data_converted .itemsize
4835
+
4836
+ # specified min_itemsize?
4837
+ if isinstance (min_itemsize , dict ):
4838
+ min_itemsize = int (min_itemsize .get (name ) or min_itemsize .get ("values" ) or 0 )
4839
+ itemsize = max (min_itemsize or 0 , itemsize )
4840
+
4841
+ # check for column in the values conflicts
4842
+ if existing_col is not None :
4843
+ eci = existing_col .validate_col (itemsize )
4844
+ if eci > itemsize :
4845
+ itemsize = eci
4846
+
4847
+ return itemsize , data_converted , use_str
4848
+
4849
+
4850
4850
def _convert_string_array (data , encoding , errors , itemsize = None ):
4851
4851
"""
4852
4852
we take a string-like that is object dtype and coerce to a fixed size
0 commit comments