@@ -94,6 +94,7 @@ from pandas._libs.khash cimport (
94
94
)
95
95
96
96
from pandas.errors import (
97
+ DtypeWarning,
97
98
EmptyDataError,
98
99
ParserError,
99
100
ParserWarning,
@@ -107,7 +108,9 @@ from pandas.core.dtypes.common import (
107
108
is_float_dtype,
108
109
is_integer_dtype,
109
110
is_object_dtype,
111
+ pandas_dtype,
110
112
)
113
+ from pandas.core.dtypes.concat import union_categoricals
111
114
112
115
cdef:
113
116
float64_t INF = < float64_t> np.inf
@@ -314,7 +317,7 @@ cdef class TextReader:
314
317
315
318
cdef public:
316
319
int64_t leading_cols, table_width, skipfooter, buffer_lines
317
- bint allow_leading_cols, mangle_dupe_cols
320
+ bint allow_leading_cols, mangle_dupe_cols, low_memory
318
321
bint delim_whitespace
319
322
object delimiter # bytes or str
320
323
object converters
@@ -359,6 +362,7 @@ cdef class TextReader:
359
362
true_values = None ,
360
363
false_values = None ,
361
364
bint allow_leading_cols = True ,
365
+ bint low_memory = False ,
362
366
skiprows = None ,
363
367
skipfooter = 0 , # int64_t
364
368
bint verbose = False ,
@@ -475,6 +479,7 @@ cdef class TextReader:
475
479
self .na_filter = na_filter
476
480
477
481
self .verbose = verbose
482
+ self .low_memory = low_memory
478
483
479
484
if float_precision == " round_trip" :
480
485
# see gh-15140
@@ -487,10 +492,12 @@ cdef class TextReader:
487
492
raise ValueError (f' Unrecognized float_precision option: '
488
493
f' {float_precision}' )
489
494
490
- # Caller is responsible for ensuring we have one of
491
- # - None
492
- # - DtypeObj
493
- # - dict[Any, DtypeObj]
495
+ if isinstance (dtype, dict ):
496
+ dtype = {k: pandas_dtype(dtype[k])
497
+ for k in dtype}
498
+ elif dtype is not None :
499
+ dtype = pandas_dtype(dtype)
500
+
494
501
self .dtype = dtype
495
502
496
503
# XXX
@@ -700,8 +707,7 @@ cdef class TextReader:
700
707
ic = (len (self .index_col) if self .index_col
701
708
is not None else 0 )
702
709
703
- # if wrong number of blanks or no index, not our format
704
- if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0 :
710
+ if lc != unnamed_count and lc - ic > unnamed_count:
705
711
hr -= 1
706
712
self .parser_start -= 1
707
713
this_header = [None ] * lc
@@ -767,18 +773,17 @@ cdef class TextReader:
767
773
"""
768
774
rows = None -- > read all rows
769
775
"""
770
- # Don't care about memory usage
771
- columns = self ._read_rows(rows, 1 )
776
+ if self.low_memory:
777
+ # Conserve intermediate space
778
+ columns = self ._read_low_memory(rows)
779
+ else:
780
+ # Don't care about memory usage
781
+ columns = self ._read_rows(rows, 1 )
772
782
773
783
return columns
774
784
775
- def read_low_memory(self , rows: int | None )-> list[dict[int , "ArrayLike"]]:
776
- """
777
- rows = None -- > read all rows
778
- """
779
- # Conserve intermediate space
780
- # Caller is responsible for concatenating chunks ,
781
- # see c_parser_wrapper._concatenatve_chunks
785
+ # -> dict[int , "ArrayLike"]
786
+ cdef _read_low_memory(self , rows ):
782
787
cdef:
783
788
size_t rows_read = 0
784
789
list chunks = []
@@ -813,7 +818,8 @@ cdef class TextReader:
813
818
if len (chunks) == 0 :
814
819
raise StopIteration
815
820
816
- return chunks
821
+ # destructive to chunks
822
+ return _concatenate_chunks(chunks)
817
823
818
824
cdef _tokenize_rows(self , size_t nrows):
819
825
cdef:
@@ -1901,6 +1907,49 @@ cdef raise_parser_error(object base, parser_t *parser):
1901
1907
raise ParserError(message)
1902
1908
1903
1909
1910
+ # chunks: list[dict[int, "ArrayLike"]]
1911
+ # -> dict[int, "ArrayLike"]
1912
+ def _concatenate_chunks (list chunks ) -> dict:
1913
+ cdef:
1914
+ list names = list (chunks[0 ].keys())
1915
+ object name
1916
+ list warning_columns = []
1917
+ object warning_names
1918
+ object common_type
1919
+
1920
+ result = {}
1921
+ for name in names:
1922
+ arrs = [chunk.pop(name) for chunk in chunks]
1923
+ # Check each arr for consistent types.
1924
+ dtypes = {a.dtype for a in arrs}
1925
+ numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
1926
+ if len(numpy_dtypes ) > 1:
1927
+ common_type = np.find_common_type(numpy_dtypes, [])
1928
+ if common_type == object:
1929
+ warning_columns.append(str(name ))
1930
+
1931
+ dtype = dtypes.pop()
1932
+ if is_categorical_dtype(dtype ):
1933
+ sort_categories = isinstance (dtype, str )
1934
+ result[name] = union_categoricals(arrs,
1935
+ sort_categories = sort_categories)
1936
+ else :
1937
+ if is_extension_array_dtype(dtype):
1938
+ array_type = dtype.construct_array_type()
1939
+ result[name] = array_type._concat_same_type(arrs)
1940
+ else :
1941
+ result[name] = np.concatenate(arrs)
1942
+
1943
+ if warning_columns:
1944
+ warning_names = ' ,' .join(warning_columns)
1945
+ warning_message = " " .join([
1946
+ f" Columns ({warning_names}) have mixed types."
1947
+ f" Specify dtype option on import or set low_memory=False."
1948
+ ])
1949
+ warnings.warn(warning_message, DtypeWarning, stacklevel = 8 )
1950
+ return result
1951
+
1952
+
1904
1953
# ----------------------------------------------------------------------
1905
1954
# NA values
1906
1955
def _compute_na_values ():
0 commit comments