Skip to content

Commit 38af11c

Browse files
jbrockmendelyeshsurya
authored andcommitted
TYP: parsers.pyi (pandas-dev#40508)
1 parent 66027bf commit 38af11c

File tree

2 files changed

+67
-18
lines changed

2 files changed

+67
-18
lines changed

pandas/_libs/parsers.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ class TextReader:
5858
true_values=...,
5959
false_values=...,
6060
allow_leading_cols: bool = ...,
61+
low_memory: bool = ...,
6162
skiprows=...,
6263
skipfooter: int = ..., # int64_t
6364
verbose: bool = ...,
@@ -74,4 +75,3 @@ class TextReader:
7475
def close(self) -> None: ...
7576

7677
def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ...
77-
def read_low_memory(self, rows: int | None) -> list[dict[int, ArrayLike]]: ...

pandas/_libs/parsers.pyx

+66-17
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ from pandas._libs.khash cimport (
9494
)
9595

9696
from pandas.errors import (
97+
DtypeWarning,
9798
EmptyDataError,
9899
ParserError,
99100
ParserWarning,
@@ -107,7 +108,9 @@ from pandas.core.dtypes.common import (
107108
is_float_dtype,
108109
is_integer_dtype,
109110
is_object_dtype,
111+
pandas_dtype,
110112
)
113+
from pandas.core.dtypes.concat import union_categoricals
111114

112115
cdef:
113116
float64_t INF = <float64_t>np.inf
@@ -314,7 +317,7 @@ cdef class TextReader:
314317

315318
cdef public:
316319
int64_t leading_cols, table_width, skipfooter, buffer_lines
317-
bint allow_leading_cols, mangle_dupe_cols
320+
bint allow_leading_cols, mangle_dupe_cols, low_memory
318321
bint delim_whitespace
319322
object delimiter # bytes or str
320323
object converters
@@ -359,6 +362,7 @@ cdef class TextReader:
359362
true_values=None,
360363
false_values=None,
361364
bint allow_leading_cols=True,
365+
bint low_memory=False,
362366
skiprows=None,
363367
skipfooter=0, # int64_t
364368
bint verbose=False,
@@ -475,6 +479,7 @@ cdef class TextReader:
475479
self.na_filter = na_filter
476480

477481
self.verbose = verbose
482+
self.low_memory = low_memory
478483

479484
if float_precision == "round_trip":
480485
# see gh-15140
@@ -487,10 +492,12 @@ cdef class TextReader:
487492
raise ValueError(f'Unrecognized float_precision option: '
488493
f'{float_precision}')
489494

490-
# Caller is responsible for ensuring we have one of
491-
# - None
492-
# - DtypeObj
493-
# - dict[Any, DtypeObj]
495+
if isinstance(dtype, dict):
496+
dtype = {k: pandas_dtype(dtype[k])
497+
for k in dtype}
498+
elif dtype is not None:
499+
dtype = pandas_dtype(dtype)
500+
494501
self.dtype = dtype
495502

496503
# XXX
@@ -700,8 +707,7 @@ cdef class TextReader:
700707
ic = (len(self.index_col) if self.index_col
701708
is not None else 0)
702709

703-
# if wrong number of blanks or no index, not our format
704-
if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
710+
if lc != unnamed_count and lc - ic > unnamed_count:
705711
hr -= 1
706712
self.parser_start -= 1
707713
this_header = [None] * lc
@@ -767,18 +773,17 @@ cdef class TextReader:
767773
"""
768774
rows=None --> read all rows
769775
"""
770-
# Don't care about memory usage
771-
columns = self._read_rows(rows, 1)
776+
if self.low_memory:
777+
# Conserve intermediate space
778+
columns = self._read_low_memory(rows)
779+
else:
780+
# Don't care about memory usage
781+
columns = self._read_rows(rows, 1)
772782

773783
return columns
774784

775-
def read_low_memory(self, rows: int | None)-> list[dict[int, "ArrayLike"]]:
776-
"""
777-
rows=None --> read all rows
778-
"""
779-
# Conserve intermediate space
780-
# Caller is responsible for concatenating chunks,
781-
# see c_parser_wrapper._concatenatve_chunks
785+
# -> dict[int, "ArrayLike"]
786+
cdef _read_low_memory(self, rows):
782787
cdef:
783788
size_t rows_read = 0
784789
list chunks = []
@@ -813,7 +818,8 @@ cdef class TextReader:
813818
if len(chunks) == 0:
814819
raise StopIteration
815820

816-
return chunks
821+
# destructive to chunks
822+
return _concatenate_chunks(chunks)
817823

818824
cdef _tokenize_rows(self, size_t nrows):
819825
cdef:
@@ -1901,6 +1907,49 @@ cdef raise_parser_error(object base, parser_t *parser):
19011907
raise ParserError(message)
19021908

19031909

1910+
# chunks: list[dict[int, "ArrayLike"]]
1911+
# -> dict[int, "ArrayLike"]
1912+
def _concatenate_chunks(list chunks) -> dict:
1913+
cdef:
1914+
list names = list(chunks[0].keys())
1915+
object name
1916+
list warning_columns = []
1917+
object warning_names
1918+
object common_type
1919+
1920+
result = {}
1921+
for name in names:
1922+
arrs = [chunk.pop(name) for chunk in chunks]
1923+
# Check each arr for consistent types.
1924+
dtypes = {a.dtype for a in arrs}
1925+
numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
1926+
if len(numpy_dtypes) > 1:
1927+
common_type = np.find_common_type(numpy_dtypes, [])
1928+
if common_type == object:
1929+
warning_columns.append(str(name))
1930+
1931+
dtype = dtypes.pop()
1932+
if is_categorical_dtype(dtype):
1933+
sort_categories = isinstance(dtype, str)
1934+
result[name] = union_categoricals(arrs,
1935+
sort_categories=sort_categories)
1936+
else:
1937+
if is_extension_array_dtype(dtype):
1938+
array_type = dtype.construct_array_type()
1939+
result[name] = array_type._concat_same_type(arrs)
1940+
else:
1941+
result[name] = np.concatenate(arrs)
1942+
1943+
if warning_columns:
1944+
warning_names = ','.join(warning_columns)
1945+
warning_message = " ".join([
1946+
f"Columns ({warning_names}) have mixed types."
1947+
f"Specify dtype option on import or set low_memory=False."
1948+
])
1949+
warnings.warn(warning_message, DtypeWarning, stacklevel=8)
1950+
return result
1951+
1952+
19041953
# ----------------------------------------------------------------------
19051954
# NA values
19061955
def _compute_na_values():

0 commit comments

Comments
 (0)