TYP: parsers.pyi (pandas-dev#40508)

jbrockmendel · yeshsurya · commit 38af11c4412a · 2021-05-06T14:25:04.000+05:30
diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi
@@ -58,6 +58,7 @@ class TextReader:
         true_values=...,
         false_values=...,
         allow_leading_cols: bool = ...,
+        low_memory: bool = ...,
         skiprows=...,
         skipfooter: int = ...,  # int64_t
         verbose: bool = ...,
@@ -74,4 +75,3 @@ class TextReader:
     def close(self) -> None: ...
 
     def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ...
-    def read_low_memory(self, rows: int | None) -> list[dict[int, ArrayLike]]: ...
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -94,6 +94,7 @@ from pandas._libs.khash cimport (
 )
 
 from pandas.errors import (
+    DtypeWarning,
     EmptyDataError,
     ParserError,
     ParserWarning,
@@ -107,7 +108,9 @@ from pandas.core.dtypes.common import (
     is_float_dtype,
     is_integer_dtype,
     is_object_dtype,
+    pandas_dtype,
 )
+from pandas.core.dtypes.concat import union_categoricals
 
 cdef:
     float64_t INF = <float64_t>np.inf
@@ -314,7 +317,7 @@ cdef class TextReader:
 
     cdef public:
         int64_t leading_cols, table_width, skipfooter, buffer_lines
-        bint allow_leading_cols, mangle_dupe_cols
+        bint allow_leading_cols, mangle_dupe_cols, low_memory
         bint delim_whitespace
         object delimiter  # bytes or str
         object converters
@@ -359,6 +362,7 @@ cdef class TextReader:
                   true_values=None,
                   false_values=None,
                   bint allow_leading_cols=True,
+                  bint low_memory=False,
                   skiprows=None,
                   skipfooter=0,         # int64_t
                   bint verbose=False,
@@ -475,6 +479,7 @@ cdef class TextReader:
         self.na_filter = na_filter
 
         self.verbose = verbose
+        self.low_memory = low_memory
 
         if float_precision == "round_trip":
             # see gh-15140
@@ -487,10 +492,12 @@ cdef class TextReader:
             raise ValueError(f'Unrecognized float_precision option: '
                              f'{float_precision}')
 
-        # Caller is responsible for ensuring we have one of
-        # - None
-        # - DtypeObj
-        # - dict[Any, DtypeObj]
+        if isinstance(dtype, dict):
+            dtype = {k: pandas_dtype(dtype[k])
+                     for k in dtype}
+        elif dtype is not None:
+            dtype = pandas_dtype(dtype)
+
         self.dtype = dtype
 
         # XXX
@@ -700,8 +707,7 @@ cdef class TextReader:
                         ic = (len(self.index_col) if self.index_col
                               is not None else 0)
 
-                        # if wrong number of blanks or no index, not our format
-                        if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
+                        if lc != unnamed_count and lc - ic > unnamed_count:
                             hr -= 1
                             self.parser_start -= 1
                             this_header = [None] * lc
@@ -767,18 +773,17 @@ cdef class TextReader:
         """
         rows=None --> read all rows
         """
-        # Don't care about memory usage
-        columns = self._read_rows(rows, 1)
+        if self.low_memory:
+            # Conserve intermediate space
+            columns = self._read_low_memory(rows)
+        else:
+            # Don't care about memory usage
+            columns = self._read_rows(rows, 1)
 
         return columns
 
-    def read_low_memory(self, rows: int | None)-> list[dict[int, "ArrayLike"]]:
-        """
-        rows=None --> read all rows
-        """
-        # Conserve intermediate space
-        # Caller is responsible for concatenating chunks,
-        #  see c_parser_wrapper._concatenatve_chunks
+    #  -> dict[int, "ArrayLike"]
+    cdef _read_low_memory(self, rows):
         cdef:
             size_t rows_read = 0
             list chunks = []
@@ -813,7 +818,8 @@ cdef class TextReader:
         if len(chunks) == 0:
             raise StopIteration
 
-        return chunks
+        # destructive to chunks
+        return _concatenate_chunks(chunks)
 
     cdef _tokenize_rows(self, size_t nrows):
         cdef:
@@ -1901,6 +1907,49 @@ cdef raise_parser_error(object base, parser_t *parser):
     raise ParserError(message)
 
 
+# chunks: list[dict[int, "ArrayLike"]]
+# -> dict[int, "ArrayLike"]
+def _concatenate_chunks(list chunks) -> dict:
+    cdef:
+        list names = list(chunks[0].keys())
+        object name
+        list warning_columns = []
+        object warning_names
+        object common_type
+
+    result = {}
+    for name in names:
+        arrs = [chunk.pop(name) for chunk in chunks]
+        # Check each arr for consistent types.
+        dtypes = {a.dtype for a in arrs}
+        numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
+        if len(numpy_dtypes) > 1:
+            common_type = np.find_common_type(numpy_dtypes, [])
+            if common_type == object:
+                warning_columns.append(str(name))
+
+        dtype = dtypes.pop()
+        if is_categorical_dtype(dtype):
+            sort_categories = isinstance(dtype, str)
+            result[name] = union_categoricals(arrs,
+                                              sort_categories=sort_categories)
+        else:
+            if is_extension_array_dtype(dtype):
+                array_type = dtype.construct_array_type()
+                result[name] = array_type._concat_same_type(arrs)
+            else:
+                result[name] = np.concatenate(arrs)
+
+    if warning_columns:
+        warning_names = ','.join(warning_columns)
+        warning_message = " ".join([
+            f"Columns ({warning_names}) have mixed types."
+            f"Specify dtype option on import or set low_memory=False."
+          ])
+        warnings.warn(warning_message, DtypeWarning, stacklevel=8)
+    return result
+
+
 # ----------------------------------------------------------------------
 # NA values
 def _compute_na_values():