Fix low_memory C engine parser

kprestel · kprestel · commit b1aaa367fcb6 · 2018-12-09T13:10:26.000-05:00
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -888,12 +888,14 @@ cdef class TextReader:
         cdef:
             int status
 
+        print(f'low mem: {self.low_memory}')
         if self.low_memory:
             # Conserve intermediate space
             columns = self._read_low_memory(rows)
         else:
             # Don't care about memory usage
             columns = self._read_rows(rows, 1)
+        print(f'columns: {columns}')
 
         return columns
 
@@ -933,7 +935,10 @@ cdef class TextReader:
             raise StopIteration
 
         # destructive to chunks
-        return _concatenate_chunks(chunks)
+        print(f'chunks: {chunks}')
+        tmp = _concatenate_chunks(chunks)
+        print(f'chunks: {tmp}')
+        return tmp
 
     cdef _tokenize_rows(self, size_t nrows):
         cdef int status
@@ -986,7 +991,7 @@ cdef class TextReader:
                                             footer=footer,
                                             upcast_na=True)
         self._end_clock('Type conversion')
-
+        print(f'columns after type conversion: {columns}')
         self._start_clock()
         if len(columns) > 0:
             rows_read = len(list(columns.values())[0])
@@ -997,6 +1002,7 @@ cdef class TextReader:
             self.parser_start -= rows_read
 
         self._end_clock('Parser memory cleanup')
+        print(f'returning columns: {columns}')
 
         return columns
 
@@ -1241,7 +1247,7 @@ cdef class TextReader:
                     try:
                         # use _from_sequence_of_strings if the class defines it
                         result = array_type._from_sequence_of_strings(result,
-                                                                    dtype=dtype) # noqa
+                                                                      dtype=dtype) # noqa
                     except AbstractMethodError:
                         result = array_type._from_sequence(result, dtype=dtype)
                 else:
@@ -2201,7 +2207,11 @@ def _concatenate_chunks(list chunks):
             result[name] = union_categoricals(arrs,
                                               sort_categories=sort_categories)
         else:
-            result[name] = np.concatenate(arrs)
+            if is_extension_array_dtype(dtype):
+                result[name] = dtype \
+                    .construct_array_type()._concat_same_type(arrs)
+            else:
+                result[name] = np.concatenate(arrs)
 
     if warning_columns:
         warning_names = ','.join(warning_columns)
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -559,7 +559,7 @@ def sanitize_array(data, index, dtype=None, copy=False,
 
             # possibility of nan -> garbage
             if is_float_dtype(data.dtype) and is_integer_dtype(dtype) \
-                and not is_extension_array_dtype(dtype):
+                    and not is_extension_array_dtype(dtype):
                 if not isna(data).any():
                     subarr = _try_cast(data, True, dtype, copy,
                                        raise_cast_failure)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1669,8 +1669,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
                     try_num_bool)
 
                 # type specified in dtype param
-                if cast_type and not is_dtype_equal(cvals, cast_type):
-                        # or is_extension_array_dtype(cast_type)):
+                if cast_type and (not is_dtype_equal(cvals, cast_type)
+                                  or is_extension_array_dtype(cast_type)):
                     try:
                         if (is_bool_dtype(cast_type) and
                                 not is_categorical_dtype(cast_type)
diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py
@@ -1,6 +1,7 @@
 import pytest
 import pandas as pd
 import numpy as np
+import pandas.util.testing as tm
 from pandas.compat import StringIO
 from pandas.core.arrays.integer import (
     Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype,
@@ -26,15 +27,21 @@ def data(dtype):
 
 
 class ExtensionParsingTests(BaseExtensionTests):
-    def test_EA_types(self):
+
+    @pytest.mark.parametrize('engine', ['c', 'python'])
+    def test_EA_types(self, engine):
         df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'),
                            'A': [1, 2, 1]})
         data = df.to_csv(index=False)
-        result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype})
+        result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype},
+                             engine=engine)
         assert result is not None
+        tm.assert_frame_equal(df, result)
 
         df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'),
                            'A': [1, 2, 1]})
         data = df.to_csv(index=False)
-        result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'})
+        result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'},
+                             engine=engine)
         assert result is not None
+        tm.assert_frame_equal(df, result)