Skip to content

Commit b1aaa36

Browse files
committed
Fix low_memory C engine parser
1 parent 9746e4b commit b1aaa36

File tree

4 files changed

+27
-10
lines changed

4 files changed

+27
-10
lines changed

pandas/_libs/parsers.pyx

+14-4
Original file line numberDiff line numberDiff line change
@@ -888,12 +888,14 @@ cdef class TextReader:
888888
cdef:
889889
int status
890890

891+
print(f'low mem: {self.low_memory}')
891892
if self.low_memory:
892893
# Conserve intermediate space
893894
columns = self._read_low_memory(rows)
894895
else:
895896
# Don't care about memory usage
896897
columns = self._read_rows(rows, 1)
898+
print(f'columns: {columns}')
897899

898900
return columns
899901

@@ -933,7 +935,10 @@ cdef class TextReader:
933935
raise StopIteration
934936

935937
# destructive to chunks
936-
return _concatenate_chunks(chunks)
938+
print(f'chunks: {chunks}')
939+
tmp = _concatenate_chunks(chunks)
940+
print(f'chunks: {tmp}')
941+
return tmp
937942

938943
cdef _tokenize_rows(self, size_t nrows):
939944
cdef int status
@@ -986,7 +991,7 @@ cdef class TextReader:
986991
footer=footer,
987992
upcast_na=True)
988993
self._end_clock('Type conversion')
989-
994+
print(f'columns after type conversion: {columns}')
990995
self._start_clock()
991996
if len(columns) > 0:
992997
rows_read = len(list(columns.values())[0])
@@ -997,6 +1002,7 @@ cdef class TextReader:
9971002
self.parser_start -= rows_read
9981003

9991004
self._end_clock('Parser memory cleanup')
1005+
print(f'returning columns: {columns}')
10001006

10011007
return columns
10021008

@@ -1241,7 +1247,7 @@ cdef class TextReader:
12411247
try:
12421248
# use _from_sequence_of_strings if the class defines it
12431249
result = array_type._from_sequence_of_strings(result,
1244-
dtype=dtype) # noqa
1250+
dtype=dtype) # noqa
12451251
except AbstractMethodError:
12461252
result = array_type._from_sequence(result, dtype=dtype)
12471253
else:
@@ -2201,7 +2207,11 @@ def _concatenate_chunks(list chunks):
22012207
result[name] = union_categoricals(arrs,
22022208
sort_categories=sort_categories)
22032209
else:
2204-
result[name] = np.concatenate(arrs)
2210+
if is_extension_array_dtype(dtype):
2211+
result[name] = dtype \
2212+
.construct_array_type()._concat_same_type(arrs)
2213+
else:
2214+
result[name] = np.concatenate(arrs)
22052215

22062216
if warning_columns:
22072217
warning_names = ','.join(warning_columns)

pandas/core/internals/construction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,7 @@ def sanitize_array(data, index, dtype=None, copy=False,
559559

560560
# possibility of nan -> garbage
561561
if is_float_dtype(data.dtype) and is_integer_dtype(dtype) \
562-
and not is_extension_array_dtype(dtype):
562+
and not is_extension_array_dtype(dtype):
563563
if not isna(data).any():
564564
subarr = _try_cast(data, True, dtype, copy,
565565
raise_cast_failure)

pandas/io/parsers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1669,8 +1669,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
16691669
try_num_bool)
16701670

16711671
# type specified in dtype param
1672-
if cast_type and not is_dtype_equal(cvals, cast_type):
1673-
# or is_extension_array_dtype(cast_type)):
1672+
if cast_type and (not is_dtype_equal(cvals, cast_type)
1673+
or is_extension_array_dtype(cast_type)):
16741674
try:
16751675
if (is_bool_dtype(cast_type) and
16761676
not is_categorical_dtype(cast_type)

pandas/tests/extension/base/io.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pytest
22
import pandas as pd
33
import numpy as np
4+
import pandas.util.testing as tm
45
from pandas.compat import StringIO
56
from pandas.core.arrays.integer import (
67
Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype,
@@ -26,15 +27,21 @@ def data(dtype):
2627

2728

2829
class ExtensionParsingTests(BaseExtensionTests):
29-
def test_EA_types(self):
30+
31+
@pytest.mark.parametrize('engine', ['c', 'python'])
32+
def test_EA_types(self, engine):
3033
df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'),
3134
'A': [1, 2, 1]})
3235
data = df.to_csv(index=False)
33-
result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype})
36+
result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype},
37+
engine=engine)
3438
assert result is not None
39+
tm.assert_frame_equal(df, result)
3540

3641
df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'),
3742
'A': [1, 2, 1]})
3843
data = df.to_csv(index=False)
39-
result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'})
44+
result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'},
45+
engine=engine)
4046
assert result is not None
47+
tm.assert_frame_equal(df, result)

0 commit comments

Comments
 (0)