Skip to content

Commit 7acd629

Browse files
authored
BUG: Avoid RangeIndex conversion in read_csv if dtype is specified (#59316)
* BUG: Avoid RangeIndex conversion in read_csv if dtype is specified * Undo change * Typing
1 parent 12c8ec4 commit 7acd629

File tree

4 files changed

+46
-17
lines changed

4 files changed

+46
-17
lines changed

pandas/io/parsers/base_parser.py

+27-12
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from copy import copy
55
import csv
66
from enum import Enum
7+
import itertools
78
from typing import (
89
TYPE_CHECKING,
910
Any,
@@ -271,7 +272,7 @@ def _maybe_make_multi_index_columns(
271272

272273
@final
273274
def _make_index(
274-
self, data, alldata, columns, indexnamerow: list[Scalar] | None = None
275+
self, alldata, columns, indexnamerow: list[Scalar] | None = None
275276
) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
276277
index: Index | None
277278
if isinstance(self.index_col, list) and len(self.index_col):
@@ -326,7 +327,11 @@ def _agg_index(self, index) -> Index:
326327
converters = self._clean_mapping(self.converters)
327328
clean_dtypes = self._clean_mapping(self.dtype)
328329

329-
for i, arr in enumerate(index):
330+
if self.index_names is not None:
331+
names: Iterable = self.index_names
332+
else:
333+
names = itertools.cycle([None])
334+
for i, (arr, name) in enumerate(zip(index, names)):
330335
if self._should_parse_dates(i):
331336
arr = date_converter(
332337
arr,
@@ -369,12 +374,17 @@ def _agg_index(self, index) -> Index:
369374
arr, _ = self._infer_types(
370375
arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
371376
)
372-
arrays.append(arr)
373-
374-
names = self.index_names
375-
index = ensure_index_from_sequences(arrays, names)
377+
if cast_type is not None:
378+
# Don't perform RangeIndex inference
379+
idx = Index(arr, name=name, dtype=cast_type)
380+
else:
381+
idx = ensure_index_from_sequences([arr], [name])
382+
arrays.append(idx)
376383

377-
return index
384+
if len(arrays) == 1:
385+
return arrays[0]
386+
else:
387+
return MultiIndex.from_arrays(arrays)
378388

379389
@final
380390
def _set_noconvert_dtype_columns(
@@ -704,12 +714,11 @@ def _get_empty_meta(
704714
dtype_dict: defaultdict[Hashable, Any]
705715
if not is_dict_like(dtype):
706716
# if dtype == None, default will be object.
707-
default_dtype = dtype or object
708-
dtype_dict = defaultdict(lambda: default_dtype)
717+
dtype_dict = defaultdict(lambda: dtype)
709718
else:
710719
dtype = cast(dict, dtype)
711720
dtype_dict = defaultdict(
712-
lambda: object,
721+
lambda: None,
713722
{columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
714723
)
715724

@@ -726,8 +735,14 @@ def _get_empty_meta(
726735
if (index_col is None or index_col is False) or index_names is None:
727736
index = default_index(0)
728737
else:
729-
data = [Series([], dtype=dtype_dict[name]) for name in index_names]
730-
index = ensure_index_from_sequences(data, names=index_names)
738+
# TODO: We could return default_index(0) if dtype_dict[name] is None
739+
data = [
740+
Index([], name=name, dtype=dtype_dict[name]) for name in index_names
741+
]
742+
if len(data) == 1:
743+
index = data[0]
744+
else:
745+
index = MultiIndex.from_arrays(data)
731746
index_col.sort()
732747

733748
for i, n in enumerate(index_col):

pandas/io/parsers/c_parser_wrapper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ def read(
338338
data = {k: v for k, (i, v) in zip(names, data_tups)}
339339

340340
date_data = self._do_date_conversions(names, data)
341-
index, column_names = self._make_index(date_data, alldata, names)
341+
index, column_names = self._make_index(alldata, names)
342342

343343
return index, column_names, date_data
344344

pandas/io/parsers/python_parser.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -312,9 +312,7 @@ def read(
312312
conv_data = self._convert_data(data)
313313
conv_data = self._do_date_conversions(columns, conv_data)
314314

315-
index, result_columns = self._make_index(
316-
conv_data, alldata, columns, indexnamerow
317-
)
315+
index, result_columns = self._make_index(alldata, columns, indexnamerow)
318316

319317
return index, result_columns, conv_data
320318

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
3030
)
3131

32+
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
33+
3234

3335
@pytest.mark.parametrize("dtype", [str, object])
3436
@pytest.mark.parametrize("check_orig", [True, False])
@@ -614,6 +616,7 @@ def test_string_inference_object_dtype(all_parsers, dtype):
614616
tm.assert_frame_equal(result, expected)
615617

616618

619+
@xfail_pyarrow
617620
def test_accurate_parsing_of_large_integers(all_parsers):
618621
# GH#52505
619622
data = """SYMBOL,MOMENT,ID,ID_DEAL
@@ -624,7 +627,7 @@ def test_accurate_parsing_of_large_integers(all_parsers):
624627
AMZN,20230301181139587,2023552585717889759,2023552585717263360
625628
MSFT,20230301181139587,2023552585717889863,2023552585717263361
626629
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
627-
orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
630+
orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
628631
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
629632
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
630633
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
@@ -646,3 +649,16 @@ def test_dtypes_with_usecols(all_parsers):
646649
values = ["1", "4"]
647650
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
648651
tm.assert_frame_equal(result, expected)
652+
653+
654+
def test_index_col_with_dtype_no_rangeindex(all_parsers):
655+
data = StringIO("345.5,519.5,0\n519.5,726.5,1")
656+
result = all_parsers.read_csv(
657+
data,
658+
header=None,
659+
names=["start", "stop", "bin_id"],
660+
dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
661+
index_col="bin_id",
662+
).index
663+
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
664+
tm.assert_index_equal(result, expected)

0 commit comments

Comments
 (0)