Skip to content

Commit 7f43278

Browse files
authored
PERF: Improve performance in read_csv with numeric index col (#44610)
1 parent fc6b441 commit 7f43278

File tree

3 files changed

+14
-3
lines changed

3 files changed

+14
-3
lines changed

asv_bench/benchmarks/io/csv.py

+10
Original file line numberDiff line numberDiff line change
@@ -525,4 +525,14 @@ def time_to_datetime_format_DD_MM_YYYY(self, cache_dates):
525525
to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y")
526526

527527

528+
class ReadCSVIndexCol(StringIORewind):
529+
def setup(self):
530+
count_elem = 100_000
531+
data = "a,b\n" + "1,2\n" * count_elem
532+
self.StringIO_input = StringIO(data)
533+
534+
def time_read_csv_index_col(self):
535+
read_csv(self.StringIO_input, index_col="a")
536+
537+
528538
from ..pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,7 @@ Performance improvements
521521
- Performance improvement in :meth:`Series.to_frame` (:issue:`43558`)
522522
- Performance improvement in :meth:`Series.mad` (:issue:`43010`)
523523
- Performance improvement in :func:`merge` (:issue:`43332`)
524+
- Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`)
524525
- Performance improvement in :func:`concat` (:issue:`43354`)
525526
-
526527

pandas/io/parsers/base_parser.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -702,9 +702,9 @@ def _infer_types(self, values, na_values, try_num_bool=True):
702702
"""
703703
na_count = 0
704704
if issubclass(values.dtype.type, (np.number, np.bool_)):
705-
# error: Argument 2 to "isin" has incompatible type "List[Any]"; expected
706-
# "Union[Union[ExtensionArray, ndarray], Index, Series]"
707-
mask = algorithms.isin(values, list(na_values)) # type: ignore[arg-type]
705+
# If our array has numeric dtype, we don't have to check for strings in isin
706+
na_values = np.array([val for val in na_values if not isinstance(val, str)])
707+
mask = algorithms.isin(values, na_values)
708708
na_count = mask.astype("uint8", copy=False).sum()
709709
if na_count > 0:
710710
if is_integer_dtype(values):

0 commit comments

Comments
 (0)