Skip to content

Commit 6fcd114

Browse files
committed
Merge remote-tracking branch 'upstream/main' into stata-read-dta110
2 parents 21e00d9 + b18a142 commit 6fcd114

File tree

11 files changed

+60
-49
lines changed

11 files changed

+60
-49
lines changed

asv_bench/asv.conf.json

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
// pip (with all the conda available packages installed first,
4242
// followed by the pip installed packages).
4343
"matrix": {
44+
"pip+build": [],
4445
"Cython": ["3.0"],
4546
"matplotlib": [],
4647
"sqlalchemy": [],

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ Bug fixes
350350
- Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
351351
- Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
352352
- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
353+
- Fixed bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
353354

354355
Categorical
355356
^^^^^^^^^^^

pandas/_libs/tslib.pyx

-34
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ from pandas._libs.tslibs.conversion cimport (
7070
from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev
7171
from pandas._libs.tslibs.nattype cimport (
7272
NPY_NAT,
73-
c_NaT as NaT,
7473
c_nat_strings as nat_strings,
7574
)
7675
from pandas._libs.tslibs.timestamps cimport _Timestamp
@@ -346,39 +345,6 @@ def array_with_unit_to_datetime(
346345
return result, tz
347346

348347

349-
cdef _array_with_unit_to_datetime_object_fallback(ndarray[object] values, str unit):
350-
cdef:
351-
Py_ssize_t i, n = len(values)
352-
ndarray[object] oresult
353-
tzinfo tz = None
354-
355-
# TODO: fix subtle differences between this and no-unit code
356-
oresult = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
357-
for i in range(n):
358-
val = values[i]
359-
360-
if checknull_with_nat_and_na(val):
361-
oresult[i] = <object>NaT
362-
elif is_integer_object(val) or is_float_object(val):
363-
364-
if val != val or val == NPY_NAT:
365-
oresult[i] = <object>NaT
366-
else:
367-
try:
368-
oresult[i] = Timestamp(val, unit=unit)
369-
except OutOfBoundsDatetime:
370-
oresult[i] = val
371-
372-
elif isinstance(val, str):
373-
if len(val) == 0 or val in nat_strings:
374-
oresult[i] = <object>NaT
375-
376-
else:
377-
oresult[i] = val
378-
379-
return oresult, tz
380-
381-
382348
@cython.wraparound(False)
383349
@cython.boundscheck(False)
384350
def first_non_null(values: ndarray) -> int:

pandas/core/apply.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1710,9 +1710,9 @@ def normalize_keyword_aggregation(
17101710
# TODO: aggspec type: typing.Dict[str, List[AggScalar]]
17111711
aggspec = defaultdict(list)
17121712
order = []
1713-
columns, pairs = list(zip(*kwargs.items()))
1713+
columns = tuple(kwargs.keys())
17141714

1715-
for column, aggfunc in pairs:
1715+
for column, aggfunc in kwargs.values():
17161716
aggspec[column].append(aggfunc)
17171717
order.append((column, com.get_callable_name(aggfunc) or aggfunc))
17181718

pandas/core/frame.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -6168,12 +6168,13 @@ class max type
61686168
names = self.index._get_default_index_names(names, default)
61696169

61706170
if isinstance(self.index, MultiIndex):
6171-
to_insert = zip(self.index.levels, self.index.codes)
6171+
to_insert = zip(reversed(self.index.levels), reversed(self.index.codes))
61726172
else:
61736173
to_insert = ((self.index, None),)
61746174

61756175
multi_col = isinstance(self.columns, MultiIndex)
6176-
for i, (lev, lab) in reversed(list(enumerate(to_insert))):
6176+
for j, (lev, lab) in enumerate(to_insert, start=1):
6177+
i = self.index.nlevels - j
61776178
if level is not None and i not in level:
61786179
continue
61796180
name = names[i]

pandas/core/groupby/ops.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,7 @@ def groups(self) -> dict[Hashable, Index]:
706706
return self.groupings[0].groups
707707
result_index, ids = self.result_index_and_ids
708708
values = result_index._values
709-
categories = Categorical(ids, categories=np.arange(len(result_index)))
709+
categories = Categorical(ids, categories=range(len(result_index)))
710710
result = {
711711
# mypy is not aware that group has to be an integer
712712
values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload]

pandas/core/indexing.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,7 @@ def __setitem__(self, key, value) -> None:
899899

900900
check_dict_or_set_indexers(key)
901901
if isinstance(key, tuple):
902-
key = tuple(list(x) if is_iterator(x) else x for x in key)
902+
key = (list(x) if is_iterator(x) else x for x in key)
903903
key = tuple(com.apply_if_callable(x, self.obj) for x in key)
904904
else:
905905
maybe_callable = com.apply_if_callable(key, self.obj)
@@ -1177,7 +1177,7 @@ def _check_deprecated_callable_usage(self, key: Any, maybe_callable: T) -> T:
11771177
def __getitem__(self, key):
11781178
check_dict_or_set_indexers(key)
11791179
if type(key) is tuple:
1180-
key = tuple(list(x) if is_iterator(x) else x for x in key)
1180+
key = (list(x) if is_iterator(x) else x for x in key)
11811181
key = tuple(com.apply_if_callable(x, self.obj) for x in key)
11821182
if self._is_scalar_access(key):
11831183
return self.obj._get_value(*key, takeable=self._takeable)

pandas/core/sorting.py

-2
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,6 @@ def maybe_lift(lab, size: int) -> tuple[np.ndarray, int]:
172172
for i, (lab, size) in enumerate(zip(labels, shape)):
173173
labels[i], lshape[i] = maybe_lift(lab, size)
174174

175-
labels = list(labels)
176-
177175
# Iteratively process all the labels in chunks sized so less
178176
# than lib.i8max unique int ids will be required for each chunk
179177
while True:

pandas/io/parsers/base_parser.py

+2
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,8 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
487487
col_na_values, col_na_fvalues = _get_na_values(
488488
col_name, self.na_values, self.na_fvalues, self.keep_default_na
489489
)
490+
else:
491+
col_na_values, col_na_fvalues = set(), set()
490492

491493
clean_dtypes = self._clean_mapping(self.dtype)
492494

pandas/io/parsers/python_parser.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -354,14 +354,15 @@ def _convert_data(
354354

355355
if isinstance(self.na_values, dict):
356356
for col in self.na_values:
357-
na_value = self.na_values[col]
358-
na_fvalue = self.na_fvalues[col]
357+
if col is not None:
358+
na_value = self.na_values[col]
359+
na_fvalue = self.na_fvalues[col]
359360

360-
if isinstance(col, int) and col not in self.orig_names:
361-
col = self.orig_names[col]
361+
if isinstance(col, int) and col not in self.orig_names:
362+
col = self.orig_names[col]
362363

363-
clean_na_values[col] = na_value
364-
clean_na_fvalues[col] = na_fvalue
364+
clean_na_values[col] = na_value
365+
clean_na_fvalues[col] = na_fvalue
365366
else:
366367
clean_na_values = self.na_values
367368
clean_na_fvalues = self.na_fvalues

pandas/tests/io/parser/test_na_values.py

+41
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,47 @@ def test_na_values_dict_aliasing(all_parsers):
532532
tm.assert_dict_equal(na_values, na_values_copy)
533533

534534

535+
def test_na_values_dict_null_column_name(all_parsers):
536+
# see gh-57547
537+
parser = all_parsers
538+
data = ",x,y\n\nMA,1,2\nNA,2,1\nOA,,3"
539+
names = [None, "x", "y"]
540+
na_values = {name: STR_NA_VALUES for name in names}
541+
dtype = {None: "object", "x": "float64", "y": "float64"}
542+
543+
if parser.engine == "pyarrow":
544+
msg = "The pyarrow engine doesn't support passing a dict for na_values"
545+
with pytest.raises(ValueError, match=msg):
546+
parser.read_csv(
547+
StringIO(data),
548+
index_col=0,
549+
header=0,
550+
dtype=dtype,
551+
names=names,
552+
na_values=na_values,
553+
keep_default_na=False,
554+
)
555+
return
556+
557+
expected = DataFrame(
558+
{None: ["MA", "NA", "OA"], "x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]}
559+
)
560+
561+
expected = expected.set_index(None)
562+
563+
result = parser.read_csv(
564+
StringIO(data),
565+
index_col=0,
566+
header=0,
567+
dtype=dtype,
568+
names=names,
569+
na_values=na_values,
570+
keep_default_na=False,
571+
)
572+
573+
tm.assert_frame_equal(result, expected)
574+
575+
535576
def test_na_values_dict_col_index(all_parsers):
536577
# see gh-14203
537578
data = "a\nfoo\n1"

0 commit comments

Comments
 (0)