Skip to content

Commit 5115f09

Browse files
Rename duplicate column names in read_json(orient='split') (#50370)
* Rename duplicate column names in read_json(orient='split') * Add issue number to TODO * Finish renaming of _dedup_names and check keys before column renaming * Black * Support multiindex and fix linters * Restoring commented code * isort * Fix doctest * Being more specific with xfail * Update whatsnew issue number
1 parent ef0eaa4 commit 5115f09

File tree

7 files changed

+138
-61
lines changed

7 files changed

+138
-61
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,7 @@ Other API changes
543543
new DataFrame (shallow copy) instead of the original DataFrame, consistent with other
544544
methods to get a full slice (for example ``df.loc[:]`` or ``df[:]``) (:issue:`49469`)
545545
- Disallow computing ``cumprod`` for :class:`Timedelta` object; previously this returned incorrect values (:issue:`50246`)
546+
- Loading a JSON file with duplicate columns using ``read_json(orient='split')`` renames columns to avoid duplicates, as :func:`read_csv` and the other readers do (:issue:`50370`)
546547
- :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`)
547548
-
548549

pandas/io/common.py

+70
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
abstractmethod,
77
)
88
import codecs
9+
from collections import defaultdict
910
import dataclasses
1011
import functools
1112
import gzip
@@ -26,7 +27,9 @@
2627
IO,
2728
Any,
2829
AnyStr,
30+
DefaultDict,
2931
Generic,
32+
Hashable,
3033
Literal,
3134
Mapping,
3235
Sequence,
@@ -67,6 +70,7 @@
6770
is_list_like,
6871
)
6972

73+
from pandas.core.indexes.api import MultiIndex
7074
from pandas.core.shared_docs import _shared_docs
7175

7276
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
@@ -1181,3 +1185,69 @@ def _get_binary_io_classes() -> tuple[type, ...]:
11811185
binary_classes += (type(reader),)
11821186

11831187
return binary_classes
1188+
1189+
1190+
def is_potential_multi_index(
1191+
columns: Sequence[Hashable] | MultiIndex,
1192+
index_col: bool | Sequence[int] | None = None,
1193+
) -> bool:
1194+
"""
1195+
Check whether or not the `columns` parameter
1196+
could be converted into a MultiIndex.
1197+
1198+
Parameters
1199+
----------
1200+
columns : array-like
1201+
Object which may or may not be convertible into a MultiIndex
1202+
index_col : None, bool or list, optional
1203+
Column or columns to use as the (possibly hierarchical) index
1204+
1205+
Returns
1206+
-------
1207+
bool : Whether or not columns could become a MultiIndex
1208+
"""
1209+
if index_col is None or isinstance(index_col, bool):
1210+
index_col = []
1211+
1212+
return bool(
1213+
len(columns)
1214+
and not isinstance(columns, MultiIndex)
1215+
and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
1216+
)
1217+
1218+
1219+
def dedup_names(
1220+
names: Sequence[Hashable], is_potential_multiindex: bool
1221+
) -> Sequence[Hashable]:
1222+
"""
1223+
Rename column names if duplicates exist.
1224+
1225+
Currently the renaming is done by appending a period and an autonumeric,
1226+
but a custom pattern may be supported in the future.
1227+
1228+
Examples
1229+
--------
1230+
>>> dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False)
1231+
['x', 'y', 'x.1', 'x.2']
1232+
"""
1233+
names = list(names) # so we can index
1234+
counts: DefaultDict[Hashable, int] = defaultdict(int)
1235+
1236+
for i, col in enumerate(names):
1237+
cur_count = counts[col]
1238+
1239+
while cur_count > 0:
1240+
counts[col] = cur_count + 1
1241+
1242+
if is_potential_multiindex:
1243+
# for mypy
1244+
assert isinstance(col, tuple)
1245+
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
1246+
else:
1247+
col = f"{col}.{cur_count}"
1248+
cur_count = counts[col]
1249+
1250+
names[i] = col
1251+
counts[col] = cur_count + 1
1252+
1253+
return names

pandas/io/json/_json.py

+10
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,12 @@
5757

5858
from pandas.io.common import (
5959
IOHandles,
60+
dedup_names,
6061
extension_to_compression,
6162
file_exists,
6263
get_handle,
6364
is_fsspec_url,
65+
is_potential_multi_index,
6466
is_url,
6567
stringify_path,
6668
)
@@ -1246,6 +1248,14 @@ def _parse(self) -> None:
12461248
for k, v in loads(json, precise_float=self.precise_float).items()
12471249
}
12481250
self.check_keys_split(decoded)
1251+
orig_names = [
1252+
(tuple(col) if isinstance(col, list) else col)
1253+
for col in decoded["columns"]
1254+
]
1255+
decoded["columns"] = dedup_names(
1256+
orig_names,
1257+
is_potential_multi_index(orig_names, None),
1258+
)
12491259
self.obj = DataFrame(dtype=None, **decoded)
12501260
elif orient == "index":
12511261
self.obj = DataFrame.from_dict(

pandas/io/parsers/base_parser.py

+3-56
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
TYPE_CHECKING,
1111
Any,
1212
Callable,
13-
DefaultDict,
1413
Hashable,
1514
Iterable,
1615
List,
@@ -89,6 +88,8 @@
8988
from pandas.core.series import Series
9089
from pandas.core.tools import datetimes as tools
9190

91+
from pandas.io.common import is_potential_multi_index
92+
9293
if TYPE_CHECKING:
9394
from pandas import DataFrame
9495

@@ -333,39 +334,14 @@ def extract(r):
333334

334335
return names, index_names, col_names, passed_names
335336

336-
@final
337-
def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
338-
names = list(names) # so we can index
339-
counts: DefaultDict[Hashable, int] = defaultdict(int)
340-
is_potential_mi = _is_potential_multi_index(names, self.index_col)
341-
342-
for i, col in enumerate(names):
343-
cur_count = counts[col]
344-
345-
while cur_count > 0:
346-
counts[col] = cur_count + 1
347-
348-
if is_potential_mi:
349-
# for mypy
350-
assert isinstance(col, tuple)
351-
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
352-
else:
353-
col = f"{col}.{cur_count}"
354-
cur_count = counts[col]
355-
356-
names[i] = col
357-
counts[col] = cur_count + 1
358-
359-
return names
360-
361337
@final
362338
def _maybe_make_multi_index_columns(
363339
self,
364340
columns: Sequence[Hashable],
365341
col_names: Sequence[Hashable] | None = None,
366342
) -> Sequence[Hashable] | MultiIndex:
367343
# possibly create a column mi here
368-
if _is_potential_multi_index(columns):
344+
if is_potential_multi_index(columns):
369345
list_columns = cast(List[Tuple], columns)
370346
return MultiIndex.from_tuples(list_columns, names=col_names)
371347
return columns
@@ -1326,35 +1302,6 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na):
13261302
return na_values, na_fvalues
13271303

13281304

1329-
def _is_potential_multi_index(
1330-
columns: Sequence[Hashable] | MultiIndex,
1331-
index_col: bool | Sequence[int] | None = None,
1332-
) -> bool:
1333-
"""
1334-
Check whether or not the `columns` parameter
1335-
could be converted into a MultiIndex.
1336-
1337-
Parameters
1338-
----------
1339-
columns : array-like
1340-
Object which may or may not be convertible into a MultiIndex
1341-
index_col : None, bool or list, optional
1342-
Column or columns to use as the (possibly hierarchical) index
1343-
1344-
Returns
1345-
-------
1346-
bool : Whether or not columns could become a MultiIndex
1347-
"""
1348-
if index_col is None or isinstance(index_col, bool):
1349-
index_col = []
1350-
1351-
return bool(
1352-
len(columns)
1353-
and not isinstance(columns, MultiIndex)
1354-
and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
1355-
)
1356-
1357-
13581305
def _validate_parse_dates_arg(parse_dates):
13591306
"""
13601307
Check whether or not the 'parse_dates' parameter

pandas/io/parsers/c_parser_wrapper.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030

3131
from pandas.core.indexes.api import ensure_index_from_sequences
3232

33+
from pandas.io.common import (
34+
dedup_names,
35+
is_potential_multi_index,
36+
)
3337
from pandas.io.parsers.base_parser import (
3438
ParserBase,
3539
ParserError,
@@ -227,7 +231,10 @@ def read(
227231
except StopIteration:
228232
if self._first_chunk:
229233
self._first_chunk = False
230-
names = self._dedup_names(self.orig_names)
234+
names = dedup_names(
235+
self.orig_names,
236+
is_potential_multi_index(self.orig_names, self.index_col),
237+
)
231238
index, columns, col_dict = self._get_empty_meta(
232239
names,
233240
self.index_col,
@@ -281,7 +288,7 @@ def read(
281288
if self.usecols is not None:
282289
names = self._filter_usecols(names)
283290

284-
names = self._dedup_names(names)
291+
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
285292

286293
# rename dict keys
287294
data_tups = sorted(data.items())
@@ -303,7 +310,7 @@ def read(
303310
# assert for mypy, orig_names is List or None, None would error in list(...)
304311
assert self.orig_names is not None
305312
names = list(self.orig_names)
306-
names = self._dedup_names(names)
313+
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
307314

308315
if self.usecols is not None:
309316
names = self._filter_usecols(names)

pandas/io/parsers/python_parser.py

+21-2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@
3737
from pandas.core.dtypes.common import is_integer
3838
from pandas.core.dtypes.inference import is_dict_like
3939

40+
from pandas.io.common import (
41+
dedup_names,
42+
is_potential_multi_index,
43+
)
4044
from pandas.io.parsers.base_parser import (
4145
ParserBase,
4246
parser_defaults,
@@ -259,7 +263,14 @@ def read(
259263
columns: Sequence[Hashable] = list(self.orig_names)
260264
if not len(content): # pragma: no cover
261265
# DataFrame with the right metadata, even though it's length 0
262-
names = self._dedup_names(self.orig_names)
266+
# error: Cannot determine type of 'index_col'
267+
names = dedup_names(
268+
self.orig_names,
269+
is_potential_multi_index(
270+
self.orig_names,
271+
self.index_col, # type: ignore[has-type]
272+
),
273+
)
263274
# error: Cannot determine type of 'index_col'
264275
index, columns, col_dict = self._get_empty_meta(
265276
names,
@@ -293,7 +304,14 @@ def _exclude_implicit_index(
293304
self,
294305
alldata: list[np.ndarray],
295306
) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
296-
names = self._dedup_names(self.orig_names)
307+
# error: Cannot determine type of 'index_col'
308+
names = dedup_names(
309+
self.orig_names,
310+
is_potential_multi_index(
311+
self.orig_names,
312+
self.index_col, # type: ignore[has-type]
313+
),
314+
)
297315

298316
offset = 0
299317
if self._implicit_index:
@@ -434,6 +452,7 @@ def _infer_columns(
434452
if i not in this_unnamed_cols
435453
] + this_unnamed_cols
436454

455+
# TODO: Use pandas.io.common.dedup_names instead (see #50371)
437456
for i in col_loop_order:
438457
col = this_columns[i]
439458
old_col = col

pandas/tests/io/json/test_pandas.py

+23
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def test_frame_non_unique_columns(self, orient, data):
118118
expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000
119119
elif orient == "split":
120120
expected = df
121+
expected.columns = ["x", "x.1"]
121122

122123
tm.assert_frame_equal(result, expected)
123124

@@ -258,6 +259,28 @@ def test_roundtrip_mixed(self, orient, convert_axes):
258259

259260
assert_json_roundtrip_equal(result, expected, orient)
260261

262+
@pytest.mark.xfail(
263+
reason="#50456 Column multiindex is stored and loaded differently",
264+
raises=AssertionError,
265+
)
266+
@pytest.mark.parametrize(
267+
"columns",
268+
[
269+
[["2022", "2022"], ["JAN", "FEB"]],
270+
[["2022", "2023"], ["JAN", "JAN"]],
271+
[["2022", "2022"], ["JAN", "JAN"]],
272+
],
273+
)
274+
def test_roundtrip_multiindex(self, columns):
275+
df = DataFrame(
276+
[[1, 2], [3, 4]],
277+
columns=pd.MultiIndex.from_arrays(columns),
278+
)
279+
280+
result = read_json(df.to_json(orient="split"), orient="split")
281+
282+
tm.assert_frame_equal(result, df)
283+
261284
@pytest.mark.parametrize(
262285
"data,msg,orient",
263286
[

0 commit comments

Comments
 (0)