diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5b725eb4d2a98..129104b56dd4e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -500,6 +500,7 @@ Other API changes new DataFrame (shallow copy) instead of the original DataFrame, consistent with other methods to get a full slice (for example ``df.loc[:]`` or ``df[:]``) (:issue:`49469`) - Disallow computing ``cumprod`` for :class:`Timedelta` object; previously this returned incorrect values (:issue:`50246`) +- Loading a JSON file with duplicate columns using ``read_json(orient='split')`` renames columns to avoid duplicates, as :func:`read_csv` and the other readers do (:issue:`50370`) - :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`) - diff --git a/pandas/io/common.py b/pandas/io/common.py index 6deaf40f00c69..13185603c7bac 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,6 +6,7 @@ abstractmethod, ) import codecs +from collections import defaultdict import dataclasses import functools import gzip @@ -26,7 +27,9 @@ IO, Any, AnyStr, + DefaultDict, Generic, + Hashable, Literal, Mapping, Sequence, @@ -67,6 +70,7 @@ is_list_like, ) +from pandas.core.indexes.api import MultiIndex from pandas.core.shared_docs import _shared_docs _VALID_URLS = set(uses_relative + uses_netloc + uses_params) @@ -1181,3 +1185,69 @@ def _get_binary_io_classes() -> tuple[type, ...]: binary_classes += (type(reader),) return binary_classes + + +def is_potential_multi_index( + columns: Sequence[Hashable] | MultiIndex, + index_col: bool | Sequence[int] | None = None, +) -> bool: + """ + Check whether or not the `columns` parameter + could be converted into a MultiIndex. + + Parameters + ---------- + columns : array-like + Object which may or may not be convertible into a MultiIndex + index_col : None, bool or list, optional + Column or columns to use as the (possibly hierarchical) index + + Returns + ------- + bool : Whether or not columns could become a MultiIndex + """ + if index_col is None or isinstance(index_col, bool): + index_col = [] + + return bool( + len(columns) + and not isinstance(columns, MultiIndex) + and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) + ) + + +def dedup_names( + names: Sequence[Hashable], is_potential_multiindex: bool +) -> Sequence[Hashable]: + """ + Rename column names if duplicates exist. + + Currently the renaming is done by appending a period and an autonumeric, + but a custom pattern may be supported in the future. + + Examples + -------- + >>> dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False) + ['x', 'y', 'x.1', 'x.2'] + """ + names = list(names) # so we can index + counts: DefaultDict[Hashable, int] = defaultdict(int) + + for i, col in enumerate(names): + cur_count = counts[col] + + while cur_count > 0: + counts[col] = cur_count + 1 + + if is_potential_multiindex: + # for mypy + assert isinstance(col, tuple) + col = col[:-1] + (f"{col[-1]}.{cur_count}",) + else: + col = f"{col}.{cur_count}" + cur_count = counts[col] + + names[i] = col + counts[col] = cur_count + 1 + + return names diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 88974f3ab4afa..c501cad721ef5 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -57,10 +57,12 @@ from pandas.io.common import ( IOHandles, + dedup_names, extension_to_compression, file_exists, get_handle, is_fsspec_url, + is_potential_multi_index, is_url, stringify_path, ) @@ -1246,6 +1248,14 @@ def _parse(self) -> None: for k, v in loads(json, precise_float=self.precise_float).items() } self.check_keys_split(decoded) + orig_names = [ + (tuple(col) if isinstance(col, list) else col) + for col in decoded["columns"] + ] + decoded["columns"] = dedup_names( + orig_names, + is_potential_multi_index(orig_names, None), + ) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": self.obj = DataFrame.from_dict( diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index f38666ac52529..742a988526cd0 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -10,7 +10,6 @@ TYPE_CHECKING, Any, Callable, - DefaultDict, Hashable, Iterable, List, @@ -89,6 +88,8 @@ from pandas.core.series import Series from pandas.core.tools import datetimes as tools +from pandas.io.common import is_potential_multi_index + if TYPE_CHECKING: from pandas import DataFrame @@ -333,31 +334,6 @@ def extract(r): return names, index_names, col_names, passed_names - @final - def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: - names = list(names) # so we can index - counts: DefaultDict[Hashable, int] = defaultdict(int) - is_potential_mi = _is_potential_multi_index(names, self.index_col) - - for i, col in enumerate(names): - cur_count = counts[col] - - while cur_count > 0: - counts[col] = cur_count + 1 - - if is_potential_mi: - # for mypy - assert isinstance(col, tuple) - col = col[:-1] + (f"{col[-1]}.{cur_count}",) - else: - col = f"{col}.{cur_count}" - cur_count = counts[col] - - names[i] = col - counts[col] = cur_count + 1 - - return names - @final def _maybe_make_multi_index_columns( self, @@ -365,7 +341,7 @@ def _maybe_make_multi_index_columns( col_names: Sequence[Hashable] | None = None, ) -> Sequence[Hashable] | MultiIndex: # possibly create a column mi here - if _is_potential_multi_index(columns): + if is_potential_multi_index(columns): list_columns = cast(List[Tuple], columns) return MultiIndex.from_tuples(list_columns, names=col_names) return columns @@ -1326,35 +1302,6 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na): return na_values, na_fvalues -def _is_potential_multi_index( - columns: Sequence[Hashable] | MultiIndex, - index_col: bool | Sequence[int] | None = None, -) -> bool: - """ - Check whether or not the `columns` parameter - could be converted into a MultiIndex. - - Parameters - ---------- - columns : array-like - Object which may or may not be convertible into a MultiIndex - index_col : None, bool or list, optional - Column or columns to use as the (possibly hierarchical) index - - Returns - ------- - bool : Whether or not columns could become a MultiIndex - """ - if index_col is None or isinstance(index_col, bool): - index_col = [] - - return bool( - len(columns) - and not isinstance(columns, MultiIndex) - and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) - ) - - def _validate_parse_dates_arg(parse_dates): """ Check whether or not the 'parse_dates' parameter diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index e0daf157d3d3a..551518b623836 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -30,6 +30,10 @@ from pandas.core.indexes.api import ensure_index_from_sequences +from pandas.io.common import ( + dedup_names, + is_potential_multi_index, +) from pandas.io.parsers.base_parser import ( ParserBase, ParserError, @@ -227,7 +231,10 @@ def read( except StopIteration: if self._first_chunk: self._first_chunk = False - names = self._dedup_names(self.orig_names) + names = dedup_names( + self.orig_names, + is_potential_multi_index(self.orig_names, self.index_col), + ) index, columns, col_dict = self._get_empty_meta( names, self.index_col, @@ -281,7 +288,7 @@ def read( if self.usecols is not None: names = self._filter_usecols(names) - names = self._dedup_names(names) + names = dedup_names(names, is_potential_multi_index(names, self.index_col)) # rename dict keys data_tups = sorted(data.items()) @@ -303,7 +310,7 @@ def read( # assert for mypy, orig_names is List or None, None would error in list(...) assert self.orig_names is not None names = list(self.orig_names) - names = self._dedup_names(names) + names = dedup_names(names, is_potential_multi_index(names, self.index_col)) if self.usecols is not None: names = self._filter_usecols(names) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index aebf285e669bb..ec19cbf3c7cfd 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -37,6 +37,10 @@ from pandas.core.dtypes.common import is_integer from pandas.core.dtypes.inference import is_dict_like +from pandas.io.common import ( + dedup_names, + is_potential_multi_index, +) from pandas.io.parsers.base_parser import ( ParserBase, parser_defaults, @@ -259,7 +263,14 @@ def read( columns: Sequence[Hashable] = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 - names = self._dedup_names(self.orig_names) + # error: Cannot determine type of 'index_col' + names = dedup_names( + self.orig_names, + is_potential_multi_index( + self.orig_names, + self.index_col, # type: ignore[has-type] + ), + ) # error: Cannot determine type of 'index_col' index, columns, col_dict = self._get_empty_meta( names, @@ -293,7 +304,14 @@ def _exclude_implicit_index( self, alldata: list[np.ndarray], ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]: - names = self._dedup_names(self.orig_names) + # error: Cannot determine type of 'index_col' + names = dedup_names( + self.orig_names, + is_potential_multi_index( + self.orig_names, + self.index_col, # type: ignore[has-type] + ), + ) offset = 0 if self._implicit_index: @@ -434,6 +452,7 @@ def _infer_columns( if i not in this_unnamed_cols ] + this_unnamed_cols + # TODO: Use pandas.io.common.dedup_names instead (see #50371) for i in col_loop_order: col = this_columns[i] old_col = col diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index aff09a62b0df3..f37c7dcec0f0a 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -117,6 +117,7 @@ def test_frame_non_unique_columns(self, orient, data): expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000 elif orient == "split": expected = df + expected.columns = ["x", "x.1"] tm.assert_frame_equal(result, expected) @@ -257,6 +258,28 @@ def test_roundtrip_mixed(self, orient, convert_axes): assert_json_roundtrip_equal(result, expected, orient) + @pytest.mark.xfail( + reason="#50456 Column multiindex is stored and loaded differently", + raises=AssertionError, + ) + @pytest.mark.parametrize( + "columns", + [ + [["2022", "2022"], ["JAN", "FEB"]], + [["2022", "2023"], ["JAN", "JAN"]], + [["2022", "2022"], ["JAN", "JAN"]], + ], + ) + def test_roundtrip_multiindex(self, columns): + df = DataFrame( + [[1, 2], [3, 4]], + columns=pd.MultiIndex.from_arrays(columns), + ) + + result = read_json(df.to_json(orient="split"), orient="split") + + tm.assert_frame_equal(result, df) + @pytest.mark.parametrize( "data,msg,orient", [