Rename duplicate column names in read_json(orient='split') (#50370)

datapythonista · web-flow · commit 5115f0964a47 · 2023-01-09T15:54:57.000-08:00
* Rename duplicate column names in read_json(orient='split')

* Add issue number to TODO

* Finish renaming of _dedup_names and check keys before column renaming

* Black

* Support multiindex and fix linters

* Restoring commented code

* isort

* Fix doctest

* Being more specific with xfail

* Update whatsnew issue number
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -543,6 +543,7 @@ Other API changes
   new DataFrame (shallow copy) instead of the original DataFrame, consistent with other
   methods to get a full slice (for example ``df.loc[:]`` or ``df[:]``) (:issue:`49469`)
 - Disallow computing ``cumprod`` for :class:`Timedelta` object; previously this returned incorrect values (:issue:`50246`)
+- Loading a JSON file with duplicate columns using ``read_json(orient='split')`` renames columns to avoid duplicates, as :func:`read_csv` and the other readers do (:issue:`50370`)
 - :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`)
 -
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -6,6 +6,7 @@
     abstractmethod,
 )
 import codecs
+from collections import defaultdict
 import dataclasses
 import functools
 import gzip
@@ -26,7 +27,9 @@
     IO,
     Any,
     AnyStr,
+    DefaultDict,
     Generic,
+    Hashable,
     Literal,
     Mapping,
     Sequence,
@@ -67,6 +70,7 @@
     is_list_like,
 )
 
+from pandas.core.indexes.api import MultiIndex
 from pandas.core.shared_docs import _shared_docs
 
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
@@ -1181,3 +1185,69 @@ def _get_binary_io_classes() -> tuple[type, ...]:
             binary_classes += (type(reader),)
 
     return binary_classes
+
+
+def is_potential_multi_index(
+    columns: Sequence[Hashable] | MultiIndex,
+    index_col: bool | Sequence[int] | None = None,
+) -> bool:
+    """
+    Check whether or not the `columns` parameter
+    could be converted into a MultiIndex.
+
+    Parameters
+    ----------
+    columns : array-like
+        Object which may or may not be convertible into a MultiIndex
+    index_col : None, bool or list, optional
+        Column or columns to use as the (possibly hierarchical) index
+
+    Returns
+    -------
+    bool : Whether or not columns could become a MultiIndex
+    """
+    if index_col is None or isinstance(index_col, bool):
+        index_col = []
+
+    return bool(
+        len(columns)
+        and not isinstance(columns, MultiIndex)
+        and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
+    )
+
+
+def dedup_names(
+    names: Sequence[Hashable], is_potential_multiindex: bool
+) -> Sequence[Hashable]:
+    """
+    Rename column names if duplicates exist.
+
+    Currently the renaming is done by appending a period and an autonumeric,
+    but a custom pattern may be supported in the future.
+
+    Examples
+    --------
+    >>> dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False)
+    ['x', 'y', 'x.1', 'x.2']
+    """
+    names = list(names)  # so we can index
+    counts: DefaultDict[Hashable, int] = defaultdict(int)
+
+    for i, col in enumerate(names):
+        cur_count = counts[col]
+
+        while cur_count > 0:
+            counts[col] = cur_count + 1
+
+            if is_potential_multiindex:
+                # for mypy
+                assert isinstance(col, tuple)
+                col = col[:-1] + (f"{col[-1]}.{cur_count}",)
+            else:
+                col = f"{col}.{cur_count}"
+            cur_count = counts[col]
+
+        names[i] = col
+        counts[col] = cur_count + 1
+
+    return names
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -57,10 +57,12 @@
 
 from pandas.io.common import (
     IOHandles,
+    dedup_names,
     extension_to_compression,
     file_exists,
     get_handle,
     is_fsspec_url,
+    is_potential_multi_index,
     is_url,
     stringify_path,
 )
@@ -1246,6 +1248,14 @@ def _parse(self) -> None:
                 for k, v in loads(json, precise_float=self.precise_float).items()
             }
             self.check_keys_split(decoded)
+            orig_names = [
+                (tuple(col) if isinstance(col, list) else col)
+                for col in decoded["columns"]
+            ]
+            decoded["columns"] = dedup_names(
+                orig_names,
+                is_potential_multi_index(orig_names, None),
+            )
             self.obj = DataFrame(dtype=None, **decoded)
         elif orient == "index":
             self.obj = DataFrame.from_dict(
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -10,7 +10,6 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    DefaultDict,
     Hashable,
     Iterable,
     List,
@@ -89,6 +88,8 @@
 from pandas.core.series import Series
 from pandas.core.tools import datetimes as tools
 
+from pandas.io.common import is_potential_multi_index
+
 if TYPE_CHECKING:
     from pandas import DataFrame
 
@@ -333,39 +334,14 @@ def extract(r):
 
         return names, index_names, col_names, passed_names
 
-    @final
-    def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
-        names = list(names)  # so we can index
-        counts: DefaultDict[Hashable, int] = defaultdict(int)
-        is_potential_mi = _is_potential_multi_index(names, self.index_col)
-
-        for i, col in enumerate(names):
-            cur_count = counts[col]
-
-            while cur_count > 0:
-                counts[col] = cur_count + 1
-
-                if is_potential_mi:
-                    # for mypy
-                    assert isinstance(col, tuple)
-                    col = col[:-1] + (f"{col[-1]}.{cur_count}",)
-                else:
-                    col = f"{col}.{cur_count}"
-                cur_count = counts[col]
-
-            names[i] = col
-            counts[col] = cur_count + 1
-
-        return names
-
     @final
     def _maybe_make_multi_index_columns(
         self,
         columns: Sequence[Hashable],
         col_names: Sequence[Hashable] | None = None,
     ) -> Sequence[Hashable] | MultiIndex:
         # possibly create a column mi here
-        if _is_potential_multi_index(columns):
+        if is_potential_multi_index(columns):
             list_columns = cast(List[Tuple], columns)
             return MultiIndex.from_tuples(list_columns, names=col_names)
         return columns
@@ -1326,35 +1302,6 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na):
         return na_values, na_fvalues
 
 
-def _is_potential_multi_index(
-    columns: Sequence[Hashable] | MultiIndex,
-    index_col: bool | Sequence[int] | None = None,
-) -> bool:
-    """
-    Check whether or not the `columns` parameter
-    could be converted into a MultiIndex.
-
-    Parameters
-    ----------
-    columns : array-like
-        Object which may or may not be convertible into a MultiIndex
-    index_col : None, bool or list, optional
-        Column or columns to use as the (possibly hierarchical) index
-
-    Returns
-    -------
-    bool : Whether or not columns could become a MultiIndex
-    """
-    if index_col is None or isinstance(index_col, bool):
-        index_col = []
-
-    return bool(
-        len(columns)
-        and not isinstance(columns, MultiIndex)
-        and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
-    )
-
-
 def _validate_parse_dates_arg(parse_dates):
     """
     Check whether or not the 'parse_dates' parameter
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -30,6 +30,10 @@
 
 from pandas.core.indexes.api import ensure_index_from_sequences
 
+from pandas.io.common import (
+    dedup_names,
+    is_potential_multi_index,
+)
 from pandas.io.parsers.base_parser import (
     ParserBase,
     ParserError,
@@ -227,7 +231,10 @@ def read(
         except StopIteration:
             if self._first_chunk:
                 self._first_chunk = False
-                names = self._dedup_names(self.orig_names)
+                names = dedup_names(
+                    self.orig_names,
+                    is_potential_multi_index(self.orig_names, self.index_col),
+                )
                 index, columns, col_dict = self._get_empty_meta(
                     names,
                     self.index_col,
@@ -281,7 +288,7 @@ def read(
             if self.usecols is not None:
                 names = self._filter_usecols(names)
 
-            names = self._dedup_names(names)
+            names = dedup_names(names, is_potential_multi_index(names, self.index_col))
 
             # rename dict keys
             data_tups = sorted(data.items())
@@ -303,7 +310,7 @@ def read(
             # assert for mypy, orig_names is List or None, None would error in list(...)
             assert self.orig_names is not None
             names = list(self.orig_names)
-            names = self._dedup_names(names)
+            names = dedup_names(names, is_potential_multi_index(names, self.index_col))
 
             if self.usecols is not None:
                 names = self._filter_usecols(names)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -37,6 +37,10 @@
 from pandas.core.dtypes.common import is_integer
 from pandas.core.dtypes.inference import is_dict_like
 
+from pandas.io.common import (
+    dedup_names,
+    is_potential_multi_index,
+)
 from pandas.io.parsers.base_parser import (
     ParserBase,
     parser_defaults,
@@ -259,7 +263,14 @@ def read(
         columns: Sequence[Hashable] = list(self.orig_names)
         if not len(content):  # pragma: no cover
             # DataFrame with the right metadata, even though it's length 0
-            names = self._dedup_names(self.orig_names)
+            # error: Cannot determine type of 'index_col'
+            names = dedup_names(
+                self.orig_names,
+                is_potential_multi_index(
+                    self.orig_names,
+                    self.index_col,  # type: ignore[has-type]
+                ),
+            )
             # error: Cannot determine type of 'index_col'
             index, columns, col_dict = self._get_empty_meta(
                 names,
@@ -293,7 +304,14 @@ def _exclude_implicit_index(
         self,
         alldata: list[np.ndarray],
     ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
-        names = self._dedup_names(self.orig_names)
+        # error: Cannot determine type of 'index_col'
+        names = dedup_names(
+            self.orig_names,
+            is_potential_multi_index(
+                self.orig_names,
+                self.index_col,  # type: ignore[has-type]
+            ),
+        )
 
         offset = 0
         if self._implicit_index:
@@ -434,6 +452,7 @@ def _infer_columns(
                         if i not in this_unnamed_cols
                     ] + this_unnamed_cols
 
+                    # TODO: Use pandas.io.common.dedup_names instead (see #50371)
                     for i in col_loop_order:
                         col = this_columns[i]
                         old_col = col
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -118,6 +118,7 @@ def test_frame_non_unique_columns(self, orient, data):
                 expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000
         elif orient == "split":
             expected = df
+            expected.columns = ["x", "x.1"]
 
         tm.assert_frame_equal(result, expected)
 
@@ -258,6 +259,28 @@ def test_roundtrip_mixed(self, orient, convert_axes):
 
         assert_json_roundtrip_equal(result, expected, orient)
 
+    @pytest.mark.xfail(
+        reason="#50456 Column multiindex is stored and loaded differently",
+        raises=AssertionError,
+    )
+    @pytest.mark.parametrize(
+        "columns",
+        [
+            [["2022", "2022"], ["JAN", "FEB"]],
+            [["2022", "2023"], ["JAN", "JAN"]],
+            [["2022", "2022"], ["JAN", "JAN"]],
+        ],
+    )
+    def test_roundtrip_multiindex(self, columns):
+        df = DataFrame(
+            [[1, 2], [3, 4]],
+            columns=pd.MultiIndex.from_arrays(columns),
+        )
+
+        result = read_json(df.to_json(orient="split"), orient="split")
+
+        tm.assert_frame_equal(result, df)
+
     @pytest.mark.parametrize(
         "data,msg,orient",
         [

Original file line number	Diff line number	Diff line change
`@@ -543,6 +543,7 @@ Other API changes`
`543`	`543`	`new DataFrame (shallow copy) instead of the original DataFrame, consistent with other`
`544`	`544`	methods to get a full slice (for example ``df.loc[:]`` or ``df[:]``) (:issue:`49469`)
`545`	`545`	- Disallow computing ``cumprod`` for :class:`Timedelta` object; previously this returned incorrect values (:issue:`50246`)
	`546`	+- Loading a JSON file with duplicate columns using ``read_json(orient='split')`` renames columns to avoid duplicates, as :func:`read_csv` and the other readers do (:issue:`50370`)
`546`	`547`	- :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`)
`547`	`548`	`-`
`548`	`549`