Skip to content

Rename duplicate column names in read_json(orient='split') #50370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jan 9, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ Other API changes
new DataFrame (shallow copy) instead of the original DataFrame, consistent with other
methods to get a full slice (for example ``df.loc[:]`` or ``df[:]``) (:issue:`49469`)
- Disallow computing ``cumprod`` for :class:`Timedelta` object; previously this returned incorrect values (:issue:`50246`)
-
- Loading a JSON file with duplicate columns using ``read_json(orient='split')`` renames columns to avoid duplicates, as :func:`read_csv` and the other readers do (:issue:`XXX`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR number here? other than that looks good

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added, thanks!


.. ---------------------------------------------------------------------------
.. _whatsnew_200.deprecations:
Expand Down
70 changes: 70 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
abstractmethod,
)
import codecs
from collections import defaultdict
import dataclasses
import functools
import gzip
Expand All @@ -26,7 +27,9 @@
IO,
Any,
AnyStr,
DefaultDict,
Generic,
Hashable,
Literal,
Mapping,
Sequence,
Expand Down Expand Up @@ -67,6 +70,7 @@
is_list_like,
)

from pandas.core.indexes.api import MultiIndex
from pandas.core.shared_docs import _shared_docs

_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
Expand Down Expand Up @@ -1181,3 +1185,69 @@ def _get_binary_io_classes() -> tuple[type, ...]:
binary_classes += (type(reader),)

return binary_classes


def is_potential_multi_index(
columns: Sequence[Hashable] | MultiIndex,
index_col: bool | Sequence[int] | None = None,
) -> bool:
"""
Check whether or not the `columns` parameter
could be converted into a MultiIndex.

Parameters
----------
columns : array-like
Object which may or may not be convertible into a MultiIndex
index_col : None, bool or list, optional
Column or columns to use as the (possibly hierarchical) index

Returns
-------
bool : Whether or not columns could become a MultiIndex
"""
if index_col is None or isinstance(index_col, bool):
index_col = []

return bool(
len(columns)
and not isinstance(columns, MultiIndex)
and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
)


def dedup_names(
names: Sequence[Hashable], is_potential_multiindex: bool
) -> Sequence[Hashable]:
"""
Rename column names if duplicates exist.

Currently the renaming is done by appending a period and an autonumeric,
but a custom pattern may be supported in the future.

Examples
--------
>>> dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False)
['x', 'y', 'x.1', 'x.2']
"""
names = list(names) # so we can index
counts: DefaultDict[Hashable, int] = defaultdict(int)

for i, col in enumerate(names):
cur_count = counts[col]

while cur_count > 0:
counts[col] = cur_count + 1

if is_potential_multiindex:
# for mypy
assert isinstance(col, tuple)
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
else:
col = f"{col}.{cur_count}"
cur_count = counts[col]

names[i] = col
counts[col] = cur_count + 1

return names
10 changes: 10 additions & 0 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@
from pandas.io.common import (
IOHandles,
_extension_to_compression,
dedup_names,
file_exists,
get_handle,
is_fsspec_url,
is_potential_multi_index,
is_url,
stringify_path,
)
Expand Down Expand Up @@ -1246,6 +1248,14 @@ def _parse(self) -> None:
for k, v in loads(json, precise_float=self.precise_float).items()
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need tests for this!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test I updated is already testing this. There was a test specific for duplicate columns that was parametrized for orient split, and check the column names. Am I missing something?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback do you mind having another look, and letting me know if the added test and my previous comment address your concern. This is ready to be merged pending your review. Thanks!

self.check_keys_split(decoded)
orig_names = [
(tuple(col) if isinstance(col, list) else col)
for col in decoded["columns"]
]
Comment on lines +1251 to +1254
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently, this isn't necessary, because the test is xfailed, so it will pass even if there's an exception thrown (which I is what this line was meant to address?)

I'm not too keen on adding a line of code which doesn't have any effect on the test suite (someone else might accidentally remove it as part of a refactor, and the test suite would still pass), so how about either:

  • remove this, and just letting the code raise: as your example shows, multiindex with columns='split' wasn't supported to begin with, so it's OK for it to throw an error;
  • OR keep this, but change the test to check that the result value has tuples as column names, with a comment saying that the return type should be changed (instead of xfailing)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. It's a bit more complex. This line does have an effect, and will make other tests fail if removed. A multiindex is represented internally in some places as a list of tuples. But JSON doesn't have the concept of tuples, so in the JSON it's represented as a list of lists. When calling dedup columns and the dataframe constructor, providing a list of lists is interpreted as a normal index with lists as values, which is a non-hashable type, and it raises a TypeError. Making them tuples will convert the values to the multiindex, and will make the JSON be loaded with a multiindex.

The part that is still missing in this PR is that the loaded multiindex is not the same as the original one, if the JSON was saves with to_json. That's what the other issue will fix, and what the xfailed test is checking.

What it makes sense based on your comment is to be more explicit on when the test should xfail. I added that the xfail needs to be for AssertionError (the compared values are different). So now, if this line is removed, the test will fail, as the error for trying to create an index with a list of lists is a TypeError, and the test will check that the code doesn't raise, and will need that a value is returned, just different from the expected one.

Does this make sense to you? I wouldn't implement anything more complex than this here, since fixing the other bug shouldn't be difficult. We can implement the other PR first too, but probably not worth it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks good, thanks!

decoded["columns"] = dedup_names(
orig_names,
is_potential_multi_index(orig_names, None),
)
self.obj = DataFrame(dtype=None, **decoded)
elif orient == "index":
self.obj = DataFrame.from_dict(
Expand Down
59 changes: 3 additions & 56 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
TYPE_CHECKING,
Any,
Callable,
DefaultDict,
Hashable,
Iterable,
List,
Expand Down Expand Up @@ -89,6 +88,8 @@
from pandas.core.series import Series
from pandas.core.tools import datetimes as tools

from pandas.io.common import is_potential_multi_index

if TYPE_CHECKING:
from pandas import DataFrame

Expand Down Expand Up @@ -333,39 +334,14 @@ def extract(r):

return names, index_names, col_names, passed_names

@final
def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
names = list(names) # so we can index
counts: DefaultDict[Hashable, int] = defaultdict(int)
is_potential_mi = _is_potential_multi_index(names, self.index_col)

for i, col in enumerate(names):
cur_count = counts[col]

while cur_count > 0:
counts[col] = cur_count + 1

if is_potential_mi:
# for mypy
assert isinstance(col, tuple)
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
else:
col = f"{col}.{cur_count}"
cur_count = counts[col]

names[i] = col
counts[col] = cur_count + 1

return names

@final
def _maybe_make_multi_index_columns(
self,
columns: Sequence[Hashable],
col_names: Sequence[Hashable] | None = None,
) -> Sequence[Hashable] | MultiIndex:
# possibly create a column mi here
if _is_potential_multi_index(columns):
if is_potential_multi_index(columns):
list_columns = cast(List[Tuple], columns)
return MultiIndex.from_tuples(list_columns, names=col_names)
return columns
Expand Down Expand Up @@ -1326,35 +1302,6 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na):
return na_values, na_fvalues


def _is_potential_multi_index(
columns: Sequence[Hashable] | MultiIndex,
index_col: bool | Sequence[int] | None = None,
) -> bool:
"""
Check whether or not the `columns` parameter
could be converted into a MultiIndex.

Parameters
----------
columns : array-like
Object which may or may not be convertible into a MultiIndex
index_col : None, bool or list, optional
Column or columns to use as the (possibly hierarchical) index

Returns
-------
bool : Whether or not columns could become a MultiIndex
"""
if index_col is None or isinstance(index_col, bool):
index_col = []

return bool(
len(columns)
and not isinstance(columns, MultiIndex)
and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
)


def _validate_parse_dates_arg(parse_dates):
"""
Check whether or not the 'parse_dates' parameter
Expand Down
13 changes: 10 additions & 3 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@

from pandas.core.indexes.api import ensure_index_from_sequences

from pandas.io.common import (
dedup_names,
is_potential_multi_index,
)
from pandas.io.parsers.base_parser import (
ParserBase,
ParserError,
Expand Down Expand Up @@ -227,7 +231,10 @@ def read(
except StopIteration:
if self._first_chunk:
self._first_chunk = False
names = self._dedup_names(self.orig_names)
names = dedup_names(
self.orig_names,
is_potential_multi_index(self.orig_names, self.index_col),
)
index, columns, col_dict = self._get_empty_meta(
names,
self.index_col,
Expand Down Expand Up @@ -281,7 +288,7 @@ def read(
if self.usecols is not None:
names = self._filter_usecols(names)

names = self._dedup_names(names)
names = dedup_names(names, is_potential_multi_index(names, self.index_col))

# rename dict keys
data_tups = sorted(data.items())
Expand All @@ -303,7 +310,7 @@ def read(
# assert for mypy, orig_names is List or None, None would error in list(...)
assert self.orig_names is not None
names = list(self.orig_names)
names = self._dedup_names(names)
names = dedup_names(names, is_potential_multi_index(names, self.index_col))

if self.usecols is not None:
names = self._filter_usecols(names)
Expand Down
23 changes: 21 additions & 2 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
from pandas.core.dtypes.common import is_integer
from pandas.core.dtypes.inference import is_dict_like

from pandas.io.common import (
dedup_names,
is_potential_multi_index,
)
from pandas.io.parsers.base_parser import (
ParserBase,
parser_defaults,
Expand Down Expand Up @@ -259,7 +263,14 @@ def read(
columns: Sequence[Hashable] = list(self.orig_names)
if not len(content): # pragma: no cover
# DataFrame with the right metadata, even though it's length 0
names = self._dedup_names(self.orig_names)
# error: Cannot determine type of 'index_col'
names = dedup_names(
self.orig_names,
is_potential_multi_index(
self.orig_names,
self.index_col, # type: ignore[has-type]
),
)
# error: Cannot determine type of 'index_col'
index, columns, col_dict = self._get_empty_meta(
names,
Expand Down Expand Up @@ -293,7 +304,14 @@ def _exclude_implicit_index(
self,
alldata: list[np.ndarray],
) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
names = self._dedup_names(self.orig_names)
# error: Cannot determine type of 'index_col'
names = dedup_names(
self.orig_names,
is_potential_multi_index(
self.orig_names,
self.index_col, # type: ignore[has-type]
),
)

offset = 0
if self._implicit_index:
Expand Down Expand Up @@ -434,6 +452,7 @@ def _infer_columns(
if i not in this_unnamed_cols
] + this_unnamed_cols

# TODO: Use pandas.io.common.dedup_names instead (see #50371)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@datapythonista This is one TODO you mentioned? Just to be sure and not doing rework? #50371

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, correct.

for i in col_loop_order:
col = this_columns[i]
old_col = col
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def test_frame_non_unique_columns(self, orient, data):
expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000
elif orient == "split":
expected = df
expected.columns = ["x", "x.1"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should there also be a test for when the columns are a multiindex, the reader is read_json, and orient='split'?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I'm not fully sure if it can be the case that read_json(orient='split') can generate a multiindex. What would the json look like?

I agree dedup_names can probably be better tested, and make sure that the multiindex cases are well tested and work as expected. In my opinion probably better to address it in #50371. For this PR multiindexing is irrelevant if I'm not missing something. #50371 should consolidate the two different versions of dedup_names and make sure they work for all use cases if the implementations are not exactly the same. I think ensuring proper testing makes more sense there, here the function is not modified, and I don't think any bug in the function can be introduced in this PR, as it's just been moved from a method to a function.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here's an example which passes on 1.5.2, but which fails here:

data = [["a", "b"], ["c", "d"]]
df = DataFrame(data, index=[1, 2], columns=pd.MultiIndex.from_arrays([["x", "x"], ['a', 'x']]))
read_json(df.to_json(orient='split'), orient='split')

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @MarcoGorelli, this is very helpful. Looks like the json IO with orient='split' does try to support multiindexes, but the implementation in main seems buggy:

>>> df = DataFrame(data, index=[1, 2], columns=pd.MultiIndex.from_arrays([["2022", "2022"], ['JAN', 'FEB']]))
>>> df
  2022    
   JAN FEB
1    a   b
2    c   d
>>> print(df.to_json(orient='split'))
{"columns":[["2022","JAN"],["2022","FEB"]],"index":[1,2],"data":[["a","b"],["c","d"]]}
>>> read_json(df.to_json(orient='split'), orient='split')
  2022 JAN
  2022 FEB
1    a   b
2    c   d

I opened #50456 to address the problem, and fixed the use case with multiindex and added the test, but xfailed it for now, so the current bug can be fixed in an separate PR.


tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -257,6 +258,28 @@ def test_roundtrip_mixed(self, orient, convert_axes):

assert_json_roundtrip_equal(result, expected, orient)

@pytest.mark.xfail(
reason="#50456 Column multiindex is stored and loaded differently",
raises=AssertionError,
)
@pytest.mark.parametrize(
"columns",
[
[["2022", "2022"], ["JAN", "FEB"]],
[["2022", "2023"], ["JAN", "JAN"]],
[["2022", "2022"], ["JAN", "JAN"]],
],
)
def test_roundtrip_multiindex(self, columns):
df = DataFrame(
[[1, 2], [3, 4]],
columns=pd.MultiIndex.from_arrays(columns),
)

result = read_json(df.to_json(orient="split"), orient="split")

tm.assert_frame_equal(result, df)

@pytest.mark.parametrize(
"data,msg,orient",
[
Expand Down