-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Rename duplicate column names in read_json(orient='split') #50370
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
482a87f
c89cba5
f701a92
fbc093d
6d30477
4252e2f
2c59ef8
a6ea294
5572574
10a9036
792ec14
5a0cd30
83a94e2
8bad27c
0b865fe
3d49073
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,10 +57,12 @@ | |
|
||
from pandas.io.common import ( | ||
IOHandles, | ||
dedup_names, | ||
extension_to_compression, | ||
file_exists, | ||
get_handle, | ||
is_fsspec_url, | ||
is_potential_multi_index, | ||
is_url, | ||
stringify_path, | ||
) | ||
|
@@ -1246,6 +1248,14 @@ def _parse(self) -> None: | |
for k, v in loads(json, precise_float=self.precise_float).items() | ||
} | ||
self.check_keys_split(decoded) | ||
orig_names = [ | ||
(tuple(col) if isinstance(col, list) else col) | ||
for col in decoded["columns"] | ||
] | ||
Comment on lines
+1251
to
+1254
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. currently, this isn't necessary, because the test is xfailed, so it will pass even if there's an exception thrown (which I is what this line was meant to address?) I'm not too keen on adding a line of code which doesn't have any effect on the test suite (someone else might accidentally remove it as part of a refactor, and the test suite would still pass), so how about either:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. It's a bit more complex. This line does have an effect, and will make other tests fail if removed. A multiindex is represented internally in some places as a list of tuples. But JSON doesn't have the concept of tuples, so in the JSON it's represented as a list of lists. When calling dedup columns and the dataframe constructor, providing a list of lists is interpreted as a normal index with lists as values, which is a non-hashable type, and it raises a TypeError. Making them tuples will convert the values to the multiindex, and will make the JSON be loaded with a multiindex. The part that is still missing in this PR is that the loaded multiindex is not the same as the original one, if the JSON was saves with What it makes sense based on your comment is to be more explicit on when the test should xfail. I added that the xfail needs to be for Does this make sense to you? I wouldn't implement anything more complex than this here, since fixing the other bug shouldn't be difficult. We can implement the other PR first too, but probably not worth it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks good, thanks! |
||
decoded["columns"] = dedup_names( | ||
orig_names, | ||
is_potential_multi_index(orig_names, None), | ||
) | ||
self.obj = DataFrame(dtype=None, **decoded) | ||
elif orient == "index": | ||
self.obj = DataFrame.from_dict( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,10 @@ | |
from pandas.core.dtypes.common import is_integer | ||
from pandas.core.dtypes.inference import is_dict_like | ||
|
||
from pandas.io.common import ( | ||
dedup_names, | ||
is_potential_multi_index, | ||
) | ||
from pandas.io.parsers.base_parser import ( | ||
ParserBase, | ||
parser_defaults, | ||
|
@@ -259,7 +263,14 @@ def read( | |
columns: Sequence[Hashable] = list(self.orig_names) | ||
if not len(content): # pragma: no cover | ||
# DataFrame with the right metadata, even though it's length 0 | ||
names = self._dedup_names(self.orig_names) | ||
# error: Cannot determine type of 'index_col' | ||
names = dedup_names( | ||
self.orig_names, | ||
is_potential_multi_index( | ||
self.orig_names, | ||
self.index_col, # type: ignore[has-type] | ||
), | ||
) | ||
# error: Cannot determine type of 'index_col' | ||
index, columns, col_dict = self._get_empty_meta( | ||
names, | ||
|
@@ -293,7 +304,14 @@ def _exclude_implicit_index( | |
self, | ||
alldata: list[np.ndarray], | ||
) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]: | ||
names = self._dedup_names(self.orig_names) | ||
# error: Cannot determine type of 'index_col' | ||
names = dedup_names( | ||
self.orig_names, | ||
is_potential_multi_index( | ||
self.orig_names, | ||
self.index_col, # type: ignore[has-type] | ||
), | ||
) | ||
|
||
offset = 0 | ||
if self._implicit_index: | ||
|
@@ -434,6 +452,7 @@ def _infer_columns( | |
if i not in this_unnamed_cols | ||
] + this_unnamed_cols | ||
|
||
# TODO: Use pandas.io.common.dedup_names instead (see #50371) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @datapythonista This is one There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, correct. |
||
for i in col_loop_order: | ||
col = this_columns[i] | ||
old_col = col | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -117,6 +117,7 @@ def test_frame_non_unique_columns(self, orient, data): | |
expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000 | ||
elif orient == "split": | ||
expected = df | ||
expected.columns = ["x", "x.1"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should there also be a test for when the columns are a multiindex, the reader is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I'm not fully sure if it can be the case that I agree There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here's an example which passes on 1.5.2, but which fails here: data = [["a", "b"], ["c", "d"]]
df = DataFrame(data, index=[1, 2], columns=pd.MultiIndex.from_arrays([["x", "x"], ['a', 'x']]))
read_json(df.to_json(orient='split'), orient='split') There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @MarcoGorelli, this is very helpful. Looks like the json IO with >>> df = DataFrame(data, index=[1, 2], columns=pd.MultiIndex.from_arrays([["2022", "2022"], ['JAN', 'FEB']]))
>>> df
2022
JAN FEB
1 a b
2 c d
>>> print(df.to_json(orient='split'))
{"columns":[["2022","JAN"],["2022","FEB"]],"index":[1,2],"data":[["a","b"],["c","d"]]}
>>> read_json(df.to_json(orient='split'), orient='split')
2022 JAN
2022 FEB
1 a b
2 c d I opened #50456 to address the problem, and fixed the use case with multiindex and added the test, but xfailed it for now, so the current bug can be fixed in an separate PR. |
||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
|
@@ -257,6 +258,28 @@ def test_roundtrip_mixed(self, orient, convert_axes): | |
|
||
assert_json_roundtrip_equal(result, expected, orient) | ||
|
||
@pytest.mark.xfail( | ||
reason="#50456 Column multiindex is stored and loaded differently", | ||
raises=AssertionError, | ||
) | ||
@pytest.mark.parametrize( | ||
"columns", | ||
[ | ||
[["2022", "2022"], ["JAN", "FEB"]], | ||
[["2022", "2023"], ["JAN", "JAN"]], | ||
[["2022", "2022"], ["JAN", "JAN"]], | ||
], | ||
) | ||
def test_roundtrip_multiindex(self, columns): | ||
df = DataFrame( | ||
[[1, 2], [3, 4]], | ||
columns=pd.MultiIndex.from_arrays(columns), | ||
) | ||
|
||
result = read_json(df.to_json(orient="split"), orient="split") | ||
|
||
tm.assert_frame_equal(result, df) | ||
|
||
@pytest.mark.parametrize( | ||
"data,msg,orient", | ||
[ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need tests for this!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The test I updated is already testing this. There was a test specific for duplicate columns that was parametrized for orient split, and check the column names. Am I missing something?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback do you mind having another look, and letting me know if the added test and my previous comment address your concern. This is ready to be merged pending your review. Thanks!