-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Rename duplicate column names in read_json(orient='split') #50370
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
482a87f
c89cba5
f701a92
fbc093d
6d30477
4252e2f
2c59ef8
a6ea294
5572574
10a9036
792ec14
5a0cd30
83a94e2
8bad27c
0b865fe
3d49073
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,6 +58,7 @@ | |
from pandas.io.common import ( | ||
IOHandles, | ||
_extension_to_compression, | ||
dedup_names, | ||
file_exists, | ||
get_handle, | ||
is_fsspec_url, | ||
|
@@ -1245,6 +1246,9 @@ def _parse(self) -> None: | |
str(k): v | ||
for k, v in loads(json, precise_float=self.precise_float).items() | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need tests for this! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The test I updated is already testing this. There was a test specific for duplicate columns that was parametrized for orient split, and check the column names. Am I missing something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback do you mind having another look, and letting me know if the added test and my previous comment address your concern. This is ready to be merged pending your review. Thanks! |
||
decoded["columns"] = dedup_names( | ||
names=decoded["columns"], is_potential_multi_index=False | ||
) | ||
self.check_keys_split(decoded) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
self.obj = DataFrame(dtype=None, **decoded) | ||
elif orient == "index": | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,10 @@ | |
from pandas.core.dtypes.common import is_integer | ||
from pandas.core.dtypes.inference import is_dict_like | ||
|
||
from pandas.io.common import ( | ||
dedup_names, | ||
is_potential_multi_index, | ||
) | ||
from pandas.io.parsers.base_parser import ( | ||
ParserBase, | ||
parser_defaults, | ||
|
@@ -259,7 +263,10 @@ def read( | |
columns: Sequence[Hashable] = list(self.orig_names) | ||
if not len(content): # pragma: no cover | ||
# DataFrame with the right metadata, even though it's length 0 | ||
names = self._dedup_names(self.orig_names) | ||
names = dedup_names( | ||
self.orig_names, | ||
is_potential_multi_index(self.orig_names, self.index_col), | ||
) | ||
# error: Cannot determine type of 'index_col' | ||
index, columns, col_dict = self._get_empty_meta( | ||
names, | ||
|
@@ -293,7 +300,9 @@ def _exclude_implicit_index( | |
self, | ||
alldata: list[np.ndarray], | ||
) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]: | ||
names = self._dedup_names(self.orig_names) | ||
names = self._dedup_names( | ||
self.orig_names, is_potential_multi_index(self.orig_names, self.index_col) | ||
) | ||
|
||
offset = 0 | ||
if self._implicit_index: | ||
|
@@ -434,6 +443,7 @@ def _infer_columns( | |
if i not in this_unnamed_cols | ||
] + this_unnamed_cols | ||
|
||
# TODO: Use pandas.io.common.dedup_names instead (see #50371) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @datapythonista This is one There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, correct. |
||
for i in col_loop_order: | ||
col = this_columns[i] | ||
old_col = col | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -117,6 +117,7 @@ def test_frame_non_unique_columns(self, orient, data): | |
expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000 | ||
elif orient == "split": | ||
expected = df | ||
expected.columns = ["x", "x.1"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should there also be a test for when the columns are a multiindex, the reader is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I'm not fully sure if it can be the case that I agree There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here's an example which passes on 1.5.2, but which fails here: data = [["a", "b"], ["c", "d"]]
df = DataFrame(data, index=[1, 2], columns=pd.MultiIndex.from_arrays([["x", "x"], ['a', 'x']]))
read_json(df.to_json(orient='split'), orient='split') There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @MarcoGorelli, this is very helpful. Looks like the json IO with >>> df = DataFrame(data, index=[1, 2], columns=pd.MultiIndex.from_arrays([["2022", "2022"], ['JAN', 'FEB']]))
>>> df
2022
JAN FEB
1 a b
2 c d
>>> print(df.to_json(orient='split'))
{"columns":[["2022","JAN"],["2022","FEB"]],"index":[1,2],"data":[["a","b"],["c","d"]]}
>>> read_json(df.to_json(orient='split'), orient='split')
2022 JAN
2022 FEB
1 a b
2 c d I opened #50456 to address the problem, and fixed the use case with multiindex and added the test, but xfailed it for now, so the current bug can be fixed in an separate PR. |
||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PR number here? other than that looks good
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!