-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Fix Issue 34748 - read in datetime as MultiIndex for column headers #34954
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
7143555
8b63b89
1a68eef
ab4b49d
d3cd371
9f3cdb4
c930a72
c744e35
c94a9b3
2ceb77e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1143,3 +1143,22 @@ def test_header_with_index_col(self, engine, filename): | |
filename, sheet_name="Sheet1", index_col=0, header=[0, 1] | ||
) | ||
tm.assert_frame_equal(expected, result) | ||
|
||
def test_read_datetime_multiindex(self, engine, read_ext): | ||
# GH 34748 | ||
if engine == "pyxlsb": | ||
pytest.xfail("Sheets containing datetimes not supported by pyxlsb") | ||
|
||
f = "test_datetime_mi" + read_ext | ||
with pd.ExcelFile(f) as excel: | ||
actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) | ||
expected_column_index = pd.MultiIndex.from_tuples( | ||
[(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))], | ||
names=[ | ||
pd.to_datetime("02/29/2020").to_pydatetime(), | ||
pd.to_datetime("03/01/2020").to_pydatetime(), | ||
], | ||
) | ||
# Cannot create a DataFrame with `expected_column_index` as the columns | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does this comment mean? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So there is a bug in master (but not in 1.0.5). See below. Should I create a separate issue for this? In [1]: import pandas as pd
In [2]: pd.__version__
Out[2]: '1.1.0.dev0+1952.gc744e35d0.dirty'
In [3]: expected_column_index = pd.MultiIndex.from_tuples(
...: [(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))],
...: names=[
...: pd.to_datetime("02/29/2020").to_pydatetime(),
...: pd.to_datetime("03/01/2020").to_pydatetime(),
...: ],
...: )
In [4]: pd.DataFrame([], columns=expected_column_index)
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-4-e63e2d18b57f> in <module>
----> 1 pd.DataFrame([], columns=expected_column_index)
c:\Code\pandas_dev\pandas\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
515 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
516 else:
--> 517 mgr = init_dict({}, index, columns, dtype=dtype)
518 else:
519 try:
c:\Code\pandas_dev\pandas\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
267 nan_dtype = dtype
268 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
--> 269 arrays.loc[missing] = [val] * missing.sum()
270
271 else:
c:\Code\pandas_dev\pandas\pandas\core\indexing.py in __setitem__(self, key, value)
662 else:
663 key = com.apply_if_callable(key, self.obj)
--> 664 indexer = self._get_setitem_indexer(key)
665 self._has_valid_setitem_indexer(key)
666
c:\Code\pandas_dev\pandas\pandas\core\indexing.py in _get_setitem_indexer(self, key)
613
614 try:
--> 615 return self._convert_to_indexer(key, axis=0, is_setter=True)
616 except TypeError as e:
617
c:\Code\pandas_dev\pandas\pandas\core\indexing.py in _convert_to_indexer(self, key, axis, is_setter)
1158 # if we are a label return me
1159 try:
-> 1160 return labels.get_loc(key)
1161 except LookupError:
1162 if isinstance(key, tuple) and isinstance(labels, ABCMultiIndex):
c:\Code\pandas_dev\pandas\pandas\core\indexes\multi.py in get_loc(self, key, method)
2694 if not isinstance(key, (tuple, list)):
2695 # not including list here breaks some indexing, xref #30892
-> 2696 loc = self._get_level_indexer(key, level=0)
2697 return _maybe_to_slice(loc)
2698
c:\Code\pandas_dev\pandas\pandas\core\indexes\multi.py in _get_level_indexer(self, key, level, indexer)
2959 else:
2960
-> 2961 code = self._get_loc_single_level_index(level_index, key)
2962
2963 if level > 0 or self.lexsort_depth == 0:
c:\Code\pandas_dev\pandas\pandas\core\indexes\multi.py in _get_loc_single_level_index(self, level_index, key)
2628 return -1
2629 else:
-> 2630 return level_index.get_loc(key)
2631
2632 def get_loc(self, key, method=None):
c:\Code\pandas_dev\pandas\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
569 """
570 if not is_scalar(key):
--> 571 raise InvalidIndexError(key)
572
573 orig_key = key
InvalidIndexError: 2020-02-29 00:00:00 2020-03-01 00:00:00
2020-02-29 2020-03-01 True
dtype: bool There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see above, i think that should work, but these should be coerced to Timestamp, we don't support datetime in Index at all (even as object), they are always coerced. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be ok with handling this as a pre-cursor PR bug fix i think. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In my example, the index values are There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right, how does the Excel reader do that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So do you mean we hold up on this PR, create an issue for the bug above, and get that fixed first? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so if the other issue can be address first would be best. so i would create an issue. ideally we merge the patch first. if it turns into a quagmire, then can re-visit. |
||
|
||
tm.assert_index_equal(actual.columns, expected_column_index) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are the to_pydatetime calls required?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because when the Excel reader creates the names of the index, the types are of
dt.datetime
not the pandasdatetime
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm, where is this done? this is unfortunately as these should actually be Timestamp
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So is this a separate issue - that we don't want the names to be dt.datetime ? If so, I will create an issue for that.