Skip to content

BUG: Interchange protocol implementation allows non-string column names #57174

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Fixed regressions

Bug fixes
~~~~~~~~~
- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`)
- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`)

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/interchange/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None:
Constructor - an instance of this (private) class is returned from
`pd.DataFrame.__dataframe__`.
"""
self._df = df
self._df = df.rename(columns=str, copy=False)
self._allow_copy = allow_copy

def __dataframe__(
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,6 @@ def test_missing_from_masked():
}
)

df2 = df.__dataframe__()

Comment on lines -165 to -166
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure what this line was ever meant to achieve, as nothing is done with df2, it's just re-assigned to in L172

keeping it in results in

>           df.loc[null_idx, col] = None

pandas/tests/interchange/test_impl.py:172: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
pandas/core/indexing.py:[91](https://github.com/pandas-dev/pandas/actions/runs/7727513398/job/21066166133#step:8:93)2: in __setitem__
    iloc._setitem_with_indexer(indexer, value, self.name)
pandas/core/indexing.py:1[94](https://github.com/pandas-dev/pandas/actions/runs/7727513398/job/21066166133#step:8:96)6: in _setitem_with_indexer
    self._setitem_with_indexer_split_path(indexer, value, name)
pandas/core/indexing.py:2039: in _setitem_with_indexer_split_path
    self._setitem_single_column(loc, value, pi)
pandas/core/indexing.py:2170: in _setitem_single_column
    self.obj._mgr.column_setitem(loc, plane_indexer, value)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = BlockManager
Items: Index(['x', 'y', 'z'], dtype='object')
Axis 1: RangeIndex(start=0, stop=5, step=1)
NumpyBlock: slice(0, 3, 1), 3 x 5, dtype: float64
loc = 0, idx = array([2, 3, 1, 0]), value = None, inplace_only = False

    def column_setitem(
        self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
    ) -> None:
        """
        Set values ("setitem") into a single column (not setting the full column).
    
        This is a method on the BlockManager level, to avoid creating an
        intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
        """
        needs_to_warn = False
        if warn_copy_on_write() and not self._has_no_reference(loc):
            if not isinstance(
                self.blocks[self.blknos[loc]].values,
                (ArrowExtensionArray, ArrowStringArray),
            ):
                # We might raise if we are in an expansion case, so defer
                # warning till we actually updated
                needs_to_warn = True
    
        elif using_copy_on_write() and not self._has_no_reference(loc):
            blkno = self.blknos[loc]
            # Split blocks to only copy the column we want to modify
            blk_loc = self.blklocs[loc]
            # Copy our values
            values = self.blocks[blkno].values
            if values.ndim == 1:
                values = values.copy()
            else:
                # Use [blk_loc] as indexer to keep ndim=2, this already results in a
                # copy
                values = values[[blk_loc]]
            self._iset_split_block(blkno, [blk_loc], values)
    
        # this manager is only created temporarily to mutate the values in place
        # so don't track references, otherwise the `setitem` would perform CoW again
        col_mgr = self.iget(loc, track_ref=False)
        if inplace_only:
            col_mgr.setitem_inplace(idx, value)
        else:
            new_mgr = col_mgr.setitem((idx,), value)
            self.iset(loc, new_mgr._block.values, inplace=True)
    
        if needs_to_warn:
>           warnings.warn(
                COW_WARNING_GENERAL_MSG,
                FutureWarning,
                stacklevel=find_stack_level(),
            )
E           FutureWarning: Setting a value on a view: behaviour will change in pandas 3.0.
E           You are mutating a Series or DataFrame object, and currently this mutation will
E           also have effect on other Series or DataFrame objects that share data with this
E           object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
E           will never modify another.

I think the simplest thing is to just remove it?

rng = np.random.default_rng(2)
dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns}
for col, num_nulls in dict_null.items():
Expand Down Expand Up @@ -392,3 +390,10 @@ def test_large_string():
result = pd.api.interchange.from_dataframe(df.__dataframe__())
expected = pd.DataFrame({"a": ["x"]}, dtype="object")
tm.assert_frame_equal(result, expected)


def test_non_str_names():
# https://github.com/pandas-dev/pandas/issues/56701
df = pd.Series([1, 2, 3], name=0).to_frame()
names = df.__dataframe__().column_names()
assert names == ["0"]