-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: Create StorageExtensionDtype #46537
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,6 @@ | ||
from __future__ import annotations | ||
|
||
from typing import ( | ||
TYPE_CHECKING, | ||
Any, | ||
) | ||
from typing import TYPE_CHECKING | ||
|
||
import numpy as np | ||
|
||
|
@@ -24,6 +21,7 @@ | |
|
||
from pandas.core.dtypes.base import ( | ||
ExtensionDtype, | ||
StorageExtensionDtype, | ||
register_extension_dtype, | ||
) | ||
from pandas.core.dtypes.common import ( | ||
|
@@ -55,7 +53,7 @@ | |
|
||
|
||
@register_extension_dtype | ||
class StringDtype(ExtensionDtype): | ||
class StringDtype(StorageExtensionDtype): | ||
""" | ||
Extension dtype for string data. | ||
|
||
|
@@ -67,7 +65,7 @@ class StringDtype(ExtensionDtype): | |
parts of the API may change without warning. | ||
|
||
In particular, StringDtype.na_value may change to no longer be | ||
``numpy.nan``. | ||
``pd.NA``. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is also related to the comment I made in #47034 (comment) about pd.NA vs pa.NA, but so AFAIK we don't have any concrete plans to stop using pd.NA for the string dtype? If that is the case, I wouldn't state it here like that, or if there are concrete ideas, that's something we should have a discussion about. (the sentence was clearly outdated of course ;), since the np.nan wasn't even returned. But so I would rather remove the full sentence then?) |
||
|
||
Parameters | ||
---------- | ||
|
@@ -141,7 +139,6 @@ def construct_from_string(cls, string): | |
----- | ||
TypeError | ||
If the string is not a valid option. | ||
|
||
""" | ||
if not isinstance(string, str): | ||
raise TypeError( | ||
|
@@ -156,15 +153,6 @@ def construct_from_string(cls, string): | |
else: | ||
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") | ||
|
||
def __eq__(self, other: Any) -> bool: | ||
if isinstance(other, str) and other == "string": | ||
return True | ||
return super().__eq__(other) | ||
|
||
def __hash__(self) -> int: | ||
# custom __eq__ so have to override __hash__ | ||
return super().__hash__() | ||
|
||
# https://github.com/pandas-dev/pandas/issues/36126 | ||
# error: Signature of "construct_array_type" incompatible with supertype | ||
# "ExtensionDtype" | ||
|
@@ -185,12 +173,6 @@ def construct_array_type( # type: ignore[override] | |
else: | ||
return ArrowStringArray | ||
|
||
def __repr__(self): | ||
return f"string[{self.storage}]" | ||
|
||
def __str__(self): | ||
return self.name | ||
|
||
def __from_arrow__( | ||
self, array: pyarrow.Array | pyarrow.ChunkedArray | ||
) -> BaseStringArray: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,6 @@ | ||
""" | ||
Extend pandas with custom array types. | ||
""" | ||
|
||
from __future__ import annotations | ||
|
||
from typing import ( | ||
|
@@ -14,6 +13,7 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._libs import missing as libmissing | ||
from pandas._libs.hashtable import object_hash | ||
from pandas._typing import ( | ||
DtypeObj, | ||
|
@@ -391,6 +391,32 @@ def _can_hold_na(self) -> bool: | |
return True | ||
|
||
|
||
class StorageExtensionDtype(ExtensionDtype): | ||
"""ExtensionDtype that may be backed by more than one implementation.""" | ||
|
||
name: str | ||
na_value = libmissing.NA | ||
_metadata = ("storage",) | ||
|
||
def __init__(self, storage=None) -> None: | ||
self.storage = storage | ||
|
||
def __repr__(self): | ||
return f"{self.name}[{self.storage}]" | ||
|
||
def __str__(self): | ||
return self.name | ||
|
||
def __eq__(self, other: Any) -> bool: | ||
if isinstance(other, self.type) and other == self.name: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you remember why you added this line here? (it's different as the line in So I think you can actually keep the (from #47034 (comment)) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm yeah I am not sure why this changed. This was just supposed to be a copy-paste essentially to a new subclass and maybe make it more generic maybe since |
||
return True | ||
return super().__eq__(other) | ||
|
||
def __hash__(self) -> int: | ||
# custom __eq__ so have to override __hash__ | ||
return super().__hash__() | ||
|
||
|
||
def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: | ||
""" | ||
Register an ExtensionType with pandas as class decorator. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is ExtensionType still neede?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's an
isinstance
check that uses it.