Skip to content

WIP: Bitarray backed Int EAs #22238

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
103 changes: 103 additions & 0 deletions pandas/core/arrays/_mask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import numpy as np


class NAMask():
"""Generic class which can be used to represent missing data.

Will use bitarray if available; otherwise will use numpy."""

def __init__(self, mask):
"""
Parameters
----------
mask : numpy array
Mask of missing values.
"""

self._has_bitarray = False
try:
import bitarray
globals()['bitarray'] = bitarray
self._has_bitarray = True
self._data = self._numpy_to_bitarray(mask)
except (ImportError, ModuleNotFoundError):
self._data = mask.astype(bool, copy=False)

def _numpy_to_bitarray(self, arr):
bit_arr = bitarray()
bit_arr.pack(arr.astype(bool, copy=False))

def _bitarray_to_numpy(self, arr):
return np.fromstring(arr.unpack(), dtype=bool)

def __getitem__(self, item):
if self._has_bitarray:
raise NotImplementedError

return self._data[item]

def __setitem__(self, key, value):
if self._has_bitarray:
raise NotImplementedError

self._data[key] = value

def __array__(self):
if self._has_bitarray:
raise NotImplementedError

return self._data

def __iter__(self):
for i in range(len(self._data)):
yield self._data[i]

def __invert__(self):
if self._has_bitarray:
raise NotImplementedError

return type(self)(~self._data)

def __or__(self, other):
if self._has_bitarray:
raise NotImplementedError

return self._data.__or__(other)

def __ior__(self, other):
if self._has_bitarray:
raise NotImplementedError

return self._data | other

@property
def nbytes(self):
if self._has_bitarray:
return self._data.buffer_info()[1]

return self._data.nbytes

@property
def size(self):
if self._has_bitarray:
raise NotImplementedError

return self._data.size

def astype(self, dtype, copy=False):
if self._has_bitarray:
raise NotImplementedError

return self._data.astype(dtype, copy=copy)

def any(self):
return self._data.any()

def copy(self):
return type(self)(self._data.copy())

def sum(self):
if self._has_bitarray:
raise NotImplementedError

return self._data.sum()
7 changes: 5 additions & 2 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from pandas.core import nanops
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.arrays._mask import NAMask
from pandas.core.tools.numeric import to_numeric


Expand Down Expand Up @@ -247,7 +248,8 @@ def __init__(self, values, mask, copy=False):
and is_integer_dtype(values.dtype)):
raise TypeError("values should be integer numpy array. Use "
"the 'integer_array' function instead")
if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
if not (isinstance(mask, NAMask) or (
isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype))):
raise TypeError("mask should be boolean numpy array. Use "
"the 'integer_array' function instead")

Expand All @@ -256,7 +258,7 @@ def __init__(self, values, mask, copy=False):
mask = mask.copy()

self._data = values
self._mask = mask
self._mask = NAMask(mask)

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
Expand All @@ -283,6 +285,7 @@ def __getitem__(self, item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self):
Expand Down