Skip to content

WIP: Bitarray backed Int EAs #22238

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
41 changes: 33 additions & 8 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import warnings
import copy
import numpy as np
from bitarray import bitarray

from pandas._libs.lib import infer_dtype
from pandas.util._decorators import cache_readonly
Expand Down Expand Up @@ -167,10 +168,12 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
else:
assert len(mask) == len(values)

# Work with bitarrays from here on out
if isinstance(mask, np.ndarray):
mask = _numpy_to_bitarray(mask)

if not values.ndim == 1:
raise TypeError("values must be a 1D list-like")
if not mask.ndim == 1:
raise TypeError("mask must be a 1D list-like")

# infer dtype if needed
if dtype is None:
Expand All @@ -195,6 +198,23 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
return values, mask


def _numpy_to_bitarray(arr):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This and the method below maybe belong in a shared module, but placing here now for better change visibility

"""
Efficiently convert a NumPy array to a bitarray object.
"""
barr = bitarray()
barr.pack(arr.astype(bool, copy=False).tostring())

return barr


def _bitarray_to_numpy(arr):
"""
Efficiently convert a bitarray object to a NumPy array.
"""
return np.fromstring(arr.unpack(), dtype=bool)


class IntegerArray(ExtensionArray, ExtensionOpsMixin):
"""
We represent an IntegerArray with 2 numpy arrays
Expand Down Expand Up @@ -235,8 +255,9 @@ def __getitem__(self, item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

return type(self)(self._data[item],
mask=self._mask[item],
mask=_bitarray_to_numpy(self._mask)[item],
dtype=self.dtype)

def _coerce_to_ndarray(self):
Expand Down Expand Up @@ -317,7 +338,10 @@ def __setitem__(self, key, value):
mask = mask[0]

self._data[key] = value
self._mask[key] = mask
# Coerce to numpy array to leverage advanced indexing, then coerce back
arr = _bitarray_to_numpy(self._mask)
arr[key] = mask
self._mask = _numpy_to_bitarray(arr)

def __len__(self):
return len(self._data)
Expand All @@ -343,10 +367,10 @@ def __repr__(self):

@property
def nbytes(self):
return self._data.nbytes + self._mask.nbytes
return self._data.nbytes + self._mask.buffer_info()[1]

def isna(self):
return self._mask
return _bitarray_to_numpy(self._mask)

@property
def _na_value(self):
Expand Down Expand Up @@ -444,7 +468,7 @@ def value_counts(self, dropna=True):
# TODO(extension)
# appending to an Index *always* infers
# w/o passing the dtype
array = np.append(array, [self._mask.sum()])
array = np.append(array, [_bitarray_to_numpy(self._mask).sum()])
index = Index(np.concatenate(
[index.values,
np.array([np.nan], dtype=object)]), dtype=object)
Expand Down Expand Up @@ -513,7 +537,8 @@ def _maybe_mask_result(self, result, mask, other, op_name):
# may need to fill infs
# and mask wraparound
if is_float_dtype(result):
mask |= (result == np.inf) | (result == -np.inf)
arr = _numpy_to_bitarray((result == np.inf) | (result == -np.inf))
mask |= arr

# if we have a float operand we are by-definition
# a float result
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,10 @@ def _set_with(self, key, value):
else:
return self._set_values(key, value)
elif key_type == 'boolean':
self._set_values(key.astype(np.bool_), value)
try:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quite a few ways to do this but this was raising when any non-NumPy arrays were hitting this branch. Not directly related to change because it would fail for lists getting here but came to light as bitarrays were hitting this condition.

self._set_values(key.astype(np.bool_), value)
except AttributeError:
self._set_values(key, value)
else:
self._set_labels(key, value)

Expand Down
8 changes: 8 additions & 0 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from functools import wraps
from contextlib import contextmanager

from bitarray import bitarray
from numpy.random import randn, rand
import numpy as np

Expand Down Expand Up @@ -1172,6 +1173,13 @@ def assert_extension_array_equal(left, right):
assert left.dtype == right.dtype
left_na = left.isna()
right_na = right.isna()

# TODO - maybe generate dedicated method for bitarray comparison?
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment says it all - we maybe need a dedicated method for comparing bit arrays. At the very least there's got to be a more robust way of handling this in the extension check

if isinstance(left_na, bitarray):
left_na = np.fromstring(left_na.unpack(), dtype=bool)
if isinstance(right_na, bitarray):
right_na = np.fromstring(right_na.unpack(), dtype=bool)

assert_numpy_array_equal(left_na, right_na)

left_valid = left[~left_na].astype(object)
Expand Down