Skip to content

Commit 6e084ad

Browse files
committed
Internals refactor
1 parent a9e0972 commit 6e084ad

File tree

5 files changed

+118
-38
lines changed

5 files changed

+118
-38
lines changed

pandas/core/arrays/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from .categorical import Categorical
1+
from .base import ExtensionArray # noqa
2+
from .categorical import Categorical # noqa

pandas/core/arrays/base.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""An interface for extending pandas with custom arrays."""
22
import abc
3-
from typing import Tuple, Sequence, Optional, Any # noqa
3+
from typing import Tuple, Sequence, Optional, Any, Iterator # noqa
44

55
import numpy as np
66

@@ -20,19 +20,23 @@ class ExtensionArray(metaclass=abc.ABCMeta):
2020
# ------------------------------------------------------------------------
2121
@abc.abstractmethod
2222
def __getitem__(self, item):
23+
# type (Any) -> Any
2324
pass
2425

2526
def __setitem__(self, key, value):
27+
# type: (Any, Any) -> None
2628
raise NotImplementedError(_not_implemented_message.format(
2729
type(self), '__setitem__')
2830
)
2931

3032
@abc.abstractmethod
3133
def __iter__(self):
34+
# type: () -> Iterator
3235
pass
3336

3437
@abc.abstractmethod
3538
def __len__(self):
39+
# type: () -> int
3640
pass
3741

3842
# ------------------------------------------------------------------------
@@ -41,6 +45,7 @@ def __len__(self):
4145
@property
4246
@abc.abstractmethod
4347
def dtype(self):
48+
"""An instance of 'ExtensionDtype'."""
4449
# type: () -> ExtensionDtype
4550
pass
4651

@@ -52,23 +57,23 @@ def shape(self):
5257
@property
5358
def ndim(self):
5459
# type: () -> int
55-
"""Extension Arrays are only allowed to be 1-dimensional"""
60+
"""Extension Arrays are only allowed to be 1-dimensional."""
5661
return 1
5762

5863
@property
5964
@abc.abstractmethod
6065
def nbytes(self):
66+
"""The number of bytes needed to store this object in memory."""
6167
# type: () -> int
62-
# TODO: default impl?
6368
pass
6469

6570
# ------------------------------------------------------------------------
6671
# Additional Methods
6772
# ------------------------------------------------------------------------
6873
@abc.abstractmethod
6974
def isna(self):
70-
# type: () -> Sequence[bool]
71-
# TODO: narrow this type?
75+
"""Boolean NumPy array indicating if each value is missing."""
76+
# type: () -> np.ndarray
7277
pass
7378

7479
# ------------------------------------------------------------------------

pandas/core/categorical.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# pickle compatability, probably a better way
2+
from pandas.core.arrays import Categorical # noqa

pandas/core/internals.py

+102-30
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959

6060
from pandas.core.index import Index, MultiIndex, _ensure_index
6161
from pandas.core.indexing import maybe_convert_indices, length_of_indexer
62-
from pandas.core.arrays.categorical import Categorical, _maybe_to_categorical
62+
from pandas.core.arrays import Categorical, ExtensionArray
6363
from pandas.core.indexes.datetimes import DatetimeIndex
6464
from pandas.io.formats.printing import pprint_thing
6565

@@ -95,6 +95,7 @@ class Block(PandasObject):
9595
is_object = False
9696
is_categorical = False
9797
is_sparse = False
98+
is_extension = False
9899
_box_to_block_values = True
99100
_can_hold_na = False
100101
_downcast_dtype = None
@@ -108,14 +109,14 @@ class Block(PandasObject):
108109
def __init__(self, values, placement, ndim=None, fastpath=False):
109110
if ndim is None:
110111
ndim = values.ndim
111-
elif values.ndim != ndim:
112+
elif self._validate_ndim and values.ndim != ndim:
112113
raise ValueError('Wrong number of dimensions')
113114
self.ndim = ndim
114115

115116
self.mgr_locs = placement
116117
self.values = values
117118

118-
if ndim and len(self.mgr_locs) != len(self.values):
119+
if self._validate_ndim and ndim and len(self.mgr_locs) != len(self.values):
119120
raise ValueError(
120121
'Wrong number of items passed {val}, placement implies '
121122
'{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs)))
@@ -274,7 +275,6 @@ def reshape_nd(self, labels, shape, ref_items, mgr=None):
274275
275276
return a new block that is transformed to a nd block
276277
"""
277-
278278
return _block2d_to_blocknd(values=self.get_values().T,
279279
placement=self.mgr_locs, shape=shape,
280280
labels=labels, ref_items=ref_items)
@@ -1697,24 +1697,19 @@ class NonConsolidatableMixIn(object):
16971697
_holder = None
16981698

16991699
def __init__(self, values, placement, ndim=None, fastpath=False, **kwargs):
1700+
# Placement must be converted to BlockPlacement so that we can check
1701+
# its length
1702+
if not isinstance(placement, BlockPlacement):
1703+
placement = BlockPlacement(placement)
17001704

1701-
# Placement must be converted to BlockPlacement via property setter
1702-
# before ndim logic, because placement may be a slice which doesn't
1703-
# have a length.
1704-
self.mgr_locs = placement
1705-
1706-
# kludgetastic
1705+
# Maybe infer ndim from placement
17071706
if ndim is None:
1708-
if len(self.mgr_locs) != 1:
1707+
if len(placement) != 1:
17091708
ndim = 1
17101709
else:
17111710
ndim = 2
1712-
self.ndim = ndim
1713-
1714-
if not isinstance(values, self._holder):
1715-
raise TypeError("values must be {0}".format(self._holder.__name__))
1716-
1717-
self.values = values
1711+
super(NonConsolidatableMixIn, self).__init__(values, placement, ndim=ndim,
1712+
fastpath=fastpath)
17181713

17191714
@property
17201715
def shape(self):
@@ -1765,7 +1760,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
17651760
17661761
Returns
17671762
-------
1768-
a new block(s), the result of the putmask
1763+
a new block, the result of the putmask
17691764
"""
17701765
inplace = validate_bool_kwarg(inplace, 'inplace')
17711766

@@ -1823,6 +1818,91 @@ def _unstack(self, unstacker_func, new_columns):
18231818
return blocks, mask
18241819

18251820

1821+
class ExtensionBlock(NonConsolidatableMixIn, Block):
1822+
"""Block for holding extension types.
1823+
1824+
Notes
1825+
-----
1826+
This is the holds all 3rd-party extension types. It's also the immediate
1827+
parent class for our internal extension types' blocks, CategoricalBlock.
1828+
1829+
All extension arrays *must* be 1-D, which simplifies things a bit.
1830+
"""
1831+
# Some questions / notes as comments, will be removed.
1832+
#
1833+
# Currently inherited from NCB. We'll keep it around until SparseBlock
1834+
# and DatetimeTZBlock are refactored.
1835+
# - set
1836+
# - iget
1837+
# - should_store
1838+
# - putmask
1839+
# - _slice
1840+
# - _try_cast_result
1841+
# - unstack
1842+
1843+
# Think about overriding these methods from Block
1844+
# - _maybe_downcast: (never downcast)
1845+
1846+
# Methods we can (probably) ignore and just use Block's:
1847+
1848+
# * replace / replace_single
1849+
# Categorical got Object, but was hopefully unnescessary.
1850+
# DatetimeTZ, Sparse got Block
1851+
# * is_view
1852+
# Categorical overrides to say that it is not.
1853+
# DatetimeTZ, Sparse inherits Base anyway
1854+
1855+
is_extension = True
1856+
1857+
# XXX
1858+
# is_bool is is a change for CategoricalBlock. Used to inherit
1859+
# from Object to infer from values. If this matters, we should
1860+
# override it directly in CategoricalBlock so that we infer from
1861+
# the categories, not the codes.
1862+
is_bool = False
1863+
1864+
def __init__(self, values, placement, ndim=None, fastpath=False):
1865+
self._holder = type(values)
1866+
super(ExtensionBlock, self).__init__(values, placement, ndim=ndim,
1867+
fastpath=fastpath)
1868+
1869+
def get_values(self, dtype=None):
1870+
# ExtensionArrays must be iterable, so this works.
1871+
values = np.asarray(self.values)
1872+
if values.ndim == self.ndim - 1:
1873+
values = values.reshape((1,) + values.shape)
1874+
return values
1875+
1876+
def _can_hold_element(self, element):
1877+
# XXX:
1878+
# Not defined on NCM.
1879+
# Categorical got True from ObjectBlock
1880+
# DatetimeTZ gets DatetimeBlock
1881+
# Sparse gets Block
1882+
# Let's just assume yes for now, but we can maybe push
1883+
# this onto the array.
1884+
return True
1885+
1886+
def convert(self, copy=True, **kwargs):
1887+
# We're dedicated to a type, we don't convert.
1888+
# Taken from CategoricalBlock / Block.
1889+
return self.copy() if copy else self
1890+
1891+
def _slice(self, slicer):
1892+
""" return a slice of my values """
1893+
1894+
# slice the category
1895+
# return same dims as we currently have
1896+
1897+
if isinstance(slicer, tuple) and len(slicer) == 2:
1898+
if not is_null_slice(slicer[0]):
1899+
raise AssertionError("invalid slicing for a 1-ndim "
1900+
"categorical")
1901+
slicer = slicer[1]
1902+
1903+
return self.values._slice(slicer)
1904+
1905+
18261906
class NumericBlock(Block):
18271907
__slots__ = ()
18281908
is_numeric = True
@@ -2337,7 +2417,7 @@ def re_replacer(s):
23372417
return block
23382418

23392419

2340-
class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
2420+
class CategoricalBlock(ExtensionBlock):
23412421
__slots__ = ()
23422422
is_categorical = True
23432423
_verify_integrity = True
@@ -2346,6 +2426,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
23462426
_concatenator = staticmethod(_concat._concat_categorical)
23472427

23482428
def __init__(self, values, placement, fastpath=False, **kwargs):
2429+
from pandas.core.arrays.categorical import _maybe_to_categorical
23492430

23502431
# coerce to categorical if we can
23512432
super(CategoricalBlock, self).__init__(_maybe_to_categorical(values),
@@ -2360,23 +2441,13 @@ def is_view(self):
23602441
def to_dense(self):
23612442
return self.values.to_dense().view()
23622443

2363-
def convert(self, copy=True, **kwargs):
2364-
return self.copy() if copy else self
2365-
23662444
@property
23672445
def array_dtype(self):
23682446
""" the dtype to return if I want to construct this block as an
23692447
array
23702448
"""
23712449
return np.object_
23722450

2373-
def _slice(self, slicer):
2374-
""" return a slice of my values """
2375-
2376-
# slice the category
2377-
# return same dims as we currently have
2378-
return self.values._slice(slicer)
2379-
23802451
def _try_coerce_result(self, result):
23812452
""" reverse of try_coerce_args """
23822453

@@ -2468,7 +2539,8 @@ class DatetimeBlock(DatetimeLikeBlockMixin, Block):
24682539
_can_hold_na = True
24692540

24702541
def __init__(self, values, placement, fastpath=False, **kwargs):
2471-
if values.dtype != _NS_DTYPE:
2542+
if values.dtype != _NS_DTYPE and values.dtype.base != _NS_DTYPE:
2543+
# not datetime64 or datetime64tz
24722544
values = conversion.ensure_datetime64ns(values)
24732545

24742546
super(DatetimeBlock, self).__init__(values, fastpath=True,

pandas/tests/internals/test_external_block.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55

66
import pandas as pd
77
from pandas.core.internals import (
8-
Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn)
8+
BlockManager, SingleBlockManager, ExtensionBlock)
99

1010
import pytest
1111

1212

13-
class CustomBlock(NonConsolidatableMixIn, Block):
13+
class CustomBlock(ExtensionBlock):
1414

1515
_holder = np.ndarray
1616

0 commit comments

Comments
 (0)