Skip to content

Sync Fork from Upstream Repo #242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def setup(self, dtype):
columns = np.arange(n)
if dtype == "int":
values = np.arange(m * m * n).reshape(m * m, n)
self.df = DataFrame(values, index, columns)
else:
# the category branch is ~20x slower than int. So we
# cut down the size a bit. Now it's only ~3x slower.
Expand All @@ -111,7 +112,10 @@ def setup(self, dtype):
values = np.take(list(string.ascii_letters), indices)
values = [pd.Categorical(v) for v in values.T]

self.df = DataFrame({i: cat for i, cat in enumerate(values)}, index, columns)
self.df = DataFrame(
{i: cat for i, cat in enumerate(values)}, index, columns
)

self.df2 = self.df.iloc[:-1]

def time_full_product(self, dtype):
Expand Down
3 changes: 2 additions & 1 deletion ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
pandas/io/parsers/ \
pandas/io/sas/ \
pandas/io/sql.py \
pandas/tseries/
pandas/tseries/ \
pandas/io/formats/style_render.py
RET=$(($RET + $?)) ; echo $MSG "DONE"

fi
Expand Down
1 change: 1 addition & 0 deletions ci/deps/actions-39-slow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies:
- matplotlib
- moto>=1.3.14
- flask
- numba
- numexpr
- numpy
- openpyxl
Expand Down
1 change: 1 addition & 0 deletions ci/deps/actions-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies:
- matplotlib
- moto>=1.3.14
- flask
- numba
- numexpr
- numpy
- openpyxl
Expand Down
1 change: 1 addition & 0 deletions ci/deps/azure-windows-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies:
- matplotlib
- moto>=1.3.14
- flask
- numba
- numexpr
- numpy
- openpyxl
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Fixed regressions
~~~~~~~~~~~~~~~~~
- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`)
- Regression in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
-
- Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`)
-

.. ---------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)

.. ---------------------------------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2836,7 +2836,7 @@ def _maybe_to_slice(loc):
try:
return self._engine.get_loc(key)
except TypeError:
# e.g. partial string slicing
# e.g. test_partial_slicing_with_multiindex partial string slicing
loc, _ = self.get_loc_level(key, list(range(self.nlevels)))
return loc

Expand Down
15 changes: 2 additions & 13 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,16 +658,7 @@ def _get_setitem_indexer(self, key):
if isinstance(key, range):
return list(key)

try:
return self._convert_to_indexer(key, axis=0, is_setter=True)
except TypeError as e:

# invalid indexer type vs 'other' indexing errors
if "cannot do" in str(e):
raise
elif "unhashable type" in str(e):
raise
raise IndexingError(key) from e
return self._convert_to_indexer(key, axis=0, is_setter=True)

def _ensure_listlike_indexer(self, key, axis=None, value=None):
"""
Expand Down Expand Up @@ -1209,7 +1200,7 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
is_int_index = labels.is_integer()
is_int_positional = is_integer(key) and not is_int_index

if is_scalar(key) or isinstance(labels, MultiIndex):
if is_scalar(key) or (isinstance(labels, MultiIndex) and is_hashable(key)):
# Otherwise get_loc will raise InvalidIndexError

# if we are a label return me
Expand All @@ -1224,8 +1215,6 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
# GH35015, using datetime as column indices raises exception
if not isinstance(labels, MultiIndex):
raise
except TypeError:
pass
except ValueError:
if not is_int_positional:
raise
Expand Down
15 changes: 9 additions & 6 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from pandas._libs.internals import BlockPlacement
from pandas._typing import (
ArrayLike,
Dtype,
DtypeObj,
F,
Shape,
Expand All @@ -52,7 +51,6 @@
is_list_like,
is_sparse,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
Expand Down Expand Up @@ -100,6 +98,7 @@
TimedeltaArray,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
import pandas.core.computation.expressions as expressions
Expand Down Expand Up @@ -326,6 +325,8 @@ def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:

return type(self)(new_values, new_mgr_locs, self.ndim)

# NB: this cannot be made cache_readonly because in libreduction we pin
# new .values that can have different shape GH#42631
@property
def shape(self) -> Shape:
return self.values.shape
Expand Down Expand Up @@ -1843,7 +1844,7 @@ class CategoricalBlock(ExtensionBlock):
# Constructor Helpers


def maybe_coerce_values(values) -> ArrayLike:
def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
"""
Input validation for values passed to __init__. Ensure that
any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
Expand Down Expand Up @@ -1875,7 +1876,7 @@ def maybe_coerce_values(values) -> ArrayLike:
return values


def get_block_type(values, dtype: Dtype | None = None):
def get_block_type(values, dtype: DtypeObj | None = None):
"""
Find the appropriate Block subclass to use for the given values and dtype.

Expand All @@ -1890,13 +1891,15 @@ def get_block_type(values, dtype: Dtype | None = None):
"""
# We use vtype and kind checks because they are much more performant
# than is_foo_dtype
dtype = cast(np.dtype, pandas_dtype(dtype) if dtype else values.dtype)
if dtype is None:
dtype = values.dtype

vtype = dtype.type
kind = dtype.kind

cls: type[Block]

if is_sparse(dtype):
if isinstance(dtype, SparseDtype):
# Need this first(ish) so that Sparse[datetime] is sparse
cls = ExtensionBlock
elif isinstance(dtype, CategoricalDtype):
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,14 @@ def to_arrays(
# i.e. numpy structured array
columns = ensure_index(data.dtype.names)
arrays = [data[name] for name in columns]

if len(data) == 0:
# GH#42456 the indexing above results in list of 2D ndarrays
# TODO: is that an issue with numpy?
for i, arr in enumerate(arrays):
if arr.ndim == 2:
arrays[i] = arr[:, 0]

return arrays, columns
return [], ensure_index([])

Expand Down
38 changes: 21 additions & 17 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ class BaseBlockManager(DataManager):
_known_consolidated: bool
_is_consolidated: bool

def __init__(self, blocks, axes, verify_integrity=True):
def __init__(self, blocks, axes, verify_integrity: bool = True):
raise NotImplementedError

@classmethod
Expand Down Expand Up @@ -889,7 +889,8 @@ def __init__(
):

if verify_integrity:
assert all(isinstance(x, Index) for x in axes)
# Assertion disabled for performance
# assert all(isinstance(x, Index) for x in axes)

for block in blocks:
if self.ndim != block.ndim:
Expand Down Expand Up @@ -1563,8 +1564,9 @@ def __init__(
verify_integrity: bool = False,
fastpath=lib.no_default,
):
assert isinstance(block, Block), type(block)
assert isinstance(axis, Index), type(axis)
# Assertions disabled for performance
# assert isinstance(block, Block), type(block)
# assert isinstance(axis, Index), type(axis)

if fastpath is not lib.no_default:
warnings.warn(
Expand Down Expand Up @@ -1665,7 +1667,8 @@ def getitem_mgr(self, indexer) -> SingleBlockManager:
return type(self)(block, new_idx)

def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:
assert isinstance(slobj, slice), type(slobj)
# Assertion disabled for performance
# assert isinstance(slobj, slice), type(slobj)
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")

Expand Down Expand Up @@ -1783,9 +1786,10 @@ def create_block_manager_from_arrays(
axes: list[Index],
consolidate: bool = True,
) -> BlockManager:
assert isinstance(names, Index)
assert isinstance(axes, list)
assert all(isinstance(x, Index) for x in axes)
# Assertions disabled for performance
# assert isinstance(names, Index)
# assert isinstance(axes, list)
# assert all(isinstance(x, Index) for x in axes)

arrays = [_extract_array(x) for x in arrays]

Expand Down Expand Up @@ -1840,7 +1844,8 @@ def _form_blocks(
if names_idx.equals(axes[0]):
names_indexer = np.arange(len(names_idx))
else:
assert names_idx.intersection(axes[0]).is_unique
# Assertion disabled for performance
# assert names_idx.intersection(axes[0]).is_unique
names_indexer = names_idx.get_indexer_for(axes[0])

for i, name_idx in enumerate(names_indexer):
Expand Down Expand Up @@ -1868,10 +1873,9 @@ def _form_blocks(

if len(items_dict["DatetimeTZBlock"]):
dttz_blocks = [
new_block(
DatetimeTZBlock(
ensure_block_shape(extract_array(array), 2),
klass=DatetimeTZBlock,
placement=i,
placement=BlockPlacement(i),
ndim=2,
)
for i, array in items_dict["DatetimeTZBlock"]
Expand All @@ -1886,14 +1890,14 @@ def _form_blocks(

if len(items_dict["CategoricalBlock"]) > 0:
cat_blocks = [
new_block(array, klass=CategoricalBlock, placement=i, ndim=2)
CategoricalBlock(array, placement=BlockPlacement(i), ndim=2)
for i, array in items_dict["CategoricalBlock"]
]
blocks.extend(cat_blocks)

if len(items_dict["ExtensionBlock"]):
external_blocks = [
new_block(array, klass=ExtensionBlock, placement=i, ndim=2)
ExtensionBlock(array, placement=BlockPlacement(i), ndim=2)
for i, array in items_dict["ExtensionBlock"]
]

Expand Down Expand Up @@ -1926,7 +1930,7 @@ def _simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]:
if dtype is not None and values.dtype != dtype: # pragma: no cover
values = values.astype(dtype)

block = new_block(values, placement=placement, ndim=2)
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
return [block]


Expand All @@ -1949,14 +1953,14 @@ def _multi_blockify(tuples, dtype: DtypeObj | None = None, consolidate: bool = T
list(tup_block), dtype # type: ignore[arg-type]
)

block = new_block(values, placement=placement, ndim=2)
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
new_blocks.append(block)

return new_blocks


def _tuples_to_blocks_no_consolidate(tuples, dtype: DtypeObj | None) -> list[Block]:
# tuples produced within _form_blocks are of the form (placement, whatever, array)
# tuples produced within _form_blocks are of the form (placement, array)
if dtype is not None:
return [
new_block(
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/formats/style_render.py
Original file line number Diff line number Diff line change
Expand Up @@ -1334,9 +1334,9 @@ def _parse_latex_header_span(

Examples
--------
>>> cell = {'display_vale':'text', 'attributes': 'colspan="3"'}
>>> cell = {'display_value':'text', 'attributes': 'colspan="3"'}
>>> _parse_latex_header_span(cell, 't', 'c')
'\multicol{3}{c}{text}'
'\\multicolumn{3}{c}{text}'
"""
if "attributes" in cell:
attrs = cell["attributes"]
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,3 +457,16 @@ def test_from_records_empty_with_nonempty_fields_gh3682(self):
b = a[:0]
df2 = DataFrame.from_records(b, index="id")
tm.assert_frame_equal(df2, df.iloc[:0])

def test_from_records_empty2(self):
# GH#42456
dtype = [("prop", int)]
shape = (0, len(dtype))
arr = np.empty(shape, dtype=dtype)

result = DataFrame.from_records(arr)
expected = DataFrame({"prop": np.array([], dtype=int)})
tm.assert_frame_equal(result, expected)

alt = DataFrame(arr)
tm.assert_frame_equal(alt, expected)
13 changes: 13 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1178,3 +1178,16 @@ def test_positional_slice_groups_datetimelike():
lambda x: x.iloc[0:]
)
tm.assert_frame_equal(result, expected)


def test_doctest_example2():
# GH#42702 this fails if we cache_readonly Block.shape
# TODO: more informative name
df = DataFrame({"A": ["a", "a", "b"], "B": [1, 2, 3], "C": [4, 6, 5]})
gb = df.groupby("A")
result = gb[["B", "C"]].apply(lambda x: x.astype(float).max() - x.min())

expected = DataFrame(
{"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A")
)
tm.assert_frame_equal(result, expected)