Skip to content

Commit 781647a

Browse files
authored
Merge pull request #242 from pandas-dev/master
Sync Fork from Upstream Repo
2 parents c54ea4a + b8d750f commit 781647a

File tree

15 files changed

+81
-42
lines changed

15 files changed

+81
-42
lines changed

asv_bench/benchmarks/reshape.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ def setup(self, dtype):
102102
columns = np.arange(n)
103103
if dtype == "int":
104104
values = np.arange(m * m * n).reshape(m * m, n)
105+
self.df = DataFrame(values, index, columns)
105106
else:
106107
# the category branch is ~20x slower than int. So we
107108
# cut down the size a bit. Now it's only ~3x slower.
@@ -111,7 +112,10 @@ def setup(self, dtype):
111112
values = np.take(list(string.ascii_letters), indices)
112113
values = [pd.Categorical(v) for v in values.T]
113114

114-
self.df = DataFrame({i: cat for i, cat in enumerate(values)}, index, columns)
115+
self.df = DataFrame(
116+
{i: cat for i, cat in enumerate(values)}, index, columns
117+
)
118+
115119
self.df2 = self.df.iloc[:-1]
116120

117121
def time_full_product(self, dtype):

ci/code_checks.sh

+2-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
121121
pandas/io/parsers/ \
122122
pandas/io/sas/ \
123123
pandas/io/sql.py \
124-
pandas/tseries/
124+
pandas/tseries/ \
125+
pandas/io/formats/style_render.py
125126
RET=$(($RET + $?)) ; echo $MSG "DONE"
126127

127128
fi

ci/deps/actions-39-slow.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ dependencies:
2323
- matplotlib
2424
- moto>=1.3.14
2525
- flask
26+
- numba
2627
- numexpr
2728
- numpy
2829
- openpyxl

ci/deps/actions-39.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies:
2222
- matplotlib
2323
- moto>=1.3.14
2424
- flask
25+
- numba
2526
- numexpr
2627
- numpy
2728
- openpyxl

ci/deps/azure-windows-39.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ dependencies:
2323
- matplotlib
2424
- moto>=1.3.14
2525
- flask
26+
- numba
2627
- numexpr
2728
- numpy
2829
- openpyxl

doc/source/whatsnew/v1.3.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`)
1818
- Regression in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
19-
-
19+
- Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`)
2020
-
2121

2222
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ Performance improvements
166166
~~~~~~~~~~~~~~~~~~~~~~~~
167167
- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
168168
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
169+
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
169170

170171
.. ---------------------------------------------------------------------------
171172

pandas/core/indexes/multi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2836,7 +2836,7 @@ def _maybe_to_slice(loc):
28362836
try:
28372837
return self._engine.get_loc(key)
28382838
except TypeError:
2839-
# e.g. partial string slicing
2839+
# e.g. test_partial_slicing_with_multiindex partial string slicing
28402840
loc, _ = self.get_loc_level(key, list(range(self.nlevels)))
28412841
return loc
28422842

pandas/core/indexing.py

+2-13
Original file line numberDiff line numberDiff line change
@@ -658,16 +658,7 @@ def _get_setitem_indexer(self, key):
658658
if isinstance(key, range):
659659
return list(key)
660660

661-
try:
662-
return self._convert_to_indexer(key, axis=0, is_setter=True)
663-
except TypeError as e:
664-
665-
# invalid indexer type vs 'other' indexing errors
666-
if "cannot do" in str(e):
667-
raise
668-
elif "unhashable type" in str(e):
669-
raise
670-
raise IndexingError(key) from e
661+
return self._convert_to_indexer(key, axis=0, is_setter=True)
671662

672663
def _ensure_listlike_indexer(self, key, axis=None, value=None):
673664
"""
@@ -1209,7 +1200,7 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
12091200
is_int_index = labels.is_integer()
12101201
is_int_positional = is_integer(key) and not is_int_index
12111202

1212-
if is_scalar(key) or isinstance(labels, MultiIndex):
1203+
if is_scalar(key) or (isinstance(labels, MultiIndex) and is_hashable(key)):
12131204
# Otherwise get_loc will raise InvalidIndexError
12141205

12151206
# if we are a label return me
@@ -1224,8 +1215,6 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
12241215
# GH35015, using datetime as column indices raises exception
12251216
if not isinstance(labels, MultiIndex):
12261217
raise
1227-
except TypeError:
1228-
pass
12291218
except ValueError:
12301219
if not is_int_positional:
12311220
raise

pandas/core/internals/blocks.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from pandas._libs.internals import BlockPlacement
2626
from pandas._typing import (
2727
ArrayLike,
28-
Dtype,
2928
DtypeObj,
3029
F,
3130
Shape,
@@ -52,7 +51,6 @@
5251
is_list_like,
5352
is_sparse,
5453
is_string_dtype,
55-
pandas_dtype,
5654
)
5755
from pandas.core.dtypes.dtypes import (
5856
CategoricalDtype,
@@ -100,6 +98,7 @@
10098
TimedeltaArray,
10199
)
102100
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
101+
from pandas.core.arrays.sparse import SparseDtype
103102
from pandas.core.base import PandasObject
104103
import pandas.core.common as com
105104
import pandas.core.computation.expressions as expressions
@@ -326,6 +325,8 @@ def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:
326325

327326
return type(self)(new_values, new_mgr_locs, self.ndim)
328327

328+
# NB: this cannot be made cache_readonly because in libreduction we pin
329+
# new .values that can have different shape GH#42631
329330
@property
330331
def shape(self) -> Shape:
331332
return self.values.shape
@@ -1843,7 +1844,7 @@ class CategoricalBlock(ExtensionBlock):
18431844
# Constructor Helpers
18441845

18451846

1846-
def maybe_coerce_values(values) -> ArrayLike:
1847+
def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
18471848
"""
18481849
Input validation for values passed to __init__. Ensure that
18491850
any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
@@ -1875,7 +1876,7 @@ def maybe_coerce_values(values) -> ArrayLike:
18751876
return values
18761877

18771878

1878-
def get_block_type(values, dtype: Dtype | None = None):
1879+
def get_block_type(values, dtype: DtypeObj | None = None):
18791880
"""
18801881
Find the appropriate Block subclass to use for the given values and dtype.
18811882
@@ -1890,13 +1891,15 @@ def get_block_type(values, dtype: Dtype | None = None):
18901891
"""
18911892
# We use vtype and kind checks because they are much more performant
18921893
# than is_foo_dtype
1893-
dtype = cast(np.dtype, pandas_dtype(dtype) if dtype else values.dtype)
1894+
if dtype is None:
1895+
dtype = values.dtype
1896+
18941897
vtype = dtype.type
18951898
kind = dtype.kind
18961899

18971900
cls: type[Block]
18981901

1899-
if is_sparse(dtype):
1902+
if isinstance(dtype, SparseDtype):
19001903
# Need this first(ish) so that Sparse[datetime] is sparse
19011904
cls = ExtensionBlock
19021905
elif isinstance(dtype, CategoricalDtype):

pandas/core/internals/construction.py

+8
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,14 @@ def to_arrays(
757757
# i.e. numpy structured array
758758
columns = ensure_index(data.dtype.names)
759759
arrays = [data[name] for name in columns]
760+
761+
if len(data) == 0:
762+
# GH#42456 the indexing above results in list of 2D ndarrays
763+
# TODO: is that an issue with numpy?
764+
for i, arr in enumerate(arrays):
765+
if arr.ndim == 2:
766+
arrays[i] = arr[:, 0]
767+
760768
return arrays, columns
761769
return [], ensure_index([])
762770

pandas/core/internals/managers.py

+21-17
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ class BaseBlockManager(DataManager):
148148
_known_consolidated: bool
149149
_is_consolidated: bool
150150

151-
def __init__(self, blocks, axes, verify_integrity=True):
151+
def __init__(self, blocks, axes, verify_integrity: bool = True):
152152
raise NotImplementedError
153153

154154
@classmethod
@@ -889,7 +889,8 @@ def __init__(
889889
):
890890

891891
if verify_integrity:
892-
assert all(isinstance(x, Index) for x in axes)
892+
# Assertion disabled for performance
893+
# assert all(isinstance(x, Index) for x in axes)
893894

894895
for block in blocks:
895896
if self.ndim != block.ndim:
@@ -1563,8 +1564,9 @@ def __init__(
15631564
verify_integrity: bool = False,
15641565
fastpath=lib.no_default,
15651566
):
1566-
assert isinstance(block, Block), type(block)
1567-
assert isinstance(axis, Index), type(axis)
1567+
# Assertions disabled for performance
1568+
# assert isinstance(block, Block), type(block)
1569+
# assert isinstance(axis, Index), type(axis)
15681570

15691571
if fastpath is not lib.no_default:
15701572
warnings.warn(
@@ -1665,7 +1667,8 @@ def getitem_mgr(self, indexer) -> SingleBlockManager:
16651667
return type(self)(block, new_idx)
16661668

16671669
def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:
1668-
assert isinstance(slobj, slice), type(slobj)
1670+
# Assertion disabled for performance
1671+
# assert isinstance(slobj, slice), type(slobj)
16691672
if axis >= self.ndim:
16701673
raise IndexError("Requested axis not found in manager")
16711674

@@ -1783,9 +1786,10 @@ def create_block_manager_from_arrays(
17831786
axes: list[Index],
17841787
consolidate: bool = True,
17851788
) -> BlockManager:
1786-
assert isinstance(names, Index)
1787-
assert isinstance(axes, list)
1788-
assert all(isinstance(x, Index) for x in axes)
1789+
# Assertions disabled for performance
1790+
# assert isinstance(names, Index)
1791+
# assert isinstance(axes, list)
1792+
# assert all(isinstance(x, Index) for x in axes)
17891793

17901794
arrays = [_extract_array(x) for x in arrays]
17911795

@@ -1840,7 +1844,8 @@ def _form_blocks(
18401844
if names_idx.equals(axes[0]):
18411845
names_indexer = np.arange(len(names_idx))
18421846
else:
1843-
assert names_idx.intersection(axes[0]).is_unique
1847+
# Assertion disabled for performance
1848+
# assert names_idx.intersection(axes[0]).is_unique
18441849
names_indexer = names_idx.get_indexer_for(axes[0])
18451850

18461851
for i, name_idx in enumerate(names_indexer):
@@ -1868,10 +1873,9 @@ def _form_blocks(
18681873

18691874
if len(items_dict["DatetimeTZBlock"]):
18701875
dttz_blocks = [
1871-
new_block(
1876+
DatetimeTZBlock(
18721877
ensure_block_shape(extract_array(array), 2),
1873-
klass=DatetimeTZBlock,
1874-
placement=i,
1878+
placement=BlockPlacement(i),
18751879
ndim=2,
18761880
)
18771881
for i, array in items_dict["DatetimeTZBlock"]
@@ -1886,14 +1890,14 @@ def _form_blocks(
18861890

18871891
if len(items_dict["CategoricalBlock"]) > 0:
18881892
cat_blocks = [
1889-
new_block(array, klass=CategoricalBlock, placement=i, ndim=2)
1893+
CategoricalBlock(array, placement=BlockPlacement(i), ndim=2)
18901894
for i, array in items_dict["CategoricalBlock"]
18911895
]
18921896
blocks.extend(cat_blocks)
18931897

18941898
if len(items_dict["ExtensionBlock"]):
18951899
external_blocks = [
1896-
new_block(array, klass=ExtensionBlock, placement=i, ndim=2)
1900+
ExtensionBlock(array, placement=BlockPlacement(i), ndim=2)
18971901
for i, array in items_dict["ExtensionBlock"]
18981902
]
18991903

@@ -1926,7 +1930,7 @@ def _simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]:
19261930
if dtype is not None and values.dtype != dtype: # pragma: no cover
19271931
values = values.astype(dtype)
19281932

1929-
block = new_block(values, placement=placement, ndim=2)
1933+
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
19301934
return [block]
19311935

19321936

@@ -1949,14 +1953,14 @@ def _multi_blockify(tuples, dtype: DtypeObj | None = None, consolidate: bool = T
19491953
list(tup_block), dtype # type: ignore[arg-type]
19501954
)
19511955

1952-
block = new_block(values, placement=placement, ndim=2)
1956+
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
19531957
new_blocks.append(block)
19541958

19551959
return new_blocks
19561960

19571961

19581962
def _tuples_to_blocks_no_consolidate(tuples, dtype: DtypeObj | None) -> list[Block]:
1959-
# tuples produced within _form_blocks are of the form (placement, whatever, array)
1963+
# tuples produced within _form_blocks are of the form (placement, array)
19601964
if dtype is not None:
19611965
return [
19621966
new_block(

pandas/io/formats/style_render.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1334,9 +1334,9 @@ def _parse_latex_header_span(
13341334
13351335
Examples
13361336
--------
1337-
>>> cell = {'display_vale':'text', 'attributes': 'colspan="3"'}
1337+
>>> cell = {'display_value':'text', 'attributes': 'colspan="3"'}
13381338
>>> _parse_latex_header_span(cell, 't', 'c')
1339-
'\multicol{3}{c}{text}'
1339+
'\\multicolumn{3}{c}{text}'
13401340
"""
13411341
if "attributes" in cell:
13421342
attrs = cell["attributes"]

pandas/tests/frame/constructors/test_from_records.py

+13
Original file line numberDiff line numberDiff line change
@@ -457,3 +457,16 @@ def test_from_records_empty_with_nonempty_fields_gh3682(self):
457457
b = a[:0]
458458
df2 = DataFrame.from_records(b, index="id")
459459
tm.assert_frame_equal(df2, df.iloc[:0])
460+
461+
def test_from_records_empty2(self):
462+
# GH#42456
463+
dtype = [("prop", int)]
464+
shape = (0, len(dtype))
465+
arr = np.empty(shape, dtype=dtype)
466+
467+
result = DataFrame.from_records(arr)
468+
expected = DataFrame({"prop": np.array([], dtype=int)})
469+
tm.assert_frame_equal(result, expected)
470+
471+
alt = DataFrame(arr)
472+
tm.assert_frame_equal(alt, expected)

pandas/tests/groupby/test_apply.py

+13
Original file line numberDiff line numberDiff line change
@@ -1178,3 +1178,16 @@ def test_positional_slice_groups_datetimelike():
11781178
lambda x: x.iloc[0:]
11791179
)
11801180
tm.assert_frame_equal(result, expected)
1181+
1182+
1183+
def test_doctest_example2():
1184+
# GH#42702 this fails if we cache_readonly Block.shape
1185+
# TODO: more informative name
1186+
df = DataFrame({"A": ["a", "a", "b"], "B": [1, 2, 3], "C": [4, 6, 5]})
1187+
gb = df.groupby("A")
1188+
result = gb[["B", "C"]].apply(lambda x: x.astype(float).max() - x.min())
1189+
1190+
expected = DataFrame(
1191+
{"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A")
1192+
)
1193+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)